[PATCH 1/2 v4] Btrfs: snapshot-aware defrag

2012-10-27 Thread Liu Bo
This comes from one of btrfs's project ideas,
As we defragment files, we break any sharing from other snapshots.
The balancing code will preserve the sharing, and defrag needs to grow this
as well.

Now we're able to fill the blank with this patch, in which we make full use of
backref walking stuff.

Here is the basic idea,
o  set the writeback ranges started by defragment with flag EXTENT_DEFRAG
o  at endio, after we finish updating fs tree, we use backref walking to find
   all parents of the ranges and re-link them with the new COWed file layout by
   adding corresponding backrefs.

Originally patch by Li Zefan l...@cn.fujitsu.com
Signed-off-by: Liu Bo bo.li@oracle.com
---
v3-v4:
  - fix duplicated refs bugs detected by mounting with autodefrag, thanks
for the bug report from Mitch and Chris.

 fs/btrfs/inode.c |  609 ++
 1 files changed, 609 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 85a1e50..35e6993 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -54,6 +54,7 @@
 #include locking.h
 #include free-space-cache.h
 #include inode-map.h
+#include backref.h
 
 struct btrfs_iget_args {
u64 ino;
@@ -1839,6 +1840,600 @@ out:
return ret;
 }
 
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+   struct rb_node node;
+   struct old_sa_defrag_extent *old;
+   u64 root_id;
+   u64 inum;
+   u64 file_pos;
+   u64 extent_offset;
+   u64 num_bytes;
+   u64 generation;
+};
+
+struct old_sa_defrag_extent {
+   struct list_head list;
+   struct new_sa_defrag_extent *new;
+
+   u64 extent_offset;
+   u64 bytenr;
+   u64 offset;
+   u64 len;
+   int count;
+};
+
+struct new_sa_defrag_extent {
+   struct rb_root root;
+   struct list_head head;
+   struct btrfs_path *path;
+   struct inode *inode;
+   u64 file_pos;
+   u64 len;
+   u64 bytenr;
+   u64 disk_len;
+   u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+   struct sa_defrag_extent_backref *b2)
+{
+   if (b1-root_id  b2-root_id)
+   return -1;
+   else if (b1-root_id  b2-root_id)
+   return 1;
+
+   if (b1-inum  b2-inum)
+   return -1;
+   else if (b1-inum  b2-inum)
+   return 1;
+
+   if (b1-file_pos  b2-file_pos)
+   return -1;
+   else if (b1-file_pos  b2-file_pos)
+   return 1;
+
+   return 0;
+}
+
+static void backref_insert(struct rb_root *root,
+  struct sa_defrag_extent_backref *backref)
+{
+   struct rb_node **p = root-rb_node;
+   struct rb_node *parent = NULL;
+   struct sa_defrag_extent_backref *entry;
+   int ret;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+   ret = backref_comp(backref, entry);
+   if (ret  0)
+   p = (*p)-rb_left;
+   else
+   /*
+* Since space can be shared, so there can be
+* some backrefs(extent tree to fs/file tree)
+* whoes fs/file extents map to the same address.
+* If so, we just put it after what we've found.
+*/
+   p = (*p)-rb_right;
+   }
+
+   rb_link_node(backref-node, parent, p);
+   rb_insert_color(backref-node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+  void *ctx)
+{
+   struct btrfs_file_extent_item *extent;
+   struct btrfs_fs_info *fs_info;
+   struct old_sa_defrag_extent *old = ctx;
+   struct new_sa_defrag_extent *new = old-new;
+   struct btrfs_path *path = new-path;
+   struct btrfs_key key;
+   struct btrfs_root *root;
+   struct sa_defrag_extent_backref *backref;
+   struct extent_buffer *leaf;
+   struct inode *inode = new-inode;
+   int slot;
+   int ret;
+   u64 extent_offset;
+   u64 num_bytes;
+
+   if (BTRFS_I(inode)-root-root_key.objectid == root_id 
+   inum == btrfs_ino(inode))
+   return 0;
+
+   key.objectid = root_id;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+
+   fs_info = BTRFS_I(inode)-root-fs_info;
+   root = btrfs_read_fs_root_no_name(fs_info, key);
+   if (IS_ERR(root)) {
+   if (PTR_ERR(root) == -ENOENT)
+   return 0;
+   WARN_ON(1);
+   pr_debug(inum=%llu, offset=%llu, root_id=%llu\n,
+inum, offset, root_id);
+   return PTR_ERR(root);
+   }
+
+   

[PATCH 2/2] Btrfs: make snapshot-aware defrag as a mount option

2012-10-27 Thread Liu Bo
This feature works on our crucial write endio path, so if we've got
lots of fragments to process, it will be kind of a disaster to the
performance, so I make such a change.

One can benifit from it while mounting with '-o snap_aware_defrag'.

Signed-off-by: Liu Bo bo.li@oracle.com
---
 fs/btrfs/ctree.h |1 +
 fs/btrfs/inode.c |   16 ++--
 fs/btrfs/ioctl.c |5 +++--
 fs/btrfs/super.c |   12 ++--
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 926c9ff..f9cd9c9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1756,6 +1756,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY(1  20)
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1  21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR   (1  22)
+#define BTRFS_MOUNT_SA_DEFRAG  (1  23)
 
 #define btrfs_clear_opt(o, opt)((o) = ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)  ((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 35e6993..069499e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2488,13 +2488,17 @@ static int btrfs_finish_ordered_io(struct 
btrfs_ordered_extent *ordered_extent)
 ordered_extent-file_offset + ordered_extent-len - 1,
 0, cached_state);
 
-   ret = test_range_bit(io_tree, ordered_extent-file_offset,
-   ordered_extent-file_offset + ordered_extent-len - 1,
-   EXTENT_DEFRAG, 1, cached_state);
-   if (ret  btrfs_root_last_snapshot(root-root_item) =
+   if (btrfs_test_opt(root, SA_DEFRAG)) {
+   ret = test_range_bit(io_tree, ordered_extent-file_offset,
+ordered_extent-file_offset +
+ordered_extent-len - 1,
+EXTENT_DEFRAG, 1, cached_state);
+   if (ret 
+   btrfs_root_last_snapshot(root-root_item) =
BTRFS_I(inode)-generation) {
-   /* the inode is shared */
-   new = record_old_file_extents(inode, ordered_extent);
+   /* the inode is shared */
+   new = record_old_file_extents(inode, ordered_extent);
+   }
}
 
if (nolock)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6116880..1367165 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1058,8 +1058,9 @@ again:
}
 
 
-   set_extent_defrag(BTRFS_I(inode)-io_tree, page_start, page_end - 1,
- cached_state, GFP_NOFS);
+   if (btrfs_test_opt(BTRFS_I(inode)-root, SA_DEFRAG))
+   set_extent_defrag(BTRFS_I(inode)-io_tree, page_start,
+ page_end - 1, cached_state, GFP_NOFS);
 
unlock_extent_cached(BTRFS_I(inode)-io_tree,
 page_start, page_end - 1, cached_state,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 915ac14..24eac5f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -308,8 +308,8 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-   Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-   Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+   Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_sa_defrag,
+   Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
Opt_check_integrity, Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors,
Opt_err,
@@ -344,6 +344,7 @@ static match_table_t tokens = {
{Opt_enospc_debug, enospc_debug},
{Opt_subvolrootid, subvolrootid=%d},
{Opt_defrag, autodefrag},
+   {Opt_sa_defrag, snap_aware_defrag},
{Opt_inode_cache, inode_cache},
{Opt_no_space_cache, nospace_cache},
{Opt_recovery, recovery},
@@ -564,6 +565,11 @@ int btrfs_parse_options(struct btrfs_root *root, char 
*options)
printk(KERN_INFO btrfs: enabling auto defrag\n);
btrfs_set_opt(info-mount_opt, AUTO_DEFRAG);
break;
+   case Opt_sa_defrag:
+   printk(KERN_INFO btrfs: enabling snapshot-aware
+   defrag\n);
+   btrfs_set_opt(info-mount_opt, SA_DEFRAG);
+   break;
case Opt_recovery:
printk(KERN_INFO btrfs: enabling auto recovery\n);
btrfs_set_opt(info-mount_opt, RECOVERY);
@@ -935,6 +941,8 @@ static int btrfs_show_options(struct seq_file *seq, struct 
dentry *dentry)
seq_puts(seq, ,enospc_debug);
if 

[PATCH] Btrfs: Remove the invalid shrink size check up from btrfs_shrink_dev()

2012-10-27 Thread Jeff Liu
Remove an invalid size check up from btrfs_shrink_dev().

The new size should not larger than the device-total_bytes as it was
already verified before coming to here(i.e. new_size  old_size).

Remove invalid check up for btrfs_shrink_dev().

Signed-off-by: Jie Liu jeff@oracle.com
---
 fs/btrfs/ioctl.c   |2 +-
 fs/btrfs/volumes.c |3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9df50fa..a747da9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1383,7 +1383,7 @@ static noinline int btrfs_ioctl_resize(struct
btrfs_root *root,
btrfs_commit_transaction(trans, root);
} else if (new_size  old_size) {
ret = btrfs_shrink_device(device, new_size);
-   }
+   } /* equal, nothing need to do */

 out_free:
kfree(vol_args);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88b969a..8fda4fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3038,9 +3038,6 @@ int btrfs_shrink_device(struct btrfs_device
*device, u64 new_size)
u64 old_size = device-total_bytes;
u64 diff = device-total_bytes - new_size;

-   if (new_size = device-total_bytes)
-   return -EINVAL;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-- 
1.7.9.5
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Chris Murphy

On Oct 25, 2012, at 9:36 PM, cwillu cwi...@cwillu.com wrote:

 On Thu, Oct 25, 2012 at 8:33 PM, Chris Murphy li...@colorremedies.com wrote:
 So what's the intended distinction between 'fi df' and 'fi show'? Because 
 for months using btrfs I'd constantly be confused which command was going to 
 show me what information I wanted, and that tells me there should be some 
 better distinction between the commands.
 
 Or the distinction should be removed, which is what this patch effectively 
 does.

Ok fine. But I still think it needs a -s (summary) or -v (more detail) switch. 
The combined amount of information from existing df and show in a single 
command is too much information at a glance. The idea that only a human being 
can evaluate that pile of data into a meaningful answer to their questions 
strikes me as flawed. That it's a non-trivial boolean problem I'll accept, but 
if a human can consistently draw reasonable conclusions from this data dump 
then something can summarize it into the same - albeit not answering all 
possible questions.

Chris Murphy

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Naming of subvolumes

2012-10-27 Thread Chris Murphy

On Oct 26, 2012, at 7:58 PM, cwillu cwi...@cwillu.com wrote:
 
 I'd argue that everything should be absolute references to subvolumes
 (/@home, /@, etc), and neither set-default nor subvolume id's should
 be touched.  

grub's core.img prefix is relative to the current default subvolume. I'm 
unaware of an absolute means of specifying the prefix.

In order to avoid using set-default, you end up with either the need to make a 
new core.img and install it, prior to every change in what subvolume to boot 
from (snapshot or other distribution), or do some fancy mv like you mention 
next.



 There's no need, as you can simply mv those around (even
 while mounted).

What if Ubuntu wants to use root and boot on the top level, but finds boot and 
root already exist there from Fedora 18? Will Ubunuto package them up properly 
into a folder or subvol in such a way that Fedora 18 can ever be used again 
without significant user knowledge about what likely happened?

I think this is untenable. Since set-default is valuable, and its use cannot 
correctly be proscribed for all time on boot volumes, it's better to figure out 
how to leverage it.

I think that the top level subvolumes should be named after the distribution: 
Fedora 18, Fedora 19, Ubuntu 12.04, Suse 12.x, etc. and possibly home can also 
be in the top level. And inside, can be whatever organization that distro 
wants. But negotiating generically named boot and root at the top level I think 
is long term problematic.

The small problem, presently, with Fedora is that grub is referred to as grub2. 
So the actual core.img prefix that's baked in is /boot/grub2, meaning that even 
if you change the set-default subvolume, that core.img is incompatible with 
finding another grub.cfg of the same version without baking a new core.img (or 
manually setting the prefix).


  More importantly, it doesn't result in a case where
 the fstab in one snapshot points its mountpoint to a different
 snapshot, with all the hilarity that would cause over time, and also
 allows multiple distros to be installed on the same filesystem without
 having them stomp on each others set-defaults: /@fedora, /@rawhide,
 /@ubuntu, /@home, etc.

You'd have to use absolute paths if you're going to depend on every distro 
potentially mv'ing every other distros folders and subvolumes around, yes. If a 
distribution starts moving and renaming my subvolumes, I will not be using that 
distribution.


Chris Murphy--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Martin Steigerwald
Am Donnerstag, 25. Oktober 2012 schrieb Goffredo Baroncelli:
 Hi all,
 
 this is a new attempt to improve the output of the command btrfs fi
 df.
 
 The previous attempt received a good reception. However there was no a
 general consensus about the wording.
 
 Moreover I still didn't understand how btrfs was using the disks.
 A my first attempt was to develop a new command which shows how the
 disks are divided in chunks. However it still was no clear. Then I
 realised that I need to merge the two output.
 
 Below the results. It shows how the disks is used by the different
 chunks.
 
 The code is pullable from
 http://cassiopea.homelinux.net/git/btrfs-progs-unstable.git
 branch
 info-cmd
 
 I don't publish the patched because aren't in a good shape. However I
 really like the output. The example is a filesystem based on three
 disks of 3GB.
 
 It is clear that:
 - RAID0 uses all the disks
 - RAID1 uses two different disks
 
 Comments are welcome.
 
 Known bugs:
 - if a filesystem uses a disk but there is any chunk, the disk is not
 shown (solvable)
 - this command need root capability (I use the BTRFS_IOC_TREE_SEARCH
 to get the chunk info; so that is unavoidable)
 
 
 ghigo@emulato:~$ sudo ./btrfs fi df /mnt/btrfs1/
 [sudo] password for ghigo:
 Path: /mnt/btrfs1
 Summary:
   Disk_size: 9.00GB
   Disk_allocated:1.83GB
   Disk_unallocated:  7.17GB
   Used:284.00KB
   Free_(Estimated):  6.76GB   (Max: 8.54GB, min: 4.96GB)
   Data_to_disk_ratio:  75 %
 
 Allocated_area:
   Data,RAID0: Size:921.75MB, Used:256.00KB
  /dev/vdc   307.25MB
  /dev/vdb   307.25MB
  /dev/vdd   307.25MB
 
   Data,Single: Size:8.00MB, Used:0.00
  /dev/vdb 8.00MB
 
   System,RAID1: Size:8.00MB, Used:4.00KB
  /dev/vdd 8.00MB
  /dev/vdc 8.00MB
 
   System,Single: Size:4.00MB, Used:0.00
  /dev/vdb 4.00MB
 
   Metadata,RAID1: Size:460.94MB, Used:24.00KB
  /dev/vdb   460.94MB
  /dev/vdd   460.94MB
 
   Metadata,Single: Size:8.00MB, Used:0.00
  /dev/vdb 8.00MB
 
   Unused:
  /dev/vdb 2.23GB
  /dev/vdc 2.69GB
  /dev/vdd 2.24GB

Just a quick feedback:

I think this is rather long. And I find it more complicated than the older 
output.

But maybe its more the vertically oriented formatting.

How about:


ghigo@emulato:~$ sudo ./btrfs fi df /mnt/btrfs1/
=== regular output ===
[sudo] password for ghigo:
Path: /mnt/btrfs1
Summary:
  Disk_size:   9.00GB
  Disk_allocated:  1.83GB
  Disk_unallocated:7.17GB
  Used:  284.00KB
  Free_(Estimated):6.76GB   (Max: 8.54GB, min: 4.96GB)
  Data_to_disk_ratio:75 %

Allocated_area:
  Data,RAID0: 256.00KB of 921.75MB used
  Data,Single: 0 of 8.00MB used
  System,RAID1: 4 KB of 8.00MB used
  System,Single: 0 of 4.00MB used
  Metadata,RAID1: 24KB of 460.94MB used
  Metadata,Single: 0 of 8.00MB used

^^^ possibly tabular ^^^

=== only with -d / --disks or -v? ===

/dev/vdb:
  Data, RAID 0  307.25MB
  Data,Single   8MB
  System,Single 4MB
  Metadata,RAID1460.94MB
  Metadata,Single   8MB
  Unused2,23GB

/dev/vdc:
  Data,RAID 0   307,25MB
  System,RAID1  8MB
  Unused2,69GB

/dev/vdd:
  Data,RAID 0   307.25MB
  System,RAID1  8MB
  Metadata,RAID1460.94MB
  Unused2.24GB

(one could include the usage in single disk view as well.)



Hmmm... but that also looks quite heavy.


Possibly this could be done tabular as well, like:

vdb vdc vdd
Data, RAID 0307,25MB307,25MB307,25MB
…
System,RAID1-   8MB 8MB
…
Unused  2,23GB  2,69GB  2,24GB

I like this. But what if the filesystem has 100 disks?


Hmmm, how about:

vdb vdc vdd
Data, RAID 0307,25MB307,25MB307,25MB
vde vdf vdg
307,25MB200MB   200MB
…
vdb vdc vdd
System,RAID1-   8MB 8MB
…
vdb vdc vdd
Unused  2,23GB  2,69GB  2,24GB
vde vdf vdg
2,25GB  2,8GB   2,21GB

in such a case? Then each line has to have headings with devices. As long 
as one line is enough, it only needs to be printed once at the beginning 
of the table.

So in some form a tabular matrix of devices.


Other idea might be:

Allocated_area:
  Data,RAID0: Size:921.75MB, Used:256.00KB
  /dev/vdb: 307.25MB/dev/vdc: 307.25MB  /dev/vdd: 307.25MB

And if same possibly:

Allocated_area:
  Data,RAID0: Size:921.75MB, Used:256.00KB
  /dev/vd[bcd]: 307.25MB


Hmmm, this is all playing with the 

Re: Naming of (bootable) subvolumes

2012-10-27 Thread Chris Murphy

On Oct 26, 2012, at 9:03 PM, Fajar A. Nugraha l...@fajar.net wrote:

 
 So back to the original question, I'd suggest NOT to use either
 send/receive or set-default. Instead, setup multiple boot environment
 (e.g. old version, current version) and let user choose which one to
 boot using a menu.

Is it possible to make a functioning symbolic or hard link of a subvolume? 

I'm fine with current and previous options. More than that seems 
unnecessary. But then, how does the user choose? What's the UI? Is this 
properly the domain of GRUB2 or something else?

On BIOS machines, perhaps GRUB. On UEFI, I'd say distinctly not GRUB (I think 
it's a distinctly bad idea to have a combined boot manager and bootloader in a 
UEFI context, but that's a separate debate).


 However for this to work, grub (the bootloader, and
 the userland programs like update-grub) needs to be able to refer to
 each grub.cfg/kernel/initrd in a global manner regardless of what the
 current default subvolume is (zfs' grub code uses something like
 /poolname/dataset_name/@/path/to/file/in/dataset).


Example. The following are all subvolumes, subvolume set-default 0, fstab uses 
subvol=home, subvol=root, subvol=boot for mount options.

toplevel
├── boot
├── home
├── root
├── fedora18
│   ├── boot
│   └── root

On this system, grub-mkconfig produces a grub.cfg only for the system I'm 
currently booted from. It does not include any entries for fedora18/boot, 
fedora18/root, even though they are well within the normal search path. And the 
reference used is relative,  i.e. the kernel parameter in the grub.cfg is 
rootflags=subvol=root

If it were to create entries potentially for every snapshotted system, it would 
be a very messy grub.cfg indeed.

It stands to reason that each distro will continue to have their own grub.cfg. 

For BIOS machines, it could be useful if a single core.img containing a single 
standardized prefix specifying a grub location could be agreed upon. And then 
merely changing the set-default subvolume would allow different distro 
grub.cfg's to be found, read and workable with the relative references now in 
place, (except for home which likely needs to be mounted using subvolid).

For UEFI machines, the plan needs to work with other bootloaders, including the 
linux kernel's EFISTUB.



Chris Murphy

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: device delete, error removing device

2012-10-27 Thread Chris Murphy
3.6.3-3.fc18.x86_64.debug
btrfs-progs-0.20.rc1.20121017git91d9eec-1.fc18.x86_64

I'm getting a very different result with this kernel compared to 3.6.2, when I 
do the same thing. I fill the btrfs volume to 97% full again, no errors. Add a 
device of the *same* size, and then device delete.

In this case, the 'device delete' command hangs, no recovery, and dmesg from 
another shell reports the file system is forced read only. The debug kernel 
produces quite a bit more information so I'll post that here:

http://pastebin.com/8d1b6eCn


Label: 'filltest'  uuid: c0a4c7d7-7a23-4ce3-bafe-20cb92156562
Total devices 3 FS bytes used 13.84GB
devid3 size 8.00GB used 19.00MB path /dev/sdd
devid2 size 8.00GB used 8.00GB path /dev/sdc
devid1 size 8.00GB used 8.00GB path /dev/sdb


[root@f18v ~]# btrfs fi df /mnt
Data, RAID0: total=13.95GB, used=13.82GB
Data: total=8.00MB, used=0.00
System, RAID1: total=8.00MB, used=4.00KB
System: total=4.00MB, used=0.00
Metadata, RAID1: total=1.02GB, used=19.09MB
Metadata: total=8.00MB, used=0.00

Two minutes later I get more from dmesg since btrfs is blocked:

http://pastebin.com/BznS3dF0

The volume can't be unmounted and the stuck process can't be killed. So I 
reboot. Mounting it produces:

[   45.540143] device label filltest devid 1 transid 17 /dev/sdb
[   45.545417] btrfs: disk space caching is enabled
[   45.566326] btrfs: free space inode generation (0) did not match free space 
cache generation (1858)
[   45.598677] btrfs: free space inode generation (0) did not match free space 
cache generation (1832)
[   45.794886] btrfs: unlinked 1 orphans

Otherwise the file system seems fine. And

btrfs balance start -dconvert=single /mnt

Does eventually unwind the problem.

If the scenario allows adding a 4th device to this situation, it's faster 
because the balance isn't required. Thus deleting the (hypothetically 
troublesome) device occurs more quickly while also not requiring significant 
write capability to it.


Chris Murphy

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Michael Kjörling
On 27 Oct 2012 18:43 +0200, from mar...@lichtvoll.de (Martin Steigerwald):
 Possibly this could be done tabular as well, like:
 
 vdb   vdc vdd
 Data, RAID 0  307,25MB307,25MB307,25MB
 …
 System,RAID1  -   8MB 8MB
 …
 Unused2,23GB  2,69GB  2,24GB
 
 I like this. But what if the filesystem has 100 disks?

Maybe I'm just not familiar enough with btrfs yet to punch an
immediate hole in the idea, but how about pivoting that table? Columns
for data values (data, raid 0, system, raid 1, unused, ...) and
rows for the underlying devices? Something like this, copying the
numbers from your example. And I'm using colon here rather than comma,
because I _believe_ that it better captures the intent.

   Data: RAID 0   System: RAID 1   Unused
/dev/vdb 307.25 MB-2.23 GB
/dev/vdc 307.25 MB 8 MB2.69 GB
/dev/vdd 307.25 MB 8 MB2.24 GB
      ==   
TOTAL921.75 MB16 MB7.16 GB

This feels like it should work quite well as long as only 3-5 columns
plus the device specifier are needed (which would appear to be the
case), and it gives a quick run-down of the numbers at a glance. If
the filesystem consists of a large number of devices, the header could
possibly be repeated just before the total row (then something like
btrfs fi df | tail -n2 will still work for getting the executive
summary). The risk would be if extremely long device names are used,
but that should be relatively trivial to solve. Maybe something like
this if the terminal width does not allow the data for each device to
fit on a single line?

   Data: RAID 0   System: RAID 1   Unused
/dev/disk/by-uuid/f72d74f4-206d-11e2-aa11-cb4348b38f9e
 307.25 MB-  2.23 GB
/dev/disk/by-uuid/044e48de-206e-11e2-8a63-8366c6174d47
 307.25 MB 8 MB  2.69 GB
/dev/disk/by-uuid/0b1309a2-206e-11e2-aa21-234780dc3782
 307.25 MB 8 MB  2.24 GB
      ==   ==
TOTAL921.75 MB16 MB  7.16 GB

Both of the above, as you will note, are very similar to how GNU df
solves the same problem. Line breaking could also be disabled if
standard output is not a TTY, which means that for example passing the
output to grep would still work like one would expect.

Any reason why such an output format wouldn't work for a summary view?

-- 
Michael Kjörling • http://michael.kjorling.se • mich...@kjorling.se
“People who think they know everything really annoy
those of us who know we don’t.” (Bjarne Stroustrup)
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How does btrfs behave on checksum mismatch?

2012-10-27 Thread Hugo Mills
On Sat, Oct 27, 2012 at 09:56:45PM +, Michael Kjörling wrote:
 I came across the tidbit that ZFS has a contract guarantee that the
 data read back will either be correct (the checksum computed over the
 data read from the disk matches the checksum stored on disk), or you
 get an I/O error. Obviously, this greatly reduces the probability that
 the data is invalid. (Particularly when taken in combination with the
 disk firmware's own ECC and checksumming.)
 
 With the default options, does btrfs make any similar guarantees? If
 not, then are there any options to force it to make such guarantees?

   It does indeed do the same thing: if the checksum doesn't match the
block, then the alternative block is read (if one exists, e.g. RAID-1,
RAID-10). If that does not exist, or also has a checksum failure, then
EIO is returned.

   Hugo.

 I'm interested in this both from a specification and an implementation
 point of view.
 
 The last thing anyone wants is probably undetected bit rot, and with
 today's large drives, even with the quite low bit rot numbers it can
 be a real concern. If even the act of simply successfully reading a
 file guarantees, to the extent of the checksumming algorithm's ability
 to detect changes, that the data read is the same as was once written,
 that would be a major selling point for btrfs for me personally.
 
 The closest I was able to find was that btrfs uses crc32c currently
 for data and metadata checksumming and that this can be turned off if
 so desired (using the nodatasum mount option), but nothing about
 what the file system code does or is supposed to do in the face of a
 checksum mismatch.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
  --- It used to take a lot of talent and a certain type of ---  
upbringing to be perfectly polite and have filthy manners
at the same time. Now all it needs is a computer.


signature.asc
Description: Digital signature


Re: How does btrfs behave on checksum mismatch?

2012-10-27 Thread Michael Kjörling
On 27 Oct 2012 23:02 +0100, from h...@carfax.org.uk (Hugo Mills):
 I came across the tidbit that ZFS has a contract guarantee that the
 data read back will either be correct (the checksum computed over the
 data read from the disk matches the checksum stored on disk), or you
 get an I/O error. Obviously, this greatly reduces the probability that
 the data is invalid. (Particularly when taken in combination with the
 disk firmware's own ECC and checksumming.)
 
 With the default options, does btrfs make any similar guarantees? If
 not, then are there any options to force it to make such guarantees?
 
It does indeed do the same thing: if the checksum doesn't match the
 block, then the alternative block is read (if one exists, e.g. RAID-1,
 RAID-10). If that does not exist, or also has a checksum failure, then
 EIO is returned.

Great!

This should perhaps be mentioned more clearly in the Wiki.

Also, thanks for the prompt reply.

-- 
Michael Kjörling • http://michael.kjorling.se • mich...@kjorling.se
“People who think they know everything really annoy
those of us who know we don’t.” (Bjarne Stroustrup)
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Martin Steigerwald
Am Samstag, 27. Oktober 2012 schrieb Michael Kjörling:
 On 27 Oct 2012 18:43 +0200, from mar...@lichtvoll.de (Martin 
Steigerwald):
  Possibly this could be done tabular as well, like:
  
  vdb   vdc vdd
 
  Data, RAID 0  307,25MB307,25MB307,25MB
  …
  System,RAID1  -   8MB 8MB
  …
  Unused2,23GB  2,69GB  2,24GB
 
  
 
  I like this. But what if the filesystem has 100 disks?
 
 Maybe I'm just not familiar enough with btrfs yet to punch an
 immediate hole in the idea, but how about pivoting that table? Columns
 for data values (data, raid 0, system, raid 1, unused, ...) and
 rows for the underlying devices? Something like this, copying the
 numbers from your example. And I'm using colon here rather than comma,
 because I believe that it better captures the intent.
 
Data: RAID 0   System: RAID 1   Unused
 /dev/vdb 307.25 MB-2.23 GB
 /dev/vdc 307.25 MB 8 MB2.69 GB
 /dev/vdd 307.25 MB 8 MB2.24 GB
   ==   
 TOTAL921.75 MB16 MB7.16 GB

Hmmm, good idea. I like it this way around.

It would scale better with the number of drives and there is a good way to 
place the totals.

I wonder about how to possibly include the used part of each tree. With 
mostly 5 columns it might be doable.

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Hugo Mills
On Sun, Oct 28, 2012 at 12:30:44AM +0200, Martin Steigerwald wrote:
 Am Samstag, 27. Oktober 2012 schrieb Michael Kjörling:
  On 27 Oct 2012 18:43 +0200, from mar...@lichtvoll.de (Martin 
 Steigerwald):
   Possibly this could be done tabular as well, like:
   
   vdb   vdc vdd
  
   Data, RAID 0  307,25MB307,25MB307,25MB
   …
   System,RAID1  -   8MB 8MB
   …
   Unused2,23GB  2,69GB  2,24GB
  
   
  
   I like this. But what if the filesystem has 100 disks?
  
  Maybe I'm just not familiar enough with btrfs yet to punch an
  immediate hole in the idea, but how about pivoting that table? Columns
  for data values (data, raid 0, system, raid 1, unused, ...) and
  rows for the underlying devices? Something like this, copying the
  numbers from your example. And I'm using colon here rather than comma,
  because I believe that it better captures the intent.
  
 Data: RAID 0   System: RAID 1   Unused
  /dev/vdb 307.25 MB-2.23 GB
  /dev/vdc 307.25 MB 8 MB2.69 GB
  /dev/vdd 307.25 MB 8 MB2.24 GB
    ==   
  TOTAL921.75 MB16 MB7.16 GB
 
 Hmmm, good idea. I like it this way around.
 
 It would scale better with the number of drives and there is a good way to 
 place the totals.
 
 I wonder about how to possibly include the used part of each tree. With 
 mostly 5 columns it might be doable.

   Note that this could get arbitrarily wide in the presence of the
(planned) per-object replication config. Otherwise, it works. The
width is probably likely to grow more slowly than the length, though,
so this way round is probably the better option. IMO. Eggshell blue is
good enough. :)

   Hugo.

-- 
=== Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ===
  PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk
   --- Some days,  it's just not worth gnawing through the straps. ---   


signature.asc
Description: Digital signature


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Michael Kjörling
On 27 Oct 2012 23:38 +0100, from h...@carfax.org.uk (Hugo Mills):
Data: RAID 0   System: RAID 1   Unused
 /dev/vdb 307.25 MB-2.23 GB
 /dev/vdc 307.25 MB 8 MB2.69 GB
 /dev/vdd 307.25 MB 8 MB2.24 GB
   ==   
 TOTAL921.75 MB16 MB7.16 GB
 
 It would scale better with the number of drives and there is a good way to 
 place the totals.
 
Note that this could get arbitrarily wide in the presence of the
 (planned) per-object replication config. Otherwise, it works. The
 width is probably likely to grow more slowly than the length, though,
 so this way round is probably the better option. IMO. Eggshell blue is
 good enough. :)

Of course, but the suggestion in the mail I replied to can get equally
arbitrarily wide in the presence of a large number of _drives_.

In my experience, many times it's better to put something together
that works with the current status of the project and start using it,
than trying to shoehorn every we'd like to do this some day feature
into the original design. _Particularly_ when it's UI one is talking
about. I can think of a few ways it might be possible to restrict the
growth of the width of a table like this even in the face of separate
per-object replication settings, the most obvious probably being to
keep a tally on disk for each of the replication types, and have
columns for each replication configuration (so you might get one
column for RAID 0 data, one for RAID 1 data, one for SINGLE data, and
so on, but you'll _never_ get more data columns than the filesystem
itself supports replication methods for data data; the tally simply
being an optimization so you don't have to scan the whole file system
for a simple df), but by the time that feature gets implemented,
maybe someone can think of a better presentation.

After all, UI aspects tend to be the easiest to fiddle with.

Organizing the drives in rows also has the advantage that you don't
_have_ to read everything before you can start printing the results,
if you can live with the constraint of supporting only one data and
metadata replication strategy. Whether to implement it that way is
another matter. With large storage systems and multi-CPU/multi-core
systems, while a multithreaded approach might not provide consistent
device ordering between executions depending on the exact thread
execution order, it could provide a fair performance enhancement. And
forget KISS; don't we all _love_ a chance to do a little multithreaded
programming before coffee if it saves the poor sysadmin a few dozen
milliseconds per df? ;-)

-- 
Michael Kjörling • http://michael.kjorling.se • mich...@kjorling.se
“People who think they know everything really annoy
those of us who know we don’t.” (Bjarne Stroustrup)
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] New attempt to a better btrfs fi df

2012-10-27 Thread Chris Murphy

On Oct 27, 2012, at 1:55 PM, Michael Kjörling mich...@kjorling.se wrote:

  Data: RAID 0   System: RAID 1   Unused
 /dev/vdb 307.25 MB-2.23 GB
 /dev/vdc 307.25 MB 8 MB2.69 GB
 /dev/vdd 307.25 MB 8 MB2.24 GB
     ==   
 TOTAL921.75 MB16 MB7.16 GB

I kinda like this arrangement a little better. If the top left corner could be 
the volume label, that might be nice?


Chris Murphy

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html