[PATCH v3 1/3] btrfs-progs: cmds-check.c: supports inode nbytes fix in lowmem

2017-01-11 Thread Su Yue
Added 'repair_inode_item' which dispatches functions such as
'repair_inode__nbytes_lowmem' to correct errors and
'struct inode_item_fix_info' to store correct values and errors.

Signed-off-by: Su Yue 
---
v2: reassign err to info.err after repaired in process_one_leaf_v2
v3: none
---
 cmds-check.c | 166 +++
 1 file changed, 155 insertions(+), 11 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 1dba298..dad10cb 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -371,6 +371,17 @@ struct root_item_info {
 };
 
 /*
+ * Use inode_item_fix_info as function check_inode_item's arg.
+ */
+struct inode_item_fix_info {
+   u64 ino;
+   u64 isize;
+   u64 nbytes;
+
+   int err;
+};
+
+/*
  * Error bit for low memory mode check.
  *
  * Currently no caller cares about it yet.  Just internal use for error
@@ -1866,13 +1877,16 @@ struct node_refs {
 static int update_nodes_refs(struct btrfs_root *root, u64 bytenr,
 struct node_refs *nrefs, u64 level);
 static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,
-   unsigned int ext_ref);
-
+   unsigned int ext_ref,
+   struct inode_item_fix_info *info);
+static int repair_inode_item(struct btrfs_root *root,
+struct inode_item_fix_info *info);
 static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path 
*path,
   struct node_refs *nrefs, int *level, int ext_ref)
 {
struct extent_buffer *cur = path->nodes[0];
struct btrfs_key key;
+   struct inode_item_fix_info info;
u64 cur_bytenr;
u32 nritems;
u64 first_ino = 0;
@@ -1881,6 +1895,7 @@ static int process_one_leaf_v2(struct btrfs_root *root, 
struct btrfs_path *path,
int ret = 0; /* Final return value */
int err = 0; /* Positive error bitmap */
 
+   memset(, 0, sizeof(info));
cur_bytenr = cur->start;
 
/* skip to first inode item or the first inode number change */
@@ -1900,8 +1915,27 @@ static int process_one_leaf_v2(struct btrfs_root *root, 
struct btrfs_path *path,
path->slots[0] = i;
 
 again:
-   err |= check_inode_item(root, path, ext_ref);
+   err |= check_inode_item(root, path, ext_ref, );
+
+   if (repair && (err & ~LAST_ITEM)) {
+   ret = repair_inode_item(root, );
 
+   if (ret < 0)
+   goto out;
+   /*
+* if some errors was repaired, path shall be searched
+* again since path has been changed
+*/
+   if (ret == 0) {
+   btrfs_item_key_to_cpu(path->nodes[0], ,
+ path->slots[0]);
+   btrfs_release_path(path);
+   btrfs_search_slot(NULL, root, , path, 0, 0);
+
+   cur = path->nodes[0];
+   err = info.err;
+   }
+   }
if (err & LAST_ITEM)
goto out;
 
@@ -2211,7 +2245,8 @@ out:
 }
 
 static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,
-   unsigned int ext_ref);
+   unsigned int ext_ref,
+   struct inode_item_fix_info *info);
 
 static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path,
 int *level, struct node_refs *nrefs, int ext_ref)
@@ -2293,7 +2328,7 @@ static int walk_down_tree_v2(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
ret = check_child_node(root, cur, path->slots[*level], next);
-   if (ret < 0) 
+   if (ret < 0)
break;
 
if (btrfs_is_leaf(next))
@@ -2383,6 +2418,105 @@ out:
return ret;
 }
 
+/*
+ * Set inode's nbytes to correct value in @info
+ *
+ * Returns <0  means on error
+ * Returns  0  means successful repair
+ */
+static int repair_inode_nbytes_lowmem(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode_item_fix_info *info)
+{
+   struct btrfs_inode_item *ei;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   ASSERT(info);
+   key.objectid = info->ino;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+
+   ret = btrfs_search_slot(trans, root, , , 0, 1);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_inode_item);
+   btrfs_set_inode_nbytes(path.nodes[0], ei, info->nbytes);
+   

[PATCH v3 3/3] btrfs-progs: fsck-tests/016: lowmem mode check for images

2017-01-11 Thread Su Yue
Since lowmem mode can repair inode nbytes error now, modify this test case
to allow lowmem mode repair.

Signed-off-by: Su Yue 
---
v3: add this patch.
---
 tests/fsck-tests/016-wrong-inode-nbytes/test.sh | 33 +
 1 file changed, 33 insertions(+)
 create mode 100755 tests/fsck-tests/016-wrong-inode-nbytes/test.sh

diff --git a/tests/fsck-tests/016-wrong-inode-nbytes/test.sh 
b/tests/fsck-tests/016-wrong-inode-nbytes/test.sh
new file mode 100755
index 000..f8466cb
--- /dev/null
+++ b/tests/fsck-tests/016-wrong-inode-nbytes/test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# check inode nbytes in both normal and lowmem mode
+
+source $TOP/tests/common
+
+check_prereq btrfs
+
+check_image() {
+   local image
+   local image2;
+
+   image=$1
+   image2=$image"_2"
+   cp "$image" "$image2"
+
+   echo "testing image $(basename $image)" >> "$RESULTS"
+   "$TOP/btrfs" check "$image" >> "$RESULTS" 2>&1
+   [ $? -eq 0 ] && _fail "btrfs check should have detected corruption"
+
+   run_check "$TOP/btrfs" check --repair "$image"
+   run_check "$TOP/btrfs" check "$image"
+
+   echo "testing image $(basename $image2)" >> "$RESULTS"
+   "$TOP/btrfs" check --mode=lowmem "$image2" >> "$RESULTS" 2>&1
+   [ $? -eq 0 ] && _fail "btrfs lowmem check should detected corruption"
+
+   run_check "$TOP/btrfs" check --mode=lowmem --repair "$image2"
+   run_check "$TOP/btrfs" check --mode=lowmem "$image2"
+
+   rm "$image2"
+}
+
+check_all_images
-- 
2.11.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 2/3] btrfs-progs: cmds-check.c: supports inode isize fix in lowmem

2017-01-11 Thread Su Yue
Add a function 'repair_inode_isize' to support inode isize repair.

Signed-off-by: Su Yue 
---
v2: none
v3: none
---
 cmds-check.c | 49 -
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index dad10cb..6947420 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -2458,6 +2458,45 @@ out:
 }
 
 /*
+ * Set inode's isize to correct value in @info
+ *
+ * Returns <0  means on error
+ * Returns  0  means successful repair
+ */
+static int repair_inode_isize_lowmem(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode_item_fix_info *info)
+{
+   struct btrfs_inode_item *ei;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   ASSERT(info);
+   key.objectid = info->ino;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+
+   ret = btrfs_search_slot(trans, root, , , 0, 1);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_inode_item);
+   btrfs_set_inode_size(path.nodes[0], ei, info->isize);
+   btrfs_mark_buffer_dirty(path.nodes[0]);
+   printf("reset isize for inode %llu root %llu\n", info->ino,
+  root->root_key.objectid);
+out:
+   btrfs_release_path();
+   return ret;
+}
+
+/*
  * repair_inode_item - repair inode item errors
  *
  * Repair the inode item if error can be repaired. Any caller should compare
@@ -2485,7 +2524,7 @@ static int repair_inode_item(struct btrfs_root *root,
ret = 0;
goto out;
}
-   if (!(err & NBYTES_ERROR)) {
+   if (!(err & NBYTES_ERROR) && !(err & ISIZE_ERROR)) {
warning("root %llu INODE[%llu] have error(s) can't repair, 
error : %d",
root->objectid, info->ino, err);
/* can't fix any errors, ret should be positive */
@@ -2506,6 +2545,13 @@ static int repair_inode_item(struct btrfs_root *root,
else if (ret < 0)
goto out;
}
+   if (err & ISIZE_ERROR) {
+   ret = repair_inode_isize_lowmem(trans, root, info);
+   if (ret == 0)
+   err &= ~ISIZE_ERROR;
+   else if (ret < 0)
+   goto out;
+   }
 
if (err != info->err) {
info->err = err;
@@ -5040,6 +5086,7 @@ out:
 
if (isize != size) {
err |= ISIZE_ERROR;
+   info->isize = size;
error("root %llu DIR INODE [%llu] size(%llu) not equal 
to %llu",
  root->objectid, inode_id, isize, size);
}
-- 
2.11.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: corruption: yet another one after deleting a ro snapshot

2017-01-11 Thread Qu Wenruo



At 01/12/2017 10:28 AM, Christoph Anton Mitterer wrote:

Hey Qu,

On Thu, 2017-01-12 at 09:25 +0800, Qu Wenruo wrote:

And since you just deleted a subvolume and unmount it soon

Indeed, I unmounted it pretty quickly afterwards...

I had mounted it (ro) in the meantime, and did a whole
find mntoint > /dev/null
on it just to see whether going through the file hierarchy causes any
kernel errors already.
There are about 1,2 million files on the fs (in now only one snapshot)
and that took some 3-5 mins...
Not sure whether it continues to delete the subvol when it's mounted
ro... if so, it would have had some time.


IIRC, RO mount won't continue background deletion.

So the fsck result won't change.


However, another fsck afterwards:
# btrfs check /dev/mapper/data-a3 ; echo $?
Checking filesystem on /dev/mapper/data-a3
UUID: 326d292d-f97b-43ca-b1e8-c722d3474719
checking extents
ref mismatch on [37765120 16384] extent item 0, found 1
Backref 37765120 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [37765120 16384]
owner ref check failed [37765120 16384]
ref mismatch on [5120 16384] extent item 0, found 1
Backref 5120 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [5120 16384]
owner ref check failed [5120 16384]
ref mismatch on [78135296 16384] extent item 0, found 1
Backref 78135296 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [78135296 16384]
owner ref check failed [78135296 16384]
ref mismatch on [5960381235200 16384] extent item 0, found 1
Backref 5960381235200 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [5960381235200 16384]
checking free space cache
checking fs roots
checking csums
checking root refs
found 7483995824128 bytes used err is 0
total csum bytes: 7296183880
total tree bytes: 10875944960
total fs tree bytes: 2035286016
total extent tree bytes: 1015988224
btree space waste bytes: 920641324
file data blocks allocated: 8267656339456
 referenced 8389440876544
0



, I assume
the
btrfs is still doing background subvolume deletion, maybe it's just
a
false alert from btrfsck.

If one deleted a subvol and unmounts too fast, will this already cause
a corruption or does btrfs simply continue to cleanup during the next
time(s) it's mounted?


It will continue the deletion on next RW mount.

But, I'm still not sure whether it's a false alert or a *REAL* corruption.

Even it may cause problem and corrupt your data, I still hope you could 
do a rw mount and trigger a btrfs fi sync.


If it's a false alert, we can fix it then with ease.
Or, it's a really big problem.






Would you please try btrfs check --mode=lowmem using latest btrfs-
progs?

Here we go, however still with v4.7.3:

# btrfs check --mode=lowmem /dev/mapper/data-a3 ; echo $?
Checking filesystem on /dev/mapper/data-a3
UUID: 326d292d-f97b-43ca-b1e8-c722d3474719
checking extents
ERROR: block group[74117545984 1073741824] used 1073741824 but extent items 
used 0


Errr, lowmem mode is much restrict on this case then.
Quite some block groups has mismatch used space.

But according to the same used number, I assume it's a lowmmem mode bug.

Would you please try 4.9 btrfs-progs?


ERROR: block group[239473786880 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[500393050112 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[581997428736 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[626557714432 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[668433645568 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[948680261632 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[982503129088 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1039411445760 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1054443831296 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[1190809042944 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1279392743424 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1481256206336 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1620842643456 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[1914511032320 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[3055361720320 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[3216422993920 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[3670615785472 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[3801612288000 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[3828455833600 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[4250973241344 1073741824] used 1073741824 but extent items 
used 0

Re: corruption: yet another one after deleting a ro snapshot

2017-01-11 Thread Christoph Anton Mitterer
Hey Qu,

On Thu, 2017-01-12 at 09:25 +0800, Qu Wenruo wrote:
> And since you just deleted a subvolume and unmount it soon
Indeed, I unmounted it pretty quickly afterwards...

I had mounted it (ro) in the meantime, and did a whole
find mntoint > /dev/null
on it just to see whether going through the file hierarchy causes any
kernel errors already.
There are about 1,2 million files on the fs (in now only one snapshot)
and that took some 3-5 mins...
Not sure whether it continues to delete the subvol when it's mounted
ro... if so, it would have had some time.

However, another fsck afterwards:
# btrfs check /dev/mapper/data-a3 ; echo $?
Checking filesystem on /dev/mapper/data-a3
UUID: 326d292d-f97b-43ca-b1e8-c722d3474719
checking extents
ref mismatch on [37765120 16384] extent item 0, found 1
Backref 37765120 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [37765120 16384]
owner ref check failed [37765120 16384]
ref mismatch on [5120 16384] extent item 0, found 1
Backref 5120 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [5120 16384]
owner ref check failed [5120 16384]
ref mismatch on [78135296 16384] extent item 0, found 1
Backref 78135296 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [78135296 16384]
owner ref check failed [78135296 16384]
ref mismatch on [5960381235200 16384] extent item 0, found 1
Backref 5960381235200 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [5960381235200 16384]
checking free space cache
checking fs roots
checking csums
checking root refs
found 7483995824128 bytes used err is 0
total csum bytes: 7296183880
total tree bytes: 10875944960
total fs tree bytes: 2035286016
total extent tree bytes: 1015988224
btree space waste bytes: 920641324
file data blocks allocated: 8267656339456
 referenced 8389440876544
0


> , I assume
> the 
> btrfs is still doing background subvolume deletion, maybe it's just
> a 
> false alert from btrfsck.
If one deleted a subvol and unmounts too fast, will this already cause
a corruption or does btrfs simply continue to cleanup during the next
time(s) it's mounted?



> Would you please try btrfs check --mode=lowmem using latest btrfs-
> progs?
Here we go, however still with v4.7.3:

# btrfs check --mode=lowmem /dev/mapper/data-a3 ; echo $?
Checking filesystem on /dev/mapper/data-a3
UUID: 326d292d-f97b-43ca-b1e8-c722d3474719
checking extents
ERROR: block group[74117545984 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[239473786880 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[500393050112 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[581997428736 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[626557714432 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[668433645568 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[948680261632 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[982503129088 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1039411445760 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1054443831296 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[1190809042944 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1279392743424 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1481256206336 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[1620842643456 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[1914511032320 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[3055361720320 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[3216422993920 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[3670615785472 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[3801612288000 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[3828455833600 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[4250973241344 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[4261710659584 1073741824] used 1073741824 but extent items 
used 1074266112
ERROR: block group[4392707162112 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[4558063403008 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[4607455526912 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[4635372814336 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[4640204652544 1073741824] used 1073741824 but extent items 
used 0
ERROR: block group[4642352136192 1073741824] used 1073741824 but extent items 
used 1207959552
ERROR: block group[4681006841856 1073741824] used 1073741824 but 

Re: corruption: yet another one after deleting a ro snapshot

2017-01-11 Thread Qu Wenruo



At 01/12/2017 09:07 AM, Christoph Anton Mitterer wrote:

Hey.

Linux heisenberg 4.8.0-2-amd64 #1 SMP Debian 4.8.15-2 (2017-01-04)
x86_64 GNU/Linux
btrfs-progs v4.7.3

I've had this already at least once some year ago or so:

I was doing backups (incremental via send/receive).
After everything was copied, I unmounted the destination fs, made a
fsck, all fine.
Then I mounted it again and did nothing but deleting the old snapshot.
After that, another fsck with the following errors:



According to the messages, some tree blocks has wrong extent backref.

And since you just deleted a subvolume and unmount it soon, I assume the 
btrfs is still doing background subvolume deletion, maybe it's just a 
false alert from btrfsck.


Would you please try btrfs check --mode=lowmem using latest btrfs-progs?

Sometimes bugs in original mode are fixed in lowmem mode.

And it's also recommended to call btrfs fi sync, then wait for some time 
(depending on the subvolume size) to allow btrfs to fully delete the 
subvolume, then try btrfsck.


Thanks,
Qu


Usually I have quite positive experiences with btrfs (things seem to be
fine even after a crash or accidental removal of the USB cable which
attaches the HDD)... but I'm every time shocked again, when supposedly
simple and basic operations like this cause such corruptions.
Kinda gives one the feeling as if quite deep bugs are still everywhere
in place, especially as such "hard to explain" errors happens every now
and then (take e.g. my mails "strange btrfs deadlock", "csum errors
during btrfs check" from the last days... and I don't seem to be the
only one who suffers from such problems, even with the basic parts of
btrfs which are considered to be stable - I mean we're not talking
about RAID56 here)... sigh :-(


While these files are precious, I have in total copies of all these
files, 3 on btrfs and 1 on ext4 (just to be on the safe side if btrfs
gets corrupted for no good reason :-( ) so I could do some
debugging here if some developer tells me what to do.


Anyway... what should I do to repair the fs? Or is it better to simply
re-create that backup from scratch?


Cheers,
Chris.




--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btfs-progs: fsck-tests: corrupt nlink value test

2017-01-11 Thread Qu Wenruo



At 01/11/2017 01:36 AM, lakshmipath...@giis.co.in wrote:

What about submitting a btrfs-image and use generic test load?


Okay, how to share the corrupted btrfs-image? using github?  And also do
you have references for this kind of
setup under btrfs-progs/tests/?  So that  I can follow its model.


At least I follow the following steps to create image:

- Create a 256M file
  The default size of image.
  All zero content is recommended for high compression ratio.
  Both hole or fallocate is OK.

- Make btrfs on that file

- Fill the fs with minimum content
  Just enough files/dirs for next step.
  The smaller space it takes the better

- Corrupt the fs
  Either using btrfs-corrupt-block or manually

- Take the dump
  Either by btrfs-image or just use the raw image.

  It's recommended to use btrfs-image and with -c9 option, which
  can reduce the file size dramatically compared to xz raw image.
  But we must ensure the recovered image still has the same
  corruption, or we must use raw image.

  For raw image, xz it just like other tests.

  Normally submitted images are less than 100K, and that's small enough
  to send as patch.

Thanks,
Qu



Cheers.
Lakshmipathi.G






--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: corruption: yet another one after deleting a ro snapshot

2017-01-11 Thread Christoph Anton Mitterer
Oops forgot to copy and past the actual fsck output O:-)
# btrfs check /dev/mapper/data-a3 ; echo $?
Checking filesystem on /dev/mapper/data-a3
UUID: 326d292d-f97b-43ca-b1e8-c722d3474719
checking extents
ref mismatch on [37765120 16384] extent item 0, found 1
Backref 37765120 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [37765120 16384]
owner ref check failed [37765120 16384]
ref mismatch on [5120 16384] extent item 0, found 1
Backref 5120 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [5120 16384]
owner ref check failed [5120 16384]
ref mismatch on [78135296 16384] extent item 0, found 1
Backref 78135296 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [78135296 16384]
owner ref check failed [78135296 16384]
ref mismatch on [5960381235200 16384] extent item 0, found 1
Backref 5960381235200 parent 6403 root 6403 not found in extent tree
backpointer mismatch on [5960381235200 16384]
checking free space cache
checking fs roots
checking csums
checking root refs
found 7483995824128 bytes used err is 0
total csum bytes: 7296183880
total tree bytes: 10875944960
total fs tree bytes: 2035286016
total extent tree bytes: 1015988224
btree space waste bytes: 920641324
file data blocks allocated: 8267656339456
 referenced 8389440876544
0



Also I've found the previous occasion of the apparently same issue:
https://www.spinics.net/lists/linux-btrfs/msg45190.html


What's the suggested way in reporting bugs? Here on the list?
kernel.org bugzilla?
It's a bit worrying that even just I myself has reported quite a number
of likely bugs here on the ML which never got a reaction from a
developer and thus likely still sleep under to hood :-/


Cheers,
Chris.

smime.p7s
Description: S/MIME cryptographic signature


corruption: yet another one after deleting a ro snapshot

2017-01-11 Thread Christoph Anton Mitterer
Hey.

Linux heisenberg 4.8.0-2-amd64 #1 SMP Debian 4.8.15-2 (2017-01-04)
x86_64 GNU/Linux
btrfs-progs v4.7.3

I've had this already at least once some year ago or so:

I was doing backups (incremental via send/receive).
After everything was copied, I unmounted the destination fs, made a
fsck, all fine.
Then I mounted it again and did nothing but deleting the old snapshot.
After that, another fsck with the following errors:


Usually I have quite positive experiences with btrfs (things seem to be
fine even after a crash or accidental removal of the USB cable which
attaches the HDD)... but I'm every time shocked again, when supposedly
simple and basic operations like this cause such corruptions.
Kinda gives one the feeling as if quite deep bugs are still everywhere
in place, especially as such "hard to explain" errors happens every now
and then (take e.g. my mails "strange btrfs deadlock", "csum errors
during btrfs check" from the last days... and I don't seem to be the
only one who suffers from such problems, even with the basic parts of
btrfs which are considered to be stable - I mean we're not talking
about RAID56 here)... sigh :-(


While these files are precious, I have in total copies of all these
files, 3 on btrfs and 1 on ext4 (just to be on the safe side if btrfs
gets corrupted for no good reason :-( ) so I could do some
debugging here if some developer tells me what to do.


Anyway... what should I do to repair the fs? Or is it better to simply
re-create that backup from scratch?


Cheers,
Chris.

smime.p7s
Description: S/MIME cryptographic signature


Re: Restart during btrfs balance raid1 conversion, now can only mount read-only

2017-01-11 Thread Michael Boratko
I solved the problem. After much futzing with mount options and trying
check --repair, I ended up booting into an Antergos live USB, which
runs kernel and tools 4.9, and was able to succesfully mount the btrfs
volume there. This time, before shutting down, I paused the balance
operation. (Not sure if that was necessary.) When I rebooted back into
Debian, I was able to mount the device normally.


On Wed, Jan 11, 2017 at 4:19 PM, Michael Boratko  wrote:
> I had an 8TB btrfs filesystem setup with my data, and added a 4TB and
> 3TB drive to it and then ran a raid1 conversion using the command
> btrfs balance start -dconvert=raid1 -mconvert=raid1 /mnt/btrfs
>
> It was running OK, but then I restarted the computer without pausing
> the rebalance (I wasn't aware that would be necessary). Upon reboot,
> the system got stuck on the mounting operation (as well as decryption
> of two of the drives - all drives are setup with encryption and
> automatically get unlocked via keyfiles at boot) and dropped me to a
> root shell. I removed the line which was mounting the btrfs volume in
> fstab, and rebooted successfully, however now when I attempt to mount
> the btrfs volume normally it just hangs with no response, and the
> command never terminates and cannot be killed.
>
> I am able to mount the drive read-only, but am unable to mount it as
> read-write even when using the mount option "skip_balance".
>
> Relevant debug info:
> uname -a
> Linux gaussBonnetXeon 4.8.0-0.bpo.2-amd64 #1 SMP Debian
> 4.8.11-1~bpo8+1 (2016-12-14) x86_64 GNU/Linux
>
> btrfs --version
> btrfs-progs v4.7.3
>
> btrfs fi show
> Label: none  uuid: d5c3f49c-3c69-4fd8-b9ab-272c0dbc1eab
> Total devices 3 FS bytes used 2.61TiB
> devid1 size 7.00TiB used 2.64TiB path 
> /dev/mapper/Seagate_Archive_8TB-btrfs
> devid2 size 3.64TiB used 930.00GiB path /dev/mapper/4TB
> devid3 size 2.73TiB used 0.00B path /dev/mapper/3TB
>
> btrfs fi df /mnt/btrfs
> Data, RAID1: total=925.00GiB, used=912.85GiB
> Data, single: total=1.71TiB, used=1.71TiB
> System, DUP: total=8.00MiB, used=352.00KiB
> Metadata, RAID1: total=5.00GiB, used=2.12GiB
> Metadata, DUP: total=8.50GiB, used=7.61GiB
> GlobalReserve, single: total=512.00MiB, used=1.05MiB
>
> dmesg (from when I try to mount as rw):
> [ 3202.640391] BTRFS info (device dm-10): disk space caching is enabled
> [ 3377.624283] BUG: unable to handle kernel paging request at fe10
> [ 3377.624293] IP: []
> qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs]
> [ 3377.624341] PGD 111e09067 PUD 111e0b067 PMD 0
> [ 3377.624348] Oops:  [#1] SMP
> [ 3377.624352] Modules linked in: ipt_REJECT(E) nf_reject_ipv4(E)
> rfcomm(E) xt_multiport(E) iptable_filter(E) ip_tables(E) x_tables(E)
> binfmt_misc(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E)
> sunrpc(E) bnep(E) cfg80211(E) snd_hda_codec_hdmi(E) nvidia_drm(POE)
> btusb(E) btrtl(E) drm_kms_helper(E) btbcm(E) btintel(E) drm(E)
> bluetooth(E) iTCO_wdt(E) iTCO_vendor_support(E) coretemp(E) joydev(E)
> kvm_intel(E) snd_hda_intel(E) snd_ca0106(E) rfkill(E) evdev(E)
> snd_hda_codec(E) nvidia_modeset(POE) snd_rawmidi(E) kvm(E)
> snd_seq_device(E) snd_hda_core(E) snd_ac97_codec(E) snd_hwdep(E)
> i7core_edac(E) i2c_i801(E) snd_pcm(E) nvidia(POE) irqbypass(E)
> pcspkr(E) edac_core(E) i2c_smbus(E) snd_timer(E) snd(E) soundcore(E)
> ac97_bus(E) ipmi_si(E) ipmi_msghandler(E) shpchp(E) lpc_ich(E)
> mfd_core(E) acpi_cpufreq(E)
> [ 3377.624400]  button(E) tpm_tis(E) tpm_tis_core(E) tpm(E) fuse(E)
> parport_pc(E) ppdev(E) lp(E) parport(E) autofs4(E) ext4(E) crc16(E)
> jbd2(E) fscrypto(E) mbcache(E) btrfs(E) xor(E) raid6_pq(E) xts(E)
> gf128mul(E) algif_skcipher(E) af_alg(E) dm_crypt(E) dm_mod(E)
> sr_mod(E) cdrom(E) hid_generic(E) usbhid(E) hid(E) sg(E) sd_mod(E)
> ahci(E) libahci(E) xhci_pci(E) crc32c_intel(E) ehci_pci(E) xhci_hcd(E)
> libata(E) ehci_hcd(E) e1000e(E) scsi_mod(E) usbcore(E) ptp(E)
> usb_common(E) pps_core(E) fjes(E)
> [ 3377.624426] CPU: 0 PID: 1909 Comm: mount Tainted: P   OE
> 4.8.0-0.bpo.2-amd64 #1 Debian 4.8.11-1~bpo8+1
> [ 3377.624428] Hardware name: Intel Corporation S3420GP/S3420GP, BIOS
> S3420GP.86B.01.00.0040.021120101620 02/11/2010
> [ 3377.624430] task: 9277e47f7100 task.stack: 92770a06
> [ 3377.624431] RIP: 0010:[]  []
> qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs]
> [ 3377.624454] RSP: 0018:92770a063cb8  EFLAGS: 00210246
> [ 3377.624455] RAX: 927805ee3000 RBX: 92772a31de00 RCX: 
> 
> [ 3377.624457] RDX: 9277edf22090 RSI: 9274d3f5b800 RDI: 
> 9277edf22000
> [ 3377.624458] RBP: 927805e5d800 R08:  R09: 
> 9277edf22000
> [ 3377.624460] R10:  R11:  R12: 
> 92770a063d50
> [ 3377.624461] R13: 9274d3f5b800 R14:  R15: 
> 9277edf22000
> [ 3377.624463] FS:  7f0f98765840() GS:92781fc0()
> knlGS:
> [ 3377.624465] CS:  0010 DS:  ES:  CR0: 

Re: read-only fs, kernel 4.9.0, fs/btrfs/delayed-inode.c:1170 __btrfs_run_delayed_items,

2017-01-11 Thread Chris Murphy
Looks like there's some sort of xattr and Btrfs interaction happening
here; but as it only happens with some subvolumes/snapshots not all
(but 100% consistent) maybe the kernel version at the time the
snapshot was taken is a factor? Anyway git bisect says

# first bad commit: [6c6ef9f26e598fb977f60935e109cd5b266c941a] xattr:
Stop calling {get,set,remove}xattr inode operations

btrfs-image produces a 159M file.

I've updated the bug report
https://bugzilla.kernel.org/show_bug.cgi?id=191761 and also adding
patch author to cc.


Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Restart during btrfs balance raid1 conversion, now can only mount read-only

2017-01-11 Thread Michael Boratko
I had an 8TB btrfs filesystem setup with my data, and added a 4TB and
3TB drive to it and then ran a raid1 conversion using the command
btrfs balance start -dconvert=raid1 -mconvert=raid1 /mnt/btrfs

It was running OK, but then I restarted the computer without pausing
the rebalance (I wasn't aware that would be necessary). Upon reboot,
the system got stuck on the mounting operation (as well as decryption
of two of the drives - all drives are setup with encryption and
automatically get unlocked via keyfiles at boot) and dropped me to a
root shell. I removed the line which was mounting the btrfs volume in
fstab, and rebooted successfully, however now when I attempt to mount
the btrfs volume normally it just hangs with no response, and the
command never terminates and cannot be killed.

I am able to mount the drive read-only, but am unable to mount it as
read-write even when using the mount option "skip_balance".

Relevant debug info:
uname -a
Linux gaussBonnetXeon 4.8.0-0.bpo.2-amd64 #1 SMP Debian
4.8.11-1~bpo8+1 (2016-12-14) x86_64 GNU/Linux

btrfs --version
btrfs-progs v4.7.3

btrfs fi show
Label: none  uuid: d5c3f49c-3c69-4fd8-b9ab-272c0dbc1eab
Total devices 3 FS bytes used 2.61TiB
devid1 size 7.00TiB used 2.64TiB path /dev/mapper/Seagate_Archive_8TB-btrfs
devid2 size 3.64TiB used 930.00GiB path /dev/mapper/4TB
devid3 size 2.73TiB used 0.00B path /dev/mapper/3TB

btrfs fi df /mnt/btrfs
Data, RAID1: total=925.00GiB, used=912.85GiB
Data, single: total=1.71TiB, used=1.71TiB
System, DUP: total=8.00MiB, used=352.00KiB
Metadata, RAID1: total=5.00GiB, used=2.12GiB
Metadata, DUP: total=8.50GiB, used=7.61GiB
GlobalReserve, single: total=512.00MiB, used=1.05MiB

dmesg (from when I try to mount as rw):
[ 3202.640391] BTRFS info (device dm-10): disk space caching is enabled
[ 3377.624283] BUG: unable to handle kernel paging request at fe10
[ 3377.624293] IP: []
qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs]
[ 3377.624341] PGD 111e09067 PUD 111e0b067 PMD 0
[ 3377.624348] Oops:  [#1] SMP
[ 3377.624352] Modules linked in: ipt_REJECT(E) nf_reject_ipv4(E)
rfcomm(E) xt_multiport(E) iptable_filter(E) ip_tables(E) x_tables(E)
binfmt_misc(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E)
sunrpc(E) bnep(E) cfg80211(E) snd_hda_codec_hdmi(E) nvidia_drm(POE)
btusb(E) btrtl(E) drm_kms_helper(E) btbcm(E) btintel(E) drm(E)
bluetooth(E) iTCO_wdt(E) iTCO_vendor_support(E) coretemp(E) joydev(E)
kvm_intel(E) snd_hda_intel(E) snd_ca0106(E) rfkill(E) evdev(E)
snd_hda_codec(E) nvidia_modeset(POE) snd_rawmidi(E) kvm(E)
snd_seq_device(E) snd_hda_core(E) snd_ac97_codec(E) snd_hwdep(E)
i7core_edac(E) i2c_i801(E) snd_pcm(E) nvidia(POE) irqbypass(E)
pcspkr(E) edac_core(E) i2c_smbus(E) snd_timer(E) snd(E) soundcore(E)
ac97_bus(E) ipmi_si(E) ipmi_msghandler(E) shpchp(E) lpc_ich(E)
mfd_core(E) acpi_cpufreq(E)
[ 3377.624400]  button(E) tpm_tis(E) tpm_tis_core(E) tpm(E) fuse(E)
parport_pc(E) ppdev(E) lp(E) parport(E) autofs4(E) ext4(E) crc16(E)
jbd2(E) fscrypto(E) mbcache(E) btrfs(E) xor(E) raid6_pq(E) xts(E)
gf128mul(E) algif_skcipher(E) af_alg(E) dm_crypt(E) dm_mod(E)
sr_mod(E) cdrom(E) hid_generic(E) usbhid(E) hid(E) sg(E) sd_mod(E)
ahci(E) libahci(E) xhci_pci(E) crc32c_intel(E) ehci_pci(E) xhci_hcd(E)
libata(E) ehci_hcd(E) e1000e(E) scsi_mod(E) usbcore(E) ptp(E)
usb_common(E) pps_core(E) fjes(E)
[ 3377.624426] CPU: 0 PID: 1909 Comm: mount Tainted: P   OE
4.8.0-0.bpo.2-amd64 #1 Debian 4.8.11-1~bpo8+1
[ 3377.624428] Hardware name: Intel Corporation S3420GP/S3420GP, BIOS
S3420GP.86B.01.00.0040.021120101620 02/11/2010
[ 3377.624430] task: 9277e47f7100 task.stack: 92770a06
[ 3377.624431] RIP: 0010:[]  []
qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs]
[ 3377.624454] RSP: 0018:92770a063cb8  EFLAGS: 00210246
[ 3377.624455] RAX: 927805ee3000 RBX: 92772a31de00 RCX: 
[ 3377.624457] RDX: 9277edf22090 RSI: 9274d3f5b800 RDI: 9277edf22000
[ 3377.624458] RBP: 927805e5d800 R08:  R09: 9277edf22000
[ 3377.624460] R10:  R11:  R12: 92770a063d50
[ 3377.624461] R13: 9274d3f5b800 R14:  R15: 9277edf22000
[ 3377.624463] FS:  7f0f98765840() GS:92781fc0()
knlGS:
[ 3377.624465] CS:  0010 DS:  ES:  CR0: 80050033
[ 3377.624467] CR2: fe10 CR3: 00034b952000 CR4: 06f0
[ 3377.624468] Stack:
[ 3377.624470]  c05c14a2 0801 9277edf22000
927805ee3000
[ 3377.624473]  0801 9277edf22000 c05c3ed6

[ 3377.624476]   dc2f7325 92772a31de00
927805e5d800
[ 3377.624478] Call Trace:
[ 3377.624497]  [] ?
join_transaction.isra.15+0x22/0x3f0 [btrfs]
[ 3377.624516]  [] ? start_transaction+0x96/0x4d0 [btrfs]
[ 3377.624537]  [] ?
btrfs_recover_relocation+0x2e8/0x420 [btrfs]
[ 3377.624551]  [] ? btrfs_remount+0x40f/0x570 [btrfs]
[ 

Re: Best practices for raid 1

2017-01-11 Thread Tomasz Kusmierz
I would like to use this thread to ask few questions: 

If we have 2 devices dying on us and we run RAID6 - this theoretically will 
still run (despite our current problems). Now let’s say that we booted up raid6 
of 10 disk and 2 of them dies but operator does NOT know what are dev ID of 
disk that died, How does one removes those devices other than using “-missing” 
??? I ask because it’s in multiple places stated to use “replace” when your 
device dies but nobody ever states how to find out which /dev/ node is actually 
missing  …. so when I want to use a replace, I don’t know what to use within 
command :/ … This whole thing might have an additional complication - if FS is 
fool, than one would need to add disks than remove missing. 


> On 10 Jan 2017, at 21:49, Chris Murphy  wrote:
> 
> On Tue, Jan 10, 2017 at 2:07 PM, Vinko Magecic
>  wrote:
>> Hello,
>> 
>> I set up a raid 1 with two btrfs devices and came across some situations in 
>> my testing that I can't get a straight answer on.
>> 
>> 1) When replacing a volume, do I still need to `umount /path` and then 
>> `mount -o degraded ...` the good volume before doing the `btrfs replace 
>> start ...` ?
> 
> No. If the device being replaced is unreliable, use -r to limit the
> reads from the device being replaced.
> 
> 
> 
>> I didn't see anything that said I had to and when I tested it without 
>> mounting the volume it was able to replace the device without any issue. Is 
>> that considered bad and could risk damage or has `replace` made it possible 
>> to replace devices without umounting the filesystem?
> 
> It's always been possible even before 'replace'.
> btrfs dev add 
> btrfs dev rem 
> 
> But there are some bugs in dev replace that Qu is working on; I think
> they mainly negatively impact raid56 though.
> 
> The one limitation of 'replace' is that the new block device must be
> equal to or larger than the block device being replaced; where dev add
>> dev rem doesn't require this.
> 
> 
>> 2) Everything I see about replacing a drive says to use `/old/device 
>> /new/device` but what if the old device can't be read or no longer exists?
> 
> The command works whether the device is present or not; but if it's
> present and working then any errors on one device can be corrected by
> the other, whereas if the device is missing, then any errors on the
> remaining device can't be corrected. Off hand I'm not sure if the
> replace continues and an error just logged...I think that's what
> should happen.
> 
> 
>> Would that be a `btrfs device add /new/device; btrfs balance start 
>> /new/device` ?
> 
> dev add then dev rem; the balance isn't necessary.
> 
>> 
>> 3) When I have the RAID1 with two devices and I want to grow it out, which 
>> is the better practice? Create a larger volume, replace the old device with 
>> the new device and then do it a second time for the other device, or 
>> attaching the new volumes to the label/uuid one at a time and with each one 
>> use `btrfs filesystem resize devid:max /mountpoint`.
> 
> If you're replacing a 2x raid1 with two bigger replacements, you'd use
> 'btrfs replace' twice. Maybe it'd work concurrently, I've never tried
> it, but useful for someone to test and see if it explodes because if
> it's allowed, it should work or fail gracefully.
> 
> There's no need to do filesystem resizes when doing either 'replace'
> or 'dev add' followed by 'dev rem' because the fs resize is implied.
> First it's resized/grown with add; and then it's resized/shrink with
> remove. For replace there's a consolidation of steps, it's been a
> while since I've looked at the code so I can't tell you what steps it
> skips, what the state of the devices are in during the replace, which
> one active writes go to.
> 
> 
> -- 
> Chris Murphy
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [LSF/MM TOPIC] sharing pages between mappings

2017-01-11 Thread Andreas Dilger
On Jan 11, 2017, at 3:29 AM, Miklos Szeredi  wrote:
> 
> I know there's work on this for xfs, but could this be done in generic mm 
> code?
> 
> What are the obstacles?  page->mapping and page->index are the obvious ones.
> 
> If that's too difficult is it maybe enough to share mappings between
> files while they are completely identical and clone the mapping when
> necessary?
> 
> All COW filesystems would benefit, as well as layered ones: lots of
> fuse fs, and in some cases overlayfs too.

For layered filesystems it would also be useful to have an API to move
pages between mappings easily.

> Related:  what can DAX do in the presence of cloned block?
> 
> Thanks,
> Miklos
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas







signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: Best practices for raid 1

2017-01-11 Thread Austin S. Hemmelgarn

On 2017-01-10 16:49, Chris Murphy wrote:

On Tue, Jan 10, 2017 at 2:07 PM, Vinko Magecic
 wrote:

Hello,

I set up a raid 1 with two btrfs devices and came across some situations in my 
testing that I can't get a straight answer on.

1) When replacing a volume, do I still need to `umount /path` and then `mount 
-o degraded ...` the good volume before doing the `btrfs replace start ...` ?


No. If the device being replaced is unreliable, use -r to limit the
reads from the device being replaced.




I didn't see anything that said I had to and when I tested it without mounting 
the volume it was able to replace the device without any issue. Is that 
considered bad and could risk damage or has `replace` made it possible to 
replace devices without umounting the filesystem?


It's always been possible even before 'replace'.
btrfs dev add 
btrfs dev rem 

But there are some bugs in dev replace that Qu is working on; I think
they mainly negatively impact raid56 though.

The one limitation of 'replace' is that the new block device must be
equal to or larger than the block device being replaced; where dev add

dev rem doesn't require this.
The other thing to remember is that you can resize the FS on the device 
being replaced so that it will fit on the new device.  I actually 
regularly do this when re-partitioning or moving filesystems between 
devices as a safety precaution so that I can be sure it will fit in the 
new location.  I would only suggest doing this though if that device is 
still reliable, as it may move data around on that device, and it 
obviously doesn't work if the device being replaced is missing.




2) Everything I see about replacing a drive says to use `/old/device 
/new/device` but what if the old device can't be read or no longer exists?


The command works whether the device is present or not; but if it's
present and working then any errors on one device can be corrected by
the other, whereas if the device is missing, then any errors on the
remaining device can't be corrected. Off hand I'm not sure if the
replace continues and an error just logged...I think that's what
should happen.
IIRC, that's what happens up to some (arbitrary) threshold, at which 
point the replace fails.




Would that be a `btrfs device add /new/device; btrfs balance start /new/device` 
?


dev add then dev rem; the balance isn't necessary.
A better way to put it is that the balance is implicit in the removal of 
the device.  The data that was on that device has to go somewhere, and 
the easiest way to do that is just to run a balance that's not allowed 
to allocate anything on the device being removed.




3) When I have the RAID1 with two devices and I want to grow it out, which is 
the better practice? Create a larger volume, replace the old device with the 
new device and then do it a second time for the other device, or attaching the 
new volumes to the label/uuid one at a time and with each one use `btrfs 
filesystem resize devid:max /mountpoint`.


If you're replacing a 2x raid1 with two bigger replacements, you'd use
'btrfs replace' twice. Maybe it'd work concurrently, I've never tried
it, but useful for someone to test and see if it explodes because if
it's allowed, it should work or fail gracefully.
In theory, it _might_ be possible to get dev replace to work 
concurrently.  As of right now, I know that the current implementation 
does not work with more than one instance running per FS (because it 
uses devid 0 for the new device during the replace, and devids have to 
be unique), but I don't know for certain what it does if you try to run 
another (it _should_ refuse to start, I'm not certain if that's what it 
actually does, and I don't have the time to check right now).


That said, there are many reasons to just serialize replaces most of the 
time, the most notable being that replace does not just read from the 
device being replaced (although most of the reads go to that device), 
and that serializing the replace operations has less impact on the rest 
of the system (it is designed to be used on live systems).


There's no need to do filesystem resizes when doing either 'replace'
or 'dev add' followed by 'dev rem' because the fs resize is implied.
First it's resized/grown with add; and then it's resized/shrink with
remove. For replace there's a consolidation of steps, it's been a
while since I've looked at the code so I can't tell you what steps it
skips, what the state of the devices are in during the replace, which
one active writes go to.
Last time I checked, this was not the case for replace, and a resize to 
max size was still necessary.  That was almost 3 months ago though (I've 
been lucky and not needed to replace anything since then), so I may be 
incorrect about the current state of things.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

yet another call trace during send/receive

2017-01-11 Thread Christoph Anton Mitterer
Hi.

On Debian sid:
$ uname -a
Linux heisenberg 4.8.0-2-amd64 #1 SMP Debian 4.8.15-2 (2017-01-04) x86_64 
GNU/Linux

$ btrfs version
btrfs-progs v4.7.3

During a:
# btrfs send -p foo bar | btrfs receive baz


Jan 11 20:43:10 heisenberg kernel: [ cut here ]
Jan 11 20:43:10 heisenberg kernel: WARNING: CPU: 6 PID: 10042 at 
/build/linux-zDY19G/linux-4.8.15/fs/btrfs/send.c:6117 
btrfs_ioctl_send+0x533/0x1280 [btrfs]
Jan 11 20:43:10 heisenberg kernel: Modules linked in: udp_diag tcp_diag 
inet_diag algif_skcipher af_alg uas vhost_net vhost macvtap macvlan xt_CHECKSUM 
iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 
nf_nat tun bridge stp llc fuse ctr ccm ebtable_filter ebtables joydev 
rtsx_pci_ms memstick rtsx_pci_sdmmc mmc_core iTCO_wdt iTCO_vendor_support 
cpufreq_userspace cpufreq_powersave cpufreq_conservative ip6t_REJECT 
nf_reject_ipv6 xt_tcpudp nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter 
ip6_tables xt_policy ipt_REJECT nf_reject_ipv4 xt_comment nf_conntrack_ipv4 
nf_defrag_ipv4 xt_multiport xt_conntrack nf_conntrack iptable_filter 
binfmt_misc intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel 
kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate 
intel_uncore
Jan 11 20:43:10 heisenberg kernel:  intel_rapl_perf psmouse pcspkr uvcvideo 
videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_core videodev media 
btusb btrtl btbcm btintel sg bluetooth crc16 arc4 iwldvm mac80211 iwlwifi 
cfg80211 rtsx_pci rfkill fjes snd_hda_codec_hdmi snd_hda_codec_realtek 
snd_hda_codec_generic tpm_tis tpm_tis_core tpm i915 fujitsu_laptop battery 
snd_hda_intel snd_hda_codec lpc_ich i2c_i801 ac mfd_core shpchp i2c_smbus 
snd_hda_core snd_hwdep snd_pcm snd_timer e1000e snd soundcore ptp pps_core 
video button mei_me mei drm_kms_helper drm i2c_algo_bit loop parport_pc ppdev 
sunrpc lp parport ip_tables x_tables autofs4 dm_crypt dm_mod raid10 raid456 
libcrc32c async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 
raid0 multipath linear md_mod btrfs crc32c_generic xor raid6_pq uhci_hcd 
usb_storage
Jan 11 20:43:10 heisenberg kernel:  sd_mod crc32c_intel ahci libahci 
aesni_intel xhci_pci aes_x86_64 xhci_hcd libata glue_helper lrw ehci_pci 
gf128mul ablk_helper ehci_hcd cryptd evdev usbcore scsi_mod serio_raw usb_common
Jan 11 20:43:10 heisenberg kernel: CPU: 6 PID: 10042 Comm: btrfs Tainted: G 
   W   4.8.0-2-amd64 #1 Debian 4.8.15-2
Jan 11 20:43:10 heisenberg kernel: Hardware name: FUJITSU LIFEBOOK 
E782/FJNB23E, BIOS Version 1.11 05/24/2012
Jan 11 20:43:10 heisenberg kernel:  0286 248adbdb 
b3b1f925 
Jan 11 20:43:10 heisenberg kernel:   b3874ffe 
9ebe7e9f4424 7ffcbf0ea5d0
Jan 11 20:43:10 heisenberg kernel:  9ebc0d644000 9ebe7e9f4000 
9ebe5e44fb20 9ebd4270ae00
Jan 11 20:43:10 heisenberg kernel: Call Trace:
Jan 11 20:43:10 heisenberg kernel:  [] ? dump_stack+0x5c/0x77
Jan 11 20:43:10 heisenberg kernel:  [] ? __warn+0xbe/0xe0
Jan 11 20:43:10 heisenberg kernel:  [] ? 
btrfs_ioctl_send+0x533/0x1280 [btrfs]
Jan 11 20:43:10 heisenberg kernel:  [] ? 
memcg_kmem_get_cache+0x50/0x150
Jan 11 20:43:10 heisenberg kernel:  [] ? 
kmem_cache_alloc+0x122/0x530
Jan 11 20:43:10 heisenberg kernel:  [] ? 
sched_slice.isra.57+0x51/0xc0
Jan 11 20:43:10 heisenberg kernel:  [] ? 
update_cfs_rq_load_avg+0x200/0x4c0
Jan 11 20:43:10 heisenberg kernel:  [] ? 
task_rq_lock+0x46/0xa0
Jan 11 20:43:10 heisenberg kernel:  [] ? 
btrfs_ioctl+0x97c/0x2370 [btrfs]
Jan 11 20:43:10 heisenberg kernel:  [] ? 
enqueue_task_fair+0x5c/0x940
Jan 11 20:43:10 heisenberg kernel:  [] ? sched_clock+0x5/0x10
Jan 11 20:43:10 heisenberg kernel:  [] ? 
check_preempt_curr+0x50/0x90
Jan 11 20:43:10 heisenberg kernel:  [] ? 
wake_up_new_task+0x156/0x200
Jan 11 20:43:10 heisenberg kernel:  [] ? 
do_vfs_ioctl+0x9f/0x5f0
Jan 11 20:43:10 heisenberg kernel:  [] ? _do_fork+0x14d/0x3f0
Jan 11 20:43:10 heisenberg kernel:  [] ? SyS_ioctl+0x74/0x80
Jan 11 20:43:10 heisenberg kernel:  [] ? 
system_call_fast_compare_end+0xc/0x96
Jan 11 20:43:10 heisenberg kernel: ---[ end trace 3831b8afbd0cbc9e ]---
Jan 11 20:43:45 heisenberg kernel: BTRFS info (device dm-2): The free space 
cache file (7525348933632) is invalid. skip it


The send/receive seems to continue running...
Not sure if the free space cache file entry is related (btw: a btrfs
check directly before didn't find that error - actually yet another
fsck directly before that, brought a message that the super generation
and space file generation would mismatch (or something like that) and
it would be invalidated... so kinda strange that this happens at all).


Cheers,
Chris.

smime.p7s
Description: S/MIME cryptographic signature


Re: mkfs.btrfs/balance small-btrfs chunk size RFC

2017-01-11 Thread Duncan
Austin S. Hemmelgarn posted on Tue, 10 Jan 2017 09:57:52 -0500 as
excerpted:

> I can't personally comment on the code itself right now (I've actually
> never looked at the mkfs code, or any of the stuff that deals with the
> System chunk), but I can make a few general comments on this:
> 1. This behavior is still the case as of a Git build from yesterday (I
> just verified this myself with the locally built copy of btrfs-progs on
> my laptop).

Thanks.  After posting and seeing Qu W's response I was thinking I needed 
to test current behavior, and you just saved me the trouble (tho I do 
need to freshen my backup /boot one of these days, likely testing 
mkfs.btrfs on this in the process, but that can wait until 4.10).

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Best practices for raid 1

2017-01-11 Thread Tomasz Kusmierz

> On 10 Jan 2017, at 21:07, Vinko Magecic  
> wrote:
> 
> Hello,
> 
> I set up a raid 1 with two btrfs devices and came across some situations in 
> my testing that I can't get a straight answer on.
> 1) When replacing a volume, do I still need to `umount /path` and then `mount 
> -o degraded ...` the good volume before doing the `btrfs replace start ...` ? 
> I didn't see anything that said I had to and when I tested it without 
> mounting the volume it was able to replace the device without any issue. Is 
> that considered bad and could risk damage or has `replace` made it possible 
> to replace devices without umounting the filesystem?

No need to unmount, just replace old with new. Your scenario seems very 
convoluted and it’s pointless

> 2) Everything I see about replacing a drive says to use `/old/device 
> /new/device` but what if the old device can't be read or no longer exists? 
> Would that be a `btrfs device add /new/device; btrfs balance start 
> /new/device` ?
In case where old device is missing you’ve got few options:
- if you have enough space to fit the data and enough of disks to comply with 
redundancy - just remove the drive, So for example is you have 3 x 1TB drives 
with raid 1 And use less than 1TB of data total - juste remove one drive and 
you will have 2 x 1TB drives in raid 1 and btrfs fill just rebalance stuff for 
you !
- if you have not enough space to fi the data / not enough disks left to comply 
with raid lever - your only option is to add disk first then remove missing 
(btrfs dev delete missing /mount_point_of_your_fs)

> 3) When I have the RAID1 with two devices and I want to grow it out, which is 
> the better practice? Create a larger volume, replace the old device with the 
> new device and then do it a second time for the other device, or attaching 
> the new volumes to the label/uuid one at a time and with each one use `btrfs 
> filesystem resize devid:max /mountpoint`.

You kinda misunderstand the principal of btrfs. Btrfs will span across ALL the 
available space you’ve got. If you have multiple devices in this setup 
(remember that partition IS A DEVICE), it will span across multiple devices and 
you can’t change this. Now btrfs resize is mean for resizing a file system 
occupying a device (or partition). So work flow is that is you want to shrink a 
device (partition) you first shrink fs on this device than size down the device 
(partition) … if you want to increase the size of device (partition) you 
increase size of device (partition) than you grow filesystem within this device 
(partition). This is 100% irrespective of total cumulative size of file system. 

Let’s say you’ve got a btrfs file system that is spanning across 3 x 1TB 
devices … and those devices are partitions. You have raid 1 setup - your 
complete amount of available space is 1.5 TB. Let’s say you want to shrink of 
of partitions to 0.5TB -> first you shrink FS on this partition (balance will 
runn automatically) -> you shrink partition down to 0.5TB -> from now on your 
total available space is 1.25TB. 

Simples right ? :)

> Thanks
> 
> 
> 
> 
>--
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mkfs.btrfs/balance small-btrfs chunk size RFC

2017-01-11 Thread Duncan
Hugo Mills posted on Tue, 10 Jan 2017 15:47:53 + as excerpted:

> On Tue, Jan 10, 2017 at 10:42:51AM -0500, Austin S. Hemmelgarn wrote:
>> Most of the issue in this case is with the size of the initial chunk.
>> That said, I've got quite a few reasonably sized filesystems (I think
>> the largest is 200GB) with moderate usage (max 90GB of data), and none
>> of them are using more than the first 16kB block in the System chunk.
>> While I'm not necessarily a typical user, I'd be willing to bet based
>> on this that in general, most people who aren't storing very large
>> amounts of data or taking huge numbers of snapshots aren't going to
>> need a system chunk much bigger than 1MB.
> 
>Again, the system chunk has *nothing* to do with snapshots.

Given your explanation of the system chunk containing the chunk tree but 
not being (directly) related to snapshots, I took that as...

Many snapshots, some being old snapshots of now changed data, thus 
potentially multiplying the working copy data several times and of course 
requiring more chunks in ordered to contain all that archived data.

So while snapshots aren't directly related to the system chunk, the fact 
that they're snapshotting /something/ that's presumably changing or 
there'd be no need for snapshots, and the snapshot-archived versions of 
that /something/ presumably takes additional chunks, makes snapshots 
indirectly related to the required size of the system chunk(s), in 
ordered to contain the chunk tree supporting all the other chunks, 
necessary due not to live data, but due to the snapshots.

Is that a correct read, or is (somehow) that indirect dependency not 
there either, despite the system chunk(s) containing the chunk tree?

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 00/12] Refactor btrfs_inode VS inode in delayed-inode.c

2017-01-11 Thread Nikolay Borisov


On 11.01.2017 18:51, David Sterba wrote:
> On Tue, Jan 10, 2017 at 08:35:30PM +0200, Nikolay Borisov wrote:
>> After following the discussion in [1] I took a look at what's the 
>> state of VFS-related members being used in core BTRFS code. It turned 
>> out there are quite a few functions which operate on struct btrfs_inode, 
>> yet take struct inode. As a result they have to resort ot excessive 
>> usage of BTRFS_I, furthermore passing inode around doesn't help the 
>> poor reader inferring why inode might be passed to a particular function. 
>>
>> In order to better separate core btrfs functionalities from those part,
>> which interface with the VFS I took a look around the code and this is 
>> the result. I'd like to solicit opinions whether people think this 
>> refactoring is useful, since I have gathered a list of a lot more
>> functions which might use a bit of inode VS btrfs_inode changes. Also, 
>> a lot of function take inode just because btrfs_ino was taking an inode.
> 
> Agreed, this is a good direction how to clean up the code.
> 
>> The patches are self-explanatory, with the first one dealing with 
>> btrfs_ino being the bulk of it. This paves the way to restructuring 
>> a lot of functions. 
>>
>> If the maintainers think this should be merged I'd rather resend it 
>> as a single patch so as not to pollute the git history. This 
>> version can be used for fine-grained discussion and feedback. 
> 
> Actually I like the way it's separated as it keeps the review easy, it
> keeps the context in one function and does one change.
> 
> It would be interesting the see the result as reported by the 'size'
> utility before and after the patchset, the effects of removed BTRFS_I
> calls.

Actually without really doing the full-scale refactoring I expect the
results to be worse, due to the 147 added uses of BTRFS_I in the first
patch. But those are going to be only interim until everything is
cleaned up. Anyway, here are the numbers:

text   data bss dec hex filename
2530598  174661   28288 2733547  29b5eb fs/btrfs/btrfs.ko.nopatches

text   data bss dec hex filename
2530774  174661   28288 2733723  29b69b fs/btrfs/btrfs.ko.patches

So initially there is an increate of 176 bytes in the module but
hopefully this will go down.

> 
> I'll do a testing merge on top of current for-next to see how intrusive
> it is. If it turns out to be ok, I'll add the patches to the cleanups
> branch.
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings

2017-01-11 Thread Darrick J. Wong
On Wed, Jan 11, 2017 at 12:51:43PM +0100, Jan Kara wrote:
> On Wed 11-01-17 11:29:28, Miklos Szeredi wrote:
> > I know there's work on this for xfs, but could this be done in generic mm
> > code?
> > 
> > What are the obstacles?  page->mapping and page->index are the obvious
> > ones.
> 
> Yes, these two are the main that come to my mind. Also you'd need to
> somehow share the mapping->i_mmap tree so that unmap_mapping_range() works.
> 
> > If that's too difficult is it maybe enough to share mappings between
> > files while they are completely identical and clone the mapping when
> > necessary?
> 
> Well, but how would the page->mapping->host indirection work? Even if you
> have identical contents of the mappings, you still need to be aware there
> are several inodes behind them and you need to pick the right one
> somehow...
> 
> > All COW filesystems would benefit, as well as layered ones: lots of
> > fuse fs, and in some cases overlayfs too.
> > 
> > Related:  what can DAX do in the presence of cloned block?
> 
> For DAX handling a block COW should be doable if that is what you are
> asking about. Handling of blocks that can be written to while they are
> shared will be rather difficult (you have problems with keeping dirty bits
> in the radix tree consistent if nothing else).

I'm also interested in this topic, though I haven't gotten any further
than a hand-wavy notion of handling cow by allocating new blocks, memcpy
the contents to the new blocks (how?), then update the mappings to point
to the new blocks (how?).  It looks a lot easier now with the iomap
stuff, but that's as far as I got. :)

(IOWs it basically took all the time since the last LSF to get reflink
polished enough to handle regular files reasonably well.)

--D

> 
>   Honza
> -- 
> Jan Kara 
> SUSE Labs, CR
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


btrfs: account_page_dirtied -> __percpu_counter_add kernel crash

2017-01-11 Thread Angel Shtilianov
Hi all,
I am observing periodic crashes with signature below on kernel 4.4.26.
wb is extracted from page (see mm/page-writeback.c, void
account_page_dirtied() ):
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
We are crasing in
__inc_wb_stat(wb, WB_RECLAIMABLE), which calls __add_wb_stat(wb, item,
1), which then calls  __percpu_counter_add(>stat[item], amount,
WB_STAT_BATCH);

So actually the lock is:
wb->stat[WB_RECLAIMABLE].lock

[6716239.938412] BUG: unable to handle kernel paging request at 00015e9a
[6716239.938782] IP: [] queued_spin_lock_slowpath+0xe5/0x160
[6716239.939076] PGD 16b070067 PUD 2cea00067 PMD 0
[6716239.939485] Oops: 0002 [#1] SMP
[6716239.939834] Modules linked in: xt_multiport dm_snapshot
dm_thin_pool dm_bio_prison dm_persistent_data dm_bufio btrfs raid6_pq
xor loop iptable_mangle iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4
nf_nat_ipv4 nf_nat xt_CT iptable_raw nf_conntrack_ipv6 nf_defrag_ipv6
xt_state nf_conntrack ip6table_filter ip6_tables zfs(PO) zavl(PO)
zcommon(PO) znvpair(PO) spl(O) zunicode(PO) ext2 ib_umad sb_edac
edac_core i2c_i801 lpc_ich mfd_core shpchp ioatdma igb i2c_algo_bit
ses enclosure ipmi_devintf ipmi_si ipmi_msghandler tcp_scalable ib_qib
dca ib_mad ib_core ib_addr ipv6
[6716239.944558] CPU: 7 PID: 23338 Comm: btrfs Tainted: PW  O
  4.4.26-clouder1 #3
[6716239.944950] Hardware name: Supermicro X10DRi/X10DRi, BIOS 1.1 04/14/2015
[6716239.945184] task: 88046cec6e00 ti: 8801a8f1c000 task.ti:
8801a8f1c000
[6716239.945570] RIP: 0010:[]  []
queued_spin_lock_slowpath+0xe5/0x160
[6716239.946025] RSP: 0018:8801a8f1f9a8  EFLAGS: 00010006
[6716239.946253] RAX: 19bb RBX: 880466f363a0 RCX:
00015e9a
[6716239.946639] RDX: 88047fcf5b00 RSI: 0020 RDI:
880466f363a0
[6716239.947036] RBP: 8801a8f1f9a8 R08: 0001 R09:

[6716239.947420] R10: 88026d966210 R11:  R12:
0097
[6716239.947802] R13: feff R14: 88017d5e1d68 R15:
88047f881000
[6716239.952193] FS:  7f99e3058880() GS:88047fce()
knlGS:
[6716239.952582] CS:  0010 DS:  ES:  CR0: 80050033
[6716239.952810] CR2: 00015e9a CR3: 00032bd3a000 CR4:
001406e0
[6716239.953190] Stack:
[6716239.953407]  8801a8f1f9c8 81614ed0 0102
880466f363a0
[6716239.954038]  8801a8f1f9f0 8131d1b0 880466f36340

[6716239.954669]  ea000f44d640 8801a8f1fa28 811353a6
ea000f44d640
[6716239.955298] Call Trace:
[6716239.955521]  [] _raw_spin_lock_irqsave+0x40/0x50
[6716239.955753]  [] __percpu_counter_add+0x40/0x70
[6716239.955982]  [] account_page_dirtied+0xb6/0x1a0
[6716239.956209]  [] __set_page_dirty_nobuffers+0x81/0x140
[6716239.956458]  [] btrfs_set_page_dirty+0xe/0x10 [btrfs]
[6716239.956690]  [] set_page_dirty+0x3d/0x60
[6716239.956928]  [] btrfs_dirty_pages+0x79/0xa0 [btrfs]
[6716239.957184]  []
__btrfs_write_out_cache.isra.23+0x37b/0x420 [btrfs]
[6716239.957577]  [] btrfs_write_out_cache+0x8a/0xf0 [btrfs]
[6716239.957816]  []
btrfs_start_dirty_block_groups+0x1ed/0x3f0 [btrfs]
[6716239.958210]  []
btrfs_commit_transaction+0x14e/0xa60 [btrfs]
[6716239.958602]  [] ? start_transaction+0x9a/0x4e0 [btrfs]
[6716239.958842]  [] btrfs_mksubvol+0x4ce/0x4e0 [btrfs]
[6716239.959070]  [] ? wait_woken+0xb0/0xb0
[6716239.959304]  []
btrfs_ioctl_snap_create_transid+0x18f/0x1a0 [btrfs]
[6716239.959707]  []
btrfs_ioctl_snap_create_v2+0x107/0x170 [btrfs]
[6716239.960102]  [] btrfs_ioctl+0x171a/0x2710 [btrfs]
[6716239.960330]  [] ? handle_mm_fault+0xca2/0x19c0
[6716239.960557]  [] do_vfs_ioctl+0x30f/0x560
[6716239.960786]  [] SyS_ioctl+0x79/0x90
[6716239.961012]  [] entry_SYSCALL_64_fastpath+0x16/0x6e
[6716239.961238] Code: 87 47 02 c1 e0 10 85 c0 74 3d 48 89 c1 c1 e8 12
48 c1 e9 0c 83 e8 01 83 e1 30 48 98 48 81 c1 00 5b 01 00 48 03 0c c5
40 d4 cd 81 <48> 89 11 8b 42 08 85 c0 75 12 f3 90 8b 42 08 85 c0 74 f7
8b 0f
[6716239.965810] RIP  [] queued_spin_lock_slowpath+0xe5/0x160
[6716239.966100]  RSP 
[6716239.966319] CR2: 00015e9a

Has someone seen something like that ?

Best regards,
Angel
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 00/12] Refactor btrfs_inode VS inode in delayed-inode.c

2017-01-11 Thread David Sterba
On Tue, Jan 10, 2017 at 08:35:30PM +0200, Nikolay Borisov wrote:
> After following the discussion in [1] I took a look at what's the 
> state of VFS-related members being used in core BTRFS code. It turned 
> out there are quite a few functions which operate on struct btrfs_inode, 
> yet take struct inode. As a result they have to resort ot excessive 
> usage of BTRFS_I, furthermore passing inode around doesn't help the 
> poor reader inferring why inode might be passed to a particular function. 
> 
> In order to better separate core btrfs functionalities from those part,
> which interface with the VFS I took a look around the code and this is 
> the result. I'd like to solicit opinions whether people think this 
> refactoring is useful, since I have gathered a list of a lot more
> functions which might use a bit of inode VS btrfs_inode changes. Also, 
> a lot of function take inode just because btrfs_ino was taking an inode.

Agreed, this is a good direction how to clean up the code.

> The patches are self-explanatory, with the first one dealing with 
> btrfs_ino being the bulk of it. This paves the way to restructuring 
> a lot of functions. 
> 
> If the maintainers think this should be merged I'd rather resend it 
> as a single patch so as not to pollute the git history. This 
> version can be used for fine-grained discussion and feedback. 

Actually I like the way it's separated as it keeps the review easy, it
keeps the context in one function and does one change.

It would be interesting the see the result as reported by the 'size'
utility before and after the patchset, the effects of removed BTRFS_I
calls.

I'll do a testing merge on top of current for-next to see how intrusive
it is. If it turns out to be ok, I'll add the patches to the cleanups
branch.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] etrfs: fix up misleading GFP_NOFS usage in btrfs_releasepage

2017-01-11 Thread Michal Hocko
On Wed 11-01-17 14:55:50, David Sterba wrote:
> On Mon, Jan 09, 2017 at 03:39:02PM +0100, Michal Hocko wrote:
> > From: Michal Hocko 
> > 
> > b335b0034e25 ("Btrfs: Avoid using __GFP_HIGHMEM with slab allocator")
> > has reduced the allocation mask in btrfs_releasepage to GFP_NOFS just
> > to prevent from giving an unappropriate gfp mask to the slab allocator
> > deeper down the callchain (in alloc_extent_state). This is wrong for
> > two reasons a) GFP_NOFS might be just too restrictive for the calling
> > context b) it is better to tweak the gfp mask down when it needs that.
> > 
> > So just remove the mask tweaking from btrfs_releasepage and move it
> > down to alloc_extent_state where it is needed.
> > 
> > Signed-off-by: Michal Hocko 
> > ---
> >  fs/btrfs/extent_io.c | 5 +
> >  fs/btrfs/inode.c | 2 +-
> >  2 files changed, 6 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> > index b38150eec6b4..f6ae94a4acad 100644
> > --- a/fs/btrfs/extent_io.c
> > +++ b/fs/btrfs/extent_io.c
> > @@ -226,6 +226,11 @@ static struct extent_state *alloc_extent_state(gfp_t 
> > mask)
> >  {
> > struct extent_state *state;
> >  
> > +   /*
> > +* The given mask might be not appropriate for the slab allocator,
> > +* drop the unsupported bits
> > +*/
> > +   mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
> 
> Is this future proof enough? As it's enumerating some gfp flags, what if
> more are necessary in the future? I'm interested about some synthetic
> gfp flags that would not require knowledge about what is or is not
> acceptable for slab allocator.

Well, I agree, that something like slab_restrict_gfp_mask(gfp_t gfp_mask)
would be much better. And in fact that sounds like a nice future
cleanup. I haven't checked how many users would find it useful yet but I
am putting that on my todo list.

> But otherwise looks ok to me, I'm going to merge the patch. Thanks.

Thanks!

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] etrfs: fix up misleading GFP_NOFS usage in btrfs_releasepage

2017-01-11 Thread Michal Hocko
On Wed 11-01-17 14:55:50, David Sterba wrote:
[...]
> But otherwise looks ok to me, I'm going to merge the patch. Thanks.

I have only now noticed typo in the subject. s@etrfs:@btrfs:@

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs: drop gfp mask tweaking in try_release_extent_state

2017-01-11 Thread David Sterba
On Mon, Jan 09, 2017 at 03:39:03PM +0100, Michal Hocko wrote:
> From: Michal Hocko 
> 
> try_release_extent_state reduces the gfp mask to GFP_NOFS if it is
> compatible. This is true for GFP_KERNEL as well. There is no real
> reason to do that though. There is no new lock taken down the
> the only consumer of the gfp mask which is
> try_release_extent_state
>   clear_extent_bit
> __clear_extent_bit
>   alloc_extent_state
> 
> So this seems just unnecessary and confusing.
> 
> Signed-off-by: Michal Hocko 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings

2017-01-11 Thread Miklos Szeredi
On Wed, Jan 11, 2017 at 12:51 PM, Jan Kara  wrote:
> On Wed 11-01-17 11:29:28, Miklos Szeredi wrote:
>> I know there's work on this for xfs, but could this be done in generic mm
>> code?
>>
>> What are the obstacles?  page->mapping and page->index are the obvious
>> ones.
>
> Yes, these two are the main that come to my mind. Also you'd need to
> somehow share the mapping->i_mmap tree so that unmap_mapping_range() works.
>
>> If that's too difficult is it maybe enough to share mappings between
>> files while they are completely identical and clone the mapping when
>> necessary?
>
> Well, but how would the page->mapping->host indirection work? Even if you
> have identical contents of the mappings, you still need to be aware there
> are several inodes behind them and you need to pick the right one
> somehow...

When do we actually need page->mapping->host?  The only place where
it's not available is page writeback.  Then we can know that the
original page was already cow-ed and after being cowed, the page
belong only to a single inode.

What then happens if the newly written data is cloned before being
written back?   We can either write back the page during the clone, so
that only clean pages are ever shared.  Or we can let dirty pages be
shared between inodes.  In that latter case the question is: do we
care about which inode we use for writing back the data?  Is the inode
needed at all?  I don't know enough about filesystem internals to see
clearly what happens in such a situation.

>> All COW filesystems would benefit, as well as layered ones: lots of
>> fuse fs, and in some cases overlayfs too.
>>
>> Related:  what can DAX do in the presence of cloned block?
>
> For DAX handling a block COW should be doable if that is what you are
> asking about. Handling of blocks that can be written to while they are
> shared will be rather difficult (you have problems with keeping dirty bits
> in the radix tree consistent if nothing else).

What happens if you do:

- clone_file_range(A, off1, B, off2, len);

- mmap both A and B using DAX.

The mapping will contain the same struct page for two different mappings, no?

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings

2017-01-11 Thread Jan Kara
On Wed 11-01-17 11:29:28, Miklos Szeredi wrote:
> I know there's work on this for xfs, but could this be done in generic mm
> code?
> 
> What are the obstacles?  page->mapping and page->index are the obvious
> ones.

Yes, these two are the main that come to my mind. Also you'd need to
somehow share the mapping->i_mmap tree so that unmap_mapping_range() works.

> If that's too difficult is it maybe enough to share mappings between
> files while they are completely identical and clone the mapping when
> necessary?

Well, but how would the page->mapping->host indirection work? Even if you
have identical contents of the mappings, you still need to be aware there
are several inodes behind them and you need to pick the right one
somehow...

> All COW filesystems would benefit, as well as layered ones: lots of
> fuse fs, and in some cases overlayfs too.
> 
> Related:  what can DAX do in the presence of cloned block?

For DAX handling a block COW should be doable if that is what you are
asking about. Handling of blocks that can be written to while they are
shared will be rather difficult (you have problems with keeping dirty bits
in the radix tree consistent if nothing else).

Honza
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[LSF/MM TOPIC] sharing pages between mappings

2017-01-11 Thread Miklos Szeredi
I know there's work on this for xfs, but could this be done in generic mm code?

What are the obstacles?  page->mapping and page->index are the obvious ones.

If that's too difficult is it maybe enough to share mappings between
files while they are completely identical and clone the mapping when
necessary?

All COW filesystems would benefit, as well as layered ones: lots of
fuse fs, and in some cases overlayfs too.

Related:  what can DAX do in the presence of cloned block?

Thanks,
Miklos
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATH v2 2/2] btrfs-progs: cmds-check.c: supports inode isize fix in lowmem

2017-01-11 Thread Su Yue
Add a function 'repair_inode_isize' to support inode isize repair.

Signed-off-by: Su Yue 
---
 cmds-check.c | 49 -
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index dad10cb..6947420 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -2458,6 +2458,45 @@ out:
 }
 
 /*
+ * Set inode's isize to correct value in @info
+ *
+ * Returns <0  means on error
+ * Returns  0  means successful repair
+ */
+static int repair_inode_isize_lowmem(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode_item_fix_info *info)
+{
+   struct btrfs_inode_item *ei;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   ASSERT(info);
+   key.objectid = info->ino;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+
+   ret = btrfs_search_slot(trans, root, , , 0, 1);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_inode_item);
+   btrfs_set_inode_size(path.nodes[0], ei, info->isize);
+   btrfs_mark_buffer_dirty(path.nodes[0]);
+   printf("reset isize for inode %llu root %llu\n", info->ino,
+  root->root_key.objectid);
+out:
+   btrfs_release_path();
+   return ret;
+}
+
+/*
  * repair_inode_item - repair inode item errors
  *
  * Repair the inode item if error can be repaired. Any caller should compare
@@ -2485,7 +2524,7 @@ static int repair_inode_item(struct btrfs_root *root,
ret = 0;
goto out;
}
-   if (!(err & NBYTES_ERROR)) {
+   if (!(err & NBYTES_ERROR) && !(err & ISIZE_ERROR)) {
warning("root %llu INODE[%llu] have error(s) can't repair, 
error : %d",
root->objectid, info->ino, err);
/* can't fix any errors, ret should be positive */
@@ -2506,6 +2545,13 @@ static int repair_inode_item(struct btrfs_root *root,
else if (ret < 0)
goto out;
}
+   if (err & ISIZE_ERROR) {
+   ret = repair_inode_isize_lowmem(trans, root, info);
+   if (ret == 0)
+   err &= ~ISIZE_ERROR;
+   else if (ret < 0)
+   goto out;
+   }
 
if (err != info->err) {
info->err = err;
@@ -5040,6 +5086,7 @@ out:
 
if (isize != size) {
err |= ISIZE_ERROR;
+   info->isize = size;
error("root %llu DIR INODE [%llu] size(%llu) not equal 
to %llu",
  root->objectid, inode_id, isize, size);
}
-- 
2.11.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATH v2 1/2] btrfs-progs: cmds-check.c: supports inode nbytes fix in lowmem

2017-01-11 Thread Su Yue
Added 'repair_inode_item' which dispatches functions such as
'repair_inode__nbytes_lowmem' to correct errors and
'struct inode_item_fix_info' to store correct values and errors.

v2:
   reassign err to info.err in process_one_leaf.
   
Signed-off-by: Su Yue 
---
 cmds-check.c | 166 +++
 1 file changed, 155 insertions(+), 11 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 1dba298..dad10cb 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -371,6 +371,17 @@ struct root_item_info {
 };
 
 /*
+ * Use inode_item_fix_info as function check_inode_item's arg.
+ */
+struct inode_item_fix_info {
+   u64 ino;
+   u64 isize;
+   u64 nbytes;
+
+   int err;
+};
+
+/*
  * Error bit for low memory mode check.
  *
  * Currently no caller cares about it yet.  Just internal use for error
@@ -1866,13 +1877,16 @@ struct node_refs {
 static int update_nodes_refs(struct btrfs_root *root, u64 bytenr,
 struct node_refs *nrefs, u64 level);
 static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,
-   unsigned int ext_ref);
-
+   unsigned int ext_ref,
+   struct inode_item_fix_info *info);
+static int repair_inode_item(struct btrfs_root *root,
+struct inode_item_fix_info *info);
 static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path 
*path,
   struct node_refs *nrefs, int *level, int ext_ref)
 {
struct extent_buffer *cur = path->nodes[0];
struct btrfs_key key;
+   struct inode_item_fix_info info;
u64 cur_bytenr;
u32 nritems;
u64 first_ino = 0;
@@ -1881,6 +1895,7 @@ static int process_one_leaf_v2(struct btrfs_root *root, 
struct btrfs_path *path,
int ret = 0; /* Final return value */
int err = 0; /* Positive error bitmap */
 
+   memset(, 0, sizeof(info));
cur_bytenr = cur->start;
 
/* skip to first inode item or the first inode number change */
@@ -1900,8 +1915,27 @@ static int process_one_leaf_v2(struct btrfs_root *root, 
struct btrfs_path *path,
path->slots[0] = i;
 
 again:
-   err |= check_inode_item(root, path, ext_ref);
+   err |= check_inode_item(root, path, ext_ref, );
+
+   if (repair && (err & ~LAST_ITEM)) {
+   ret = repair_inode_item(root, );
 
+   if (ret < 0)
+   goto out;
+   /*
+* if some errors was repaired, path shall be searched
+* again since path has been changed
+*/
+   if (ret == 0) {
+   btrfs_item_key_to_cpu(path->nodes[0], ,
+ path->slots[0]);
+   btrfs_release_path(path);
+   btrfs_search_slot(NULL, root, , path, 0, 0);
+
+   cur = path->nodes[0];
+   err = info.err;
+   }
+   }
if (err & LAST_ITEM)
goto out;
 
@@ -2211,7 +2245,8 @@ out:
 }
 
 static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path,
-   unsigned int ext_ref);
+   unsigned int ext_ref,
+   struct inode_item_fix_info *info);
 
 static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path,
 int *level, struct node_refs *nrefs, int ext_ref)
@@ -2293,7 +2328,7 @@ static int walk_down_tree_v2(struct btrfs_root *root, 
struct btrfs_path *path,
}
 
ret = check_child_node(root, cur, path->slots[*level], next);
-   if (ret < 0) 
+   if (ret < 0)
break;
 
if (btrfs_is_leaf(next))
@@ -2383,6 +2418,105 @@ out:
return ret;
 }
 
+/*
+ * Set inode's nbytes to correct value in @info
+ *
+ * Returns <0  means on error
+ * Returns  0  means successful repair
+ */
+static int repair_inode_nbytes_lowmem(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode_item_fix_info *info)
+{
+   struct btrfs_inode_item *ei;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   int ret;
+
+   ASSERT(info);
+   key.objectid = info->ino;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+
+   ret = btrfs_search_slot(trans, root, , , 0, 1);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_inode_item);
+   btrfs_set_inode_nbytes(path.nodes[0], ei, info->nbytes);
+   btrfs_mark_buffer_dirty(path.nodes[0]);
+