[PATCH v3 1/3] btrfs-progs: cmds-check.c: supports inode nbytes fix in lowmem
Added 'repair_inode_item' which dispatches functions such as 'repair_inode__nbytes_lowmem' to correct errors and 'struct inode_item_fix_info' to store correct values and errors. Signed-off-by: Su Yue--- v2: reassign err to info.err after repaired in process_one_leaf_v2 v3: none --- cmds-check.c | 166 +++ 1 file changed, 155 insertions(+), 11 deletions(-) diff --git a/cmds-check.c b/cmds-check.c index 1dba298..dad10cb 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -371,6 +371,17 @@ struct root_item_info { }; /* + * Use inode_item_fix_info as function check_inode_item's arg. + */ +struct inode_item_fix_info { + u64 ino; + u64 isize; + u64 nbytes; + + int err; +}; + +/* * Error bit for low memory mode check. * * Currently no caller cares about it yet. Just internal use for error @@ -1866,13 +1877,16 @@ struct node_refs { static int update_nodes_refs(struct btrfs_root *root, u64 bytenr, struct node_refs *nrefs, u64 level); static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path, - unsigned int ext_ref); - + unsigned int ext_ref, + struct inode_item_fix_info *info); +static int repair_inode_item(struct btrfs_root *root, +struct inode_item_fix_info *info); static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path *path, struct node_refs *nrefs, int *level, int ext_ref) { struct extent_buffer *cur = path->nodes[0]; struct btrfs_key key; + struct inode_item_fix_info info; u64 cur_bytenr; u32 nritems; u64 first_ino = 0; @@ -1881,6 +1895,7 @@ static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path *path, int ret = 0; /* Final return value */ int err = 0; /* Positive error bitmap */ + memset(, 0, sizeof(info)); cur_bytenr = cur->start; /* skip to first inode item or the first inode number change */ @@ -1900,8 +1915,27 @@ static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path *path, path->slots[0] = i; again: - err |= check_inode_item(root, path, ext_ref); + err |= check_inode_item(root, path, ext_ref, ); + + if (repair && (err & ~LAST_ITEM)) { + ret = repair_inode_item(root, ); + if (ret < 0) + goto out; + /* +* if some errors was repaired, path shall be searched +* again since path has been changed +*/ + if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], , + path->slots[0]); + btrfs_release_path(path); + btrfs_search_slot(NULL, root, , path, 0, 0); + + cur = path->nodes[0]; + err = info.err; + } + } if (err & LAST_ITEM) goto out; @@ -2211,7 +2245,8 @@ out: } static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path, - unsigned int ext_ref); + unsigned int ext_ref, + struct inode_item_fix_info *info); static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path, int *level, struct node_refs *nrefs, int ext_ref) @@ -2293,7 +2328,7 @@ static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path, } ret = check_child_node(root, cur, path->slots[*level], next); - if (ret < 0) + if (ret < 0) break; if (btrfs_is_leaf(next)) @@ -2383,6 +2418,105 @@ out: return ret; } +/* + * Set inode's nbytes to correct value in @info + * + * Returns <0 means on error + * Returns 0 means successful repair + */ +static int repair_inode_nbytes_lowmem(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode_item_fix_info *info) +{ + struct btrfs_inode_item *ei; + struct btrfs_key key; + struct btrfs_path path; + int ret; + + ASSERT(info); + key.objectid = info->ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, , , 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ei = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_inode_item); + btrfs_set_inode_nbytes(path.nodes[0], ei, info->nbytes); +
[PATCH v3 3/3] btrfs-progs: fsck-tests/016: lowmem mode check for images
Since lowmem mode can repair inode nbytes error now, modify this test case to allow lowmem mode repair. Signed-off-by: Su Yue--- v3: add this patch. --- tests/fsck-tests/016-wrong-inode-nbytes/test.sh | 33 + 1 file changed, 33 insertions(+) create mode 100755 tests/fsck-tests/016-wrong-inode-nbytes/test.sh diff --git a/tests/fsck-tests/016-wrong-inode-nbytes/test.sh b/tests/fsck-tests/016-wrong-inode-nbytes/test.sh new file mode 100755 index 000..f8466cb --- /dev/null +++ b/tests/fsck-tests/016-wrong-inode-nbytes/test.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# check inode nbytes in both normal and lowmem mode + +source $TOP/tests/common + +check_prereq btrfs + +check_image() { + local image + local image2; + + image=$1 + image2=$image"_2" + cp "$image" "$image2" + + echo "testing image $(basename $image)" >> "$RESULTS" + "$TOP/btrfs" check "$image" >> "$RESULTS" 2>&1 + [ $? -eq 0 ] && _fail "btrfs check should have detected corruption" + + run_check "$TOP/btrfs" check --repair "$image" + run_check "$TOP/btrfs" check "$image" + + echo "testing image $(basename $image2)" >> "$RESULTS" + "$TOP/btrfs" check --mode=lowmem "$image2" >> "$RESULTS" 2>&1 + [ $? -eq 0 ] && _fail "btrfs lowmem check should detected corruption" + + run_check "$TOP/btrfs" check --mode=lowmem --repair "$image2" + run_check "$TOP/btrfs" check --mode=lowmem "$image2" + + rm "$image2" +} + +check_all_images -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 2/3] btrfs-progs: cmds-check.c: supports inode isize fix in lowmem
Add a function 'repair_inode_isize' to support inode isize repair. Signed-off-by: Su Yue--- v2: none v3: none --- cmds-check.c | 49 - 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/cmds-check.c b/cmds-check.c index dad10cb..6947420 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -2458,6 +2458,45 @@ out: } /* + * Set inode's isize to correct value in @info + * + * Returns <0 means on error + * Returns 0 means successful repair + */ +static int repair_inode_isize_lowmem(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode_item_fix_info *info) +{ + struct btrfs_inode_item *ei; + struct btrfs_key key; + struct btrfs_path path; + int ret; + + ASSERT(info); + key.objectid = info->ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, , , 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ei = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(path.nodes[0], ei, info->isize); + btrfs_mark_buffer_dirty(path.nodes[0]); + printf("reset isize for inode %llu root %llu\n", info->ino, + root->root_key.objectid); +out: + btrfs_release_path(); + return ret; +} + +/* * repair_inode_item - repair inode item errors * * Repair the inode item if error can be repaired. Any caller should compare @@ -2485,7 +2524,7 @@ static int repair_inode_item(struct btrfs_root *root, ret = 0; goto out; } - if (!(err & NBYTES_ERROR)) { + if (!(err & NBYTES_ERROR) && !(err & ISIZE_ERROR)) { warning("root %llu INODE[%llu] have error(s) can't repair, error : %d", root->objectid, info->ino, err); /* can't fix any errors, ret should be positive */ @@ -2506,6 +2545,13 @@ static int repair_inode_item(struct btrfs_root *root, else if (ret < 0) goto out; } + if (err & ISIZE_ERROR) { + ret = repair_inode_isize_lowmem(trans, root, info); + if (ret == 0) + err &= ~ISIZE_ERROR; + else if (ret < 0) + goto out; + } if (err != info->err) { info->err = err; @@ -5040,6 +5086,7 @@ out: if (isize != size) { err |= ISIZE_ERROR; + info->isize = size; error("root %llu DIR INODE [%llu] size(%llu) not equal to %llu", root->objectid, inode_id, isize, size); } -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: corruption: yet another one after deleting a ro snapshot
At 01/12/2017 10:28 AM, Christoph Anton Mitterer wrote: Hey Qu, On Thu, 2017-01-12 at 09:25 +0800, Qu Wenruo wrote: And since you just deleted a subvolume and unmount it soon Indeed, I unmounted it pretty quickly afterwards... I had mounted it (ro) in the meantime, and did a whole find mntoint > /dev/null on it just to see whether going through the file hierarchy causes any kernel errors already. There are about 1,2 million files on the fs (in now only one snapshot) and that took some 3-5 mins... Not sure whether it continues to delete the subvol when it's mounted ro... if so, it would have had some time. IIRC, RO mount won't continue background deletion. So the fsck result won't change. However, another fsck afterwards: # btrfs check /dev/mapper/data-a3 ; echo $? Checking filesystem on /dev/mapper/data-a3 UUID: 326d292d-f97b-43ca-b1e8-c722d3474719 checking extents ref mismatch on [37765120 16384] extent item 0, found 1 Backref 37765120 parent 6403 root 6403 not found in extent tree backpointer mismatch on [37765120 16384] owner ref check failed [37765120 16384] ref mismatch on [5120 16384] extent item 0, found 1 Backref 5120 parent 6403 root 6403 not found in extent tree backpointer mismatch on [5120 16384] owner ref check failed [5120 16384] ref mismatch on [78135296 16384] extent item 0, found 1 Backref 78135296 parent 6403 root 6403 not found in extent tree backpointer mismatch on [78135296 16384] owner ref check failed [78135296 16384] ref mismatch on [5960381235200 16384] extent item 0, found 1 Backref 5960381235200 parent 6403 root 6403 not found in extent tree backpointer mismatch on [5960381235200 16384] checking free space cache checking fs roots checking csums checking root refs found 7483995824128 bytes used err is 0 total csum bytes: 7296183880 total tree bytes: 10875944960 total fs tree bytes: 2035286016 total extent tree bytes: 1015988224 btree space waste bytes: 920641324 file data blocks allocated: 8267656339456 referenced 8389440876544 0 , I assume the btrfs is still doing background subvolume deletion, maybe it's just a false alert from btrfsck. If one deleted a subvol and unmounts too fast, will this already cause a corruption or does btrfs simply continue to cleanup during the next time(s) it's mounted? It will continue the deletion on next RW mount. But, I'm still not sure whether it's a false alert or a *REAL* corruption. Even it may cause problem and corrupt your data, I still hope you could do a rw mount and trigger a btrfs fi sync. If it's a false alert, we can fix it then with ease. Or, it's a really big problem. Would you please try btrfs check --mode=lowmem using latest btrfs- progs? Here we go, however still with v4.7.3: # btrfs check --mode=lowmem /dev/mapper/data-a3 ; echo $? Checking filesystem on /dev/mapper/data-a3 UUID: 326d292d-f97b-43ca-b1e8-c722d3474719 checking extents ERROR: block group[74117545984 1073741824] used 1073741824 but extent items used 0 Errr, lowmem mode is much restrict on this case then. Quite some block groups has mismatch used space. But according to the same used number, I assume it's a lowmmem mode bug. Would you please try 4.9 btrfs-progs? ERROR: block group[239473786880 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[500393050112 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[581997428736 1073741824] used 1073741824 but extent items used 0 ERROR: block group[626557714432 1073741824] used 1073741824 but extent items used 0 ERROR: block group[668433645568 1073741824] used 1073741824 but extent items used 0 ERROR: block group[948680261632 1073741824] used 1073741824 but extent items used 0 ERROR: block group[982503129088 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1039411445760 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1054443831296 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[1190809042944 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1279392743424 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1481256206336 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1620842643456 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[1914511032320 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[3055361720320 1073741824] used 1073741824 but extent items used 0 ERROR: block group[3216422993920 1073741824] used 1073741824 but extent items used 0 ERROR: block group[3670615785472 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[3801612288000 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[3828455833600 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[4250973241344 1073741824] used 1073741824 but extent items used 0
Re: corruption: yet another one after deleting a ro snapshot
Hey Qu, On Thu, 2017-01-12 at 09:25 +0800, Qu Wenruo wrote: > And since you just deleted a subvolume and unmount it soon Indeed, I unmounted it pretty quickly afterwards... I had mounted it (ro) in the meantime, and did a whole find mntoint > /dev/null on it just to see whether going through the file hierarchy causes any kernel errors already. There are about 1,2 million files on the fs (in now only one snapshot) and that took some 3-5 mins... Not sure whether it continues to delete the subvol when it's mounted ro... if so, it would have had some time. However, another fsck afterwards: # btrfs check /dev/mapper/data-a3 ; echo $? Checking filesystem on /dev/mapper/data-a3 UUID: 326d292d-f97b-43ca-b1e8-c722d3474719 checking extents ref mismatch on [37765120 16384] extent item 0, found 1 Backref 37765120 parent 6403 root 6403 not found in extent tree backpointer mismatch on [37765120 16384] owner ref check failed [37765120 16384] ref mismatch on [5120 16384] extent item 0, found 1 Backref 5120 parent 6403 root 6403 not found in extent tree backpointer mismatch on [5120 16384] owner ref check failed [5120 16384] ref mismatch on [78135296 16384] extent item 0, found 1 Backref 78135296 parent 6403 root 6403 not found in extent tree backpointer mismatch on [78135296 16384] owner ref check failed [78135296 16384] ref mismatch on [5960381235200 16384] extent item 0, found 1 Backref 5960381235200 parent 6403 root 6403 not found in extent tree backpointer mismatch on [5960381235200 16384] checking free space cache checking fs roots checking csums checking root refs found 7483995824128 bytes used err is 0 total csum bytes: 7296183880 total tree bytes: 10875944960 total fs tree bytes: 2035286016 total extent tree bytes: 1015988224 btree space waste bytes: 920641324 file data blocks allocated: 8267656339456 referenced 8389440876544 0 > , I assume > the > btrfs is still doing background subvolume deletion, maybe it's just > a > false alert from btrfsck. If one deleted a subvol and unmounts too fast, will this already cause a corruption or does btrfs simply continue to cleanup during the next time(s) it's mounted? > Would you please try btrfs check --mode=lowmem using latest btrfs- > progs? Here we go, however still with v4.7.3: # btrfs check --mode=lowmem /dev/mapper/data-a3 ; echo $? Checking filesystem on /dev/mapper/data-a3 UUID: 326d292d-f97b-43ca-b1e8-c722d3474719 checking extents ERROR: block group[74117545984 1073741824] used 1073741824 but extent items used 0 ERROR: block group[239473786880 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[500393050112 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[581997428736 1073741824] used 1073741824 but extent items used 0 ERROR: block group[626557714432 1073741824] used 1073741824 but extent items used 0 ERROR: block group[668433645568 1073741824] used 1073741824 but extent items used 0 ERROR: block group[948680261632 1073741824] used 1073741824 but extent items used 0 ERROR: block group[982503129088 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1039411445760 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1054443831296 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[1190809042944 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1279392743424 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1481256206336 1073741824] used 1073741824 but extent items used 0 ERROR: block group[1620842643456 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[1914511032320 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[3055361720320 1073741824] used 1073741824 but extent items used 0 ERROR: block group[3216422993920 1073741824] used 1073741824 but extent items used 0 ERROR: block group[3670615785472 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[3801612288000 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[3828455833600 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[4250973241344 1073741824] used 1073741824 but extent items used 0 ERROR: block group[4261710659584 1073741824] used 1073741824 but extent items used 1074266112 ERROR: block group[4392707162112 1073741824] used 1073741824 but extent items used 0 ERROR: block group[4558063403008 1073741824] used 1073741824 but extent items used 0 ERROR: block group[4607455526912 1073741824] used 1073741824 but extent items used 0 ERROR: block group[4635372814336 1073741824] used 1073741824 but extent items used 0 ERROR: block group[4640204652544 1073741824] used 1073741824 but extent items used 0 ERROR: block group[4642352136192 1073741824] used 1073741824 but extent items used 1207959552 ERROR: block group[4681006841856 1073741824] used 1073741824 but
Re: corruption: yet another one after deleting a ro snapshot
At 01/12/2017 09:07 AM, Christoph Anton Mitterer wrote: Hey. Linux heisenberg 4.8.0-2-amd64 #1 SMP Debian 4.8.15-2 (2017-01-04) x86_64 GNU/Linux btrfs-progs v4.7.3 I've had this already at least once some year ago or so: I was doing backups (incremental via send/receive). After everything was copied, I unmounted the destination fs, made a fsck, all fine. Then I mounted it again and did nothing but deleting the old snapshot. After that, another fsck with the following errors: According to the messages, some tree blocks has wrong extent backref. And since you just deleted a subvolume and unmount it soon, I assume the btrfs is still doing background subvolume deletion, maybe it's just a false alert from btrfsck. Would you please try btrfs check --mode=lowmem using latest btrfs-progs? Sometimes bugs in original mode are fixed in lowmem mode. And it's also recommended to call btrfs fi sync, then wait for some time (depending on the subvolume size) to allow btrfs to fully delete the subvolume, then try btrfsck. Thanks, Qu Usually I have quite positive experiences with btrfs (things seem to be fine even after a crash or accidental removal of the USB cable which attaches the HDD)... but I'm every time shocked again, when supposedly simple and basic operations like this cause such corruptions. Kinda gives one the feeling as if quite deep bugs are still everywhere in place, especially as such "hard to explain" errors happens every now and then (take e.g. my mails "strange btrfs deadlock", "csum errors during btrfs check" from the last days... and I don't seem to be the only one who suffers from such problems, even with the basic parts of btrfs which are considered to be stable - I mean we're not talking about RAID56 here)... sigh :-( While these files are precious, I have in total copies of all these files, 3 on btrfs and 1 on ext4 (just to be on the safe side if btrfs gets corrupted for no good reason :-( ) so I could do some debugging here if some developer tells me what to do. Anyway... what should I do to repair the fs? Or is it better to simply re-create that backup from scratch? Cheers, Chris. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btfs-progs: fsck-tests: corrupt nlink value test
At 01/11/2017 01:36 AM, lakshmipath...@giis.co.in wrote: What about submitting a btrfs-image and use generic test load? Okay, how to share the corrupted btrfs-image? using github? And also do you have references for this kind of setup under btrfs-progs/tests/? So that I can follow its model. At least I follow the following steps to create image: - Create a 256M file The default size of image. All zero content is recommended for high compression ratio. Both hole or fallocate is OK. - Make btrfs on that file - Fill the fs with minimum content Just enough files/dirs for next step. The smaller space it takes the better - Corrupt the fs Either using btrfs-corrupt-block or manually - Take the dump Either by btrfs-image or just use the raw image. It's recommended to use btrfs-image and with -c9 option, which can reduce the file size dramatically compared to xz raw image. But we must ensure the recovered image still has the same corruption, or we must use raw image. For raw image, xz it just like other tests. Normally submitted images are less than 100K, and that's small enough to send as patch. Thanks, Qu Cheers. Lakshmipathi.G -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: corruption: yet another one after deleting a ro snapshot
Oops forgot to copy and past the actual fsck output O:-) # btrfs check /dev/mapper/data-a3 ; echo $? Checking filesystem on /dev/mapper/data-a3 UUID: 326d292d-f97b-43ca-b1e8-c722d3474719 checking extents ref mismatch on [37765120 16384] extent item 0, found 1 Backref 37765120 parent 6403 root 6403 not found in extent tree backpointer mismatch on [37765120 16384] owner ref check failed [37765120 16384] ref mismatch on [5120 16384] extent item 0, found 1 Backref 5120 parent 6403 root 6403 not found in extent tree backpointer mismatch on [5120 16384] owner ref check failed [5120 16384] ref mismatch on [78135296 16384] extent item 0, found 1 Backref 78135296 parent 6403 root 6403 not found in extent tree backpointer mismatch on [78135296 16384] owner ref check failed [78135296 16384] ref mismatch on [5960381235200 16384] extent item 0, found 1 Backref 5960381235200 parent 6403 root 6403 not found in extent tree backpointer mismatch on [5960381235200 16384] checking free space cache checking fs roots checking csums checking root refs found 7483995824128 bytes used err is 0 total csum bytes: 7296183880 total tree bytes: 10875944960 total fs tree bytes: 2035286016 total extent tree bytes: 1015988224 btree space waste bytes: 920641324 file data blocks allocated: 8267656339456 referenced 8389440876544 0 Also I've found the previous occasion of the apparently same issue: https://www.spinics.net/lists/linux-btrfs/msg45190.html What's the suggested way in reporting bugs? Here on the list? kernel.org bugzilla? It's a bit worrying that even just I myself has reported quite a number of likely bugs here on the ML which never got a reaction from a developer and thus likely still sleep under to hood :-/ Cheers, Chris. smime.p7s Description: S/MIME cryptographic signature
corruption: yet another one after deleting a ro snapshot
Hey. Linux heisenberg 4.8.0-2-amd64 #1 SMP Debian 4.8.15-2 (2017-01-04) x86_64 GNU/Linux btrfs-progs v4.7.3 I've had this already at least once some year ago or so: I was doing backups (incremental via send/receive). After everything was copied, I unmounted the destination fs, made a fsck, all fine. Then I mounted it again and did nothing but deleting the old snapshot. After that, another fsck with the following errors: Usually I have quite positive experiences with btrfs (things seem to be fine even after a crash or accidental removal of the USB cable which attaches the HDD)... but I'm every time shocked again, when supposedly simple and basic operations like this cause such corruptions. Kinda gives one the feeling as if quite deep bugs are still everywhere in place, especially as such "hard to explain" errors happens every now and then (take e.g. my mails "strange btrfs deadlock", "csum errors during btrfs check" from the last days... and I don't seem to be the only one who suffers from such problems, even with the basic parts of btrfs which are considered to be stable - I mean we're not talking about RAID56 here)... sigh :-( While these files are precious, I have in total copies of all these files, 3 on btrfs and 1 on ext4 (just to be on the safe side if btrfs gets corrupted for no good reason :-( ) so I could do some debugging here if some developer tells me what to do. Anyway... what should I do to repair the fs? Or is it better to simply re-create that backup from scratch? Cheers, Chris. smime.p7s Description: S/MIME cryptographic signature
Re: Restart during btrfs balance raid1 conversion, now can only mount read-only
I solved the problem. After much futzing with mount options and trying check --repair, I ended up booting into an Antergos live USB, which runs kernel and tools 4.9, and was able to succesfully mount the btrfs volume there. This time, before shutting down, I paused the balance operation. (Not sure if that was necessary.) When I rebooted back into Debian, I was able to mount the device normally. On Wed, Jan 11, 2017 at 4:19 PM, Michael Boratkowrote: > I had an 8TB btrfs filesystem setup with my data, and added a 4TB and > 3TB drive to it and then ran a raid1 conversion using the command > btrfs balance start -dconvert=raid1 -mconvert=raid1 /mnt/btrfs > > It was running OK, but then I restarted the computer without pausing > the rebalance (I wasn't aware that would be necessary). Upon reboot, > the system got stuck on the mounting operation (as well as decryption > of two of the drives - all drives are setup with encryption and > automatically get unlocked via keyfiles at boot) and dropped me to a > root shell. I removed the line which was mounting the btrfs volume in > fstab, and rebooted successfully, however now when I attempt to mount > the btrfs volume normally it just hangs with no response, and the > command never terminates and cannot be killed. > > I am able to mount the drive read-only, but am unable to mount it as > read-write even when using the mount option "skip_balance". > > Relevant debug info: > uname -a > Linux gaussBonnetXeon 4.8.0-0.bpo.2-amd64 #1 SMP Debian > 4.8.11-1~bpo8+1 (2016-12-14) x86_64 GNU/Linux > > btrfs --version > btrfs-progs v4.7.3 > > btrfs fi show > Label: none uuid: d5c3f49c-3c69-4fd8-b9ab-272c0dbc1eab > Total devices 3 FS bytes used 2.61TiB > devid1 size 7.00TiB used 2.64TiB path > /dev/mapper/Seagate_Archive_8TB-btrfs > devid2 size 3.64TiB used 930.00GiB path /dev/mapper/4TB > devid3 size 2.73TiB used 0.00B path /dev/mapper/3TB > > btrfs fi df /mnt/btrfs > Data, RAID1: total=925.00GiB, used=912.85GiB > Data, single: total=1.71TiB, used=1.71TiB > System, DUP: total=8.00MiB, used=352.00KiB > Metadata, RAID1: total=5.00GiB, used=2.12GiB > Metadata, DUP: total=8.50GiB, used=7.61GiB > GlobalReserve, single: total=512.00MiB, used=1.05MiB > > dmesg (from when I try to mount as rw): > [ 3202.640391] BTRFS info (device dm-10): disk space caching is enabled > [ 3377.624283] BUG: unable to handle kernel paging request at fe10 > [ 3377.624293] IP: [] > qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs] > [ 3377.624341] PGD 111e09067 PUD 111e0b067 PMD 0 > [ 3377.624348] Oops: [#1] SMP > [ 3377.624352] Modules linked in: ipt_REJECT(E) nf_reject_ipv4(E) > rfcomm(E) xt_multiport(E) iptable_filter(E) ip_tables(E) x_tables(E) > binfmt_misc(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) > sunrpc(E) bnep(E) cfg80211(E) snd_hda_codec_hdmi(E) nvidia_drm(POE) > btusb(E) btrtl(E) drm_kms_helper(E) btbcm(E) btintel(E) drm(E) > bluetooth(E) iTCO_wdt(E) iTCO_vendor_support(E) coretemp(E) joydev(E) > kvm_intel(E) snd_hda_intel(E) snd_ca0106(E) rfkill(E) evdev(E) > snd_hda_codec(E) nvidia_modeset(POE) snd_rawmidi(E) kvm(E) > snd_seq_device(E) snd_hda_core(E) snd_ac97_codec(E) snd_hwdep(E) > i7core_edac(E) i2c_i801(E) snd_pcm(E) nvidia(POE) irqbypass(E) > pcspkr(E) edac_core(E) i2c_smbus(E) snd_timer(E) snd(E) soundcore(E) > ac97_bus(E) ipmi_si(E) ipmi_msghandler(E) shpchp(E) lpc_ich(E) > mfd_core(E) acpi_cpufreq(E) > [ 3377.624400] button(E) tpm_tis(E) tpm_tis_core(E) tpm(E) fuse(E) > parport_pc(E) ppdev(E) lp(E) parport(E) autofs4(E) ext4(E) crc16(E) > jbd2(E) fscrypto(E) mbcache(E) btrfs(E) xor(E) raid6_pq(E) xts(E) > gf128mul(E) algif_skcipher(E) af_alg(E) dm_crypt(E) dm_mod(E) > sr_mod(E) cdrom(E) hid_generic(E) usbhid(E) hid(E) sg(E) sd_mod(E) > ahci(E) libahci(E) xhci_pci(E) crc32c_intel(E) ehci_pci(E) xhci_hcd(E) > libata(E) ehci_hcd(E) e1000e(E) scsi_mod(E) usbcore(E) ptp(E) > usb_common(E) pps_core(E) fjes(E) > [ 3377.624426] CPU: 0 PID: 1909 Comm: mount Tainted: P OE > 4.8.0-0.bpo.2-amd64 #1 Debian 4.8.11-1~bpo8+1 > [ 3377.624428] Hardware name: Intel Corporation S3420GP/S3420GP, BIOS > S3420GP.86B.01.00.0040.021120101620 02/11/2010 > [ 3377.624430] task: 9277e47f7100 task.stack: 92770a06 > [ 3377.624431] RIP: 0010:[] [] > qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs] > [ 3377.624454] RSP: 0018:92770a063cb8 EFLAGS: 00210246 > [ 3377.624455] RAX: 927805ee3000 RBX: 92772a31de00 RCX: > > [ 3377.624457] RDX: 9277edf22090 RSI: 9274d3f5b800 RDI: > 9277edf22000 > [ 3377.624458] RBP: 927805e5d800 R08: R09: > 9277edf22000 > [ 3377.624460] R10: R11: R12: > 92770a063d50 > [ 3377.624461] R13: 9274d3f5b800 R14: R15: > 9277edf22000 > [ 3377.624463] FS: 7f0f98765840() GS:92781fc0() > knlGS: > [ 3377.624465] CS: 0010 DS: ES: CR0:
Re: read-only fs, kernel 4.9.0, fs/btrfs/delayed-inode.c:1170 __btrfs_run_delayed_items,
Looks like there's some sort of xattr and Btrfs interaction happening here; but as it only happens with some subvolumes/snapshots not all (but 100% consistent) maybe the kernel version at the time the snapshot was taken is a factor? Anyway git bisect says # first bad commit: [6c6ef9f26e598fb977f60935e109cd5b266c941a] xattr: Stop calling {get,set,remove}xattr inode operations btrfs-image produces a 159M file. I've updated the bug report https://bugzilla.kernel.org/show_bug.cgi?id=191761 and also adding patch author to cc. Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Restart during btrfs balance raid1 conversion, now can only mount read-only
I had an 8TB btrfs filesystem setup with my data, and added a 4TB and 3TB drive to it and then ran a raid1 conversion using the command btrfs balance start -dconvert=raid1 -mconvert=raid1 /mnt/btrfs It was running OK, but then I restarted the computer without pausing the rebalance (I wasn't aware that would be necessary). Upon reboot, the system got stuck on the mounting operation (as well as decryption of two of the drives - all drives are setup with encryption and automatically get unlocked via keyfiles at boot) and dropped me to a root shell. I removed the line which was mounting the btrfs volume in fstab, and rebooted successfully, however now when I attempt to mount the btrfs volume normally it just hangs with no response, and the command never terminates and cannot be killed. I am able to mount the drive read-only, but am unable to mount it as read-write even when using the mount option "skip_balance". Relevant debug info: uname -a Linux gaussBonnetXeon 4.8.0-0.bpo.2-amd64 #1 SMP Debian 4.8.11-1~bpo8+1 (2016-12-14) x86_64 GNU/Linux btrfs --version btrfs-progs v4.7.3 btrfs fi show Label: none uuid: d5c3f49c-3c69-4fd8-b9ab-272c0dbc1eab Total devices 3 FS bytes used 2.61TiB devid1 size 7.00TiB used 2.64TiB path /dev/mapper/Seagate_Archive_8TB-btrfs devid2 size 3.64TiB used 930.00GiB path /dev/mapper/4TB devid3 size 2.73TiB used 0.00B path /dev/mapper/3TB btrfs fi df /mnt/btrfs Data, RAID1: total=925.00GiB, used=912.85GiB Data, single: total=1.71TiB, used=1.71TiB System, DUP: total=8.00MiB, used=352.00KiB Metadata, RAID1: total=5.00GiB, used=2.12GiB Metadata, DUP: total=8.50GiB, used=7.61GiB GlobalReserve, single: total=512.00MiB, used=1.05MiB dmesg (from when I try to mount as rw): [ 3202.640391] BTRFS info (device dm-10): disk space caching is enabled [ 3377.624283] BUG: unable to handle kernel paging request at fe10 [ 3377.624293] IP: [] qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs] [ 3377.624341] PGD 111e09067 PUD 111e0b067 PMD 0 [ 3377.624348] Oops: [#1] SMP [ 3377.624352] Modules linked in: ipt_REJECT(E) nf_reject_ipv4(E) rfcomm(E) xt_multiport(E) iptable_filter(E) ip_tables(E) x_tables(E) binfmt_misc(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) bnep(E) cfg80211(E) snd_hda_codec_hdmi(E) nvidia_drm(POE) btusb(E) btrtl(E) drm_kms_helper(E) btbcm(E) btintel(E) drm(E) bluetooth(E) iTCO_wdt(E) iTCO_vendor_support(E) coretemp(E) joydev(E) kvm_intel(E) snd_hda_intel(E) snd_ca0106(E) rfkill(E) evdev(E) snd_hda_codec(E) nvidia_modeset(POE) snd_rawmidi(E) kvm(E) snd_seq_device(E) snd_hda_core(E) snd_ac97_codec(E) snd_hwdep(E) i7core_edac(E) i2c_i801(E) snd_pcm(E) nvidia(POE) irqbypass(E) pcspkr(E) edac_core(E) i2c_smbus(E) snd_timer(E) snd(E) soundcore(E) ac97_bus(E) ipmi_si(E) ipmi_msghandler(E) shpchp(E) lpc_ich(E) mfd_core(E) acpi_cpufreq(E) [ 3377.624400] button(E) tpm_tis(E) tpm_tis_core(E) tpm(E) fuse(E) parport_pc(E) ppdev(E) lp(E) parport(E) autofs4(E) ext4(E) crc16(E) jbd2(E) fscrypto(E) mbcache(E) btrfs(E) xor(E) raid6_pq(E) xts(E) gf128mul(E) algif_skcipher(E) af_alg(E) dm_crypt(E) dm_mod(E) sr_mod(E) cdrom(E) hid_generic(E) usbhid(E) hid(E) sg(E) sd_mod(E) ahci(E) libahci(E) xhci_pci(E) crc32c_intel(E) ehci_pci(E) xhci_hcd(E) libata(E) ehci_hcd(E) e1000e(E) scsi_mod(E) usbcore(E) ptp(E) usb_common(E) pps_core(E) fjes(E) [ 3377.624426] CPU: 0 PID: 1909 Comm: mount Tainted: P OE 4.8.0-0.bpo.2-amd64 #1 Debian 4.8.11-1~bpo8+1 [ 3377.624428] Hardware name: Intel Corporation S3420GP/S3420GP, BIOS S3420GP.86B.01.00.0040.021120101620 02/11/2010 [ 3377.624430] task: 9277e47f7100 task.stack: 92770a06 [ 3377.624431] RIP: 0010:[] [] qgroup_fix_relocated_data_extents+0x2b/0x2c0 [btrfs] [ 3377.624454] RSP: 0018:92770a063cb8 EFLAGS: 00210246 [ 3377.624455] RAX: 927805ee3000 RBX: 92772a31de00 RCX: [ 3377.624457] RDX: 9277edf22090 RSI: 9274d3f5b800 RDI: 9277edf22000 [ 3377.624458] RBP: 927805e5d800 R08: R09: 9277edf22000 [ 3377.624460] R10: R11: R12: 92770a063d50 [ 3377.624461] R13: 9274d3f5b800 R14: R15: 9277edf22000 [ 3377.624463] FS: 7f0f98765840() GS:92781fc0() knlGS: [ 3377.624465] CS: 0010 DS: ES: CR0: 80050033 [ 3377.624467] CR2: fe10 CR3: 00034b952000 CR4: 06f0 [ 3377.624468] Stack: [ 3377.624470] c05c14a2 0801 9277edf22000 927805ee3000 [ 3377.624473] 0801 9277edf22000 c05c3ed6 [ 3377.624476] dc2f7325 92772a31de00 927805e5d800 [ 3377.624478] Call Trace: [ 3377.624497] [] ? join_transaction.isra.15+0x22/0x3f0 [btrfs] [ 3377.624516] [] ? start_transaction+0x96/0x4d0 [btrfs] [ 3377.624537] [] ? btrfs_recover_relocation+0x2e8/0x420 [btrfs] [ 3377.624551] [] ? btrfs_remount+0x40f/0x570 [btrfs] [
Re: Best practices for raid 1
I would like to use this thread to ask few questions: If we have 2 devices dying on us and we run RAID6 - this theoretically will still run (despite our current problems). Now let’s say that we booted up raid6 of 10 disk and 2 of them dies but operator does NOT know what are dev ID of disk that died, How does one removes those devices other than using “-missing” ??? I ask because it’s in multiple places stated to use “replace” when your device dies but nobody ever states how to find out which /dev/ node is actually missing …. so when I want to use a replace, I don’t know what to use within command :/ … This whole thing might have an additional complication - if FS is fool, than one would need to add disks than remove missing. > On 10 Jan 2017, at 21:49, Chris Murphywrote: > > On Tue, Jan 10, 2017 at 2:07 PM, Vinko Magecic > wrote: >> Hello, >> >> I set up a raid 1 with two btrfs devices and came across some situations in >> my testing that I can't get a straight answer on. >> >> 1) When replacing a volume, do I still need to `umount /path` and then >> `mount -o degraded ...` the good volume before doing the `btrfs replace >> start ...` ? > > No. If the device being replaced is unreliable, use -r to limit the > reads from the device being replaced. > > > >> I didn't see anything that said I had to and when I tested it without >> mounting the volume it was able to replace the device without any issue. Is >> that considered bad and could risk damage or has `replace` made it possible >> to replace devices without umounting the filesystem? > > It's always been possible even before 'replace'. > btrfs dev add > btrfs dev rem > > But there are some bugs in dev replace that Qu is working on; I think > they mainly negatively impact raid56 though. > > The one limitation of 'replace' is that the new block device must be > equal to or larger than the block device being replaced; where dev add >> dev rem doesn't require this. > > >> 2) Everything I see about replacing a drive says to use `/old/device >> /new/device` but what if the old device can't be read or no longer exists? > > The command works whether the device is present or not; but if it's > present and working then any errors on one device can be corrected by > the other, whereas if the device is missing, then any errors on the > remaining device can't be corrected. Off hand I'm not sure if the > replace continues and an error just logged...I think that's what > should happen. > > >> Would that be a `btrfs device add /new/device; btrfs balance start >> /new/device` ? > > dev add then dev rem; the balance isn't necessary. > >> >> 3) When I have the RAID1 with two devices and I want to grow it out, which >> is the better practice? Create a larger volume, replace the old device with >> the new device and then do it a second time for the other device, or >> attaching the new volumes to the label/uuid one at a time and with each one >> use `btrfs filesystem resize devid:max /mountpoint`. > > If you're replacing a 2x raid1 with two bigger replacements, you'd use > 'btrfs replace' twice. Maybe it'd work concurrently, I've never tried > it, but useful for someone to test and see if it explodes because if > it's allowed, it should work or fail gracefully. > > There's no need to do filesystem resizes when doing either 'replace' > or 'dev add' followed by 'dev rem' because the fs resize is implied. > First it's resized/grown with add; and then it's resized/shrink with > remove. For replace there's a consolidation of steps, it's been a > while since I've looked at the code so I can't tell you what steps it > skips, what the state of the devices are in during the replace, which > one active writes go to. > > > -- > Chris Murphy > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [LSF/MM TOPIC] sharing pages between mappings
On Jan 11, 2017, at 3:29 AM, Miklos Szerediwrote: > > I know there's work on this for xfs, but could this be done in generic mm > code? > > What are the obstacles? page->mapping and page->index are the obvious ones. > > If that's too difficult is it maybe enough to share mappings between > files while they are completely identical and clone the mapping when > necessary? > > All COW filesystems would benefit, as well as layered ones: lots of > fuse fs, and in some cases overlayfs too. For layered filesystems it would also be useful to have an API to move pages between mappings easily. > Related: what can DAX do in the presence of cloned block? > > Thanks, > Miklos > -- > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html Cheers, Andreas signature.asc Description: Message signed with OpenPGP using GPGMail
Re: Best practices for raid 1
On 2017-01-10 16:49, Chris Murphy wrote: On Tue, Jan 10, 2017 at 2:07 PM, Vinko Magecicwrote: Hello, I set up a raid 1 with two btrfs devices and came across some situations in my testing that I can't get a straight answer on. 1) When replacing a volume, do I still need to `umount /path` and then `mount -o degraded ...` the good volume before doing the `btrfs replace start ...` ? No. If the device being replaced is unreliable, use -r to limit the reads from the device being replaced. I didn't see anything that said I had to and when I tested it without mounting the volume it was able to replace the device without any issue. Is that considered bad and could risk damage or has `replace` made it possible to replace devices without umounting the filesystem? It's always been possible even before 'replace'. btrfs dev add btrfs dev rem But there are some bugs in dev replace that Qu is working on; I think they mainly negatively impact raid56 though. The one limitation of 'replace' is that the new block device must be equal to or larger than the block device being replaced; where dev add dev rem doesn't require this. The other thing to remember is that you can resize the FS on the device being replaced so that it will fit on the new device. I actually regularly do this when re-partitioning or moving filesystems between devices as a safety precaution so that I can be sure it will fit in the new location. I would only suggest doing this though if that device is still reliable, as it may move data around on that device, and it obviously doesn't work if the device being replaced is missing. 2) Everything I see about replacing a drive says to use `/old/device /new/device` but what if the old device can't be read or no longer exists? The command works whether the device is present or not; but if it's present and working then any errors on one device can be corrected by the other, whereas if the device is missing, then any errors on the remaining device can't be corrected. Off hand I'm not sure if the replace continues and an error just logged...I think that's what should happen. IIRC, that's what happens up to some (arbitrary) threshold, at which point the replace fails. Would that be a `btrfs device add /new/device; btrfs balance start /new/device` ? dev add then dev rem; the balance isn't necessary. A better way to put it is that the balance is implicit in the removal of the device. The data that was on that device has to go somewhere, and the easiest way to do that is just to run a balance that's not allowed to allocate anything on the device being removed. 3) When I have the RAID1 with two devices and I want to grow it out, which is the better practice? Create a larger volume, replace the old device with the new device and then do it a second time for the other device, or attaching the new volumes to the label/uuid one at a time and with each one use `btrfs filesystem resize devid:max /mountpoint`. If you're replacing a 2x raid1 with two bigger replacements, you'd use 'btrfs replace' twice. Maybe it'd work concurrently, I've never tried it, but useful for someone to test and see if it explodes because if it's allowed, it should work or fail gracefully. In theory, it _might_ be possible to get dev replace to work concurrently. As of right now, I know that the current implementation does not work with more than one instance running per FS (because it uses devid 0 for the new device during the replace, and devids have to be unique), but I don't know for certain what it does if you try to run another (it _should_ refuse to start, I'm not certain if that's what it actually does, and I don't have the time to check right now). That said, there are many reasons to just serialize replaces most of the time, the most notable being that replace does not just read from the device being replaced (although most of the reads go to that device), and that serializing the replace operations has less impact on the rest of the system (it is designed to be used on live systems). There's no need to do filesystem resizes when doing either 'replace' or 'dev add' followed by 'dev rem' because the fs resize is implied. First it's resized/grown with add; and then it's resized/shrink with remove. For replace there's a consolidation of steps, it's been a while since I've looked at the code so I can't tell you what steps it skips, what the state of the devices are in during the replace, which one active writes go to. Last time I checked, this was not the case for replace, and a resize to max size was still necessary. That was almost 3 months ago though (I've been lucky and not needed to replace anything since then), so I may be incorrect about the current state of things. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at
yet another call trace during send/receive
Hi. On Debian sid: $ uname -a Linux heisenberg 4.8.0-2-amd64 #1 SMP Debian 4.8.15-2 (2017-01-04) x86_64 GNU/Linux $ btrfs version btrfs-progs v4.7.3 During a: # btrfs send -p foo bar | btrfs receive baz Jan 11 20:43:10 heisenberg kernel: [ cut here ] Jan 11 20:43:10 heisenberg kernel: WARNING: CPU: 6 PID: 10042 at /build/linux-zDY19G/linux-4.8.15/fs/btrfs/send.c:6117 btrfs_ioctl_send+0x533/0x1280 [btrfs] Jan 11 20:43:10 heisenberg kernel: Modules linked in: udp_diag tcp_diag inet_diag algif_skcipher af_alg uas vhost_net vhost macvtap macvlan xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat tun bridge stp llc fuse ctr ccm ebtable_filter ebtables joydev rtsx_pci_ms memstick rtsx_pci_sdmmc mmc_core iTCO_wdt iTCO_vendor_support cpufreq_userspace cpufreq_powersave cpufreq_conservative ip6t_REJECT nf_reject_ipv6 xt_tcpudp nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables xt_policy ipt_REJECT nf_reject_ipv4 xt_comment nf_conntrack_ipv4 nf_defrag_ipv4 xt_multiport xt_conntrack nf_conntrack iptable_filter binfmt_misc intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate intel_uncore Jan 11 20:43:10 heisenberg kernel: intel_rapl_perf psmouse pcspkr uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_core videodev media btusb btrtl btbcm btintel sg bluetooth crc16 arc4 iwldvm mac80211 iwlwifi cfg80211 rtsx_pci rfkill fjes snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic tpm_tis tpm_tis_core tpm i915 fujitsu_laptop battery snd_hda_intel snd_hda_codec lpc_ich i2c_i801 ac mfd_core shpchp i2c_smbus snd_hda_core snd_hwdep snd_pcm snd_timer e1000e snd soundcore ptp pps_core video button mei_me mei drm_kms_helper drm i2c_algo_bit loop parport_pc ppdev sunrpc lp parport ip_tables x_tables autofs4 dm_crypt dm_mod raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear md_mod btrfs crc32c_generic xor raid6_pq uhci_hcd usb_storage Jan 11 20:43:10 heisenberg kernel: sd_mod crc32c_intel ahci libahci aesni_intel xhci_pci aes_x86_64 xhci_hcd libata glue_helper lrw ehci_pci gf128mul ablk_helper ehci_hcd cryptd evdev usbcore scsi_mod serio_raw usb_common Jan 11 20:43:10 heisenberg kernel: CPU: 6 PID: 10042 Comm: btrfs Tainted: G W 4.8.0-2-amd64 #1 Debian 4.8.15-2 Jan 11 20:43:10 heisenberg kernel: Hardware name: FUJITSU LIFEBOOK E782/FJNB23E, BIOS Version 1.11 05/24/2012 Jan 11 20:43:10 heisenberg kernel: 0286 248adbdb b3b1f925 Jan 11 20:43:10 heisenberg kernel: b3874ffe 9ebe7e9f4424 7ffcbf0ea5d0 Jan 11 20:43:10 heisenberg kernel: 9ebc0d644000 9ebe7e9f4000 9ebe5e44fb20 9ebd4270ae00 Jan 11 20:43:10 heisenberg kernel: Call Trace: Jan 11 20:43:10 heisenberg kernel: [] ? dump_stack+0x5c/0x77 Jan 11 20:43:10 heisenberg kernel: [] ? __warn+0xbe/0xe0 Jan 11 20:43:10 heisenberg kernel: [] ? btrfs_ioctl_send+0x533/0x1280 [btrfs] Jan 11 20:43:10 heisenberg kernel: [] ? memcg_kmem_get_cache+0x50/0x150 Jan 11 20:43:10 heisenberg kernel: [] ? kmem_cache_alloc+0x122/0x530 Jan 11 20:43:10 heisenberg kernel: [] ? sched_slice.isra.57+0x51/0xc0 Jan 11 20:43:10 heisenberg kernel: [] ? update_cfs_rq_load_avg+0x200/0x4c0 Jan 11 20:43:10 heisenberg kernel: [] ? task_rq_lock+0x46/0xa0 Jan 11 20:43:10 heisenberg kernel: [] ? btrfs_ioctl+0x97c/0x2370 [btrfs] Jan 11 20:43:10 heisenberg kernel: [] ? enqueue_task_fair+0x5c/0x940 Jan 11 20:43:10 heisenberg kernel: [] ? sched_clock+0x5/0x10 Jan 11 20:43:10 heisenberg kernel: [] ? check_preempt_curr+0x50/0x90 Jan 11 20:43:10 heisenberg kernel: [] ? wake_up_new_task+0x156/0x200 Jan 11 20:43:10 heisenberg kernel: [] ? do_vfs_ioctl+0x9f/0x5f0 Jan 11 20:43:10 heisenberg kernel: [] ? _do_fork+0x14d/0x3f0 Jan 11 20:43:10 heisenberg kernel: [] ? SyS_ioctl+0x74/0x80 Jan 11 20:43:10 heisenberg kernel: [] ? system_call_fast_compare_end+0xc/0x96 Jan 11 20:43:10 heisenberg kernel: ---[ end trace 3831b8afbd0cbc9e ]--- Jan 11 20:43:45 heisenberg kernel: BTRFS info (device dm-2): The free space cache file (7525348933632) is invalid. skip it The send/receive seems to continue running... Not sure if the free space cache file entry is related (btw: a btrfs check directly before didn't find that error - actually yet another fsck directly before that, brought a message that the super generation and space file generation would mismatch (or something like that) and it would be invalidated... so kinda strange that this happens at all). Cheers, Chris. smime.p7s Description: S/MIME cryptographic signature
Re: mkfs.btrfs/balance small-btrfs chunk size RFC
Austin S. Hemmelgarn posted on Tue, 10 Jan 2017 09:57:52 -0500 as excerpted: > I can't personally comment on the code itself right now (I've actually > never looked at the mkfs code, or any of the stuff that deals with the > System chunk), but I can make a few general comments on this: > 1. This behavior is still the case as of a Git build from yesterday (I > just verified this myself with the locally built copy of btrfs-progs on > my laptop). Thanks. After posting and seeing Qu W's response I was thinking I needed to test current behavior, and you just saved me the trouble (tho I do need to freshen my backup /boot one of these days, likely testing mkfs.btrfs on this in the process, but that can wait until 4.10). -- Duncan - List replies preferred. No HTML msgs. "Every nonfree program has a lord, a master -- and if you use the program, he is your master." Richard Stallman -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Best practices for raid 1
> On 10 Jan 2017, at 21:07, Vinko Magecic> wrote: > > Hello, > > I set up a raid 1 with two btrfs devices and came across some situations in > my testing that I can't get a straight answer on. > 1) When replacing a volume, do I still need to `umount /path` and then `mount > -o degraded ...` the good volume before doing the `btrfs replace start ...` ? > I didn't see anything that said I had to and when I tested it without > mounting the volume it was able to replace the device without any issue. Is > that considered bad and could risk damage or has `replace` made it possible > to replace devices without umounting the filesystem? No need to unmount, just replace old with new. Your scenario seems very convoluted and it’s pointless > 2) Everything I see about replacing a drive says to use `/old/device > /new/device` but what if the old device can't be read or no longer exists? > Would that be a `btrfs device add /new/device; btrfs balance start > /new/device` ? In case where old device is missing you’ve got few options: - if you have enough space to fit the data and enough of disks to comply with redundancy - just remove the drive, So for example is you have 3 x 1TB drives with raid 1 And use less than 1TB of data total - juste remove one drive and you will have 2 x 1TB drives in raid 1 and btrfs fill just rebalance stuff for you ! - if you have not enough space to fi the data / not enough disks left to comply with raid lever - your only option is to add disk first then remove missing (btrfs dev delete missing /mount_point_of_your_fs) > 3) When I have the RAID1 with two devices and I want to grow it out, which is > the better practice? Create a larger volume, replace the old device with the > new device and then do it a second time for the other device, or attaching > the new volumes to the label/uuid one at a time and with each one use `btrfs > filesystem resize devid:max /mountpoint`. You kinda misunderstand the principal of btrfs. Btrfs will span across ALL the available space you’ve got. If you have multiple devices in this setup (remember that partition IS A DEVICE), it will span across multiple devices and you can’t change this. Now btrfs resize is mean for resizing a file system occupying a device (or partition). So work flow is that is you want to shrink a device (partition) you first shrink fs on this device than size down the device (partition) … if you want to increase the size of device (partition) you increase size of device (partition) than you grow filesystem within this device (partition). This is 100% irrespective of total cumulative size of file system. Let’s say you’ve got a btrfs file system that is spanning across 3 x 1TB devices … and those devices are partitions. You have raid 1 setup - your complete amount of available space is 1.5 TB. Let’s say you want to shrink of of partitions to 0.5TB -> first you shrink FS on this partition (balance will runn automatically) -> you shrink partition down to 0.5TB -> from now on your total available space is 1.25TB. Simples right ? :) > Thanks > > > > >-- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: mkfs.btrfs/balance small-btrfs chunk size RFC
Hugo Mills posted on Tue, 10 Jan 2017 15:47:53 + as excerpted: > On Tue, Jan 10, 2017 at 10:42:51AM -0500, Austin S. Hemmelgarn wrote: >> Most of the issue in this case is with the size of the initial chunk. >> That said, I've got quite a few reasonably sized filesystems (I think >> the largest is 200GB) with moderate usage (max 90GB of data), and none >> of them are using more than the first 16kB block in the System chunk. >> While I'm not necessarily a typical user, I'd be willing to bet based >> on this that in general, most people who aren't storing very large >> amounts of data or taking huge numbers of snapshots aren't going to >> need a system chunk much bigger than 1MB. > >Again, the system chunk has *nothing* to do with snapshots. Given your explanation of the system chunk containing the chunk tree but not being (directly) related to snapshots, I took that as... Many snapshots, some being old snapshots of now changed data, thus potentially multiplying the working copy data several times and of course requiring more chunks in ordered to contain all that archived data. So while snapshots aren't directly related to the system chunk, the fact that they're snapshotting /something/ that's presumably changing or there'd be no need for snapshots, and the snapshot-archived versions of that /something/ presumably takes additional chunks, makes snapshots indirectly related to the required size of the system chunk(s), in ordered to contain the chunk tree supporting all the other chunks, necessary due not to live data, but due to the snapshots. Is that a correct read, or is (somehow) that indirect dependency not there either, despite the system chunk(s) containing the chunk tree? -- Duncan - List replies preferred. No HTML msgs. "Every nonfree program has a lord, a master -- and if you use the program, he is your master." Richard Stallman -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/12] Refactor btrfs_inode VS inode in delayed-inode.c
On 11.01.2017 18:51, David Sterba wrote: > On Tue, Jan 10, 2017 at 08:35:30PM +0200, Nikolay Borisov wrote: >> After following the discussion in [1] I took a look at what's the >> state of VFS-related members being used in core BTRFS code. It turned >> out there are quite a few functions which operate on struct btrfs_inode, >> yet take struct inode. As a result they have to resort ot excessive >> usage of BTRFS_I, furthermore passing inode around doesn't help the >> poor reader inferring why inode might be passed to a particular function. >> >> In order to better separate core btrfs functionalities from those part, >> which interface with the VFS I took a look around the code and this is >> the result. I'd like to solicit opinions whether people think this >> refactoring is useful, since I have gathered a list of a lot more >> functions which might use a bit of inode VS btrfs_inode changes. Also, >> a lot of function take inode just because btrfs_ino was taking an inode. > > Agreed, this is a good direction how to clean up the code. > >> The patches are self-explanatory, with the first one dealing with >> btrfs_ino being the bulk of it. This paves the way to restructuring >> a lot of functions. >> >> If the maintainers think this should be merged I'd rather resend it >> as a single patch so as not to pollute the git history. This >> version can be used for fine-grained discussion and feedback. > > Actually I like the way it's separated as it keeps the review easy, it > keeps the context in one function and does one change. > > It would be interesting the see the result as reported by the 'size' > utility before and after the patchset, the effects of removed BTRFS_I > calls. Actually without really doing the full-scale refactoring I expect the results to be worse, due to the 147 added uses of BTRFS_I in the first patch. But those are going to be only interim until everything is cleaned up. Anyway, here are the numbers: text data bss dec hex filename 2530598 174661 28288 2733547 29b5eb fs/btrfs/btrfs.ko.nopatches text data bss dec hex filename 2530774 174661 28288 2733723 29b69b fs/btrfs/btrfs.ko.patches So initially there is an increate of 176 bytes in the module but hopefully this will go down. > > I'll do a testing merge on top of current for-next to see how intrusive > it is. If it turns out to be ok, I'll add the patches to the cleanups > branch. > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings
On Wed, Jan 11, 2017 at 12:51:43PM +0100, Jan Kara wrote: > On Wed 11-01-17 11:29:28, Miklos Szeredi wrote: > > I know there's work on this for xfs, but could this be done in generic mm > > code? > > > > What are the obstacles? page->mapping and page->index are the obvious > > ones. > > Yes, these two are the main that come to my mind. Also you'd need to > somehow share the mapping->i_mmap tree so that unmap_mapping_range() works. > > > If that's too difficult is it maybe enough to share mappings between > > files while they are completely identical and clone the mapping when > > necessary? > > Well, but how would the page->mapping->host indirection work? Even if you > have identical contents of the mappings, you still need to be aware there > are several inodes behind them and you need to pick the right one > somehow... > > > All COW filesystems would benefit, as well as layered ones: lots of > > fuse fs, and in some cases overlayfs too. > > > > Related: what can DAX do in the presence of cloned block? > > For DAX handling a block COW should be doable if that is what you are > asking about. Handling of blocks that can be written to while they are > shared will be rather difficult (you have problems with keeping dirty bits > in the radix tree consistent if nothing else). I'm also interested in this topic, though I haven't gotten any further than a hand-wavy notion of handling cow by allocating new blocks, memcpy the contents to the new blocks (how?), then update the mappings to point to the new blocks (how?). It looks a lot easier now with the iomap stuff, but that's as far as I got. :) (IOWs it basically took all the time since the last LSF to get reflink polished enough to handle regular files reasonably well.) --D > > Honza > -- > Jan Kara> SUSE Labs, CR > -- > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
btrfs: account_page_dirtied -> __percpu_counter_add kernel crash
Hi all, I am observing periodic crashes with signature below on kernel 4.4.26. wb is extracted from page (see mm/page-writeback.c, void account_page_dirtied() ): inode_attach_wb(inode, page); wb = inode_to_wb(inode); We are crasing in __inc_wb_stat(wb, WB_RECLAIMABLE), which calls __add_wb_stat(wb, item, 1), which then calls __percpu_counter_add(>stat[item], amount, WB_STAT_BATCH); So actually the lock is: wb->stat[WB_RECLAIMABLE].lock [6716239.938412] BUG: unable to handle kernel paging request at 00015e9a [6716239.938782] IP: [] queued_spin_lock_slowpath+0xe5/0x160 [6716239.939076] PGD 16b070067 PUD 2cea00067 PMD 0 [6716239.939485] Oops: 0002 [#1] SMP [6716239.939834] Modules linked in: xt_multiport dm_snapshot dm_thin_pool dm_bio_prison dm_persistent_data dm_bufio btrfs raid6_pq xor loop iptable_mangle iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat xt_CT iptable_raw nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables zfs(PO) zavl(PO) zcommon(PO) znvpair(PO) spl(O) zunicode(PO) ext2 ib_umad sb_edac edac_core i2c_i801 lpc_ich mfd_core shpchp ioatdma igb i2c_algo_bit ses enclosure ipmi_devintf ipmi_si ipmi_msghandler tcp_scalable ib_qib dca ib_mad ib_core ib_addr ipv6 [6716239.944558] CPU: 7 PID: 23338 Comm: btrfs Tainted: PW O 4.4.26-clouder1 #3 [6716239.944950] Hardware name: Supermicro X10DRi/X10DRi, BIOS 1.1 04/14/2015 [6716239.945184] task: 88046cec6e00 ti: 8801a8f1c000 task.ti: 8801a8f1c000 [6716239.945570] RIP: 0010:[] [] queued_spin_lock_slowpath+0xe5/0x160 [6716239.946025] RSP: 0018:8801a8f1f9a8 EFLAGS: 00010006 [6716239.946253] RAX: 19bb RBX: 880466f363a0 RCX: 00015e9a [6716239.946639] RDX: 88047fcf5b00 RSI: 0020 RDI: 880466f363a0 [6716239.947036] RBP: 8801a8f1f9a8 R08: 0001 R09: [6716239.947420] R10: 88026d966210 R11: R12: 0097 [6716239.947802] R13: feff R14: 88017d5e1d68 R15: 88047f881000 [6716239.952193] FS: 7f99e3058880() GS:88047fce() knlGS: [6716239.952582] CS: 0010 DS: ES: CR0: 80050033 [6716239.952810] CR2: 00015e9a CR3: 00032bd3a000 CR4: 001406e0 [6716239.953190] Stack: [6716239.953407] 8801a8f1f9c8 81614ed0 0102 880466f363a0 [6716239.954038] 8801a8f1f9f0 8131d1b0 880466f36340 [6716239.954669] ea000f44d640 8801a8f1fa28 811353a6 ea000f44d640 [6716239.955298] Call Trace: [6716239.955521] [] _raw_spin_lock_irqsave+0x40/0x50 [6716239.955753] [] __percpu_counter_add+0x40/0x70 [6716239.955982] [] account_page_dirtied+0xb6/0x1a0 [6716239.956209] [] __set_page_dirty_nobuffers+0x81/0x140 [6716239.956458] [] btrfs_set_page_dirty+0xe/0x10 [btrfs] [6716239.956690] [] set_page_dirty+0x3d/0x60 [6716239.956928] [] btrfs_dirty_pages+0x79/0xa0 [btrfs] [6716239.957184] [] __btrfs_write_out_cache.isra.23+0x37b/0x420 [btrfs] [6716239.957577] [] btrfs_write_out_cache+0x8a/0xf0 [btrfs] [6716239.957816] [] btrfs_start_dirty_block_groups+0x1ed/0x3f0 [btrfs] [6716239.958210] [] btrfs_commit_transaction+0x14e/0xa60 [btrfs] [6716239.958602] [] ? start_transaction+0x9a/0x4e0 [btrfs] [6716239.958842] [] btrfs_mksubvol+0x4ce/0x4e0 [btrfs] [6716239.959070] [] ? wait_woken+0xb0/0xb0 [6716239.959304] [] btrfs_ioctl_snap_create_transid+0x18f/0x1a0 [btrfs] [6716239.959707] [] btrfs_ioctl_snap_create_v2+0x107/0x170 [btrfs] [6716239.960102] [] btrfs_ioctl+0x171a/0x2710 [btrfs] [6716239.960330] [] ? handle_mm_fault+0xca2/0x19c0 [6716239.960557] [] do_vfs_ioctl+0x30f/0x560 [6716239.960786] [] SyS_ioctl+0x79/0x90 [6716239.961012] [] entry_SYSCALL_64_fastpath+0x16/0x6e [6716239.961238] Code: 87 47 02 c1 e0 10 85 c0 74 3d 48 89 c1 c1 e8 12 48 c1 e9 0c 83 e8 01 83 e1 30 48 98 48 81 c1 00 5b 01 00 48 03 0c c5 40 d4 cd 81 <48> 89 11 8b 42 08 85 c0 75 12 f3 90 8b 42 08 85 c0 74 f7 8b 0f [6716239.965810] RIP [] queued_spin_lock_slowpath+0xe5/0x160 [6716239.966100] RSP [6716239.966319] CR2: 00015e9a Has someone seen something like that ? Best regards, Angel -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 00/12] Refactor btrfs_inode VS inode in delayed-inode.c
On Tue, Jan 10, 2017 at 08:35:30PM +0200, Nikolay Borisov wrote: > After following the discussion in [1] I took a look at what's the > state of VFS-related members being used in core BTRFS code. It turned > out there are quite a few functions which operate on struct btrfs_inode, > yet take struct inode. As a result they have to resort ot excessive > usage of BTRFS_I, furthermore passing inode around doesn't help the > poor reader inferring why inode might be passed to a particular function. > > In order to better separate core btrfs functionalities from those part, > which interface with the VFS I took a look around the code and this is > the result. I'd like to solicit opinions whether people think this > refactoring is useful, since I have gathered a list of a lot more > functions which might use a bit of inode VS btrfs_inode changes. Also, > a lot of function take inode just because btrfs_ino was taking an inode. Agreed, this is a good direction how to clean up the code. > The patches are self-explanatory, with the first one dealing with > btrfs_ino being the bulk of it. This paves the way to restructuring > a lot of functions. > > If the maintainers think this should be merged I'd rather resend it > as a single patch so as not to pollute the git history. This > version can be used for fine-grained discussion and feedback. Actually I like the way it's separated as it keeps the review easy, it keeps the context in one function and does one change. It would be interesting the see the result as reported by the 'size' utility before and after the patchset, the effects of removed BTRFS_I calls. I'll do a testing merge on top of current for-next to see how intrusive it is. If it turns out to be ok, I'll add the patches to the cleanups branch. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] etrfs: fix up misleading GFP_NOFS usage in btrfs_releasepage
On Wed 11-01-17 14:55:50, David Sterba wrote: > On Mon, Jan 09, 2017 at 03:39:02PM +0100, Michal Hocko wrote: > > From: Michal Hocko> > > > b335b0034e25 ("Btrfs: Avoid using __GFP_HIGHMEM with slab allocator") > > has reduced the allocation mask in btrfs_releasepage to GFP_NOFS just > > to prevent from giving an unappropriate gfp mask to the slab allocator > > deeper down the callchain (in alloc_extent_state). This is wrong for > > two reasons a) GFP_NOFS might be just too restrictive for the calling > > context b) it is better to tweak the gfp mask down when it needs that. > > > > So just remove the mask tweaking from btrfs_releasepage and move it > > down to alloc_extent_state where it is needed. > > > > Signed-off-by: Michal Hocko > > --- > > fs/btrfs/extent_io.c | 5 + > > fs/btrfs/inode.c | 2 +- > > 2 files changed, 6 insertions(+), 1 deletion(-) > > > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > > index b38150eec6b4..f6ae94a4acad 100644 > > --- a/fs/btrfs/extent_io.c > > +++ b/fs/btrfs/extent_io.c > > @@ -226,6 +226,11 @@ static struct extent_state *alloc_extent_state(gfp_t > > mask) > > { > > struct extent_state *state; > > > > + /* > > +* The given mask might be not appropriate for the slab allocator, > > +* drop the unsupported bits > > +*/ > > + mask &= ~(__GFP_DMA32|__GFP_HIGHMEM); > > Is this future proof enough? As it's enumerating some gfp flags, what if > more are necessary in the future? I'm interested about some synthetic > gfp flags that would not require knowledge about what is or is not > acceptable for slab allocator. Well, I agree, that something like slab_restrict_gfp_mask(gfp_t gfp_mask) would be much better. And in fact that sounds like a nice future cleanup. I haven't checked how many users would find it useful yet but I am putting that on my todo list. > But otherwise looks ok to me, I'm going to merge the patch. Thanks. Thanks! -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] etrfs: fix up misleading GFP_NOFS usage in btrfs_releasepage
On Wed 11-01-17 14:55:50, David Sterba wrote: [...] > But otherwise looks ok to me, I'm going to merge the patch. Thanks. I have only now noticed typo in the subject. s@etrfs:@btrfs:@ -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] btrfs: drop gfp mask tweaking in try_release_extent_state
On Mon, Jan 09, 2017 at 03:39:03PM +0100, Michal Hocko wrote: > From: Michal Hocko> > try_release_extent_state reduces the gfp mask to GFP_NOFS if it is > compatible. This is true for GFP_KERNEL as well. There is no real > reason to do that though. There is no new lock taken down the > the only consumer of the gfp mask which is > try_release_extent_state > clear_extent_bit > __clear_extent_bit > alloc_extent_state > > So this seems just unnecessary and confusing. > > Signed-off-by: Michal Hocko Reviewed-by: David Sterba -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings
On Wed, Jan 11, 2017 at 12:51 PM, Jan Karawrote: > On Wed 11-01-17 11:29:28, Miklos Szeredi wrote: >> I know there's work on this for xfs, but could this be done in generic mm >> code? >> >> What are the obstacles? page->mapping and page->index are the obvious >> ones. > > Yes, these two are the main that come to my mind. Also you'd need to > somehow share the mapping->i_mmap tree so that unmap_mapping_range() works. > >> If that's too difficult is it maybe enough to share mappings between >> files while they are completely identical and clone the mapping when >> necessary? > > Well, but how would the page->mapping->host indirection work? Even if you > have identical contents of the mappings, you still need to be aware there > are several inodes behind them and you need to pick the right one > somehow... When do we actually need page->mapping->host? The only place where it's not available is page writeback. Then we can know that the original page was already cow-ed and after being cowed, the page belong only to a single inode. What then happens if the newly written data is cloned before being written back? We can either write back the page during the clone, so that only clean pages are ever shared. Or we can let dirty pages be shared between inodes. In that latter case the question is: do we care about which inode we use for writing back the data? Is the inode needed at all? I don't know enough about filesystem internals to see clearly what happens in such a situation. >> All COW filesystems would benefit, as well as layered ones: lots of >> fuse fs, and in some cases overlayfs too. >> >> Related: what can DAX do in the presence of cloned block? > > For DAX handling a block COW should be doable if that is what you are > asking about. Handling of blocks that can be written to while they are > shared will be rather difficult (you have problems with keeping dirty bits > in the radix tree consistent if nothing else). What happens if you do: - clone_file_range(A, off1, B, off2, len); - mmap both A and B using DAX. The mapping will contain the same struct page for two different mappings, no? Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Lsf-pc] [LSF/MM TOPIC] sharing pages between mappings
On Wed 11-01-17 11:29:28, Miklos Szeredi wrote: > I know there's work on this for xfs, but could this be done in generic mm > code? > > What are the obstacles? page->mapping and page->index are the obvious > ones. Yes, these two are the main that come to my mind. Also you'd need to somehow share the mapping->i_mmap tree so that unmap_mapping_range() works. > If that's too difficult is it maybe enough to share mappings between > files while they are completely identical and clone the mapping when > necessary? Well, but how would the page->mapping->host indirection work? Even if you have identical contents of the mappings, you still need to be aware there are several inodes behind them and you need to pick the right one somehow... > All COW filesystems would benefit, as well as layered ones: lots of > fuse fs, and in some cases overlayfs too. > > Related: what can DAX do in the presence of cloned block? For DAX handling a block COW should be doable if that is what you are asking about. Handling of blocks that can be written to while they are shared will be rather difficult (you have problems with keeping dirty bits in the radix tree consistent if nothing else). Honza -- Jan KaraSUSE Labs, CR -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[LSF/MM TOPIC] sharing pages between mappings
I know there's work on this for xfs, but could this be done in generic mm code? What are the obstacles? page->mapping and page->index are the obvious ones. If that's too difficult is it maybe enough to share mappings between files while they are completely identical and clone the mapping when necessary? All COW filesystems would benefit, as well as layered ones: lots of fuse fs, and in some cases overlayfs too. Related: what can DAX do in the presence of cloned block? Thanks, Miklos -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATH v2 2/2] btrfs-progs: cmds-check.c: supports inode isize fix in lowmem
Add a function 'repair_inode_isize' to support inode isize repair. Signed-off-by: Su Yue--- cmds-check.c | 49 - 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/cmds-check.c b/cmds-check.c index dad10cb..6947420 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -2458,6 +2458,45 @@ out: } /* + * Set inode's isize to correct value in @info + * + * Returns <0 means on error + * Returns 0 means successful repair + */ +static int repair_inode_isize_lowmem(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode_item_fix_info *info) +{ + struct btrfs_inode_item *ei; + struct btrfs_key key; + struct btrfs_path path; + int ret; + + ASSERT(info); + key.objectid = info->ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, , , 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ei = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(path.nodes[0], ei, info->isize); + btrfs_mark_buffer_dirty(path.nodes[0]); + printf("reset isize for inode %llu root %llu\n", info->ino, + root->root_key.objectid); +out: + btrfs_release_path(); + return ret; +} + +/* * repair_inode_item - repair inode item errors * * Repair the inode item if error can be repaired. Any caller should compare @@ -2485,7 +2524,7 @@ static int repair_inode_item(struct btrfs_root *root, ret = 0; goto out; } - if (!(err & NBYTES_ERROR)) { + if (!(err & NBYTES_ERROR) && !(err & ISIZE_ERROR)) { warning("root %llu INODE[%llu] have error(s) can't repair, error : %d", root->objectid, info->ino, err); /* can't fix any errors, ret should be positive */ @@ -2506,6 +2545,13 @@ static int repair_inode_item(struct btrfs_root *root, else if (ret < 0) goto out; } + if (err & ISIZE_ERROR) { + ret = repair_inode_isize_lowmem(trans, root, info); + if (ret == 0) + err &= ~ISIZE_ERROR; + else if (ret < 0) + goto out; + } if (err != info->err) { info->err = err; @@ -5040,6 +5086,7 @@ out: if (isize != size) { err |= ISIZE_ERROR; + info->isize = size; error("root %llu DIR INODE [%llu] size(%llu) not equal to %llu", root->objectid, inode_id, isize, size); } -- 2.11.0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATH v2 1/2] btrfs-progs: cmds-check.c: supports inode nbytes fix in lowmem
Added 'repair_inode_item' which dispatches functions such as 'repair_inode__nbytes_lowmem' to correct errors and 'struct inode_item_fix_info' to store correct values and errors. v2: reassign err to info.err in process_one_leaf. Signed-off-by: Su Yue--- cmds-check.c | 166 +++ 1 file changed, 155 insertions(+), 11 deletions(-) diff --git a/cmds-check.c b/cmds-check.c index 1dba298..dad10cb 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -371,6 +371,17 @@ struct root_item_info { }; /* + * Use inode_item_fix_info as function check_inode_item's arg. + */ +struct inode_item_fix_info { + u64 ino; + u64 isize; + u64 nbytes; + + int err; +}; + +/* * Error bit for low memory mode check. * * Currently no caller cares about it yet. Just internal use for error @@ -1866,13 +1877,16 @@ struct node_refs { static int update_nodes_refs(struct btrfs_root *root, u64 bytenr, struct node_refs *nrefs, u64 level); static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path, - unsigned int ext_ref); - + unsigned int ext_ref, + struct inode_item_fix_info *info); +static int repair_inode_item(struct btrfs_root *root, +struct inode_item_fix_info *info); static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path *path, struct node_refs *nrefs, int *level, int ext_ref) { struct extent_buffer *cur = path->nodes[0]; struct btrfs_key key; + struct inode_item_fix_info info; u64 cur_bytenr; u32 nritems; u64 first_ino = 0; @@ -1881,6 +1895,7 @@ static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path *path, int ret = 0; /* Final return value */ int err = 0; /* Positive error bitmap */ + memset(, 0, sizeof(info)); cur_bytenr = cur->start; /* skip to first inode item or the first inode number change */ @@ -1900,8 +1915,27 @@ static int process_one_leaf_v2(struct btrfs_root *root, struct btrfs_path *path, path->slots[0] = i; again: - err |= check_inode_item(root, path, ext_ref); + err |= check_inode_item(root, path, ext_ref, ); + + if (repair && (err & ~LAST_ITEM)) { + ret = repair_inode_item(root, ); + if (ret < 0) + goto out; + /* +* if some errors was repaired, path shall be searched +* again since path has been changed +*/ + if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], , + path->slots[0]); + btrfs_release_path(path); + btrfs_search_slot(NULL, root, , path, 0, 0); + + cur = path->nodes[0]; + err = info.err; + } + } if (err & LAST_ITEM) goto out; @@ -2211,7 +2245,8 @@ out: } static int check_inode_item(struct btrfs_root *root, struct btrfs_path *path, - unsigned int ext_ref); + unsigned int ext_ref, + struct inode_item_fix_info *info); static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path, int *level, struct node_refs *nrefs, int ext_ref) @@ -2293,7 +2328,7 @@ static int walk_down_tree_v2(struct btrfs_root *root, struct btrfs_path *path, } ret = check_child_node(root, cur, path->slots[*level], next); - if (ret < 0) + if (ret < 0) break; if (btrfs_is_leaf(next)) @@ -2383,6 +2418,105 @@ out: return ret; } +/* + * Set inode's nbytes to correct value in @info + * + * Returns <0 means on error + * Returns 0 means successful repair + */ +static int repair_inode_nbytes_lowmem(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode_item_fix_info *info) +{ + struct btrfs_inode_item *ei; + struct btrfs_key key; + struct btrfs_path path; + int ret; + + ASSERT(info); + key.objectid = info->ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(trans, root, , , 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ei = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_inode_item); + btrfs_set_inode_nbytes(path.nodes[0], ei, info->nbytes); + btrfs_mark_buffer_dirty(path.nodes[0]); +