[PATCH v2 4/6] Btrfs: fail on mismatched subvol and subvolid mount options
There's nothing to stop a user from passing both subvol= and subvolid= to mount, but if they don't refer to the same subvolume, someone is going to be surprised at some point. Error out on this case, but allow users to pass in both if they do match (which they could, for example, get out of /proc/mounts). Signed-off-by: Omar Sandoval --- fs/btrfs/super.c | 32 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ab100e5..20b470d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1163,8 +1163,9 @@ static char *setup_root_args(char *args) return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; struct vfsmount *mnt = NULL; @@ -1210,12 +1211,26 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, /* mount_subtree() drops our reference on the vfsmount. */ mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - pr_err("BTRFS: '%s' is not a valid subvolume\n", subvol_name); + u64 root_objectid = BTRFS_I(root->d_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root->d_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } out: @@ -1308,7 +1323,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (subvol_name) { /* mount_subvol() will free subvol_name. */ - return mount_subvol(subvol_name, flags, device_name, data); + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); -- 2.3.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 5/6] Btrfs: unify subvol= and subvolid= mounting
Currently, mounting a subvolume with subvolid= takes a different code path than mounting with subvol=. This isn't really a big deal except for the fact that mounts done with subvolid= or the default subvolume don't have a dentry that's connected to the dentry tree like in the subvol= case. To unify the code paths, when given subvolid= or using the default subvolume ID, translate it into a subvolume name by walking ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees. Signed-off-by: Omar Sandoval --- fs/btrfs/super.c | 229 +-- 1 file changed, 171 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 20b470d..80a8047 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -841,33 +841,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* -* We have a specific subvol we want to mount, just setup location and -* go look up the root. +* Walk up the subvolume trees in the tree of tree roots by root +* backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* +* Walk up the filesystem tree by inode refs until we hit the +* root directory. +*/ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if
[PATCH v2 3/6] Btrfs: clean up error handling in mount_subvol()
In preparation for new functionality in mount_subvol(), give it ownership of subvol_name and tidy up the error paths. Signed-off-by: Omar Sandoval --- fs/btrfs/super.c | 61 ++-- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bac3c9a..ab100e5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1167,55 +1167,61 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, const char *device_name, char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, -newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, -newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, +device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, -newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, +device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } down_write(&mnt->mnt_sb->s_umount); - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); up_write(&mnt->mnt_sb->s_umount); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } - - kfree(newargs); - - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { struct super_block *s = root->d_sb; dput(root); root = ERR_PTR(-EINVAL); deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + pr_err("BTRFS: '%s' is not a valid subvolume\n", subvol_name); } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1301,9 +1307,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, flags, device_name, data); } security_init_mnt_opts(&new_sec_opts); -- 2.3.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 6/6] Btrfs: show subvol= and subvolid= in /proc/mounts
Now that we're guaranteed to have a meaningful root dentry, we can just export seq_dentry() and use it in btrfs_show_options(). The subvolume ID is easy to get and can also be useful, so put that in there, too. Signed-off-by: Omar Sandoval --- fs/btrfs/super.c | 4 fs/seq_file.c| 1 + 2 files changed, 5 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 80a8047..f334cc4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1193,6 +1193,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f821..52b4927 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -538,6 +538,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } +EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { -- 2.3.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: bug: proc mountinfo, findmnt, and subvol vs subvolid
On Thu, Apr 9, 2015 at 1:41 PM, Omar Sandoval wrote: > On Thu, Apr 09, 2015 at 01:38:19PM -0600, Chris Murphy wrote: >> Another way to put this is, the only reliable way to mount and get >> subvolume info in findmnt and /proc is by using subvol=. >> >> When using subvolid=, the subvolume info isn't available unless the >> initial mount is the top level (ID 5). > > Working on it :) https://lkml.org/lkml/2015/4/8/16 OK great. Seems like it could also (passively) relate to the "mlocate/updatedb and btrfs subvolume mounts" thread. -- Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: btrfs filesystem resize :max not working
On Thu, Apr 9, 2015 at 10:03 AM, André-Sebastian Liebe wrote: > Hey list, > > I've got a problem with resizing a multi drive filesystem. > I had a 5 disk array of 4TB drives. Then I added a 5th (6TB) drive to > the array and replaced one of the 4TB ones with a 6TB drive. You did device add, device delete, then fi resize? It should work but it's better to use btrfs replace start for this sort of operation since it does all of that in one step. > As you can > see in my `btrfs fi sh` output below, my newly added drive (ID=6) has > the expected size of 5.46TiB, but my replaced drive (ID=4) won't show up > as 6TB as expected. Neither a `btrfs fi res max /data/pool0` nor a > `btrfs fi res 4:max /data/pool0` had any effect. > > Any suggestions how to reclaim the 2TB free space of drive 4? Not sure, this has always worked for me in the past, online. You could try unmounting and doing a btrfs dev scan, remounting, and then retrying the command. There's nothing in dmesg? There should be something there whether it succeeds or fails. -- Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: bug: proc mountinfo, findmnt, and subvol vs subvolid
On Thu, Apr 09, 2015 at 01:38:19PM -0600, Chris Murphy wrote: > Another way to put this is, the only reliable way to mount and get > subvolume info in findmnt and /proc is by using subvol=. > > When using subvolid=, the subvolume info isn't available unless the > initial mount is the top level (ID 5). Working on it :) https://lkml.org/lkml/2015/4/8/16 -- Omar -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: bug: proc mountinfo, findmnt, and subvol vs subvolid
Another way to put this is, the only reliable way to mount and get subvolume info in findmnt and /proc is by using subvol=. When using subvolid=, the subvolume info isn't available unless the initial mount is the top level (ID 5). -- Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
bug: proc mountinfo, findmnt, and subvol vs subvolid
kernel-4.0.0-0.rc6.git0.1.fc22.i686 The short version is that if the top level subvolume is not mounted first, any usage of subvolid= fails to show the subvolume in either findmnt or /proc/self/mountinfo. That is, only when the initial mount is the top level, any subsequent mount using option subvolid= or subvol= will show the subvolume name in findmnt and /proc/self/mountinfo. This seems like a bug. -- Chris Murphy -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting
On Thu, Apr 09, 2015 at 06:28:48PM +0200, David Sterba wrote: > On Tue, Apr 07, 2015 at 10:34:01PM -0700, Omar Sandoval wrote: > > Currently, mounting a subvolume with subvolid= takes a different code > > path than mounting with subvol=. This isn't really a big deal except for > > the fact that mounts done with subvolid= or the default subvolume don't > > have a dentry that's connected to the dentry tree like in the subvol= > > case. To unify the code paths, when given subvolid= or using the default > > subvolume ID, translate it into a subvolume name by walking > > ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees. > > Can you please split this patches? It's doing several things, but the > core change will probably be a big one. The mount path is not trivial, > all the recursions and argument replacements. Will do. > Otherwise, I'm ok with this approach, ie. to set up the dentry at mount > time. > > A few comments below. > > > /* > > - * This will strip out the subvol=%s argument for an argument string and > > add > > - * subvolid=0 to make sure we get the actual tree root for path walking to > > the > > - * subvol we want. > > + * This will add subvolid=0 to the argument string while removing any > > subvol= > > + * and subvolid= arguments to make sure we get the top-level root for path > > + * walking to the subvol we want. > > */ > > static char *setup_root_args(char *args) > > { > > - unsigned len = strlen(args) + 2 + 1; > > - char *src, *dst, *buf; > > - > > - /* > > -* We need the same args as before, but with this substitution: > > -* s!subvol=[^,]+!subvolid=0! > > -* > > -* Since the replacement string is up to 2 bytes longer than the > > -* original, allocate strlen(args) + 2 + 1 bytes. > > -*/ > > + char *p, *dst, *buf; > > Fix the coding style. Ok. > > root = mount_subtree(mnt, subvol_name); > > + mnt = NULL; /* mount_subtree drops our reference on the vfsmount. */ > > Put the comment on a separate line. Ok. > > + if (!IS_ERR(root) && subvol_objectid && > > + BTRFS_I(root->d_inode)->root->root_key.objectid != subvol_objectid) > > { > > + pr_warn("BTRFS: subvol '%s' does not match subvolid %llu\n", > > + subvol_name, subvol_objectid); > > We should define the precedence of subvolid and subvol if both are set. > A warning might not be enough. Ah, that probably deserves some more explanation. My original intent was to alert the user if there was a race where the subvolume passed by ID was renamed and another subvolume was renamed over the old location. Then I figured that users should probably be warned if they are passing bogus mount options, too. However, I just now realized that the current behavior will error out in that case anyways because before this patch, setup_root_args() only replaces the first subvol= and ignores anything that comes after it. So subvol=/foovol,subvolid=258 becomes subvolid=0,subvolid=258 and the last one takes precedence, so the lookup of /foovol happens inside of subvol 258 instead of the top-level and fails. So I think reasonable behavior would be to change that warning into a hard error for both cases (the race and the misguided user). Just in case a user copies the mount options straight out of /proc/mounts or something, we can allow both subvol= and subvolid= to be passed, but only if they match. Thanks for the review! -- Omar -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: mlocate/updatedb and btrfs subvolume mounts
Is disabling PRUNE_BIND_MOUNTS for updatedb really the only solution here? On Fri, Apr 3, 2015 at 1:07 PM, G. Richard Bellamy wrote: > I've just noticed that I'm having issues with finding files using > "locate" when those files are on btrfs subvolume mounts. > > The issue is that updatedb cannot discern the difference between a > btrfs bind mount and btrfs subvolume [1][2]. This generally means that > if you're using btrfs subvolume mounts and updatedb at the same time, > and you want to index those subvolumes, you'll need to set > PRUNE_BIND_MOUNTS to 0 or "no". And then deal with all the cruft that > causes. > > From the bug above, you can see that the RedHat dev Michal Sekletar is > out of ideas. I'm not sure if he's reached out here or not... and if > not, he might welcome some help from the folks on this list. > > Regrads, > Richard > > [1] https://bugzilla.redhat.com/show_bug.cgi?id=906591#c3 > [2] http://www.spinics.net/lists/linux-btrfs/msg42510.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: use local btrfs-image in leaf corruption test
On Wed, Apr 08, 2015 at 03:50:04PM +0100, WorMzy Tykashi wrote: > Currently this test uses the system btrfs-image. If there isn't a > btrfs-image on $PATH, the test fails. The test should be using the > locally compiled btrfs-image, not the system one. Added your sign-off and applied, thanks. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: incremental full file backups to smaller mediums possible?
On Thu, Apr 09, 2015 at 06:14:33PM +0200, Christoph Anton Mitterer wrote: > Hey. > > I wondered whether this is possible in btrfs (or could be > implemented),... it's in a way similar to send/receive, but AFAIU not > fully solvable with that. > > What I want to do is making incremental backups of a (btrfs) filesystem > to smaller mediums (that is for example: from a big RAID filesystem to > many BluRays or similar things). > > > Right now I make this as follows: > Every time backups should be made, I create a ext4 image which would > just fit inside the UDF fs of a BluRay (the ext4 image in turn is > dm-crypt encrypted) and copy as many *complete*[0] files from the source > filesystem to that image as possible. That image file is then burned to > BluRay. I also write an SHA512 sum of the whole image file to the BluRay > so that one can check whether it can be still read correctly. > Then the process continues with the remaining files. > > > The main idea behind writing only complete files to the separate parts > of the backup is, that I don't want to have a backup set, where I may > get into troubles if just one of the disks fails. > > > Obviously, when I would do regularly backups that way, then I'd write > each time a big amount of data which hasn't changed at all, which is not > just a costly issue, but also quite time consuming. > So ideally, I'd want to do just an incremental backup of all the files > that have been added/modified + the information of what has been > deleted/moved/changed it's properties. > > All that already smells quite a lot after making a snapshot at the last > backup, and then doing send/receive for the next one. > > > Now the problem, though, is, that I don't want to loose the property of > working in terms of complete files (and I guess send/recieve does not), > i.e. when a big file has just one block changed, I still would like to > have the complete file on the next incremental dump (and not just > information about the "diff"). > The only exception is when file metadata changes (i.e. pathname, date, > permissions, xattrs, etc.). If *just* one of these changes I wouldn't > want to backup the full raw data of the file again. > Any ideas whether and how this could be done / assisted by btrfs? btrfs sub find-new might be more helpful to you here. That will give you the list of changed files; then just feed that list to your existing bin-packing algorithm for working out what goes on which disks, and you're done. Hugo. -- Hugo Mills | Dullest spy film ever: The Eastbourne Ultimatum hugo@... carfax.org.uk | http://carfax.org.uk/ | PGP: E2AB1DE4 | The Thick of It signature.asc Description: Digital signature
Re: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting
On Tue, Apr 07, 2015 at 10:34:01PM -0700, Omar Sandoval wrote: > Currently, mounting a subvolume with subvolid= takes a different code > path than mounting with subvol=. This isn't really a big deal except for > the fact that mounts done with subvolid= or the default subvolume don't > have a dentry that's connected to the dentry tree like in the subvol= > case. To unify the code paths, when given subvolid= or using the default > subvolume ID, translate it into a subvolume name by walking > ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees. Can you please split this patches? It's doing several things, but the core change will probably be a big one. The mount path is not trivial, all the recursions and argument replacements. Otherwise, I'm ok with this approach, ie. to set up the dentry at mount time. A few comments below. > /* > - * This will strip out the subvol=%s argument for an argument string and add > - * subvolid=0 to make sure we get the actual tree root for path walking to > the > - * subvol we want. > + * This will add subvolid=0 to the argument string while removing any subvol= > + * and subvolid= arguments to make sure we get the top-level root for path > + * walking to the subvol we want. > */ > static char *setup_root_args(char *args) > { > - unsigned len = strlen(args) + 2 + 1; > - char *src, *dst, *buf; > - > - /* > - * We need the same args as before, but with this substitution: > - * s!subvol=[^,]+!subvolid=0! > - * > - * Since the replacement string is up to 2 bytes longer than the > - * original, allocate strlen(args) + 2 + 1 bytes. > - */ > + char *p, *dst, *buf; Fix the coding style. > root = mount_subtree(mnt, subvol_name); > + mnt = NULL; /* mount_subtree drops our reference on the vfsmount. */ Put the comment on a separate line. > + if (!IS_ERR(root) && subvol_objectid && > + BTRFS_I(root->d_inode)->root->root_key.objectid != subvol_objectid) > { > + pr_warn("BTRFS: subvol '%s' does not match subvolid %llu\n", > + subvol_name, subvol_objectid); We should define the precedence of subvolid and subvol if both are set. A warning might not be enough. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting
On Wed, Apr 08, 2015 at 02:06:14PM +0800, Qu Wenruo wrote: > > > Original Message > Subject: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting > From: Omar Sandoval > To: Chris Mason , Josef Bacik , David Sterba > , > Date: 2015年04月08日 13:34 > > > Currently, mounting a subvolume with subvolid= takes a different code > > path than mounting with subvol=. This isn't really a big deal except for > > the fact that mounts done with subvolid= or the default subvolume don't > > have a dentry that's connected to the dentry tree like in the subvol= > > case. To unify the code paths, when given subvolid= or using the default > > subvolume ID, translate it into a subvolume name by walking > > ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees. > Oh, this patch is what I have tried long long ago, and want to do the > same thing, to show subvolume mount for btrfs. > > But it came to me that, superblock->show_path() is a better method to do it. > > You can implement btrfs_show_path() to allow mountinfo to get the > subvolume name from subvolid, and don't change the mount routine much. The problem I see with the show_mount approach is related to the additional path lookup, memory allocation and locking. If the mountpoint dentry is the right on ,it's just a simple seq_dentry in show_options. OTOH, your patch takes subvol_sem that will block the callback if there's eg. a subvolume being deleted (that takes the write lock). This is not a lightweight operation nor an infrequent one. There are more write locks to subvol_sem. I'm not sure if I've ever sent this comment back to you, sorry if not. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
incremental full file backups to smaller mediums possible?
Hey. I wondered whether this is possible in btrfs (or could be implemented),... it's in a way similar to send/receive, but AFAIU not fully solvable with that. What I want to do is making incremental backups of a (btrfs) filesystem to smaller mediums (that is for example: from a big RAID filesystem to many BluRays or similar things). Right now I make this as follows: Every time backups should be made, I create a ext4 image which would just fit inside the UDF fs of a BluRay (the ext4 image in turn is dm-crypt encrypted) and copy as many *complete*[0] files from the source filesystem to that image as possible. That image file is then burned to BluRay. I also write an SHA512 sum of the whole image file to the BluRay so that one can check whether it can be still read correctly. Then the process continues with the remaining files. The main idea behind writing only complete files to the separate parts of the backup is, that I don't want to have a backup set, where I may get into troubles if just one of the disks fails. Obviously, when I would do regularly backups that way, then I'd write each time a big amount of data which hasn't changed at all, which is not just a costly issue, but also quite time consuming. So ideally, I'd want to do just an incremental backup of all the files that have been added/modified + the information of what has been deleted/moved/changed it's properties. All that already smells quite a lot after making a snapshot at the last backup, and then doing send/receive for the next one. Now the problem, though, is, that I don't want to loose the property of working in terms of complete files (and I guess send/recieve does not), i.e. when a big file has just one block changed, I still would like to have the complete file on the next incremental dump (and not just information about the "diff"). The only exception is when file metadata changes (i.e. pathname, date, permissions, xattrs, etc.). If *just* one of these changes I wouldn't want to backup the full raw data of the file again. Another issue is, that send gives me one big file, while I would need however n files of max size m, where m is e.g. what fits in a BluRay's UDF... or perhaps on a 3,5" HD Floppy ;) Because of the "complete files" constraint I cannot just split the file from btrfs send,... because the split would need to happen so that no files are split up. Any ideas whether and how this could be done / assisted by btrfs? Thanks, Chris. [0] Obviously there is a problem, when a single file would be larger than a BluRay, but so far this hasn't happened to me. If so, I'd of course need to split the file. smime.p7s Description: S/MIME cryptographic signature
btrfs filesystem resize :max not working
Hey list, I've got a problem with resizing a multi drive filesystem. I had a 5 disk array of 4TB drives. Then I added a 5th (6TB) drive to the array and replaced one of the 4TB ones with a 6TB drive. As you can see in my `btrfs fi sh` output below, my newly added drive (ID=6) has the expected size of 5.46TiB, but my replaced drive (ID=4) won't show up as 6TB as expected. Neither a `btrfs fi res max /data/pool0` nor a `btrfs fi res 4:max /data/pool0` had any effect. Any suggestions how to reclaim the 2TB free space of drive 4? Thanks in advance André-Sebastian Liebe > Kernel:Linux apc01 3.19.2-1-ARCH #1 SMP PREEMPT Wed Mar 18 16:21:02 CET 2015 x86_64 GNU/Linux > btrfs-progs: btrfs-progs v3.19 My 6 drives used by the apc01_pool (none of them has a partition table) Konsole output > # hdparm -I /dev/sdb | grep "device size" >device size with M = 1024*1024: 3815447 MBytes >device size with M = 1000*1000: 4000787 MBytes (4000 GB) > # hdparm -I /dev/sdc | grep "device size" >device size with M = 1024*1024: 3815447 MBytes >device size with M = 1000*1000: 4000787 MBytes (4000 GB) > # hdparm -I /dev/sdd | grep "device size" >device size with M = 1024*1024: 3815447 MBytes >device size with M = 1000*1000: 4000787 MBytes (4000 GB) > # hdparm -I /dev/sde | grep "device size" >device size with M = 1024*1024: 3815447 MBytes >device size with M = 1000*1000: 4000787 MBytes (4000 GB) > # hdparm -I /dev/sdf | grep "device size" >device size with M = 1024*1024: 5723166 MBytes >device size with M = 1000*1000: 6001175 MBytes (6001 GB) > # hdparm -I /dev/sdg | grep "device size" >device size with M = 1024*1024: 5723166 MBytes >device size with M = 1000*1000: 6001175 MBytes (6001 GB) btrfs filesystem: Konsole output > # btrfs fi sh > Label: 'apc01_pool0' uuid: 066141c6-16ca-4a30-b55c-e606b90ad0fb >Total devices 6 FS bytes used 23.43TiB >devid1 size 3.64TiB used 3.61TiB path /dev/sdd >devid2 size 3.64TiB used 3.61TiB path /dev/sdc >devid3 size 3.64TiB used 3.61TiB path /dev/sde >devid4 size 3.64TiB used 3.61TiB path /dev/sdg >devid5 size 3.64TiB used 3.61TiB path /dev/sdb >devid6 size 5.46TiB used 5.43TiB path /dev/sdf -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/3] Btrfs: show subvol= and subvolid= in /proc/mounts
On Tue, Apr 07, 2015 at 10:34:02PM -0700, Omar Sandoval wrote: > Currently, userspace has no way to know which subvolume is mounted. Oh, there is a way, 'btrfs inspect-internal rootid /path/to/mount', just we'd like to see it in the mount options as well. > But, > now that we're guaranteed to have a meaningful root dentry, we can just > export and use seq_dentry() in btrfs_show_options(). The subvolume ID is > easy to get, so put that in there, too. > > Signed-off-by: Omar Sandoval > --- > fs/btrfs/super.c | 4 > fs/seq_file.c| 1 + > 2 files changed, 5 insertions(+) > > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c > index 5ab9801..5e14bb6 100644 > --- a/fs/btrfs/super.c > +++ b/fs/btrfs/super.c > @@ -1193,6 +1193,10 @@ static int btrfs_show_options(struct seq_file *seq, > struct dentry *dentry) > seq_puts(seq, ",fatal_errors=panic"); > if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) > seq_printf(seq, ",commit=%d", info->commit_interval); > + seq_puts(seq, ",subvol="); Please make subvol= the last one, as it can contain any string that could be confused with other options. Although nobody would probably call their subvolume "name,autodefrag" etc, the way to obtain the full path is to either resolve the subvolid, or take the whole text after "subvol=" to the end of the line. > + seq_dentry(seq, dentry, " \t\n\\"); > + seq_printf(seq, ",subvolid=%llu", > + BTRFS_I(d_inode(dentry))->root->root_key.objectid); > return 0; -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Recovering BTRFS from bcache failure.
On Tue, Apr 7, 2015 at 11:40 PM, Dan Merillat wrote: > Bcache failures are nasty, because they leave a mix of old and new > data on the disk. In this case, there was very little dirty data, but > of course the tree roots were dirty and out-of-sync. > > fileserver:/usr/src/btrfs-progs# ./btrfs --version > Btrfs v3.18.2 > > kernel version 3.18 > > [ 572.573566] BTRFS info (device bcache0): enabling auto recovery > [ 572.573619] BTRFS info (device bcache0): disk space caching is enabled > [ 574.266055] BTRFS (device bcache0): parent transid verify failed on > 7567956930560 wanted 613690 found 613681 > [ 574.276952] BTRFS (device bcache0): parent transid verify failed on > 7567956930560 wanted 613690 found 613681 > [ 574.277008] BTRFS: failed to read tree root on bcache0 > [ 574.277187] BTRFS (device bcache0): parent transid verify failed on > 7567956930560 wanted 613690 found 613681 > [ 574.277356] BTRFS (device bcache0): parent transid verify failed on > 7567956930560 wanted 613690 found 613681 > [ 574.277398] BTRFS: failed to read tree root on bcache0 > [ 574.285955] BTRFS (device bcache0): parent transid verify failed on > 7567965720576 wanted 613689 found 613694 > [ 574.298741] BTRFS (device bcache0): parent transid verify failed on > 7567965720576 wanted 613689 found 610499 > [ 574.298804] BTRFS: failed to read tree root on bcache0 > [ 575.047079] BTRFS (device bcache0): bad tree block start 0 7567954464768 > [ 575.111495] BTRFS (device bcache0): parent transid verify failed on > 7567954464768 wanted 613688 found 613685 > [ 575.111559] BTRFS: failed to read tree root on bcache0 > [ 575.121749] BTRFS (device bcache0): bad tree block start 0 7567954214912 > [ 575.131803] BTRFS (device bcache0): parent transid verify failed on > 7567954214912 wanted 613687 found 613680 > [ 575.131866] BTRFS: failed to read tree root on bcache0 > [ 575.180101] BTRFS: open_ctree failed > > all the btrfs tools throw up their hands with similar errors: > ileserver:/usr/src/btrfs-progs# btrfs restore /dev/bcache0 -l > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > Ignoring transid failure > Couldn't setup extent tree > Couldn't setup device tree > Could not open root, trying backup super > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > Ignoring transid failure > Couldn't setup extent tree > Couldn't setup device tree > Could not open root, trying backup super > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > Ignoring transid failure > Couldn't setup extent tree > Couldn't setup device tree > Could not open root, trying backup super > > > fileserver:/usr/src/btrfs-progs# ./btrfsck --repair /dev/bcache0 > --init-extent-tree > enabling repair mode > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > Ignoring transid failure > Couldn't setup extent tree > Couldn't setup device tree > Couldn't open file system > > Annoyingly: > # ./btrfs-image -c9 -t4 -s -w /dev/bcache0 /tmp/test.out > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > parent transid verify failed on 7567956930560 wanted 613690 found 613681 > Ignoring transid failure > Couldn't setup extent tree > Open ctree failed > create failed (Success) > > So I can't even send an image for people to look at. CCing some more people on this one, while this filesystem isn't important I'd like to know that "restore from backup" isn't the only option for BTRFS corruption. All of the tools simply throw up their hands and bail when confronted with this filesystem, even btrfs-image. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] fstests: test btrfs send after swapping directory names differently
Test btrfs incremental send after renaming and moving directories around in a way that ends up making a directory have different dentries with the same name but pointing to different inodes in the parent and send snapshots, and also inverting the ancestor-descendent relationship between one of those inodes and some other inode. Cases like this made an incremental send enter an infinite lopp when building path strings, leading to -ENOMEM errors when the path string reached a length of PATH_MAX. This issue was fixed by the following linux kernel btrfs patch: Btrfs: incremental send, check if orphanized dir inode needs delayed rename Signed-off-by: Filipe Manana --- tests/btrfs/090 | 186 tests/btrfs/090.out | 2 + tests/btrfs/group | 1 + 3 files changed, 189 insertions(+) create mode 100755 tests/btrfs/090 create mode 100644 tests/btrfs/090.out diff --git a/tests/btrfs/090 b/tests/btrfs/090 new file mode 100755 index 000..3eb6f37 --- /dev/null +++ b/tests/btrfs/090 @@ -0,0 +1,186 @@ +#! /bin/bash +# FS QA Test No. btrfs/090 +# +# Test btrfs incremental send after renaming and moving directories around in a +# way that ends up making a directory have different dentries with the same name +# but pointing to different inodes in the parent and send snapshots, and also +# inverting the ancestor-descendent relationship between one of those inodes and +# some other inode. +# +# Cases like this made an incremental send enter an infinite lopp when building +# path strings, leading to -ENOMEM errors when the path string reached a length +# of PATH_MAX. +# This issue was fixed by the following linux kernel btrfs patch: +# +# Btrfs: incremental send, check if orphanized dir inode needs delayed rename +# +#--- +# Copyright (C) 2015 SUSE Linux Products GmbH. All Rights Reserved. +# Author: Filipe Manana +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +_cleanup() +{ + rm -fr $send_files_dir + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_require_fssum +_need_to_be_root + +send_files_dir=$TEST_DIR/btrfs-test-$seq + +rm -f $seqres.full +rm -fr $send_files_dir +mkdir $send_files_dir + +_scratch_mkfs >>$seqres.full 2>&1 +_scratch_mount + +mkdir -p $SCRATCH_MNT/data/n1/n2 +mkdir $SCRATCH_MNT/data/n4 +mkdir -p $SCRATCH_MNT/data/t6/t7 +mkdir $SCRATCH_MNT/data/t5 +mkdir $SCRATCH_MNT/data/t7 +mkdir $SCRATCH_MNT/data/n4/t2 +mkdir $SCRATCH_MNT/data/t4 +mkdir $SCRATCH_MNT/data/t3 +mv $SCRATCH_MNT/data/t7 $SCRATCH_MNT/data/n4/t2 +mv $SCRATCH_MNT/data/t4 $SCRATCH_MNT/data/n4/t2/t7 +mv $SCRATCH_MNT/data/t5 $SCRATCH_MNT/data/n4/t2/t7/t4 +mv $SCRATCH_MNT/data/t6 $SCRATCH_MNT/data/n4/t2/t7/t4/t5 +mv $SCRATCH_MNT/data/n1/n2 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6 +mv $SCRATCH_MNT/data/n1 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6 +mv $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6/t7 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6/n2 +mv $SCRATCH_MNT/data/t3 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6/n2/t7 + +# Filesystem looks like: +# +# . (ino 256) +# |-- data/ (ino 257) +# |-- n4/ (ino 260) +#|-- t2/ (ino 265) +# |-- t7/ (ino 264) +# |-- t4/(ino 266) +# |-- t5/ (ino 263) +#|-- t6/ (ino 261) +# |-- n1/ (ino 258) +# |-- n2/ (ino 259) +# |-- t7/(ino 262) +#
[PATCH] Btrfs: incremental send, check if orphanized dir inode needs delayed rename
If a directory inode is orphanized, because some inode previously processed has a new name that collides with the old name of the current inode, we need to check if it needs its rename operation delayed too, as its ancestor-descendent relationship with some other inode might have been reversed between the parent and send snapshots and therefore its rename operation needs to happen after that other inode is renamed. For example, for the following reproducer where this is needed (provided by Robbie Ko): $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt2 $ mkdir -p /mnt/data/n1/n2 $ mkdir /mnt/data/n4 $ mkdir -p /mnt/data/t6/t7 $ mkdir /mnt/data/t5 $ mkdir /mnt/data/t7 $ mkdir /mnt/data/n4/t2 $ mkdir /mnt/data/t4 $ mkdir /mnt/data/t3 $ mv /mnt/data/t7 /mnt/data/n4/t2 $ mv /mnt/data/t4 /mnt/data/n4/t2/t7 $ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4 $ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5 $ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6 $ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6 $ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2 $ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7 $ btrfs subvolume snapshot -r /mnt /mnt/snap1 $ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4 $ mv /mnt/data/n4/t2 /mnt/data/n4/n1 $ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2 $ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2 $ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2 $ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6 $ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3 $ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2 $ btrfs subvolume snapshot -r /mnt /mnt/snap2 $ btrfs send /mnt/snap1 | btrfs receive /mnt2 $ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2 ERROR: send ioctl failed with -12: Cannot allocate memory Where the parent snapshot directory hierarchy is the following: .(ino 256) |-- data/(ino 257) |-- n4/(ino 260) |-- t2/ (ino 265) |-- t7/ (ino 264) |-- t4/ (ino 266) |-- t5/(ino 263) |-- t6/ (ino 261) |-- n1/ (ino 258) |-- n2/ (ino 259) |-- t7/ (ino 262) |-- t3/(ino 267) And the send snapshot's directory hierarchy is the following: .(ino 256) |-- data/(ino 257) |-- n4/(ino 260) |-- n1/ (ino 258) |-- t2/ (ino 265) |-- n2/ (ino 259) |-- t3/ (ino 267) ||-- t7 (ino 264) | |-- t6/ (ino 261) ||-- t4/(ino 266) | |-- t5/ (ino 263) | |-- t7/ (ino 262) While processing inode 262 we orphanize inode 264 and later attempt to rename inode 264 to its new name/location, which resulted in building an incorrect destination path string for the rename operation with the value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must have been done only after inode 267 is processed and renamed, as the ancestor-descendent relationship between inodes 264 and 267 was reversed between both snapshots, because otherwise it results in an infinite loop when building the path string for inode 264 when we are processing an inode with a number larger than 264. That loop is the following: start inode 264, send progress of 265 for example parent of 264 -> 267 parent of 267 -> 262 parent of 262 -> 259 parent of 259 -> 261 parent of 261 -> 263 parent of 263 -> 266 parent of 266 -> 264 |--> back to first iteration while current path string length is <= PATH_MAX, and fail with -ENOMEM otherwise So fix this by making the check if we need to delay a directory rename regardless of the current inode having been orphanized or not. A test case for fstests follows soon. Thanks to Robbie Ko for providing a reproducer for this problem. Reported-by: Robbie Ko Signed-off-by: Filipe Manana --- fs/btrfs/
Re: BTRFS corruption w/kernel 3.13 while using docker -s btrfs
Thank you for trying btrfs, it's great for snapshots in docker, but outside of the 2 helpful comments you already got, do yourself a favour and use a newer kernel. Btrfs moves fast, and 3.13 is way too old. The number of bugs (including corruption bugs) that has been fixed since then is too long to list. Try getting something newer than 3.16.2 at least, and if you're building your own, 3.19.3 + this patch http://permalink.gmane.org/gmane.comp.file-systems.btrfs/42241 (btrfs: simplify insert_orphan_item / 381cf6587f8a8a8e981bc0c18859b51dc756 ) Cheers, Marc -- "A mouse is a device used to point at the xterm you want to type in" - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ | PGP 1024R/763BE901 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: unstable atimes on empty dirs in read-only snapshots which were subvol parents
On Fri, Mar 06, 2015 at 12:03:59PM +1100, Paul Harvey wrote: > Apparently in my haste, forgot to include any information in my email. > This is also in the URL to the gist of my test script: > > btrfs v3.17 > Linux 3.16.0-4-amd64 #1 SMP Debian 3.16.7-ckt4-3 (2015-02-03) x86_64 GNU/Linux > > # mount: /dev/loop2 mounted on /tmp/tmp.FkcW7fRde7. > # Showing mount: > # /tmp/tmp.1fbwCCdeNM on /tmp/tmp.FkcW7fRde7 type btrfs > (rw,noatime,space_cache) > # Create subvolume '/tmp/tmp.FkcW7fRde7/subvol' > # mkdir: created directory ‘/tmp/tmp.FkcW7fRde7/snapshots’ > # mkdir: created directory ‘/tmp/tmp.FkcW7fRde7/empty_dir’ > # Create a readonly snapshot of '/tmp/tmp.FkcW7fRde7' in > '/tmp/tmp.FkcW7fRde7/snapshots/1' > # Testing that the subvol dir has stable atime on original parent FS: > # Testing that '/tmp/tmp.FkcW7fRde7/subvol' has repeatable atime of > # 2015-03-06T11:09:40+1100... > # 1: 2015-03-06T11:09:40+1100 > # 2: 2015-03-06T11:09:40+1100 > # PASS /tmp/tmp.FkcW7fRde7/subvol atime is stable :) > # Testing that a normal empty dir has stable atime on the snapshot: > # Testing that '/tmp/tmp.FkcW7fRde7/snapshots/1/empty_dir' has > repeatable atime of > # 2015-03-06T11:09:40+1100... > # 1: 2015-03-06T11:09:40+1100 > # 2: 2015-03-06T11:09:40+1100 > # PASS /tmp/tmp.FkcW7fRde7/snapshots/1/empty_dir atime is stable :) > # Testing that the subvol dir has stable atime on snapshot of parent FS: > # Testing that '/tmp/tmp.FkcW7fRde7/snapshots/1/subvol' has repeatable atime > of > # 2015-03-06T11:09:48+1100... > # 1: 2015-03-06T11:09:50+1100 > # 2: 2015-03-06T11:09:52+1100 > # FAIL /tmp/tmp.FkcW7fRde7/snapshots/1/subvol atime is unstable :( > # './btrfs-atime-bug.sh nocleanup' not specified so cleaning up our mess: > # umount: /tmp/tmp.FkcW7fRde7 unmounted > # rmdir: removing directory, ‘/tmp/tmp.FkcW7fRde7’ > # removed ‘/tmp/tmp.1fbwCCdeNM’ Right, that's an intended behaviour because we'd like to avoid hardlink similar problems, that is, to allow ONLY one valid access to 'subvol'. Here btrfs makes a pseudo 'subvol' with setting CURRENT_TIME to inode->atime/ctime/mtime, that's why we see it's unstable. Thanks, -liubo > > On 6 March 2015 at 11:29, Paul Harvey wrote: > > Hi there, > > > > Apologies for not confirming on a much more recent kernel, if anyone > > could please try my test script for me on a newer kernel that would be > > very much appreciated. > > > > I'm working on reproducible builds, and part of this workflow involves > > tar archiving parts of read-only btrfs snapshots. Problem is, some of > > these tar archives are different from run to run when they capture an > > empty directory that happened to be a subvol parent on the original > > FS: the atimes on these empty dirs are always returning the current > > time - which is not the case with an ordinary empty directory created > > with mkdir; it's also not the same behaviour on the original FS (tar > > archives are reproducible if we use the original FS rather than the > > read-only snapshot). This all happens regardless of mounting noatime. > > > > Perhaps this verbiage is convoluted, I'm writing this in a hurry with > > limited internet connectivity - I have a reproducible test case here > > at https://gist.github.com/csirac2/c2b5b2b9d0193b3c08a8 > > > > Again, I understand this is a pretty old kernel and perhaps this is > > fixed by now, I'll try a more recent kernel with more assertive bug > > report next week if nobody has time to try out my test case. > > > > Cheers > > > > -- > > Paul Harvey > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: snapshot space use
On Thu, Apr 09, 2015 at 08:45:21AM +0800, Qu Wenruo wrote: > *snip* > NOTE: quota is not so stable and has some problem, but should give > you enough info. Those are related to actually using quota or can also hit you when you want to use it just for things like this snapshot space use? Piotr Szymaniak. -- - Oo, jesteś bystrzejszy, niż się wydaje. Przechodziłeś jakieś szkolenie antyterrorystyczne? - W pewnym sensie tak. Byłem żonaty. -- Nelson DeMille, "The Lion's Game" signature.asc Description: Digital signature
Re: BTRFS corruption w/kernel 3.13 while using docker -s btrfs
On Sun, May 18, 2014 at 6:28 AM, Paul Harvey wrote: > This is a "Damnit! I held the power button in and now it won't mount!" > story, but I'm sharing what I found to learn what I can. And in case it's > useful for btrfs development. > > Also curious if my usage of Docker (an LXC thing, http://docker.io) has > exacerbated things. I ask because it seems some btrfs mount options are > breaking docker: > https://github.com/dotcloud/docker/issues/5429#issuecomment-42443919 - > however, I'm not using space_cache or inode_cache myself. > > Notes: > - I've taken a dd copy of the btrfs partition so that I can perform anything > extra people would like to see. > - The following output is taken from the same machine, same partition layout > but a fresh/working Debian testing install on a different disk. It should be > reporting the same as it did before the corruption (it was definitely Linux > 3.13). > - /dev/sda is a new disk, /dev/sdb3 is the corrupted btrfs partition. > - I have a single btrfs partition which contains the rootfs. I use > subvolumes to separately snapshot /home, /vms, etc. > - mount options were: noatime,autodefrag,discard,compress=lzo Don't use the discard option on this kernel release. There are 2 know bugs that made btrfs issue a discard on extents when it shouldn't, which results in those messages you are seeing "btrfs bad tree block start 0 xxx" (the key part is the 0 following the word start). The fixes are: 1) https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=678886bdc6378c1cbd5072da2c5a3035000214e3 Happens both with -o discard and without it if you run fstrim after the fs went into readonly mode. This is included in most stable releases afaik, but certainly not in the kernel version you are using. The btrfs-zero-log tool won't help here; 2) https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=dcc82f4783ad91d4ab654f89f37ae9291cdc846a Happens only with -o discard, and affects only the fsync log. You can get your fs mountable again after using btrfs-zero-log (but losing all data fsynced in the transaction that was open when a crash/reboot happened). Not yet in any kernel release, only on linux-next at the moment. > > $ uname -a > Linux weatherwax 3.13-1-amd64 #1 SMP Debian 3.13.10-1 (2014-04-15) > x86_64 GNU/Linux > $ sudo btrfs version > Btrfs v3.14.1 > $ sudo btrfs fi show /dev/sdb3 > Label: 'weatherwax 0' uuid: 721926c3-147a-44a0-8c82-62534dd6ee94 > Total devices 1 FS bytes used 246.14GiB > devid1 size 357.63GiB used 313.06GiB path /dev/sdb3 > > Btrfs v3.14.1 > $ dmesg # see attached dmesg.txt, includes connecting corrupted disk via USB > $ sudo mount -o recovery,ro /dev/sdb3 /mnt > mount: wrong fs type, bad option, bad superblock on /dev/sdb3, > missing codepage or helper program, or other error > In some cases useful info is found in syslog - try > dmesg | tail or so > [ 1297.632057] btrfs: device label weatherwax 0 devid 1 transid 504075 > /dev/sdb3 > [ 1297.633042] btrfs: enabling auto recovery > [ 1297.633045] btrfs: disk space caching is enabled > [ 1297.830793] btrfs bad tree block start 0 632081858560 > [ 1297.831012] btrfs bad tree block start 0 632081858560 > [ 1297.831016] btrfs: failed to read log tree > [ 1297.892241] btrfs: open_ctree failed > $ sudo btrfs-find-root /dev/sdb3 > Super think's the tree root is at 632081670144, chunk root 386274426880 > Went past the fs size, exiting > $ sudo btrfs-image -c9 -t4 /dev/sdb3 /mnt/sdb3.img > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > read block failed check_tree_block > Couldn't setup log root tree > Open ctree failed > create failed (Success) > $ du /mnt/sdb3.img > 0 /mnt/sdb3.img > $ sudo btrfs rescue chunk-recover /dev/sdb3 # Takes a few hours at > 30MiB/sec, reports 326 (IIRC) good chunks, no bad or orphaned chunks. > $ sudo btrfs restore -i /dev/sdb3 /mnt # recovers ~3.2GB of /var from > the old rootfs, nothing useful. A lot of "failed to inflate: -6" > $ sudo btrfs restore -l /dev/sdb3 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > read block failed check_tree_block > Couldn't setup log root tree > tree key (EXTENT_TREE ROOT_ITEM 0) 632081240064 level 3 > tree key (DEV_TREE ROOT_ITEM 0) 631709724672 level 1 > tree key (FS_TREE ROOT_ITEM 0) 632081072128 level 3 > tree key (CSUM_TREE ROOT_ITEM 0) 632079880192 level 3 > tree key (UUID_TREE ROOT_ITEM 0) 632070057984 level 0 > tree key (262 ROOT_ITEM 0) 632079863808 level 3 > tree key (263 ROOT_ITEM 0) 5308496773
Re: snapshot space use
Original Message Subject: Re: snapshot space use From: Piotr Szymaniak To: Qu Wenruo Date: 2015年04月09日 17:02 On Thu, Apr 09, 2015 at 08:45:21AM +0800, Qu Wenruo wrote: *snip* NOTE: quota is not so stable and has some problem, but should give you enough info. Those are related to actually using quota or can also hit you when you want to use it just for things like this snapshot space use? Using quota has more problems. Like exceeding quota make you unable to delete any file(like a deadlock). Your use case, just showing how much space you used, may still hit some unusual problem like inaccurate number or even minus number, with performance drop. But overall, it should be OK and meets your need. Thanks, Qu Piotr Szymaniak. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH/RFC] fscache/cachefiles versus btrfs
NeilBrown wrote: > Is there a better way? Could a better way be created? Maybe > SEEK_DATA_RELIABLE ?? fiemap() maybe? > Also, if you do try to use fscache on btrfs with 3.19, then nothing gets > cached (as expected) and with a heavy load you can lose a race and get an > asserting fail in fscache_enqueue_operation Do you have the patches here applied? http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=fscache-fixes David -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BTRFS corruption w/kernel 3.13 while using docker -s btrfs
Hi, On Sun, May 18, 2014 at 03:28:18PM +1000, Paul Harvey wrote: > This is a "Damnit! I held the power button in and now it won't mount!" story, > but I'm sharing what I found to learn what I can. And in case it's useful for > btrfs development. > > Also curious if my usage of Docker (an LXC thing, http://docker.io) has > exacerbated things. I ask because it seems some btrfs mount options are > breaking docker: > https://github.com/dotcloud/docker/issues/5429#issuecomment-42443919 - > however, I'm not using space_cache or inode_cache myself. > > Notes: > - I've taken a dd copy of the btrfs partition so that I can perform anything > extra people would like to see. > - The following output is taken from the same machine, same partition layout > but a fresh/working Debian testing install on a different disk. It should be > reporting the same as it did before the corruption (it was definitely Linux > 3.13). > - /dev/sda is a new disk, /dev/sdb3 is the corrupted btrfs partition. > - I have a single btrfs partition which contains the rootfs. I use subvolumes > to separately snapshot /home, /vms, etc. > - mount options were: noatime,autodefrag,discard,compress=lzo > > $ uname -a > Linux weatherwax 3.13-1-amd64 #1 SMP Debian 3.13.10-1 (2014-04-15) > x86_64 GNU/Linux > $ sudo btrfs version > Btrfs v3.14.1 > $ sudo btrfs fi show /dev/sdb3 > Label: 'weatherwax 0' uuid: 721926c3-147a-44a0-8c82-62534dd6ee94 > Total devices 1 FS bytes used 246.14GiB > devid1 size 357.63GiB used 313.06GiB path /dev/sdb3 > > Btrfs v3.14.1 > $ dmesg # see attached dmesg.txt, includes connecting corrupted disk via USB > $ sudo mount -o recovery,ro /dev/sdb3 /mnt > mount: wrong fs type, bad option, bad superblock on /dev/sdb3, > missing codepage or helper program, or other error > In some cases useful info is found in syslog - try > dmesg | tail or so > [ 1297.632057] btrfs: device label weatherwax 0 devid 1 transid 504075 > /dev/sdb3 > [ 1297.633042] btrfs: enabling auto recovery > [ 1297.633045] btrfs: disk space caching is enabled > [ 1297.830793] btrfs bad tree block start 0 632081858560 > [ 1297.831012] btrfs bad tree block start 0 632081858560 > [ 1297.831016] btrfs: failed to read log tree > [ 1297.892241] btrfs: open_ctree failed This shows that your trees are good except log tree, as if btrfs uses backup trees for recovery it'll zero log tree and clear free space cache , I recommend you to use btrfs-zero-log. Thanks, -liubo > $ sudo btrfs-find-root /dev/sdb3 > Super think's the tree root is at 632081670144, chunk root 386274426880 > Went past the fs size, exiting > $ sudo btrfs-image -c9 -t4 /dev/sdb3 /mnt/sdb3.img > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > read block failed check_tree_block > Couldn't setup log root tree > Open ctree failed > create failed (Success) > $ du /mnt/sdb3.img > 0 /mnt/sdb3.img > $ sudo btrfs rescue chunk-recover /dev/sdb3 # Takes a few hours at > 30MiB/sec, reports 326 (IIRC) good chunks, no bad or orphaned chunks. > $ sudo btrfs restore -i /dev/sdb3 /mnt # recovers ~3.2GB of /var from > the old rootfs, nothing useful. A lot of "failed to inflate: -6" > $ sudo btrfs restore -l /dev/sdb3 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > read block failed check_tree_block > Couldn't setup log root tree > tree key (EXTENT_TREE ROOT_ITEM 0) 632081240064 level 3 > tree key (DEV_TREE ROOT_ITEM 0) 631709724672 level 1 > tree key (FS_TREE ROOT_ITEM 0) 632081072128 level 3 > tree key (CSUM_TREE ROOT_ITEM 0) 632079880192 level 3 > tree key (UUID_TREE ROOT_ITEM 0) 632070057984 level 0 > tree key (262 ROOT_ITEM 0) 632079863808 level 3 > tree key (263 ROOT_ITEM 0) 530849677312 level 2 > tree key (264 ROOT_ITEM 0) 632081104896 level 3 > tree key (265 ROOT_ITEM 0) 632070590464 level 3 > tree key (577 ROOT_ITEM 0) 633083752448 level 2 > # ... ~80 more lines of similar output > $ sudo btrfsck /dev/sdb3 # exits quickly, < 2s > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > Check tree block failed, want=632081858560, have=0 > read block failed check_tree_block > Couldn't setup log root tree > Checking filesystem on /dev/sdb3 > UUID: 721926c3-147a-44a0-8c82-62534dd6ee94 > checking extents > btrfsck[5378]: segfault at 1d8 ip 004176d4 sp 7fff50ed58a0 error > 4 in btrfsck[40+57000] # from dmesg > $ sudo btrfsck --backup /de
[PATCH/RFC] fscache/cachefiles versus btrfs
hi, fscache cannot currently be used with btrfs as the backing store for the cache (managed by cachefilesd). This is because cachefiles needs the ->bmap address_space_operation, and btrfs doesn't provide it. cachefiles only uses this to find out if a particular page is a 'hole' or not. For btrfs, this can be done with 'SEEK_DATA'. Unfortunately it doesn't seem to be possible to query a filesystem or a file to see if SEEK_DATA is reliable or not, so we cannot simply use SEEK_DATA when reliable, else ->bmap if available. The following patch make fscache work for me on btrfs. It explicitly checks for BTRFS_SUPER_MAGIC. Not really a nice solution, but all I could think of. Is there a better way? Could a better way be created? Maybe SEEK_DATA_RELIABLE ?? Comments, suggestions welcome. Also, if you do try to use fscache on btrfs with 3.19, then nothing gets cached (as expected) and with a heavy load you can lose a race and get an asserting fail in fscache_enqueue_operation ASSERT(fscache_object_is_available(op->object)); It looks like the object is being killed before it is available... [ 859.700765] kernel BUG at ../fs/fscache/operation.c:38! ... [ 859.703124] Call Trace: [ 859.703193] [] fscache_run_op.isra.4+0x34/0x80 [fscache] [ 859.703260] [] fscache_start_operations+0xa0/0xf0 [fscache] [ 859.703388] [] fscache_kill_object+0x98/0xc0 [fscache] [ 859.703455] [] fscache_object_work_func+0x151/0x210 [fscache] [ 859.703578] [] process_one_work+0x147/0x3c0 [ 859.703642] [] worker_thread+0x20c/0x470 I haven't figured out the cause of that yet. Thanks, NeilBrown diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1e51714eb33e..1389d8483d5d 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "internal.h" #define CACHEFILES_KEYBUF_SIZE 512 @@ -647,7 +648,8 @@ lookup_again: ret = -EPERM; aops = object->dentry->d_inode->i_mapping->a_ops; - if (!aops->bmap) + if (!aops->bmap && + object->dentry->d_sb->s_magic != BTRFS_SUPER_MAGIC) goto check_error; object->backer = object->dentry; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c6cd8d7a4eef..49fb330c0ab8 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -410,11 +410,11 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, inode = object->backer->d_inode; ASSERT(S_ISREG(inode->i_mode)); - ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ - if (inode->i_sb->s_blocksize > PAGE_SIZE) + if (inode->i_mapping->a_ops->bmap && + inode->i_sb->s_blocksize > PAGE_SIZE) goto enobufs; shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -423,20 +423,36 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; - /* we assume the absence or presence of the first block is a good -* enough indication for the page as a whole -* - TODO: don't use bmap() for this as it is _not_ actually good -* enough for this as it doesn't indicate errors, but it's all we've -* got for the moment -*/ - block0 = page->index; - block0 <<= shift; - - block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0); - _debug("%llx -> %llx", - (unsigned long long) block0, - (unsigned long long) block); + if (inode->i_mapping->a_ops->bmap) { + /* we assume the absence or presence of the first block is a good +* enough indication for the page as a whole +* - TODO: don't use bmap() for this as it is _not_ actually good +* enough for this as it doesn't indicate errors, but it's all we've +* got for the moment +*/ + block0 = page->index; + block0 <<= shift; + block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0); + _debug("%llx -> %llx", + (unsigned long long) block0, + (unsigned long long) block); + } else { + /* Use llseek */ + struct path path; + struct file *file; + path.mnt = cache->mnt; + path.dentry = object->backer; + file = dentry_open(&path, O_RDONLY, cache->cache_cred); + if (IS_ERR(file)) + goto enobufs; + block = vfs_llseek(file, page->index << PAGE_SHIFT, SEEK_DATA); + filp_close(file, NULL); +