[PATCH v2 4/6] Btrfs: fail on mismatched subvol and subvolid mount options

2015-04-09 Thread Omar Sandoval
There's nothing to stop a user from passing both subvol= and subvolid=
to mount, but if they don't refer to the same subvolume, someone is
going to be surprised at some point. Error out on this case, but allow
users to pass in both if they do match (which they could, for example,
get out of /proc/mounts).

Signed-off-by: Omar Sandoval 
---
 fs/btrfs/super.c | 32 
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ab100e5..20b470d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1163,8 +1163,9 @@ static char *setup_root_args(char *args)
return buf;
 }
 
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
-  const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 
subvol_objectid,
+  int flags, const char *device_name,
+  char *data)
 {
struct dentry *root;
struct vfsmount *mnt = NULL;
@@ -1210,12 +1211,26 @@ static struct dentry *mount_subvol(const char 
*subvol_name, int flags,
/* mount_subtree() drops our reference on the vfsmount. */
mnt = NULL;
 
-   if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
+   if (!IS_ERR(root)) {
struct super_block *s = root->d_sb;
-   dput(root);
-   root = ERR_PTR(-EINVAL);
-   deactivate_locked_super(s);
-   pr_err("BTRFS: '%s' is not a valid subvolume\n", subvol_name);
+   u64 root_objectid = 
BTRFS_I(root->d_inode)->root->root_key.objectid;
+
+   ret = 0;
+   if (!is_subvolume_inode(root->d_inode)) {
+   pr_err("BTRFS: '%s' is not a valid subvolume\n",
+  subvol_name);
+   ret = -EINVAL;
+   }
+   if (subvol_objectid && root_objectid != subvol_objectid) {
+   pr_err("BTRFS: subvol '%s' does not match subvolid 
%llu\n",
+  subvol_name, subvol_objectid);
+   ret = -EINVAL;
+   }
+   if (ret) {
+   dput(root);
+   root = ERR_PTR(ret);
+   deactivate_locked_super(s);
+   }
}
 
 out:
@@ -1308,7 +1323,8 @@ static struct dentry *btrfs_mount(struct file_system_type 
*fs_type, int flags,
 
if (subvol_name) {
/* mount_subvol() will free subvol_name. */
-   return mount_subvol(subvol_name, flags, device_name, data);
+   return mount_subvol(subvol_name, subvol_objectid, flags,
+   device_name, data);
}
 
security_init_mnt_opts(&new_sec_opts);
-- 
2.3.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/6] Btrfs: unify subvol= and subvolid= mounting

2015-04-09 Thread Omar Sandoval
Currently, mounting a subvolume with subvolid= takes a different code
path than mounting with subvol=. This isn't really a big deal except for
the fact that mounts done with subvolid= or the default subvolume don't
have a dentry that's connected to the dentry tree like in the subvol=
case. To unify the code paths, when given subvolid= or using the default
subvolume ID, translate it into a subvolume name by walking
ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees.

Signed-off-by: Omar Sandoval 
---
 fs/btrfs/super.c | 229 +--
 1 file changed, 171 insertions(+), 58 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 20b470d..80a8047 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -841,33 +841,153 @@ out:
return error;
 }
 
-static struct dentry *get_default_root(struct super_block *sb,
-  u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+  u64 subvol_objectid)
 {
-   struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
-   struct btrfs_root *new_root;
-   struct btrfs_dir_item *di;
-   struct btrfs_path *path;
-   struct btrfs_key location;
-   struct inode *inode;
-   u64 dir_id;
-   int new = 0;
+   struct btrfs_root *fs_root;
+   struct btrfs_root_ref *root_ref;
+   struct btrfs_inode_ref *inode_ref;
+   struct btrfs_key key;
+   struct btrfs_path *path = NULL;
+   char *name = NULL, *ptr;
+   u64 dirid;
+   int len;
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path) {
+   ret = -ENOMEM;
+   goto err;
+   }
+   path->leave_spinning = 1;
+
+   name = kmalloc(PATH_MAX, GFP_NOFS);
+   if (!name) {
+   ret = -ENOMEM;
+   goto err;
+   }
+   ptr = name + PATH_MAX - 1;
+   ptr[0] = '\0';
 
/*
-* We have a specific subvol we want to mount, just setup location and
-* go look up the root.
+* Walk up the subvolume trees in the tree of tree roots by root
+* backrefs until we hit the top-level subvolume.
 */
-   if (subvol_objectid) {
-   location.objectid = subvol_objectid;
-   location.type = BTRFS_ROOT_ITEM_KEY;
-   location.offset = (u64)-1;
-   goto find_root;
+   while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+   key.objectid = subvol_objectid;
+   key.type = BTRFS_ROOT_BACKREF_KEY;
+   key.offset = (u64)-1;
+
+   ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+   if (ret < 0) {
+   goto err;
+   } else if (ret > 0) {
+   ret = btrfs_previous_item(root, path, subvol_objectid,
+ BTRFS_ROOT_BACKREF_KEY);
+   if (ret < 0) {
+   goto err;
+   } else if (ret > 0) {
+   ret = -ENOENT;
+   goto err;
+   }
+   }
+
+   btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+   subvol_objectid = key.offset;
+
+   root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_root_ref);
+   len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+   ptr -= len + 1;
+   if (ptr < name) {
+   ret = -ENAMETOOLONG;
+   goto err;
+   }
+   read_extent_buffer(path->nodes[0], ptr + 1,
+  (unsigned long)(root_ref + 1), len);
+   ptr[0] = '/';
+   dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+   btrfs_release_path(path);
+
+   key.objectid = subvol_objectid;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+   key.offset = (u64)-1;
+   fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+   if (IS_ERR(fs_root)) {
+   ret = PTR_ERR(fs_root);
+   goto err;
+   }
+
+   /*
+* Walk up the filesystem tree by inode refs until we hit the
+* root directory.
+*/
+   while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+   key.objectid = dirid;
+   key.type = BTRFS_INODE_REF_KEY;
+   key.offset = (u64)-1;
+
+   ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 
0);
+   if (ret < 0) {
+   goto err;
+   } else if

[PATCH v2 3/6] Btrfs: clean up error handling in mount_subvol()

2015-04-09 Thread Omar Sandoval
In preparation for new functionality in mount_subvol(), give it
ownership of subvol_name and tidy up the error paths.

Signed-off-by: Omar Sandoval 
---
 fs/btrfs/super.c | 61 ++--
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index bac3c9a..ab100e5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1167,55 +1167,61 @@ static struct dentry *mount_subvol(const char 
*subvol_name, int flags,
   const char *device_name, char *data)
 {
struct dentry *root;
-   struct vfsmount *mnt;
+   struct vfsmount *mnt = NULL;
char *newargs;
+   int ret;
 
newargs = setup_root_args(data);
-   if (!newargs)
-   return ERR_PTR(-ENOMEM);
-   mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
-newargs);
+   if (!newargs) {
+   root = ERR_PTR(-ENOMEM);
+   goto out;
+   }
 
-   if (PTR_RET(mnt) == -EBUSY) {
+   mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+   if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
if (flags & MS_RDONLY) {
-   mnt = vfs_kern_mount(&btrfs_fs_type, flags & 
~MS_RDONLY, device_name,
-newargs);
+   mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+device_name, newargs);
} else {
-   int r;
-   mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, 
device_name,
-newargs);
+   mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+device_name, newargs);
if (IS_ERR(mnt)) {
-   kfree(newargs);
-   return ERR_CAST(mnt);
+   root = ERR_CAST(mnt);
+   mnt = NULL;
+   goto out;
}
 
down_write(&mnt->mnt_sb->s_umount);
-   r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+   ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
up_write(&mnt->mnt_sb->s_umount);
-   if (r < 0) {
-   /* FIXME: release vfsmount mnt ??*/
-   kfree(newargs);
-   return ERR_PTR(r);
+   if (ret < 0) {
+   root = ERR_PTR(ret);
+   goto out;
}
}
}
-
-   kfree(newargs);
-
-   if (IS_ERR(mnt))
-   return ERR_CAST(mnt);
+   if (IS_ERR(mnt)) {
+   root = ERR_CAST(mnt);
+   mnt = NULL;
+   goto out;
+   }
 
root = mount_subtree(mnt, subvol_name);
+   /* mount_subtree() drops our reference on the vfsmount. */
+   mnt = NULL;
 
if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
struct super_block *s = root->d_sb;
dput(root);
root = ERR_PTR(-EINVAL);
deactivate_locked_super(s);
-   printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
-   subvol_name);
+   pr_err("BTRFS: '%s' is not a valid subvolume\n", subvol_name);
}
 
+out:
+   mntput(mnt);
+   kfree(newargs);
+   kfree(subvol_name);
return root;
 }
 
@@ -1301,9 +1307,8 @@ static struct dentry *btrfs_mount(struct file_system_type 
*fs_type, int flags,
}
 
if (subvol_name) {
-   root = mount_subvol(subvol_name, flags, device_name, data);
-   kfree(subvol_name);
-   return root;
+   /* mount_subvol() will free subvol_name. */
+   return mount_subvol(subvol_name, flags, device_name, data);
}
 
security_init_mnt_opts(&new_sec_opts);
-- 
2.3.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/6] Btrfs: show subvol= and subvolid= in /proc/mounts

2015-04-09 Thread Omar Sandoval
Now that we're guaranteed to have a meaningful root dentry, we can just
export seq_dentry() and use it in btrfs_show_options(). The subvolume ID
is easy to get and can also be useful, so put that in there, too.

Signed-off-by: Omar Sandoval 
---
 fs/btrfs/super.c | 4 
 fs/seq_file.c| 1 +
 2 files changed, 5 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 80a8047..f334cc4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1193,6 +1193,10 @@ static int btrfs_show_options(struct seq_file *seq, 
struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+   seq_printf(seq, ",subvolid=%llu",
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+   seq_puts(seq, ",subvol=");
+   seq_dentry(seq, dentry, " \t\n\\");
return 0;
 }
 
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f821..52b4927 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -538,6 +538,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, 
const char *esc)
 
return res;
 }
+EXPORT_SYMBOL(seq_dentry);
 
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
-- 
2.3.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bug: proc mountinfo, findmnt, and subvol vs subvolid

2015-04-09 Thread Chris Murphy
On Thu, Apr 9, 2015 at 1:41 PM, Omar Sandoval  wrote:
> On Thu, Apr 09, 2015 at 01:38:19PM -0600, Chris Murphy wrote:
>> Another way to put this is, the only reliable way to mount and get
>> subvolume info in findmnt and /proc is by using subvol=.
>>
>> When using subvolid=, the subvolume info isn't available unless the
>> initial mount is the top level (ID 5).
>
> Working on it :) https://lkml.org/lkml/2015/4/8/16

OK great. Seems like it could also (passively) relate to the
"mlocate/updatedb and btrfs subvolume mounts" thread.


-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs filesystem resize :max not working

2015-04-09 Thread Chris Murphy
On Thu, Apr 9, 2015 at 10:03 AM, André-Sebastian Liebe  wrote:
> Hey list,
>
> I've got a problem with resizing a multi drive filesystem.
> I had a 5 disk array of 4TB drives. Then I added a 5th (6TB) drive to
> the array and replaced one of the 4TB ones with a 6TB drive.

You did device add, device delete, then fi resize? It should work but
it's better to use btrfs replace start for this sort of operation
since it does all of that in one step.

> As you can
> see in my `btrfs fi sh` output below, my newly added drive (ID=6) has
> the expected size of 5.46TiB, but my replaced drive (ID=4) won't show up
> as 6TB as expected. Neither a `btrfs fi res max /data/pool0` nor a
> `btrfs fi res 4:max /data/pool0` had any effect.
>
> Any suggestions how to reclaim the 2TB free space of drive 4?

Not sure, this has always worked for me in the past, online. You could
try unmounting and doing a btrfs dev scan, remounting, and then
retrying the command. There's nothing in dmesg? There should be
something there whether it succeeds or fails.


-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bug: proc mountinfo, findmnt, and subvol vs subvolid

2015-04-09 Thread Omar Sandoval
On Thu, Apr 09, 2015 at 01:38:19PM -0600, Chris Murphy wrote:
> Another way to put this is, the only reliable way to mount and get
> subvolume info in findmnt and /proc is by using subvol=.
> 
> When using subvolid=, the subvolume info isn't available unless the
> initial mount is the top level (ID 5).

Working on it :) https://lkml.org/lkml/2015/4/8/16

-- 
Omar
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bug: proc mountinfo, findmnt, and subvol vs subvolid

2015-04-09 Thread Chris Murphy
Another way to put this is, the only reliable way to mount and get
subvolume info in findmnt and /proc is by using subvol=.

When using subvolid=, the subvolume info isn't available unless the
initial mount is the top level (ID 5).


-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


bug: proc mountinfo, findmnt, and subvol vs subvolid

2015-04-09 Thread Chris Murphy
kernel-4.0.0-0.rc6.git0.1.fc22.i686


The short version is that if the top level subvolume is not mounted
first, any usage of subvolid= fails to show the subvolume in either
findmnt or /proc/self/mountinfo.

That is, only when the initial mount is the top level, any subsequent
mount using option subvolid= or subvol= will show the subvolume name
in findmnt and /proc/self/mountinfo.

This seems like a bug.

-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting

2015-04-09 Thread Omar Sandoval
On Thu, Apr 09, 2015 at 06:28:48PM +0200, David Sterba wrote:
> On Tue, Apr 07, 2015 at 10:34:01PM -0700, Omar Sandoval wrote:
> > Currently, mounting a subvolume with subvolid= takes a different code
> > path than mounting with subvol=. This isn't really a big deal except for
> > the fact that mounts done with subvolid= or the default subvolume don't
> > have a dentry that's connected to the dentry tree like in the subvol=
> > case. To unify the code paths, when given subvolid= or using the default
> > subvolume ID, translate it into a subvolume name by walking
> > ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees.
> 
> Can you please split this patches? It's doing several things, but the
> core change will probably be a big one. The mount path is not trivial,
> all the recursions and argument replacements.

Will do.

> Otherwise, I'm ok with this approach, ie. to set up the dentry at mount
> time.
> 
> A few comments below.
> 
> >  /*
> > - * This will strip out the subvol=%s argument for an argument string and 
> > add
> > - * subvolid=0 to make sure we get the actual tree root for path walking to 
> > the
> > - * subvol we want.
> > + * This will add subvolid=0 to the argument string while removing any 
> > subvol=
> > + * and subvolid= arguments to make sure we get the top-level root for path
> > + * walking to the subvol we want.
> >   */
> >  static char *setup_root_args(char *args)
> >  {
> > -   unsigned len = strlen(args) + 2 + 1;
> > -   char *src, *dst, *buf;
> > -
> > -   /*
> > -* We need the same args as before, but with this substitution:
> > -* s!subvol=[^,]+!subvolid=0!
> > -*
> > -* Since the replacement string is up to 2 bytes longer than the
> > -* original, allocate strlen(args) + 2 + 1 bytes.
> > -*/
> > +   char *p, *dst, *buf;
> 
> Fix the coding style.

Ok.

> > root = mount_subtree(mnt, subvol_name);
> > +   mnt = NULL; /* mount_subtree drops our reference on the vfsmount. */
> 
> Put the comment on a separate line.

Ok.

> > +   if (!IS_ERR(root) && subvol_objectid &&
> > +   BTRFS_I(root->d_inode)->root->root_key.objectid != subvol_objectid) 
> > {
> > +   pr_warn("BTRFS: subvol '%s' does not match subvolid %llu\n",
> > +   subvol_name, subvol_objectid);
> 
> We should define the precedence of subvolid and subvol if both are set.
> A warning might not be enough.

Ah, that probably deserves some more explanation. My original intent was
to alert the user if there was a race where the subvolume passed by ID
was renamed and another subvolume was renamed over the old location.
Then I figured that users should probably be warned if they are passing
bogus mount options, too.

However, I just now realized that the current behavior will error out in
that case anyways because before this patch, setup_root_args() only
replaces the first subvol= and ignores anything that comes after it. So
subvol=/foovol,subvolid=258 becomes subvolid=0,subvolid=258 and the last
one takes precedence, so the lookup of /foovol happens inside of subvol
258 instead of the top-level and fails.

So I think reasonable behavior would be to change that warning into a
hard error for both cases (the race and the misguided user). Just in
case a user copies the mount options straight out of /proc/mounts or
something, we can allow both subvol= and subvolid= to be passed, but
only if they match.

Thanks for the review!
-- 
Omar
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlocate/updatedb and btrfs subvolume mounts

2015-04-09 Thread G. Richard Bellamy
Is disabling PRUNE_BIND_MOUNTS for updatedb really the only solution here?

On Fri, Apr 3, 2015 at 1:07 PM, G. Richard Bellamy
 wrote:
> I've just noticed that I'm having issues with finding files using
> "locate" when those files are on btrfs subvolume mounts.
>
> The issue is that updatedb cannot discern the difference between a
> btrfs bind mount and btrfs subvolume [1][2]. This generally means that
> if you're using btrfs subvolume mounts and updatedb at the same time,
> and you want to index those subvolumes, you'll need to set
> PRUNE_BIND_MOUNTS to 0 or "no". And then deal with all the cruft that
> causes.
>
> From the bug above, you can see that the RedHat dev Michal Sekletar is
> out of ideas. I'm not sure if he's reached out here or not... and if
> not, he might welcome some help from the folks on this list.
>
> Regrads,
> Richard
>
> [1] https://bugzilla.redhat.com/show_bug.cgi?id=906591#c3
> [2] http://www.spinics.net/lists/linux-btrfs/msg42510.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: use local btrfs-image in leaf corruption test

2015-04-09 Thread David Sterba
On Wed, Apr 08, 2015 at 03:50:04PM +0100, WorMzy Tykashi wrote:
> Currently this test uses the system btrfs-image. If there isn't a
> btrfs-image on $PATH, the test fails. The test should be using the
> locally compiled btrfs-image, not the system one.

Added your sign-off and applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: incremental full file backups to smaller mediums possible?

2015-04-09 Thread Hugo Mills
On Thu, Apr 09, 2015 at 06:14:33PM +0200, Christoph Anton Mitterer wrote:
> Hey.
> 
> I wondered whether this is possible in btrfs (or could be
> implemented),... it's in a way similar to send/receive, but AFAIU not
> fully solvable with that.
> 
> What I want to do is making incremental backups of a (btrfs) filesystem
> to smaller mediums (that is for example: from a big RAID filesystem to
> many BluRays or similar things).
> 
> 
> Right now I make this as follows:
> Every time backups should be made, I create a ext4 image which would
> just fit inside the UDF fs of a BluRay (the ext4 image in turn is
> dm-crypt encrypted) and copy as many *complete*[0] files from the source
> filesystem to that image as possible. That image file is then burned to
> BluRay. I also write an SHA512 sum of the whole image file to the BluRay
> so that one can check whether it can be still read correctly.
> Then the process continues with the remaining files.
> 
> 
> The main idea behind writing only complete files to the separate parts
> of the backup is, that I don't want to have a backup set, where I may
> get into troubles if just one of the disks fails.
> 
> 
> Obviously, when I would do regularly backups that way, then I'd write
> each time a big amount of data which hasn't changed at all, which is not
> just a costly issue, but also quite time consuming.
> So ideally, I'd want to do just an incremental backup of all the files
> that have been added/modified + the information of what has been
> deleted/moved/changed it's properties.
> 
> All that already smells quite a lot after making a snapshot at the last
> backup, and then doing send/receive for the next one.
> 
> 
> Now the problem, though, is, that I don't want to loose the property of
> working in terms of complete files (and I guess send/recieve does not),
> i.e. when a big file has just one block changed, I still would like to
> have the complete file on the next incremental dump (and not just
> information about the "diff").
> The only exception is when file metadata changes (i.e. pathname, date,
> permissions, xattrs, etc.). If *just* one of these changes I wouldn't
> want to backup the full raw data of the file again.

> Any ideas whether and how this could be done / assisted by btrfs?

   btrfs sub find-new might be more helpful to you here. That will
give you the list of changed files; then just feed that list to your
existing bin-packing algorithm for working out what goes on which
disks, and you're done.

   Hugo.

-- 
Hugo Mills | Dullest spy film ever: The Eastbourne Ultimatum
hugo@... carfax.org.uk |
http://carfax.org.uk/  |
PGP: E2AB1DE4  |   The Thick of It


signature.asc
Description: Digital signature


Re: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting

2015-04-09 Thread David Sterba
On Tue, Apr 07, 2015 at 10:34:01PM -0700, Omar Sandoval wrote:
> Currently, mounting a subvolume with subvolid= takes a different code
> path than mounting with subvol=. This isn't really a big deal except for
> the fact that mounts done with subvolid= or the default subvolume don't
> have a dentry that's connected to the dentry tree like in the subvol=
> case. To unify the code paths, when given subvolid= or using the default
> subvolume ID, translate it into a subvolume name by walking
> ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees.

Can you please split this patches? It's doing several things, but the
core change will probably be a big one. The mount path is not trivial,
all the recursions and argument replacements.

Otherwise, I'm ok with this approach, ie. to set up the dentry at mount
time.

A few comments below.

>  /*
> - * This will strip out the subvol=%s argument for an argument string and add
> - * subvolid=0 to make sure we get the actual tree root for path walking to 
> the
> - * subvol we want.
> + * This will add subvolid=0 to the argument string while removing any subvol=
> + * and subvolid= arguments to make sure we get the top-level root for path
> + * walking to the subvol we want.
>   */
>  static char *setup_root_args(char *args)
>  {
> - unsigned len = strlen(args) + 2 + 1;
> - char *src, *dst, *buf;
> -
> - /*
> -  * We need the same args as before, but with this substitution:
> -  * s!subvol=[^,]+!subvolid=0!
> -  *
> -  * Since the replacement string is up to 2 bytes longer than the
> -  * original, allocate strlen(args) + 2 + 1 bytes.
> -  */
> + char *p, *dst, *buf;

Fix the coding style.

>   root = mount_subtree(mnt, subvol_name);
> + mnt = NULL; /* mount_subtree drops our reference on the vfsmount. */

Put the comment on a separate line.

> + if (!IS_ERR(root) && subvol_objectid &&
> + BTRFS_I(root->d_inode)->root->root_key.objectid != subvol_objectid) 
> {
> + pr_warn("BTRFS: subvol '%s' does not match subvolid %llu\n",
> + subvol_name, subvol_objectid);

We should define the precedence of subvolid and subvol if both are set.
A warning might not be enough.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting

2015-04-09 Thread David Sterba
On Wed, Apr 08, 2015 at 02:06:14PM +0800, Qu Wenruo wrote:
> 
> 
>  Original Message  
> Subject: [PATCH 2/3] Btrfs: unify subvol= and subvolid= mounting
> From: Omar Sandoval 
> To: Chris Mason , Josef Bacik , David Sterba 
> , 
> Date: 2015年04月08日 13:34
> 
> > Currently, mounting a subvolume with subvolid= takes a different code
> > path than mounting with subvol=. This isn't really a big deal except for
> > the fact that mounts done with subvolid= or the default subvolume don't
> > have a dentry that's connected to the dentry tree like in the subvol=
> > case. To unify the code paths, when given subvolid= or using the default
> > subvolume ID, translate it into a subvolume name by walking
> > ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees.
> Oh, this patch is what I have tried long long ago, and want to do the 
> same thing, to show subvolume mount for btrfs.
> 
> But it came to me that, superblock->show_path() is a better method to do it.
> 
> You can implement btrfs_show_path() to allow mountinfo to get the 
> subvolume name from subvolid, and don't change the mount routine much.

The problem I see with the show_mount approach is related to the
additional path lookup, memory allocation and locking.

If the mountpoint dentry is the right on ,it's just a simple seq_dentry
in show_options.

OTOH, your patch takes subvol_sem that will block the callback if
there's eg. a subvolume being deleted (that takes the write lock). This
is not a lightweight operation nor an infrequent one. There are more
write locks to subvol_sem.

I'm not sure if I've ever sent this comment back to you, sorry if not.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


incremental full file backups to smaller mediums possible?

2015-04-09 Thread Christoph Anton Mitterer
Hey.

I wondered whether this is possible in btrfs (or could be
implemented),... it's in a way similar to send/receive, but AFAIU not
fully solvable with that.

What I want to do is making incremental backups of a (btrfs) filesystem
to smaller mediums (that is for example: from a big RAID filesystem to
many BluRays or similar things).


Right now I make this as follows:
Every time backups should be made, I create a ext4 image which would
just fit inside the UDF fs of a BluRay (the ext4 image in turn is
dm-crypt encrypted) and copy as many *complete*[0] files from the source
filesystem to that image as possible. That image file is then burned to
BluRay. I also write an SHA512 sum of the whole image file to the BluRay
so that one can check whether it can be still read correctly.
Then the process continues with the remaining files.


The main idea behind writing only complete files to the separate parts
of the backup is, that I don't want to have a backup set, where I may
get into troubles if just one of the disks fails.


Obviously, when I would do regularly backups that way, then I'd write
each time a big amount of data which hasn't changed at all, which is not
just a costly issue, but also quite time consuming.
So ideally, I'd want to do just an incremental backup of all the files
that have been added/modified + the information of what has been
deleted/moved/changed it's properties.

All that already smells quite a lot after making a snapshot at the last
backup, and then doing send/receive for the next one.


Now the problem, though, is, that I don't want to loose the property of
working in terms of complete files (and I guess send/recieve does not),
i.e. when a big file has just one block changed, I still would like to
have the complete file on the next incremental dump (and not just
information about the "diff").
The only exception is when file metadata changes (i.e. pathname, date,
permissions, xattrs, etc.). If *just* one of these changes I wouldn't
want to backup the full raw data of the file again.

Another issue is, that send gives me one big file, while I would need
however n files of max size m, where m is e.g. what fits in a BluRay's
UDF... or perhaps on a 3,5" HD Floppy ;)
Because of the "complete files" constraint I cannot just split the file
from btrfs send,... because the split would need to happen so that no
files are split up.


Any ideas whether and how this could be done / assisted by btrfs?


Thanks,
Chris.

[0] Obviously there is a problem, when a single file would be larger
than a BluRay, but so far this hasn't happened to me. If so, I'd of
course need to split the file.


smime.p7s
Description: S/MIME cryptographic signature


btrfs filesystem resize :max not working

2015-04-09 Thread André-Sebastian Liebe
Hey list,

I've got a problem with resizing a multi drive filesystem.
I had a 5 disk array of 4TB drives. Then I added a 5th (6TB) drive to
the array and replaced one of the 4TB ones with a 6TB drive. As you can
see in my `btrfs fi sh` output below, my newly added drive (ID=6) has
the expected size of 5.46TiB, but my replaced drive (ID=4) won't show up
as 6TB as expected. Neither a `btrfs fi res max /data/pool0` nor a
`btrfs fi res 4:max /data/pool0` had any effect.

Any suggestions how to reclaim the 2TB free space of drive 4?


Thanks in advance

André-Sebastian Liebe


> Kernel:Linux apc01 3.19.2-1-ARCH #1 SMP PREEMPT Wed Mar 18 16:21:02
CET 2015 x86_64 GNU/Linux
> btrfs-progs: btrfs-progs v3.19

My 6 drives used by the apc01_pool (none of them has a partition table)
Konsole output
> # hdparm -I /dev/sdb | grep "device size"   
>device size with M = 1024*1024: 3815447 MBytes
>device size with M = 1000*1000: 4000787 MBytes (4000 GB)
> # hdparm -I /dev/sdc | grep "device size"  
>device size with M = 1024*1024: 3815447 MBytes
>device size with M = 1000*1000: 4000787 MBytes (4000 GB)
> # hdparm -I /dev/sdd | grep "device size"  
>device size with M = 1024*1024: 3815447 MBytes
>device size with M = 1000*1000: 4000787 MBytes (4000 GB)
> # hdparm -I /dev/sde | grep "device size"  
>device size with M = 1024*1024: 3815447 MBytes
>device size with M = 1000*1000: 4000787 MBytes (4000 GB)
> # hdparm -I /dev/sdf | grep "device size"  
>device size with M = 1024*1024: 5723166 MBytes
>device size with M = 1000*1000: 6001175 MBytes (6001 GB)
> # hdparm -I /dev/sdg | grep "device size"  
>device size with M = 1024*1024: 5723166 MBytes
>device size with M = 1000*1000: 6001175 MBytes (6001 GB)


btrfs filesystem:
Konsole output
> # btrfs fi sh
> Label: 'apc01_pool0'  uuid: 066141c6-16ca-4a30-b55c-e606b90ad0fb
>Total devices 6 FS bytes used 23.43TiB
>devid1 size 3.64TiB used 3.61TiB path /dev/sdd
>devid2 size 3.64TiB used 3.61TiB path /dev/sdc
>devid3 size 3.64TiB used 3.61TiB path /dev/sde
>devid4 size 3.64TiB used 3.61TiB path /dev/sdg
>devid5 size 3.64TiB used 3.61TiB path /dev/sdb
>devid6 size 5.46TiB used 5.43TiB path /dev/sdf



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] Btrfs: show subvol= and subvolid= in /proc/mounts

2015-04-09 Thread David Sterba
On Tue, Apr 07, 2015 at 10:34:02PM -0700, Omar Sandoval wrote:
> Currently, userspace has no way to know which subvolume is mounted.

Oh, there is a way, 'btrfs inspect-internal rootid /path/to/mount', just
we'd like to see it in the mount options as well.

> But,
> now that we're guaranteed to have a meaningful root dentry, we can just
> export and use seq_dentry() in btrfs_show_options(). The subvolume ID is
> easy to get, so put that in there, too.
> 
> Signed-off-by: Omar Sandoval 
> ---
>  fs/btrfs/super.c | 4 
>  fs/seq_file.c| 1 +
>  2 files changed, 5 insertions(+)
> 
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 5ab9801..5e14bb6 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1193,6 +1193,10 @@ static int btrfs_show_options(struct seq_file *seq, 
> struct dentry *dentry)
>   seq_puts(seq, ",fatal_errors=panic");
>   if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
>   seq_printf(seq, ",commit=%d", info->commit_interval);
> + seq_puts(seq, ",subvol=");

Please make subvol= the last one, as it can contain any string that
could be confused with other options. Although nobody would probably
call their subvolume "name,autodefrag" etc, the way to obtain the full
path is to either resolve the subvolid, or take the whole text after
"subvol=" to the end of the line.

> + seq_dentry(seq, dentry, " \t\n\\");
> + seq_printf(seq, ",subvolid=%llu",
> +   BTRFS_I(d_inode(dentry))->root->root_key.objectid);
>   return 0;
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Recovering BTRFS from bcache failure.

2015-04-09 Thread Dan Merillat
On Tue, Apr 7, 2015 at 11:40 PM, Dan Merillat  wrote:
> Bcache failures are nasty, because they leave a mix of old and new
> data on the disk.  In this case, there was very little dirty data, but
> of course the tree roots were dirty and out-of-sync.
>
> fileserver:/usr/src/btrfs-progs# ./btrfs --version
> Btrfs v3.18.2
>
> kernel version 3.18
>
> [  572.573566] BTRFS info (device bcache0): enabling auto recovery
> [  572.573619] BTRFS info (device bcache0): disk space caching is enabled
> [  574.266055] BTRFS (device bcache0): parent transid verify failed on
> 7567956930560 wanted 613690 found 613681
> [  574.276952] BTRFS (device bcache0): parent transid verify failed on
> 7567956930560 wanted 613690 found 613681
> [  574.277008] BTRFS: failed to read tree root on bcache0
> [  574.277187] BTRFS (device bcache0): parent transid verify failed on
> 7567956930560 wanted 613690 found 613681
> [  574.277356] BTRFS (device bcache0): parent transid verify failed on
> 7567956930560 wanted 613690 found 613681
> [  574.277398] BTRFS: failed to read tree root on bcache0
> [  574.285955] BTRFS (device bcache0): parent transid verify failed on
> 7567965720576 wanted 613689 found 613694
> [  574.298741] BTRFS (device bcache0): parent transid verify failed on
> 7567965720576 wanted 613689 found 610499
> [  574.298804] BTRFS: failed to read tree root on bcache0
> [  575.047079] BTRFS (device bcache0): bad tree block start 0 7567954464768
> [  575.111495] BTRFS (device bcache0): parent transid verify failed on
> 7567954464768 wanted 613688 found 613685
> [  575.111559] BTRFS: failed to read tree root on bcache0
> [  575.121749] BTRFS (device bcache0): bad tree block start 0 7567954214912
> [  575.131803] BTRFS (device bcache0): parent transid verify failed on
> 7567954214912 wanted 613687 found 613680
> [  575.131866] BTRFS: failed to read tree root on bcache0
> [  575.180101] BTRFS: open_ctree failed
>
> all the btrfs tools throw up their hands with similar errors:
> ileserver:/usr/src/btrfs-progs# btrfs restore /dev/bcache0 -l
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> Ignoring transid failure
> Couldn't setup extent tree
> Couldn't setup device tree
> Could not open root, trying backup super
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> Ignoring transid failure
> Couldn't setup extent tree
> Couldn't setup device tree
> Could not open root, trying backup super
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> Ignoring transid failure
> Couldn't setup extent tree
> Couldn't setup device tree
> Could not open root, trying backup super
>
>
> fileserver:/usr/src/btrfs-progs# ./btrfsck --repair /dev/bcache0
> --init-extent-tree
> enabling repair mode
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> Ignoring transid failure
> Couldn't setup extent tree
> Couldn't setup device tree
> Couldn't open file system
>
> Annoyingly:
> # ./btrfs-image -c9 -t4 -s -w /dev/bcache0 /tmp/test.out
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> parent transid verify failed on 7567956930560 wanted 613690 found 613681
> Ignoring transid failure
> Couldn't setup extent tree
> Open ctree failed
> create failed (Success)
>
> So I can't even send an image for people to look at.

CCing some more people on this one, while this filesystem isn't
important I'd like to know that "restore from backup" isn't the only
option for BTRFS corruption.  All of the tools simply throw up their
hands and bail when confronted with this filesystem, even btrfs-image.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] fstests: test btrfs send after swapping directory names differently

2015-04-09 Thread Filipe Manana
Test btrfs incremental send after renaming and moving directories around in a
way that ends up making a directory have different dentries with the same name
but pointing to different inodes in the parent and send snapshots, and also
inverting the ancestor-descendent relationship between one of those inodes and
some other inode.

Cases like this made an incremental send enter an infinite lopp when building
path strings, leading to -ENOMEM errors when the path string reached a length
of PATH_MAX.
This issue was fixed by the following linux kernel btrfs patch:

  Btrfs: incremental send, check if orphanized dir inode needs delayed rename

Signed-off-by: Filipe Manana 
---
 tests/btrfs/090 | 186 
 tests/btrfs/090.out |   2 +
 tests/btrfs/group   |   1 +
 3 files changed, 189 insertions(+)
 create mode 100755 tests/btrfs/090
 create mode 100644 tests/btrfs/090.out

diff --git a/tests/btrfs/090 b/tests/btrfs/090
new file mode 100755
index 000..3eb6f37
--- /dev/null
+++ b/tests/btrfs/090
@@ -0,0 +1,186 @@
+#! /bin/bash
+# FS QA Test No. btrfs/090
+#
+# Test btrfs incremental send after renaming and moving directories around in a
+# way that ends up making a directory have different dentries with the same 
name
+# but pointing to different inodes in the parent and send snapshots, and also
+# inverting the ancestor-descendent relationship between one of those inodes 
and
+# some other inode.
+#
+# Cases like this made an incremental send enter an infinite lopp when building
+# path strings, leading to -ENOMEM errors when the path string reached a length
+# of PATH_MAX.
+# This issue was fixed by the following linux kernel btrfs patch:
+#
+#   Btrfs: incremental send, check if orphanized dir inode needs delayed rename
+#
+#---
+# Copyright (C) 2015 SUSE Linux Products GmbH. All Rights Reserved.
+# Author: Filipe Manana 
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   rm -fr $send_files_dir
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_require_fssum
+_need_to_be_root
+
+send_files_dir=$TEST_DIR/btrfs-test-$seq
+
+rm -f $seqres.full
+rm -fr $send_files_dir
+mkdir $send_files_dir
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+mkdir -p $SCRATCH_MNT/data/n1/n2
+mkdir $SCRATCH_MNT/data/n4
+mkdir -p $SCRATCH_MNT/data/t6/t7
+mkdir $SCRATCH_MNT/data/t5
+mkdir $SCRATCH_MNT/data/t7
+mkdir $SCRATCH_MNT/data/n4/t2
+mkdir $SCRATCH_MNT/data/t4
+mkdir $SCRATCH_MNT/data/t3
+mv $SCRATCH_MNT/data/t7 $SCRATCH_MNT/data/n4/t2
+mv $SCRATCH_MNT/data/t4 $SCRATCH_MNT/data/n4/t2/t7
+mv $SCRATCH_MNT/data/t5 $SCRATCH_MNT/data/n4/t2/t7/t4
+mv $SCRATCH_MNT/data/t6 $SCRATCH_MNT/data/n4/t2/t7/t4/t5
+mv $SCRATCH_MNT/data/n1/n2 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6
+mv $SCRATCH_MNT/data/n1 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6
+mv $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6/t7 
$SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6/n2
+mv $SCRATCH_MNT/data/t3 $SCRATCH_MNT/data/n4/t2/t7/t4/t5/t6/n2/t7
+
+# Filesystem looks like:
+#
+# .   (ino 256)
+# |-- data/   (ino 257)
+#   |-- n4/   (ino 260)
+#|-- t2/  (ino 265)
+# |-- t7/ (ino 264)
+#  |-- t4/(ino 266)
+#   |-- t5/   (ino 263)
+#|-- t6/  (ino 261)
+# |-- n1/ (ino 258)
+# |-- n2/ (ino 259)
+#  |-- t7/(ino 262)
+#   

[PATCH] Btrfs: incremental send, check if orphanized dir inode needs delayed rename

2015-04-09 Thread Filipe Manana
If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.

For example, for the following reproducer where this is needed (provided
by Robbie Ko):

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt
  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt2

  $ mkdir -p /mnt/data/n1/n2
  $ mkdir /mnt/data/n4
  $ mkdir -p /mnt/data/t6/t7
  $ mkdir /mnt/data/t5
  $ mkdir /mnt/data/t7
  $ mkdir /mnt/data/n4/t2
  $ mkdir /mnt/data/t4
  $ mkdir /mnt/data/t3
  $ mv /mnt/data/t7 /mnt/data/n4/t2
  $ mv /mnt/data/t4 /mnt/data/n4/t2/t7
  $ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
  $ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
  $ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
  $ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
  $ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
  $ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7

  $ btrfs subvolume snapshot -r /mnt /mnt/snap1

  $ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
  $ mv /mnt/data/n4/t2 /mnt/data/n4/n1
  $ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
  $ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
  $ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
  $ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
  $ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
  $ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2

  $ btrfs subvolume snapshot -r /mnt /mnt/snap2

  $ btrfs send /mnt/snap1 | btrfs receive /mnt2
  $ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
  ERROR: send ioctl failed with -12: Cannot allocate memory

Where the parent snapshot directory hierarchy is the following:

  .(ino 256)
  |-- data/(ino 257)
|-- n4/(ino 260)
 |-- t2/   (ino 265)
  |-- t7/  (ino 264)
   |-- t4/ (ino 266)
|-- t5/(ino 263)
 |-- t6/   (ino 261)
  |-- n1/  (ino 258)
  |-- n2/  (ino 259)
   |-- t7/ (ino 262)
|-- t3/(ino 267)

And the send snapshot's directory hierarchy is the following:

  .(ino 256)
  |-- data/(ino 257)
|-- n4/(ino 260)
 |-- n1/   (ino 258)
  |-- t2/  (ino 265)
   |-- n2/ (ino 259)
   |-- t3/ (ino 267)
   ||-- t7 (ino 264)
   |
   |-- t6/ (ino 261)
   ||-- t4/(ino 266)
   | |-- t5/   (ino 263)
   |
   |-- t7/ (ino 262)

While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:

  start inode 264, send progress of 265 for example
  parent of 264 -> 267
  parent of 267 -> 262
  parent of 262 -> 259
  parent of 259 -> 261
  parent of 261 -> 263
  parent of 263 -> 266
  parent of 266 -> 264
|--> back to first iteration while current path string length
 is <= PATH_MAX, and fail with -ENOMEM otherwise

So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.

A test case for fstests follows soon.

Thanks to Robbie Ko for providing a reproducer for this problem.

Reported-by: Robbie Ko 
Signed-off-by: Filipe Manana 
---
 fs/btrfs/

Re: BTRFS corruption w/kernel 3.13 while using docker -s btrfs

2015-04-09 Thread Marc MERLIN
Thank you for trying btrfs, it's great for snapshots in docker, but
outside of the 2 helpful comments you already got, do yourself a favour
and use a newer kernel.

Btrfs moves fast, and 3.13 is way too old. The number of bugs (including
corruption bugs) that has been fixed since then is too long to list.

Try getting something newer than 3.16.2 at least, and if you're building
your own, 3.19.3 + this patch
http://permalink.gmane.org/gmane.comp.file-systems.btrfs/42241
(btrfs: simplify insert_orphan_item / 381cf6587f8a8a8e981bc0c18859b51dc756 )

Cheers,
Marc
-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: unstable atimes on empty dirs in read-only snapshots which were subvol parents

2015-04-09 Thread Liu Bo
On Fri, Mar 06, 2015 at 12:03:59PM +1100, Paul Harvey wrote:
> Apparently in my haste, forgot to include any information in my email.
> This is also in the URL to the gist of my test script:
> 
> btrfs v3.17
> Linux 3.16.0-4-amd64 #1 SMP Debian 3.16.7-ckt4-3 (2015-02-03) x86_64 GNU/Linux
> 
> # mount: /dev/loop2 mounted on /tmp/tmp.FkcW7fRde7.
> # Showing mount:
> # /tmp/tmp.1fbwCCdeNM on /tmp/tmp.FkcW7fRde7 type btrfs 
> (rw,noatime,space_cache)
> # Create subvolume '/tmp/tmp.FkcW7fRde7/subvol'
> # mkdir: created directory ‘/tmp/tmp.FkcW7fRde7/snapshots’
> # mkdir: created directory ‘/tmp/tmp.FkcW7fRde7/empty_dir’
> # Create a readonly snapshot of '/tmp/tmp.FkcW7fRde7' in
> '/tmp/tmp.FkcW7fRde7/snapshots/1'
> # Testing that the subvol dir has stable atime on original parent FS:
> # Testing that '/tmp/tmp.FkcW7fRde7/subvol' has repeatable atime of
> # 2015-03-06T11:09:40+1100...
> # 1:  2015-03-06T11:09:40+1100
> # 2:  2015-03-06T11:09:40+1100
> # PASS /tmp/tmp.FkcW7fRde7/subvol atime is stable :)
> # Testing that a normal empty dir has stable atime on the snapshot:
> # Testing that '/tmp/tmp.FkcW7fRde7/snapshots/1/empty_dir' has
> repeatable atime of
> # 2015-03-06T11:09:40+1100...
> # 1:  2015-03-06T11:09:40+1100
> # 2:  2015-03-06T11:09:40+1100
> # PASS /tmp/tmp.FkcW7fRde7/snapshots/1/empty_dir atime is stable :)
> # Testing that the subvol dir has stable atime on snapshot of parent FS:
> # Testing that '/tmp/tmp.FkcW7fRde7/snapshots/1/subvol' has repeatable atime 
> of
> # 2015-03-06T11:09:48+1100...
> # 1:  2015-03-06T11:09:50+1100
> # 2:  2015-03-06T11:09:52+1100
> # FAIL /tmp/tmp.FkcW7fRde7/snapshots/1/subvol atime is unstable :(
> # './btrfs-atime-bug.sh nocleanup' not specified so cleaning up our mess:
> # umount: /tmp/tmp.FkcW7fRde7 unmounted
> # rmdir: removing directory, ‘/tmp/tmp.FkcW7fRde7’
> # removed ‘/tmp/tmp.1fbwCCdeNM’

Right, that's an intended behaviour because we'd like to avoid hardlink similar
problems, that is, to allow ONLY one valid access to 'subvol'.  Here btrfs
makes a pseudo 'subvol' with setting CURRENT_TIME to inode->atime/ctime/mtime,
that's why we see it's unstable.

Thanks,

-liubo
> 
> On 6 March 2015 at 11:29, Paul Harvey  wrote:
> > Hi there,
> >
> > Apologies for not confirming on a much more recent kernel, if anyone
> > could please try my test script for me on a newer kernel that would be
> > very much appreciated.
> >
> > I'm working on reproducible builds, and part of this workflow involves
> > tar archiving parts of read-only btrfs snapshots. Problem is, some of
> > these tar archives are different from run to run when they capture an
> > empty directory that happened to be a subvol parent on the original
> > FS: the atimes on these empty dirs are always returning the current
> > time - which is not the case with an ordinary empty directory created
> > with mkdir; it's also not the same behaviour on the original FS (tar
> > archives are reproducible if we use the original FS rather than the
> > read-only snapshot). This all happens regardless of mounting noatime.
> >
> > Perhaps this verbiage is convoluted, I'm writing this in a hurry with
> > limited internet connectivity - I have a reproducible test case here
> > at https://gist.github.com/csirac2/c2b5b2b9d0193b3c08a8
> >
> > Again, I understand this is a pretty old kernel and perhaps this is
> > fixed by now, I'll try a more recent kernel with more assertive bug
> > report next week if nobody has time to try out my test case.
> >
> > Cheers
> >
> > --
> > Paul Harvey
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: snapshot space use

2015-04-09 Thread Piotr Szymaniak
On Thu, Apr 09, 2015 at 08:45:21AM +0800, Qu Wenruo wrote:
> *snip*
> NOTE: quota is not so stable and has some problem, but should give
> you enough info.

Those are related to actually using quota or can also hit you when you
want to use it just for things like this snapshot space use?


Piotr Szymaniak.
-- 
  - Oo,  jesteś  bystrzejszy,  niż  się  wydaje.  Przechodziłeś  jakieś
szkolenie antyterrorystyczne?
  - W pewnym sensie tak. Byłem żonaty.
 -- Nelson DeMille, "The Lion's Game"


signature.asc
Description: Digital signature


Re: BTRFS corruption w/kernel 3.13 while using docker -s btrfs

2015-04-09 Thread Filipe David Manana
On Sun, May 18, 2014 at 6:28 AM, Paul Harvey  wrote:
> This is a "Damnit! I held the power button in and now it won't mount!"
> story, but I'm sharing what I found to learn what I can. And in case it's
> useful for btrfs development.
>
> Also curious if my usage of Docker (an LXC thing, http://docker.io) has
> exacerbated things. I ask because it seems some btrfs mount options are
> breaking docker:
> https://github.com/dotcloud/docker/issues/5429#issuecomment-42443919 -
> however, I'm not using space_cache or inode_cache myself.
>
> Notes:
> - I've taken a dd copy of the btrfs partition so that I can perform anything
> extra people would like to see.
> - The following output is taken from the same machine, same partition layout
> but a fresh/working Debian testing install on a different disk. It should be
> reporting the same as it did before the corruption (it was definitely Linux
> 3.13).
> - /dev/sda is a new disk, /dev/sdb3 is the corrupted btrfs partition.
> - I have a single btrfs partition which contains the rootfs. I use
> subvolumes to separately snapshot /home, /vms, etc.
> - mount options were: noatime,autodefrag,discard,compress=lzo

Don't use the discard option on this kernel release. There are 2 know
bugs that made btrfs issue a discard on extents when it shouldn't,
which results in those messages you are seeing "btrfs bad tree block
start 0 xxx" (the key part is the 0 following the word start).
The fixes are:

1) 
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=678886bdc6378c1cbd5072da2c5a3035000214e3
Happens both with -o discard and without it if you run fstrim
after the fs went into readonly mode. This is included in most stable
releases afaik, but certainly not in the kernel version you are using.
The btrfs-zero-log tool won't help here;

2) 
https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/?id=dcc82f4783ad91d4ab654f89f37ae9291cdc846a
   Happens only with -o discard, and affects only the fsync log. You
can get your fs mountable again after using btrfs-zero-log (but losing
all data fsynced in the transaction that was open when a crash/reboot
happened). Not yet in any kernel release, only on linux-next at the
moment.

>
> $ uname -a
> Linux weatherwax 3.13-1-amd64 #1 SMP Debian 3.13.10-1 (2014-04-15)
> x86_64 GNU/Linux
> $ sudo btrfs version
> Btrfs v3.14.1
> $ sudo btrfs fi show /dev/sdb3
> Label: 'weatherwax 0'  uuid: 721926c3-147a-44a0-8c82-62534dd6ee94
>  Total devices 1 FS bytes used 246.14GiB
>  devid1 size 357.63GiB used 313.06GiB path /dev/sdb3
>
> Btrfs v3.14.1
> $ dmesg # see attached dmesg.txt, includes connecting corrupted disk via USB
> $ sudo mount -o recovery,ro /dev/sdb3 /mnt
> mount: wrong fs type, bad option, bad superblock on /dev/sdb3,
> missing codepage or helper program, or other error
> In some cases useful info is found in syslog - try
> dmesg | tail  or so
> [ 1297.632057] btrfs: device label weatherwax 0 devid 1 transid 504075
> /dev/sdb3
> [ 1297.633042] btrfs: enabling auto recovery
> [ 1297.633045] btrfs: disk space caching is enabled
> [ 1297.830793] btrfs bad tree block start 0 632081858560
> [ 1297.831012] btrfs bad tree block start 0 632081858560
> [ 1297.831016] btrfs: failed to read log tree
> [ 1297.892241] btrfs: open_ctree failed
> $ sudo btrfs-find-root /dev/sdb3
> Super think's the tree root is at 632081670144, chunk root 386274426880
> Went past the fs size, exiting
> $ sudo btrfs-image -c9 -t4 /dev/sdb3 /mnt/sdb3.img
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> read block failed check_tree_block
> Couldn't setup log root tree
> Open ctree failed
> create failed (Success)
> $ du /mnt/sdb3.img
> 0   /mnt/sdb3.img
> $ sudo btrfs rescue chunk-recover /dev/sdb3 # Takes a few hours at
> 30MiB/sec, reports 326 (IIRC) good chunks, no bad or orphaned chunks.
> $ sudo btrfs restore -i /dev/sdb3 /mnt # recovers ~3.2GB of /var from
> the old rootfs, nothing useful. A lot of "failed to inflate: -6"
> $ sudo btrfs restore -l /dev/sdb3
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> read block failed check_tree_block
> Couldn't setup log root tree
>   tree key (EXTENT_TREE ROOT_ITEM 0) 632081240064 level 3
>   tree key (DEV_TREE ROOT_ITEM 0) 631709724672 level 1
>   tree key (FS_TREE ROOT_ITEM 0) 632081072128 level 3
>   tree key (CSUM_TREE ROOT_ITEM 0) 632079880192 level 3
>   tree key (UUID_TREE ROOT_ITEM 0) 632070057984 level 0
>   tree key (262 ROOT_ITEM 0) 632079863808 level 3
>   tree key (263 ROOT_ITEM 0) 5308496773

Re: snapshot space use

2015-04-09 Thread Qu Wenruo



 Original Message  
Subject: Re: snapshot space use
From: Piotr Szymaniak 
To: Qu Wenruo 
Date: 2015年04月09日 17:02


On Thu, Apr 09, 2015 at 08:45:21AM +0800, Qu Wenruo wrote:

*snip*
NOTE: quota is not so stable and has some problem, but should give
you enough info.


Those are related to actually using quota or can also hit you when you
want to use it just for things like this snapshot space use?
Using quota has more problems. Like exceeding quota make you unable to 
delete any file(like a deadlock).


Your use case, just showing how much space you used, may still hit some 
unusual problem like inaccurate number or even minus number, with 
performance drop.

But overall, it should be OK and meets your need.

Thanks,
Qu



Piotr Szymaniak.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] fscache/cachefiles versus btrfs

2015-04-09 Thread David Howells
NeilBrown  wrote:

>  Is there a better way?  Could a better way be created?  Maybe
>  SEEK_DATA_RELIABLE ??

fiemap() maybe?

> Also, if you do try to use fscache on btrfs with 3.19, then nothing gets
> cached (as expected) and with a heavy load you can lose a race and get an
> asserting fail in fscache_enqueue_operation

Do you have the patches here applied?


http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=fscache-fixes

David
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS corruption w/kernel 3.13 while using docker -s btrfs

2015-04-09 Thread Liu Bo
Hi,

On Sun, May 18, 2014 at 03:28:18PM +1000, Paul Harvey wrote:
> This is a "Damnit! I held the power button in and now it won't mount!" story, 
> but I'm sharing what I found to learn what I can. And in case it's useful for 
> btrfs development.
> 
> Also curious if my usage of Docker (an LXC thing, http://docker.io) has 
> exacerbated things. I ask because it seems some btrfs mount options are 
> breaking docker:
> https://github.com/dotcloud/docker/issues/5429#issuecomment-42443919 - 
> however, I'm not using space_cache or inode_cache myself.
> 
> Notes:
> - I've taken a dd copy of the btrfs partition so that I can perform anything 
> extra people would like to see.
> - The following output is taken from the same machine, same partition layout 
> but a fresh/working Debian testing install on a different disk. It should be 
> reporting the same as it did before the corruption (it was definitely Linux 
> 3.13).
> - /dev/sda is a new disk, /dev/sdb3 is the corrupted btrfs partition.
> - I have a single btrfs partition which contains the rootfs. I use subvolumes 
> to separately snapshot /home, /vms, etc.
> - mount options were: noatime,autodefrag,discard,compress=lzo
> 
> $ uname -a
> Linux weatherwax 3.13-1-amd64 #1 SMP Debian 3.13.10-1 (2014-04-15)
> x86_64 GNU/Linux
> $ sudo btrfs version
> Btrfs v3.14.1
> $ sudo btrfs fi show /dev/sdb3
> Label: 'weatherwax 0'  uuid: 721926c3-147a-44a0-8c82-62534dd6ee94
>  Total devices 1 FS bytes used 246.14GiB
>  devid1 size 357.63GiB used 313.06GiB path /dev/sdb3
> 
> Btrfs v3.14.1
> $ dmesg # see attached dmesg.txt, includes connecting corrupted disk via USB
> $ sudo mount -o recovery,ro /dev/sdb3 /mnt
> mount: wrong fs type, bad option, bad superblock on /dev/sdb3,
> missing codepage or helper program, or other error
> In some cases useful info is found in syslog - try
> dmesg | tail  or so
> [ 1297.632057] btrfs: device label weatherwax 0 devid 1 transid 504075 
> /dev/sdb3
> [ 1297.633042] btrfs: enabling auto recovery
> [ 1297.633045] btrfs: disk space caching is enabled
> [ 1297.830793] btrfs bad tree block start 0 632081858560
> [ 1297.831012] btrfs bad tree block start 0 632081858560
> [ 1297.831016] btrfs: failed to read log tree
> [ 1297.892241] btrfs: open_ctree failed

This shows that your trees are good except log tree, as if btrfs uses
backup trees for recovery it'll zero log tree and clear free space cache ,
I recommend you to use btrfs-zero-log.

Thanks,

-liubo

> $ sudo btrfs-find-root /dev/sdb3
> Super think's the tree root is at 632081670144, chunk root 386274426880
> Went past the fs size, exiting
> $ sudo btrfs-image -c9 -t4 /dev/sdb3 /mnt/sdb3.img
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> read block failed check_tree_block
> Couldn't setup log root tree
> Open ctree failed
> create failed (Success)
> $ du /mnt/sdb3.img
> 0   /mnt/sdb3.img
> $ sudo btrfs rescue chunk-recover /dev/sdb3 # Takes a few hours at
> 30MiB/sec, reports 326 (IIRC) good chunks, no bad or orphaned chunks.
> $ sudo btrfs restore -i /dev/sdb3 /mnt # recovers ~3.2GB of /var from
> the old rootfs, nothing useful. A lot of "failed to inflate: -6"
> $ sudo btrfs restore -l /dev/sdb3
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> read block failed check_tree_block
> Couldn't setup log root tree
>   tree key (EXTENT_TREE ROOT_ITEM 0) 632081240064 level 3
>   tree key (DEV_TREE ROOT_ITEM 0) 631709724672 level 1
>   tree key (FS_TREE ROOT_ITEM 0) 632081072128 level 3
>   tree key (CSUM_TREE ROOT_ITEM 0) 632079880192 level 3
>   tree key (UUID_TREE ROOT_ITEM 0) 632070057984 level 0
>   tree key (262 ROOT_ITEM 0) 632079863808 level 3
>   tree key (263 ROOT_ITEM 0) 530849677312 level 2
>   tree key (264 ROOT_ITEM 0) 632081104896 level 3
>   tree key (265 ROOT_ITEM 0) 632070590464 level 3
>   tree key (577 ROOT_ITEM 0) 633083752448 level 2
> # ... ~80 more lines of similar output
> $ sudo btrfsck /dev/sdb3 # exits quickly, < 2s
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> Check tree block failed, want=632081858560, have=0
> read block failed check_tree_block
> Couldn't setup log root tree
> Checking filesystem on /dev/sdb3
> UUID: 721926c3-147a-44a0-8c82-62534dd6ee94
> checking extents
> btrfsck[5378]: segfault at 1d8 ip 004176d4 sp 7fff50ed58a0 error 
> 4 in btrfsck[40+57000] # from dmesg
> $ sudo btrfsck --backup /de

[PATCH/RFC] fscache/cachefiles versus btrfs

2015-04-09 Thread NeilBrown

hi,
 fscache cannot currently be used with btrfs as the backing store for the
 cache (managed by cachefilesd).
 This is because cachefiles needs the ->bmap address_space_operation, and
 btrfs doesn't provide it.

 cachefiles only uses this to find out if a particular page is a 'hole' or
 not.  For btrfs, this can be done with 'SEEK_DATA'.

 Unfortunately it doesn't seem to be possible to query a filesystem or a file
 to see if SEEK_DATA is reliable or not, so we cannot simply use SEEK_DATA
 when reliable, else ->bmap if available.

 The following patch make fscache work for me on btrfs.  It explicitly checks
 for BTRFS_SUPER_MAGIC.  Not really a nice solution, but all I could think of.

 Is there a better way?  Could a better way be created?  Maybe
 SEEK_DATA_RELIABLE ??

 Comments, suggestions welcome.


Also, if you do try to use fscache on btrfs with 3.19, then nothing gets
cached (as expected) and with a heavy load you can lose a race and get an
asserting fail in fscache_enqueue_operation

ASSERT(fscache_object_is_available(op->object));

It looks like the object is being killed before it is available...

[  859.700765] kernel BUG at ../fs/fscache/operation.c:38!
...
[  859.703124] Call Trace:
[  859.703193]  [] fscache_run_op.isra.4+0x34/0x80 [fscache]
[  859.703260]  [] fscache_start_operations+0xa0/0xf0 
[fscache]
[  859.703388]  [] fscache_kill_object+0x98/0xc0 [fscache]
[  859.703455]  [] fscache_object_work_func+0x151/0x210 
[fscache]
[  859.703578]  [] process_one_work+0x147/0x3c0
[  859.703642]  [] worker_thread+0x20c/0x470

I haven't figured out the cause of that yet.


Thanks,
NeilBrown




diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1e51714eb33e..1389d8483d5d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 
 #define CACHEFILES_KEYBUF_SIZE 512
@@ -647,7 +648,8 @@ lookup_again:
 
ret = -EPERM;
aops = object->dentry->d_inode->i_mapping->a_ops;
-   if (!aops->bmap)
+   if (!aops->bmap &&
+   object->dentry->d_sb->s_magic != BTRFS_SUPER_MAGIC)
goto check_error;
 
object->backer = object->dentry;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c6cd8d7a4eef..49fb330c0ab8 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -410,11 +410,11 @@ int cachefiles_read_or_alloc_page(struct 
fscache_retrieval *op,
 
inode = object->backer->d_inode;
ASSERT(S_ISREG(inode->i_mode));
-   ASSERT(inode->i_mapping->a_ops->bmap);
ASSERT(inode->i_mapping->a_ops->readpages);
 
/* calculate the shift required to use bmap */
-   if (inode->i_sb->s_blocksize > PAGE_SIZE)
+   if (inode->i_mapping->a_ops->bmap &&
+   inode->i_sb->s_blocksize > PAGE_SIZE)
goto enobufs;
 
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -423,20 +423,36 @@ int cachefiles_read_or_alloc_page(struct 
fscache_retrieval *op,
op->op.flags |= FSCACHE_OP_ASYNC;
op->op.processor = cachefiles_read_copier;
 
-   /* we assume the absence or presence of the first block is a good
-* enough indication for the page as a whole
-* - TODO: don't use bmap() for this as it is _not_ actually good
-*   enough for this as it doesn't indicate errors, but it's all we've
-*   got for the moment
-*/
-   block0 = page->index;
-   block0 <<= shift;
-
-   block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
-   _debug("%llx -> %llx",
-  (unsigned long long) block0,
-  (unsigned long long) block);
+   if (inode->i_mapping->a_ops->bmap) {
+   /* we assume the absence or presence of the first block is a 
good
+* enough indication for the page as a whole
+* - TODO: don't use bmap() for this as it is _not_ actually 
good
+*   enough for this as it doesn't indicate errors, but it's 
all we've
+*   got for the moment
+*/
+   block0 = page->index;
+   block0 <<= shift;
 
+   block = inode->i_mapping->a_ops->bmap(inode->i_mapping, block0);
+   _debug("%llx -> %llx",
+  (unsigned long long) block0,
+  (unsigned long long) block);
+   } else {
+   /* Use llseek */
+   struct path path;
+   struct file *file;
+   path.mnt = cache->mnt;
+   path.dentry = object->backer;
+   file = dentry_open(&path, O_RDONLY, cache->cache_cred);
+   if (IS_ERR(file))
+   goto enobufs;
+   block = vfs_llseek(file, page->index << PAGE_SHIFT, SEEK_DATA);
+   filp_close(file, NULL);
+