Re: [PATCH 1/4] vfs: introduce function to map unique ino/dev pairs

2018-07-31 Thread Amir Goldstein
On Wed, Aug 1, 2018 at 2:21 AM, Mark Fasheh  wrote:
>
> On Tue, Jul 31, 2018 at 02:10:42PM -0700, Mark Fasheh wrote:
>> diff --git a/fs/stat.c b/fs/stat.c
>> index f8e6fb2c3657..80ea42505219 100644
>> --- a/fs/stat.c
>> +++ b/fs/stat.c
>> @@ -84,6 +84,29 @@ int vfs_getattr_nosec(const struct path *path, struct 
>> kstat *stat,
>>  }
>>  EXPORT_SYMBOL(vfs_getattr_nosec);
>>
>> +void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t *dev)
>> +{
>> + int ret;
>> + struct path path;
>> + struct kstat stat;
>> +
>> + path.mnt = NULL;
>> + path.dentry = dentry;
>> + /* ->dev is always returned, so we only need to specify ino here */
>> + ret = vfs_getattr_nosec(, , STATX_INO, 0);
>> + if (ret) {
>> + struct inode *inode = d_inode(dentry);
>> + /* Fallback to old behavior in case of getattr error */
>> + *ino = inode->i_ino;
>> + *dev = inode->i_sb->s_dev;
>> + return ret;
>
> Ooof, attached is a version of this patch which doesn't try to return a value
> from a void function. Apologies for the extra e-mail.
>
> From Mark Fasheh 
>
> [PATCH 1/4] vfs: introduce function to map unique ino/dev pairs
>
> There are places in the VFS where we export an ino/dev pair to userspace.
> /proc/PID/maps is a good example - we directly expose inode->i_ino and
> inode->i_sb->s_dev to userspace there.  Many filesystems don't put a unique
> value in inode->i_ino and instead rely on ->getattr to provide the real
> inode number to userspace.  super->s_dev is similar - some filesystems
> expose a difference device from what's put in super->s_dev when queried via
> stat/statx.
>
> Ultimately this makes it impossible for a user (or software) to match one of
> those reported pairs to the right inode.
>
> We can fix this by adding a helper function, vfs_map_unique_ino_dev(), which
> will query the owning filesystem (via ->getattr) to get the correct ino/dev
> pair.  Later patches will update those places which simply dump inode->i_ino
> and super->s_dev to use the helper.
>
> Signed-off-by: Mark Fasheh 
> ---
>  fs/stat.c  | 22 ++
>  include/linux/fs.h |  2 ++
>  2 files changed, 24 insertions(+)
>
> diff --git a/fs/stat.c b/fs/stat.c
> index f8e6fb2c3657..20c72d618ed5 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -84,6 +84,28 @@ int vfs_getattr_nosec(const struct path *path, struct 
> kstat *stat,
>  }
>  EXPORT_SYMBOL(vfs_getattr_nosec);
>
> +void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t *dev)

I find this function name a bit more than function can guaranty.
It's just a fancy wrapper around ->getattr()
How about vfs_get_ino_dev() ?

Thanks,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/4] locks: map correct ino/dev pairs when exporting to userspace

2018-07-31 Thread Amir Goldstein
On Wed, Aug 1, 2018 at 12:10 AM, Mark Fasheh  wrote:
> /proc/locks does not always print the correct inode number/device pair.
> Update lock_get_status() to use vfs_map_unique_ino_dev() to get the real,
> unique values for userspace.
>
> Signed-off-by: Mark Fasheh 
> ---
>  fs/locks.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/fs/locks.c b/fs/locks.c
> index db7b6917d9c5..3a012df87fd8 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -2621,6 +2621,7 @@ static void lock_get_status(struct seq_file *f, struct 
> file_lock *fl,
> loff_t id, char *pfx)
>  {
> struct inode *inode = NULL;
> +   struct dentry *dentry;
> unsigned int fl_pid;
> struct pid_namespace *proc_pidns = 
> file_inode(f->file)->i_sb->s_fs_info;
>
> @@ -2633,8 +2634,10 @@ static void lock_get_status(struct seq_file *f, struct 
> file_lock *fl,
> if (fl_pid == 0)
> return;
>
> -   if (fl->fl_file != NULL)
> +   if (fl->fl_file != NULL) {
> inode = locks_inode(fl->fl_file);
> +   dentry = file_dentry(fl->fl_file);
> +   }
>
> seq_printf(f, "%lld:%s ", id, pfx);
> if (IS_POSIX(fl)) {
> @@ -2681,10 +2684,13 @@ static void lock_get_status(struct seq_file *f, 
> struct file_lock *fl,
>: (fl->fl_type == F_WRLCK) ? "WRITE" : "READ 
> ");
> }
> if (inode) {
> +   __u64 ino;
> +   dev_t dev;
> +
> +   vfs_map_unique_ino_dev(dentry, , );
> /* userspace relies on this representation of dev_t */
> seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
> -   MAJOR(inode->i_sb->s_dev),
> -   MINOR(inode->i_sb->s_dev), inode->i_ino);
> +   MAJOR(dev), MINOR(dev), inode->i_ino);

Don't you mean ,ino); ?

Thanks,
Amir.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Unmountable root partition

2018-07-31 Thread Chris Murphy
On Tue, Jul 31, 2018 at 12:03 PM, Cerem Cem ASLAN  wrote:

> 3. mount -t btrfs /dev/mapper/foo--vg-root /mnt/foo
> Gives the following error:
>
> mount: wrong fs type, bad option, bad superblock on ...
>
> 4. dmesg | tail
> Outputs the following:
>
>
> [17755.840916] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> [17755.840919] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0 02 00 00
> 02 00
> [17755.840921] blk_update_request: I/O error, dev sda, sector 507906
> [17755.840941] EXT4-fs (dm-4): unable to read superblock


Are you sure this is the output for the command? Because you're
explicitly asking for type btrfs, which fails, and then the kernel
reports EXT4 superblock unreadable. What do you get if you omit -t
btrfs and just let it autodetect?

But yeah, this is an IO error from the device and there's nothing
Btrfs can do about that unless there is DUP or raid1+ metadata
available.

Is it possible this LV was accidentally reformatted ext4?


-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Unmountable root partition

2018-07-31 Thread Cerem Cem ASLAN
That might be the case. Can't we recover anything with some data loss?

2018-08-01 2:04 GMT+03:00 Hans van Kranenburg <
hans.van.kranenb...@mendix.com>:

> Hi,
>
> On 07/31/2018 08:03 PM, Cerem Cem ASLAN wrote:
> > Hi,
> >
> > I'm having trouble with my server setup, which contains a BTRFS root
> > partition on top of LVM on top of LUKS partition.
> >
> > Yesterday server was shut down unexpectedly. I booted the system with a
> > pendrive which contains Debian 4.9.18 and tried to mount the BTRFS root
> > partition manually.
> >
> > 1. cryptsetup luksOpen /dev/sda5
> >
> > Seems to decrypt the partition (there are no errors)
> >
> > 2. /dev/mapper/foo--vg-root and /dev/mapper/foo--vg-swap_1 is created
> > automatically, so I suppose LVM works correctly.
> >
> > 3. mount -t btrfs /dev/mapper/foo--vg-root /mnt/foo
> > Gives the following error:
> >
> > mount: wrong fs type, bad option, bad superblock on ...
> >
> > 4. dmesg | tail
> > Outputs the following:
> >
> >
> > [17755.840916] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> > hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> > [17755.840919] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0
> > 02 00 00 02 00
> > [17755.840921] blk_update_request: I/O error, dev sda, sector 507906
> > [17755.840941] EXT4-fs (dm-4): unable to read superblock
> > [18140.052300] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> > hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> > [18140.052305] sd 3:0:0:0: [sda] tag#0 CDB: ATA command pass
> > through(16) 85 06 20 00 00 00 00 00 00 00 00 00 00 00 e5 00
> > [18142.991851] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> > hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> > [18142.991856] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0
> > 80 00 00 08 00
> > [18142.991860] blk_update_request: I/O error, dev sda, sector 508032
> > [18142.991869] Buffer I/O error on dev dm-4, logical block 16, async
> > page read
>
> This points at hardware level failure, regardless if the disk would hold
> a btrfs or ext4 or whatever other kind of filesystem. The disk itself
> cannot read data back when we ask for it.
>
> > 4.
> >
> > # btrfs restore -i -D /dev/mapper/foo--vg-root /dev/null
> > No valid Btrfs found on /dev/mapper/foo--vg-root
> > Could not open root, trying backup super
> > No valid Btrfs found on /dev/mapper/foo--vg-root
> > Could not open root, trying backup super
> > No valid Btrfs found on /dev/mapper/foo--vg-root
> > Could not open root, trying backup super
> >
> > We are pretty sure that no unexpected electric cuts has been happened.
> >
> > At this point I don't know what information I should supply.
> >
>
>
> --
> Hans van Kranenburg
>


BTRFS and databases

2018-07-31 Thread MegaBrutal
Hi all,

I know it's a decade-old question, but I'd like to hear your thoughts
of today. By now, I became a heavy BTRFS user. Almost everywhere I use
BTRFS, except in situations when it is obvious there is no benefit
(e.g. /var/log, /boot). At home, all my desktop, laptop and server
computers are mainly running on BTRFS with only a few file systems on
ext4. I even installed BTRFS in corporate productive systems (in those
cases, the systems were mainly on ext4; but there were some specific
file systems those exploited BTRFS features).

But there is still one question that I can't get over: if you store a
database (e.g. MySQL), would you prefer having a BTRFS volume mounted
with nodatacow, or would you just simply use ext4?

I know that with nodatacow, I take away most of the benefits of BTRFS
(those are actually hurting database performance – the exact CoW
nature that is elsewhere a blessing, with databases it's a drawback).
But are there any advantages of still sticking to BTRFS for a database
albeit CoW is disabled, or should I just return to the old and
reliable ext4 for those applications?


Kind regards,
MegaBrutal
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/7] Some unused parameter cleanup

2018-07-31 Thread Lu Fengqi
PATCH 1-2 remove all unused fs_info parameter from delayed-inode.c
PATCH 3-5 remove all unused fs_info parameter from root-tree.c
PATCH 6-7 some cleanup for btrfs_unlink_subvol

Lu Fengqi (7):
  btrfs: Remove fs_info from btrfs_insert_delayed_dir_index
  btrfs: Remove fs_info from btrfs_delete_delayed_dir_index
  btrfs: Remove fs_info from btrfs_del_root
  btrfs: Remove fs_info from btrfs_del_root_ref
  btrfs: Remove fs_info from btrfs_add_root_ref
  btrfs: Remove root parameter from btrfs_unlink_subvol
  btrfs: Remove redundant btrfs_release_path from btrfs_unlink_subvol

 fs/btrfs/ctree.h   | 16 +++
 fs/btrfs/delayed-inode.c   | 10 --
 fs/btrfs/delayed-inode.h   |  2 --
 fs/btrfs/dir-item.c|  4 ++--
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/free-space-tree.c |  2 +-
 fs/btrfs/inode.c   | 41 +++---
 fs/btrfs/ioctl.c   |  3 +--
 fs/btrfs/qgroup.c  |  2 +-
 fs/btrfs/root-tree.c   | 22 ++--
 fs/btrfs/transaction.c |  2 +-
 11 files changed, 44 insertions(+), 62 deletions(-)

-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/7] btrfs: Remove fs_info from btrfs_del_root_ref

2018-07-31 Thread Lu Fengqi
It can be referenced from the passed transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h | 7 +++
 fs/btrfs/inode.c | 8 +++-
 fs/btrfs/root-tree.c | 9 -
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ef47a171e2f..3677082ddf4c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2988,10 +2988,9 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
   struct btrfs_fs_info *fs_info,
   u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
   const char *name, int name_len);
-int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
-  u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
-  const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+  u64 ref_id, u64 dirid, u64 *sequence, const char *name,
+  int name_len);
 int btrfs_del_root(struct btrfs_trans_handle *trans,
   const struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root 
*root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59b3ea32b6de..7bf4a8d07e1e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4085,7 +4085,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle 
*trans,
struct inode *dir, u64 objectid,
const char *name, int name_len)
 {
-   struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
@@ -4118,9 +4117,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle 
*trans,
}
btrfs_release_path(path);
 
-   ret = btrfs_del_root_ref(trans, fs_info, objectid,
-root->root_key.objectid, dir_ino,
-, name, name_len);
+   ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
+dir_ino, , name, name_len);
if (ret < 0) {
if (ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
@@ -6439,7 +6437,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
int err;
-   err = btrfs_del_root_ref(trans, fs_info, key.objectid,
+   err = btrfs_del_root_ref(trans, key.objectid,
 root->root_key.objectid, parent_ino,
 _index, name, name_len);
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f7c14c454f91..52fa133ab53c 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -341,13 +341,12 @@ int btrfs_del_root(struct btrfs_trans_handle *trans,
return ret;
 }
 
-int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
-  u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
-  const char *name, int name_len)
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+  u64 ref_id, u64 dirid, u64 *sequence, const char *name,
+  int name_len)
 
 {
-   struct btrfs_root *tree_root = fs_info->tree_root;
+   struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/7] btrfs: Remove root parameter from btrfs_unlink_subvol

2018-07-31 Thread Lu Fengqi
It can be fetched by BTRFS_I(dir)->root.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/inode.c | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 407d068d4208..b8c131b82978 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4081,10 +4081,10 @@ static int btrfs_unlink(struct inode *dir, struct 
dentry *dentry)
 }
 
 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root,
-   struct inode *dir, u64 objectid,
-   const char *name, int name_len)
+  struct inode *dir, u64 objectid,
+  const char *name, int name_len)
 {
+   struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
@@ -4335,10 +4335,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct 
dentry *dentry)
 
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
 
-   ret = btrfs_unlink_subvol(trans, root, dir,
-   dest->root_key.objectid,
-   dentry->d_name.name,
-   dentry->d_name.len);
+   ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid,
+ dentry->d_name.name, dentry->d_name.len);
if (ret) {
err = ret;
btrfs_abort_transaction(trans, ret);
@@ -4433,7 +4431,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry 
*dentry)
return PTR_ERR(trans);
 
if (unlikely(btrfs_ino(BTRFS_I(inode)) == 
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
-   err = btrfs_unlink_subvol(trans, root, dir,
+   err = btrfs_unlink_subvol(trans, dir,
  BTRFS_I(inode)->location.objectid,
  dentry->d_name.name,
  dentry->d_name.len);
@@ -9505,8 +9503,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
-   ret = btrfs_unlink_subvol(trans, root, old_dir,
- root_objectid,
+   ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
  old_dentry->d_name.name,
  old_dentry->d_name.len);
} else { /* src is an inode */
@@ -9525,8 +9522,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
-   ret = btrfs_unlink_subvol(trans, dest, new_dir,
- root_objectid,
+   ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
  new_dentry->d_name.name,
  new_dentry->d_name.len);
} else { /* dest is an inode */
@@ -9786,7 +9782,7 @@ static int btrfs_rename(struct inode *old_dir, struct 
dentry *old_dentry,
 
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
-   ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+   ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
old_dentry->d_name.name,
old_dentry->d_name.len);
} else {
@@ -9808,8 +9804,7 @@ static int btrfs_rename(struct inode *old_dir, struct 
dentry *old_dentry,
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
root_objectid = BTRFS_I(new_inode)->location.objectid;
-   ret = btrfs_unlink_subvol(trans, dest, new_dir,
-   root_objectid,
+   ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
new_dentry->d_name.name,
new_dentry->d_name.len);
BUG_ON(new_inode->i_nlink == 0);
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/7] btrfs: Remove fs_info from btrfs_delete_delayed_dir_index

2018-07-31 Thread Lu Fengqi
It can be referenced from the passed transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-inode.c | 6 +++---
 fs/btrfs/delayed-inode.h | 1 -
 fs/btrfs/inode.c | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5d103eda1874..f51b509f2d9b 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1493,7 +1493,6 @@ static int btrfs_delete_delayed_insertion_item(struct 
btrfs_fs_info *fs_info,
 }
 
 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
   struct btrfs_inode *dir, u64 index)
 {
struct btrfs_delayed_node *node;
@@ -1509,7 +1508,8 @@ int btrfs_delete_delayed_dir_index(struct 
btrfs_trans_handle *trans,
item_key.type = BTRFS_DIR_INDEX_KEY;
item_key.offset = index;
 
-   ret = btrfs_delete_delayed_insertion_item(fs_info, node, _key);
+   ret = btrfs_delete_delayed_insertion_item(trans->fs_info, node,
+ _key);
if (!ret)
goto end;
 
@@ -1531,7 +1531,7 @@ int btrfs_delete_delayed_dir_index(struct 
btrfs_trans_handle *trans,
mutex_lock(>mutex);
ret = __btrfs_add_delayed_deletion_item(node, item);
if (unlikely(ret)) {
-   btrfs_err(fs_info,
+   btrfs_err(trans->fs_info,
  "err add delayed dir index item(index: %llu) into the 
deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
  index, node->root->objectid, node->inode_id, ret);
BUG();
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 44558ededcf5..33536cd681d4 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -92,7 +92,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle 
*trans,
   u64 index);
 
 int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
   struct btrfs_inode *dir, u64 index);
 
 int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 472457795486..59b3ea32b6de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3978,7 +3978,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle 
*trans,
goto err;
}
 skip_backref:
-   ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
+   ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
@@ -4144,7 +4144,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle 
*trans,
}
btrfs_release_path(path);
 
-   ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), 
index);
+   ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] btrfs: Remove fs_info from btrfs_insert_delayed_dir_index

2018-07-31 Thread Lu Fengqi
It can be referenced from the passed transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-inode.c | 4 +---
 fs/btrfs/delayed-inode.h | 1 -
 fs/btrfs/dir-item.c  | 4 ++--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 596d2af0c8aa..5d103eda1874 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1418,7 +1418,6 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info 
*fs_info)
 
 /* Will return 0 or -ENOMEM */
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
   const char *name, int name_len,
   struct btrfs_inode *dir,
   struct btrfs_disk_key *disk_key, u8 type,
@@ -1458,11 +1457,10 @@ int btrfs_insert_delayed_dir_index(struct 
btrfs_trans_handle *trans,
 */
BUG_ON(ret);
 
-
mutex_lock(_node->mutex);
ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
if (unlikely(ret)) {
-   btrfs_err(fs_info,
+   btrfs_err(trans->fs_info,
  "err add delayed dir index item(name: %.*s) into the 
insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
  name_len, name, delayed_node->root->objectid,
  delayed_node->inode_id, ret);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index ca7a97f3ab6b..44558ededcf5 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -86,7 +86,6 @@ static inline void btrfs_init_delayed_root(
 }
 
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
   const char *name, int name_len,
   struct btrfs_inode *dir,
   struct btrfs_disk_key *disk_key, u8 type,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 39e9766d1cbd..a678b07fcf01 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -160,8 +160,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 
struct btrfs_root
}
btrfs_release_path(path);
 
-   ret2 = btrfs_insert_delayed_dir_index(trans, root->fs_info, name,
-   name_len, dir, _key, type, index);
+   ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir,
+ _key, type, index);
 out_free:
btrfs_free_path(path);
if (ret)
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] btrfs: Remove fs_info from btrfs_add_root_ref

2018-07-31 Thread Lu Fengqi
It can be referenced from the passed transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   | 7 +++
 fs/btrfs/inode.c   | 3 +--
 fs/btrfs/ioctl.c   | 3 +--
 fs/btrfs/root-tree.c   | 9 -
 fs/btrfs/transaction.c | 2 +-
 5 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3677082ddf4c..c275ea258f9a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2984,10 +2984,9 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info 
*fs_info,
 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
 
 /* root-item.c */
-int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
-  u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
-  const char *name, int name_len);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+  u64 ref_id, u64 dirid, u64 sequence, const char *name,
+  int name_len);
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
   u64 ref_id, u64 dirid, u64 *sequence, const char *name,
   int name_len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7bf4a8d07e1e..407d068d4208 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6385,7 +6385,6 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
   const char *name, int name_len, int add_backref, u64 index)
 {
-   struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = parent_inode->root;
@@ -6401,7 +6400,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
}
 
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
-   ret = btrfs_add_root_ref(trans, fs_info, key.objectid,
+   ret = btrfs_add_root_ref(trans, key.objectid,
 root->root_key.objectid, parent_ino,
 index, name, name_len);
} else if (add_backref) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 80fe3c654612..6eaadddaca9f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -698,8 +698,7 @@ static noinline int create_subvol(struct inode *dir,
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
 
-   ret = btrfs_add_root_ref(trans, fs_info,
-objectid, root->root_key.objectid,
+   ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
 btrfs_ino(BTRFS_I(dir)), index, name, namelen);
BUG_ON(ret);
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 52fa133ab53c..65bda0682928 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -412,12 +412,11 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, 
u64 root_id,
  *
  * Will return 0, -ENOMEM, or anything from the CoW path
  */
-int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info,
-  u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
-  const char *name, int name_len)
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
+  u64 ref_id, u64 dirid, u64 sequence, const char *name,
+  int name_len)
 {
-   struct btrfs_root *tree_root = fs_info->tree_root;
+   struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
int ret;
struct btrfs_path *path;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index aec208cbff00..001ed1bc2aa8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1573,7 +1573,7 @@ static noinline int create_pending_snapshot(struct 
btrfs_trans_handle *trans,
/*
 * insert root back/forward references
 */
-   ret = btrfs_add_root_ref(trans, fs_info, objectid,
+   ret = btrfs_add_root_ref(trans, objectid,
 parent_root->root_key.objectid,
 btrfs_ino(BTRFS_I(parent_inode)), index,
 dentry->d_name.name, dentry->d_name.len);
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] btrfs: Remove fs_info from btrfs_del_root

2018-07-31 Thread Lu Fengqi
It can be referenced from the passed transaction handle.

Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   | 2 +-
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/free-space-tree.c | 2 +-
 fs/btrfs/qgroup.c  | 2 +-
 fs/btrfs/root-tree.c   | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2e32584c635f..9ef47a171e2f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2993,7 +2993,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
   u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
   const char *name, int name_len);
 int btrfs_del_root(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info, const struct btrfs_key *key);
+  const struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root 
*root,
  const struct btrfs_key *key,
  struct btrfs_root_item *item);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fcaf598f4222..042dd4186fb8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9028,7 +9028,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
if (err)
goto out_end_trans;
 
-   ret = btrfs_del_root(trans, fs_info, >root_key);
+   ret = btrfs_del_root(trans, >root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
err = ret;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index b5950aacd697..d6736595ec57 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1236,7 +1236,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info 
*fs_info)
if (ret)
goto abort;
 
-   ret = btrfs_del_root(trans, fs_info, _space_root->root_key);
+   ret = btrfs_del_root(trans, _space_root->root_key);
if (ret)
goto abort;
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index dd30c5921dcd..2ba29f0609d9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1087,7 +1087,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
goto end_trans;
}
 
-   ret = btrfs_del_root(trans, fs_info, _root->root_key);
+   ret = btrfs_del_root(trans, _root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto end_trans;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index c451285976ac..f7c14c454f91 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -320,9 +320,9 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
 
 /* drop the root item for 'key' from the tree root */
 int btrfs_del_root(struct btrfs_trans_handle *trans,
-  struct btrfs_fs_info *fs_info, const struct btrfs_key *key)
+  const struct btrfs_key *key)
 {
-   struct btrfs_root *root = fs_info->tree_root;
+   struct btrfs_root *root = trans->fs_info->tree_root;
struct btrfs_path *path;
int ret;
 
-- 
2.18.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 4/6] btrfs: Introduce mount time chunk <-> dev extent mapping check

2018-07-31 Thread Su Yue




On 08/01/2018 10:37 AM, Qu Wenruo wrote:

This patch will introduce chunk <-> dev extent mapping check, to protect
us against invalid dev extents or chunks.

Since chunk mapping is the fundamental infrastructure of btrfs, extra
check at mount time could prevent a lot of unexpected behavior (BUG_ON).

Reported-by: Xu Wen 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200403
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200407
Signed-off-by: Qu Wenruo 


LGTM.
Reviewed-by: Su Yue 


---
  fs/btrfs/disk-io.c |   7 ++
  fs/btrfs/volumes.c | 183 +
  fs/btrfs/volumes.h |   2 +
  3 files changed, 192 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 205092dc9390..068ca7498e94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3075,6 +3075,13 @@ int open_ctree(struct super_block *sb,
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
  
+	ret = btrfs_verify_dev_extents(fs_info);

+   if (ret) {
+   btrfs_err(fs_info,
+ "failed to verify dev extents against chunks: %d",
+ ret);
+   goto fail_block_groups;
+   }
ret = btrfs_recover_balance(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to recover balance: %d", ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e6a8e4aabc66..467a589854fa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6440,6 +6440,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, 
struct btrfs_key *key,
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+   map->verified_stripes = 0;
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7295,3 +7296,185 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info 
*fs_info)
fs_devices = fs_devices->seed;
}
  }
+
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+{
+   int index = btrfs_bg_flags_to_raid_index(type);
+   int ncopies = btrfs_raid_array[index].ncopies;
+   int data_stripes;
+
+   switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+   case BTRFS_BLOCK_GROUP_RAID5:
+   data_stripes = num_stripes - 1;
+   break;
+   case BTRFS_BLOCK_GROUP_RAID6:
+   data_stripes = num_stripes - 2;
+   break;
+   default:
+   data_stripes = num_stripes / ncopies;
+   break;
+   }
+   return div_u64(chunk_len, data_stripes);
+}
+static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
+u64 chunk_offset, u64 devid,
+u64 physical_offset, u64 physical_len)
+{
+   struct extent_map_tree *em_tree = _info->mapping_tree.map_tree;
+   struct extent_map *em;
+   struct map_lookup *map;
+   u64 stripe_len;
+   bool found = false;
+   int ret = 0;
+   int i;
+
+   read_lock(_tree->lock);
+   em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+   read_unlock(_tree->lock);
+
+   if (!em) {
+   ret = -EUCLEAN;
+   btrfs_err(fs_info,
+   "dev extent (%llu, %llu) doesn't have corresponding chunk",
+ devid, physical_offset);
+   goto out;
+   }
+
+   map = em->map_lookup;
+   stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+   if (physical_len != stripe_len) {
+   btrfs_err(fs_info,
+"dev extent (%llu, %llu) length doesn't match with chunk %llu, have %llu expect 
%llu",
+ devid, physical_offset, em->start, physical_len,
+ stripe_len);
+   ret = -EUCLEAN;
+   goto out;
+   }
+
+   for (i = 0; i < map->num_stripes; i++) {
+   if (map->stripes[i].dev->devid == devid &&
+   map->stripes[i].physical == physical_offset) {
+   found = true;
+   if (map->verified_stripes >= map->num_stripes) {
+   btrfs_err(fs_info,
+   "too many dev extent for chunk %llu is detected",
+ em->start);
+   ret = -EUCLEAN;
+   goto out;
+   }
+   map->verified_stripes++;
+   break;
+   }
+   }
+   if (!found) {
+   ret = -EUCLEAN;
+   btrfs_err(fs_info,
+   "dev extent (%llu, %llu) has no corresponding chunk",
+   devid, physical_offset);
+   }
+out:
+   free_extent_map(em);
+  

Re: [PATCH v2 6/6] btrfs: locking: Allow btrfs_tree_lock() to return error to avoid deadlock

2018-07-31 Thread Su Yue




On 08/01/2018 10:37 AM, Qu Wenruo wrote:

[BUG]
For certains crafted image, whose csum root leaf has missing backref, if
we try to trigger write with data csum, it could cause deadlock with the
following kernel WARN_ON():
--
WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400
CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
Workqueue: btrfs-endio-write btrfs_endio_write_helper
RIP: 0010:btrfs_tree_lock+0x3e2/0x400
Call Trace:
  btrfs_alloc_tree_block+0x39f/0x770
  __btrfs_cow_block+0x285/0x9e0
  btrfs_cow_block+0x191/0x2e0
  btrfs_search_slot+0x492/0x1160
  btrfs_lookup_csum+0xec/0x280
  btrfs_csum_file_blocks+0x2be/0xa60
  add_pending_csums+0xaf/0xf0
  btrfs_finish_ordered_io+0x74b/0xc90
  finish_ordered_fn+0x15/0x20
  normal_work_helper+0xf6/0x500
  btrfs_endio_write_helper+0x12/0x20
  process_one_work+0x302/0x770
  worker_thread+0x81/0x6d0
  kthread+0x180/0x1d0
  ret_from_fork+0x35/0x40
---[ end trace 2e85051acb5f6dc1 ]---
--

[CAUSE]
That crafted image has missing backref for csum tree root leaf.
And when we try to allocate new tree block, since there is no
EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
and use it.

However we have already locked csum tree root leaf by ourselves, thus we
will ourselves to release read/write lock, and causing deadlock.

[FIX]
This patch will allow btrfs_tree_lock() to return error number, so
most callers can exit gracefully.
(Some caller doesn't really expect btrfs_tree_lock() to return error,
and in that case, we use BUG_ON())

Since such problem can only happen in crafted image, we will still
trigger kernel warning, but with a little more meaningful warning
message.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 


Reviewed-by: Su Yue 


---
  fs/btrfs/ctree.c   | 57 +++---
  fs/btrfs/extent-tree.c | 28 +++
  fs/btrfs/extent_io.c   |  8 --
  fs/btrfs/free-space-tree.c |  4 ++-
  fs/btrfs/locking.c | 13 +++--
  fs/btrfs/locking.h |  2 +-
  fs/btrfs/qgroup.c  |  4 ++-
  fs/btrfs/relocation.c  | 13 +++--
  fs/btrfs/tree-log.c| 14 --
  9 files changed, 115 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc326df472e..6e695f4cd931 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -161,10 +161,12 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root 
*root)
  struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
  {
struct extent_buffer *eb;
+   int ret;
  
  	while (1) {

eb = btrfs_root_node(root);
-   btrfs_tree_lock(eb);
+   ret = btrfs_tree_lock(eb);
+   BUG_ON(ret < 0);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
@@ -1584,6 +1586,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
int close = 1;
+   int ret;
  
  		btrfs_node_key(parent, _key, i);

if (!progress_passed && comp_keys(_key, progress) < 0)
@@ -1637,7 +1640,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (search_start == 0)
search_start = last_block;
  
-		btrfs_tree_lock(cur);

+   ret = btrfs_tree_lock(cur);
+   if (ret < 0) {
+   free_extent_buffer(cur);
+   return ret;
+   }
btrfs_set_lock_blocking(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
, search_start,
@@ -1853,7 +1860,11 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
goto enospc;
}
  
-		btrfs_tree_lock(child);

+   ret = btrfs_tree_lock(child);
+   if (ret < 0) {
+   free_extent_buffer(child);
+   goto enospc;
+   }
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, );
if (ret) {
@@ -1891,7 +1902,12 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
left = NULL;
  
  	if (left) {

-   btrfs_tree_lock(left);
+   ret = btrfs_tree_lock(left);
+   if (ret < 0) {
+   free_extent_buffer(left);
+   left = NULL;
+   goto enospc;
+   }
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
   parent, pslot - 1, );
@@ -1906,7 +1922,12 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
  

Re: [PATCH v2 1/6] btrfs: Check each block group has corresponding chunk at mount time

2018-07-31 Thread Su Yue




On 08/01/2018 10:37 AM, Qu Wenruo wrote:

A crafted btrfs with incorrect chunk<->block group mapping, it could leads
to a lot of unexpected behavior.

Although the crafted image can be catched by block group item checker
added in "[PATCH] btrfs: tree-checker: Verify block_group_item", if one
crafted a valid enough block group item which can pass above check but
still mismatch with existing chunk, it could cause a lot of undefined
behavior.

This patch will add extra block group -> chunk mapping check, to ensure
we have a completely matching (start, len, flags) chunk for each block
group at mount time.

Here we reuse the original find_first_block_group(), which is already
doing basic bg -> chunk check, adding more check on start/len and type
flags.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199837
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 


Reviewed-by: Su Yue 


---
  fs/btrfs/extent-tree.c | 29 -
  1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d9fe58c0080..63a6b5d36ac1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9717,6 +9717,8 @@ static int find_first_block_group(struct btrfs_fs_info 
*fs_info,
int ret = 0;
struct btrfs_key found_key;
struct extent_buffer *leaf;
+   struct btrfs_block_group_item bg;
+   u64 flags;
int slot;
  
  	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);

@@ -9751,8 +9753,33 @@ static int find_first_block_group(struct btrfs_fs_info 
*fs_info,
"logical %llu len %llu found bg but no related chunk",
  found_key.objectid, found_key.offset);
ret = -ENOENT;
+   } else if (em->start != found_key.objectid ||
+  em->len != found_key.offset) {
+   btrfs_err(fs_info,
+   "block group %llu len %llu mismatch with chunk %llu len %llu",
+ found_key.objectid, found_key.offset,
+ em->start, em->len);
+   ret = -EUCLEAN;
} else {
-   ret = 0;
+   read_extent_buffer(leaf, ,
+   btrfs_item_ptr_offset(leaf, slot),
+   sizeof(bg));
+   flags = btrfs_block_group_flags() &
+   BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+   if (flags != (em->map_lookup->type &
+ BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+   btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 
0x%llx",
+   found_key.objectid,
+   found_key.offset,
+   flags,
+   (BTRFS_BLOCK_GROUP_TYPE_MASK &
+em->map_lookup->type));
+   ret = -EUCLEAN;
+   } else {
+   ret = 0;
+   }
}
free_extent_map(em);
goto out;




--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/6] btrfs: locking: Allow btrfs_tree_lock() to return error to avoid deadlock

2018-07-31 Thread Qu Wenruo
[BUG]
For certains crafted image, whose csum root leaf has missing backref, if
we try to trigger write with data csum, it could cause deadlock with the
following kernel WARN_ON():
--
WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400
CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
Workqueue: btrfs-endio-write btrfs_endio_write_helper
RIP: 0010:btrfs_tree_lock+0x3e2/0x400
Call Trace:
 btrfs_alloc_tree_block+0x39f/0x770
 __btrfs_cow_block+0x285/0x9e0
 btrfs_cow_block+0x191/0x2e0
 btrfs_search_slot+0x492/0x1160
 btrfs_lookup_csum+0xec/0x280
 btrfs_csum_file_blocks+0x2be/0xa60
 add_pending_csums+0xaf/0xf0
 btrfs_finish_ordered_io+0x74b/0xc90
 finish_ordered_fn+0x15/0x20
 normal_work_helper+0xf6/0x500
 btrfs_endio_write_helper+0x12/0x20
 process_one_work+0x302/0x770
 worker_thread+0x81/0x6d0
 kthread+0x180/0x1d0
 ret_from_fork+0x35/0x40
---[ end trace 2e85051acb5f6dc1 ]---
--

[CAUSE]
That crafted image has missing backref for csum tree root leaf.
And when we try to allocate new tree block, since there is no
EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
and use it.

However we have already locked csum tree root leaf by ourselves, thus we
will ourselves to release read/write lock, and causing deadlock.

[FIX]
This patch will allow btrfs_tree_lock() to return error number, so
most callers can exit gracefully.
(Some caller doesn't really expect btrfs_tree_lock() to return error,
and in that case, we use BUG_ON())

Since such problem can only happen in crafted image, we will still
trigger kernel warning, but with a little more meaningful warning
message.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/ctree.c   | 57 +++---
 fs/btrfs/extent-tree.c | 28 +++
 fs/btrfs/extent_io.c   |  8 --
 fs/btrfs/free-space-tree.c |  4 ++-
 fs/btrfs/locking.c | 13 +++--
 fs/btrfs/locking.h |  2 +-
 fs/btrfs/qgroup.c  |  4 ++-
 fs/btrfs/relocation.c  | 13 +++--
 fs/btrfs/tree-log.c| 14 --
 9 files changed, 115 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc326df472e..6e695f4cd931 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -161,10 +161,12 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root 
*root)
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
struct extent_buffer *eb;
+   int ret;
 
while (1) {
eb = btrfs_root_node(root);
-   btrfs_tree_lock(eb);
+   ret = btrfs_tree_lock(eb);
+   BUG_ON(ret < 0);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
@@ -1584,6 +1586,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
int close = 1;
+   int ret;
 
btrfs_node_key(parent, _key, i);
if (!progress_passed && comp_keys(_key, progress) < 0)
@@ -1637,7 +1640,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (search_start == 0)
search_start = last_block;
 
-   btrfs_tree_lock(cur);
+   ret = btrfs_tree_lock(cur);
+   if (ret < 0) {
+   free_extent_buffer(cur);
+   return ret;
+   }
btrfs_set_lock_blocking(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
, search_start,
@@ -1853,7 +1860,11 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
goto enospc;
}
 
-   btrfs_tree_lock(child);
+   ret = btrfs_tree_lock(child);
+   if (ret < 0) {
+   free_extent_buffer(child);
+   goto enospc;
+   }
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, );
if (ret) {
@@ -1891,7 +1902,12 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
left = NULL;
 
if (left) {
-   btrfs_tree_lock(left);
+   ret = btrfs_tree_lock(left);
+   if (ret < 0) {
+   free_extent_buffer(left);
+   left = NULL;
+   goto enospc;
+   }
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
   parent, pslot - 1, );
@@ -1906,7 +1922,12 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
right = NULL;
 
if (right) {
-   

[PATCH v2 5/6] btrfs: Exit gracefully when failed to add chunk map

2018-07-31 Thread Qu Wenruo
It's completely possible that a crafted btrfs image contains overlapping
chunks.

Although we can't detect such problem by tree-checker, but it's not a
catastrophic problem, current extent map can already detect such problem
and return -EEXIST.

We just only need to exit gracefully so btrfs can refuse to mount the
fs.

Reported-by: Xu Wen 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200409
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/volumes.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 467a589854fa..8c281c1e7f36 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6477,10 +6477,14 @@ static int read_one_chunk(struct btrfs_fs_info 
*fs_info, struct btrfs_key *key,
write_lock(_tree->map_tree.lock);
ret = add_extent_mapping(_tree->map_tree, em, 0);
write_unlock(_tree->map_tree.lock);
-   BUG_ON(ret); /* Tree corruption */
+   if (ret < 0) {
+   btrfs_err(fs_info,
+ "failed to add chunk map, start=%llu len=%llu: %d",
+ em->start, em->len, ret);
+   }
free_extent_map(em);
 
-   return 0;
+   return ret;
 }
 
 static void fill_device_from_item(struct extent_buffer *leaf,
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/6] btrfs: Introduce mount time chunk <-> dev extent mapping check

2018-07-31 Thread Qu Wenruo
This patch will introduce chunk <-> dev extent mapping check, to protect
us against invalid dev extents or chunks.

Since chunk mapping is the fundamental infrastructure of btrfs, extra
check at mount time could prevent a lot of unexpected behavior (BUG_ON).

Reported-by: Xu Wen 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200403
Link: https://bugzilla.kernel.org/show_bug.cgi?id=200407
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/disk-io.c |   7 ++
 fs/btrfs/volumes.c | 183 +
 fs/btrfs/volumes.h |   2 +
 3 files changed, 192 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 205092dc9390..068ca7498e94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3075,6 +3075,13 @@ int open_ctree(struct super_block *sb,
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
 
+   ret = btrfs_verify_dev_extents(fs_info);
+   if (ret) {
+   btrfs_err(fs_info,
+ "failed to verify dev extents against chunks: %d",
+ ret);
+   goto fail_block_groups;
+   }
ret = btrfs_recover_balance(fs_info);
if (ret) {
btrfs_err(fs_info, "failed to recover balance: %d", ret);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e6a8e4aabc66..467a589854fa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6440,6 +6440,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, 
struct btrfs_key *key,
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
map->type = btrfs_chunk_type(leaf, chunk);
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+   map->verified_stripes = 0;
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -7295,3 +7296,185 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info 
*fs_info)
fs_devices = fs_devices->seed;
}
 }
+
+static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
+{
+   int index = btrfs_bg_flags_to_raid_index(type);
+   int ncopies = btrfs_raid_array[index].ncopies;
+   int data_stripes;
+
+   switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+   case BTRFS_BLOCK_GROUP_RAID5:
+   data_stripes = num_stripes - 1;
+   break;
+   case BTRFS_BLOCK_GROUP_RAID6:
+   data_stripes = num_stripes - 2;
+   break;
+   default:
+   data_stripes = num_stripes / ncopies;
+   break;
+   }
+   return div_u64(chunk_len, data_stripes);
+}
+static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
+u64 chunk_offset, u64 devid,
+u64 physical_offset, u64 physical_len)
+{
+   struct extent_map_tree *em_tree = _info->mapping_tree.map_tree;
+   struct extent_map *em;
+   struct map_lookup *map;
+   u64 stripe_len;
+   bool found = false;
+   int ret = 0;
+   int i;
+
+   read_lock(_tree->lock);
+   em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+   read_unlock(_tree->lock);
+
+   if (!em) {
+   ret = -EUCLEAN;
+   btrfs_err(fs_info,
+   "dev extent (%llu, %llu) doesn't have corresponding chunk",
+ devid, physical_offset);
+   goto out;
+   }
+
+   map = em->map_lookup;
+   stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
+   if (physical_len != stripe_len) {
+   btrfs_err(fs_info,
+"dev extent (%llu, %llu) length doesn't match with chunk %llu, have %llu 
expect %llu",
+ devid, physical_offset, em->start, physical_len,
+ stripe_len);
+   ret = -EUCLEAN;
+   goto out;
+   }
+
+   for (i = 0; i < map->num_stripes; i++) {
+   if (map->stripes[i].dev->devid == devid &&
+   map->stripes[i].physical == physical_offset) {
+   found = true;
+   if (map->verified_stripes >= map->num_stripes) {
+   btrfs_err(fs_info,
+   "too many dev extent for chunk %llu is detected",
+ em->start);
+   ret = -EUCLEAN;
+   goto out;
+   }
+   map->verified_stripes++;
+   break;
+   }
+   }
+   if (!found) {
+   ret = -EUCLEAN;
+   btrfs_err(fs_info,
+   "dev extent (%llu, %llu) has no corresponding chunk",
+   devid, physical_offset);
+   }
+out:
+   free_extent_map(em);
+   return ret;
+}
+
+static int verify_chunk_dev_extent_mapping(struct 

[PATCH v2 1/6] btrfs: Check each block group has corresponding chunk at mount time

2018-07-31 Thread Qu Wenruo
A crafted btrfs with incorrect chunk<->block group mapping, it could leads
to a lot of unexpected behavior.

Although the crafted image can be catched by block group item checker
added in "[PATCH] btrfs: tree-checker: Verify block_group_item", if one
crafted a valid enough block group item which can pass above check but
still mismatch with existing chunk, it could cause a lot of undefined
behavior.

This patch will add extra block group -> chunk mapping check, to ensure
we have a completely matching (start, len, flags) chunk for each block
group at mount time.

Here we reuse the original find_first_block_group(), which is already
doing basic bg -> chunk check, adding more check on start/len and type
flags.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199837
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/extent-tree.c | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d9fe58c0080..63a6b5d36ac1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9717,6 +9717,8 @@ static int find_first_block_group(struct btrfs_fs_info 
*fs_info,
int ret = 0;
struct btrfs_key found_key;
struct extent_buffer *leaf;
+   struct btrfs_block_group_item bg;
+   u64 flags;
int slot;
 
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
@@ -9751,8 +9753,33 @@ static int find_first_block_group(struct btrfs_fs_info 
*fs_info,
"logical %llu len %llu found bg but no related chunk",
  found_key.objectid, found_key.offset);
ret = -ENOENT;
+   } else if (em->start != found_key.objectid ||
+  em->len != found_key.offset) {
+   btrfs_err(fs_info,
+   "block group %llu len %llu mismatch with chunk %llu len %llu",
+ found_key.objectid, found_key.offset,
+ em->start, em->len);
+   ret = -EUCLEAN;
} else {
-   ret = 0;
+   read_extent_buffer(leaf, ,
+   btrfs_item_ptr_offset(leaf, slot),
+   sizeof(bg));
+   flags = btrfs_block_group_flags() &
+   BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+   if (flags != (em->map_lookup->type &
+ BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+   btrfs_err(fs_info,
+"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 
0x%llx",
+   found_key.objectid,
+   found_key.offset,
+   flags,
+   (BTRFS_BLOCK_GROUP_TYPE_MASK &
+em->map_lookup->type));
+   ret = -EUCLEAN;
+   } else {
+   ret = 0;
+   }
}
free_extent_map(em);
goto out;
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/6] btrfs: Enhanced validation check for fuzzed images

2018-07-31 Thread Qu Wenruo
The branch can be fetched from the following git repo:
https://github.com/adam900710/linux/tree/tree_checker_enhance

It's based on v4.18-rc1, with 3 patches already merged into misc-next.

This patchset introduced the following enhanced validation check:
1) chunk/block group/dev extent cross check
   Unlike extent tree, such cross check can be implemented pretty easy
   with minimal mount time impact.
   Now the kernel could do chunk/bg/dev extent check as good as btrfs
   check.

2) Locking test to avoid possible deadlock due to extent tree corruption
   Unfortunately, for extent tree we can't do really much cross check.
   Instead we use the selftest from btrfs_tree_lock() to detect and
   avoid deadlock caused by corrupted extent tree.

The 3rd patch "btrfs: Remove unused function btrfs_account_dev_extents_size()"
has also been merged into misc-next.

changelog:
v2:
  Added reviewed-by tags from Gu and Nikolay.
  Address comment from David for the 4th patch
  Address comment from Gu for the 2nd patch.

Qu Wenruo (6):
  btrfs: Check each block group has corresponding chunk at mount time
  btrfs: Verify every chunk has corresponding block group at mount time
  btrfs: Remove unused function btrfs_account_dev_extents_size()
  btrfs: Introduce mount time chunk <-> dev extent mapping check
  btrfs: Exit gracefully when failed to add chunk map
  btrfs: locking: Allow btrfs_tree_lock() to return error to avoid
deadlock

 fs/btrfs/ctree.c   |  57 ++--
 fs/btrfs/disk-io.c |   7 +
 fs/btrfs/extent-tree.c | 114 +--
 fs/btrfs/extent_io.c   |   8 +-
 fs/btrfs/free-space-tree.c |   4 +-
 fs/btrfs/locking.c |  13 +-
 fs/btrfs/locking.h |   2 +-
 fs/btrfs/qgroup.c  |   4 +-
 fs/btrfs/relocation.c  |  13 +-
 fs/btrfs/tree-log.c|  14 +-
 fs/btrfs/volumes.c | 276 +
 fs/btrfs/volumes.h |   4 +-
 12 files changed, 397 insertions(+), 119 deletions(-)

-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/6] btrfs: Remove unused function btrfs_account_dev_extents_size()

2018-07-31 Thread Qu Wenruo
This function is never used by any one in kernel, just remove it.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/volumes.c | 85 --
 fs/btrfs/volumes.h |  2 --
 2 files changed, 87 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b33bf29130b6..e6a8e4aabc66 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1259,91 +1259,6 @@ int btrfs_scan_one_device(const char *path, fmode_t 
flags, void *holder,
return ret;
 }
 
-/* helper to account the used device space in the range */
-int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
-  u64 end, u64 *length)
-{
-   struct btrfs_key key;
-   struct btrfs_root *root = device->fs_info->dev_root;
-   struct btrfs_dev_extent *dev_extent;
-   struct btrfs_path *path;
-   u64 extent_end;
-   int ret;
-   int slot;
-   struct extent_buffer *l;
-
-   *length = 0;
-
-   if (start >= device->total_bytes ||
-   test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state))
-   return 0;
-
-   path = btrfs_alloc_path();
-   if (!path)
-   return -ENOMEM;
-   path->reada = READA_FORWARD;
-
-   key.objectid = device->devid;
-   key.offset = start;
-   key.type = BTRFS_DEV_EXTENT_KEY;
-
-   ret = btrfs_search_slot(NULL, root, , path, 0, 0);
-   if (ret < 0)
-   goto out;
-   if (ret > 0) {
-   ret = btrfs_previous_item(root, path, key.objectid, key.type);
-   if (ret < 0)
-   goto out;
-   }
-
-   while (1) {
-   l = path->nodes[0];
-   slot = path->slots[0];
-   if (slot >= btrfs_header_nritems(l)) {
-   ret = btrfs_next_leaf(root, path);
-   if (ret == 0)
-   continue;
-   if (ret < 0)
-   goto out;
-
-   break;
-   }
-   btrfs_item_key_to_cpu(l, , slot);
-
-   if (key.objectid < device->devid)
-   goto next;
-
-   if (key.objectid > device->devid)
-   break;
-
-   if (key.type != BTRFS_DEV_EXTENT_KEY)
-   goto next;
-
-   dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-   extent_end = key.offset + btrfs_dev_extent_length(l,
- dev_extent);
-   if (key.offset <= start && extent_end > end) {
-   *length = end - start + 1;
-   break;
-   } else if (key.offset <= start && extent_end > start)
-   *length += extent_end - start;
-   else if (key.offset > start && extent_end <= end)
-   *length += extent_end - key.offset;
-   else if (key.offset > start && key.offset <= end) {
-   *length += end - key.offset + 1;
-   break;
-   } else if (key.offset > end)
-   break;
-
-next:
-   path->slots[0]++;
-   }
-   ret = 0;
-out:
-   btrfs_free_path(path);
-   return ret;
-}
-
 static int contains_pending_extent(struct btrfs_transaction *transaction,
   struct btrfs_device *device,
   u64 *start, u64 len)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 77e6004b6cb9..6d4f38ad9f5c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -384,8 +384,6 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio)
}
 }
 
-int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
-  u64 end, u64 *length);
 void btrfs_get_bbio(struct btrfs_bio *bbio);
 void btrfs_put_bbio(struct btrfs_bio *bbio);
 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/6] btrfs: Verify every chunk has corresponding block group at mount time

2018-07-31 Thread Qu Wenruo
If a crafted btrfs has missing block group items, it could cause
unexpected behavior and breaks our expectation on 1:1
chunk<->block group mapping.

Although we added block group -> chunk mapping check, we still need
chunk -> block group mapping check.

This patch will do extra check to ensure each chunk has its
corresponding block group.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=199847
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 
Reviewed-by: Gu Jinxiang 
---
 fs/btrfs/extent-tree.c | 57 +-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 63a6b5d36ac1..3578fa5b30ef 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10030,6 +10030,61 @@ btrfs_create_block_group_cache(struct btrfs_fs_info 
*fs_info,
return cache;
 }
 
+
+/*
+ * Iterate all chunks and verify each of them has corresponding block group
+ */
+static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_mapping_tree *map_tree = _info->mapping_tree;
+   struct extent_map *em;
+   struct btrfs_block_group_cache *bg;
+   u64 start = 0;
+   int ret = 0;
+
+   while (1) {
+   read_lock(_tree->map_tree.lock);
+   /*
+* lookup_extent_mapping will return the first extent map
+* intersects the range, so set @len to 1 is enough to get
+* the first chunk.
+*/
+   em = lookup_extent_mapping(_tree->map_tree, start, 1);
+   read_unlock(_tree->map_tree.lock);
+   if (!em)
+   break;
+
+   bg = btrfs_lookup_block_group(fs_info, em->start);
+   if (!bg) {
+   btrfs_err(fs_info,
+   "chunk start=%llu len=%llu doesn't have corresponding block group",
+em->start, em->len);
+   ret = -EUCLEAN;
+   free_extent_map(em);
+   break;
+   }
+   if (bg->key.objectid != em->start ||
+   bg->key.offset != em->len ||
+   (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
+   (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+   btrfs_err(fs_info,
+"chunk start=%llu len=%llu flags=0x%llx doesn't match with block group 
start=%llu len=%llu flags=0x%llx",
+   em->start, em->len,
+   em->map_lookup->type & 
BTRFS_BLOCK_GROUP_TYPE_MASK,
+   bg->key.objectid, bg->key.offset,
+   bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
+   ret = -EUCLEAN;
+   free_extent_map(em);
+   btrfs_put_block_group(bg);
+   break;
+   }
+   start = em->start + em->len;
+   free_extent_map(em);
+   btrfs_put_block_group(bg);
+   }
+   return ret;
+}
+
 int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
struct btrfs_path *path;
@@ -10203,7 +10258,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 
btrfs_add_raid_kobjects(info);
init_global_block_rsv(info);
-   ret = 0;
+   ret = check_chunk_block_group_mappings(info);
 error:
btrfs_free_path(path);
return ret;
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Expected Received UUID

2018-07-31 Thread Hans van Kranenburg
Hi,

Can you please report the kernel versions you are using on every system
(uname -a)?

I would indeed expect the Received UUID value for C to have the same
uuid as the original UUID of A, so the b00e8ba1 one.

The send part, generating the send stream is done by the running kernel.
The receive part is done by the userspace btrfs progs. The btrfs progs
receive code just sets the Received UUID to whatever it reads from the
send stream information.

So, your sending kernel version is important here.

When looking up the lines that send the UUID, in fs/btrfs/send.c, I can
see there was a problem like this in the past:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b96b1db039ebc584d03a9933b279e0d3e704c528

I was introduced between linux 4.1 and 4.2 and solved in 2015 with that
commit, which ended up somewhere in 4.3 I think.

>From the btrfs-progs versions you list, I suspect this is a Debian
Jessie and 2x Debian Stretch, but the Debian 3.16 and 4.9 kernels would
not should not have this issue.

On 07/31/2018 07:42 PM, Gaurav Sanghavi wrote:
> Hello everyone, 
> 
> While researching BTRFS for a project that involves backing up snapshots
> from device A to device B 
> before consolidating backups from device B to device C, I noticed that
> received UUID on snapshot on 
> device C is not the same as received UUID for the same snapshot on
> device B. Here are my steps:
> 
> 1. Device A
> BTRFS version: v3.17
> 
> btrfs su sn -r /home/test/snapshot1 /home/test/snaps/snapshot1
> btrfs su show /home/test/snaps/snapshot1
> Name: snapshot1
> uuid: b00e8ba1-5aaa-3641-9c4c-e168eee5c296
> Parent uuid: cb570dec-e9fd-1f40-99d2-2070c8ee2466
>         Received UUID:      ---
> Creation time: 2018-07-30 18:32:37
> Flags: readonly
> 
> 2. Send to Device B
> btrfs send /home/test/snaps/snapshot1 | ssh  'btrfs receive
> /home/backups/'
> 
> After send completes, on Device B
> BTRFS version: v4.7.3
> btrfs su show /home/backups/snapshot1
> Name: snapshot1
> UUID: 7c13d189-7fee-584e-ac90-e68cb0012f5c
> Parent UUID: a2314f7c-4b11-ed40-901f-f1acb5ebf802
> Received UUID: b00e8ba1-5aaa-3641-9c4c-e168eee5c296
> Creation time: 2018-07-30 18:42:37 -0700
> Flags: readonly
> 
> 
> 3. Send to Device C
> btrfs send /home/backups/snapshot1 | ssh  'btrfs receive
> /home/backups2/'
> 
> After send completes, on Device C
> BTRFS version: v4.7.3
> btrfs su show /home/backups2/snapshot1
> Name: snapshot1
> UUID: 8a13aab5-8e44-2541-9082-bc583933b964
> Parent UUID: 54e9b4ff-46dc-534e-b70f-69eb7bb21028
> Received UUID: 7c13d189-7fee-584e-ac90-e68cb0012f5c
> Creation time: 2018-07-30 18:58:32 -0700
> Flags: readonly
> 
> 1. I have gone through some of the archived emails and have noticed
> people mentioning that
> if received UUID is set, btrfs send propogates this 'received UUID'. But
> in above case, 
> it's different for the same snapshot on all three devices. Is this the
> expected behavior ?
> 
> 2. We want to be able to start backing up from Device A to C, in case B
> goes down or needs 
> to be replaced. If received UUID is expected to differ for the snapshot
> on device B and C, incremental
> backups will not work from A to C without setting received UUID. I have
> seen python-btrfs
> mentioned in a couple of emails; but have anyone of you used it in a
> production environment ?
> 
> This is my first post to this email. Please let me know if I am missing
> any details.
> 
> Thanks,
> Gaurav
> 

-- 
Hans van Kranenburg
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] vfs: introduce function to map unique ino/dev pairs

2018-07-31 Thread Mark Fasheh


On Tue, Jul 31, 2018 at 02:10:42PM -0700, Mark Fasheh wrote:
> diff --git a/fs/stat.c b/fs/stat.c
> index f8e6fb2c3657..80ea42505219 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -84,6 +84,29 @@ int vfs_getattr_nosec(const struct path *path, struct 
> kstat *stat,
>  }
>  EXPORT_SYMBOL(vfs_getattr_nosec);
>  
> +void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t *dev)
> +{
> + int ret;
> + struct path path;
> + struct kstat stat;
> +
> + path.mnt = NULL;
> + path.dentry = dentry;
> + /* ->dev is always returned, so we only need to specify ino here */
> + ret = vfs_getattr_nosec(, , STATX_INO, 0);
> + if (ret) {
> + struct inode *inode = d_inode(dentry);
> + /* Fallback to old behavior in case of getattr error */
> + *ino = inode->i_ino;
> + *dev = inode->i_sb->s_dev;
> + return ret;

Ooof, attached is a version of this patch which doesn't try to return a value
from a void function. Apologies for the extra e-mail.

>From Mark Fasheh 

[PATCH 1/4] vfs: introduce function to map unique ino/dev pairs

There are places in the VFS where we export an ino/dev pair to userspace.
/proc/PID/maps is a good example - we directly expose inode->i_ino and
inode->i_sb->s_dev to userspace there.  Many filesystems don't put a unique
value in inode->i_ino and instead rely on ->getattr to provide the real
inode number to userspace.  super->s_dev is similar - some filesystems
expose a difference device from what's put in super->s_dev when queried via
stat/statx.

Ultimately this makes it impossible for a user (or software) to match one of
those reported pairs to the right inode.

We can fix this by adding a helper function, vfs_map_unique_ino_dev(), which
will query the owning filesystem (via ->getattr) to get the correct ino/dev
pair.  Later patches will update those places which simply dump inode->i_ino
and super->s_dev to use the helper.

Signed-off-by: Mark Fasheh 
---
 fs/stat.c  | 22 ++
 include/linux/fs.h |  2 ++
 2 files changed, 24 insertions(+)

diff --git a/fs/stat.c b/fs/stat.c
index f8e6fb2c3657..20c72d618ed5 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -84,6 +84,28 @@ int vfs_getattr_nosec(const struct path *path, struct kstat 
*stat,
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
 
+void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t *dev)
+{
+   int ret;
+   struct path path;
+   struct kstat stat;
+
+   path.mnt = NULL;
+   path.dentry = dentry;
+   /* ->dev is always returned, so we only need to specify ino here */
+   ret = vfs_getattr_nosec(, , STATX_INO, 0);
+   if (ret) {
+   struct inode *inode = d_inode(dentry);
+   /* Fallback to old behavior in case of getattr error */
+   *ino = inode->i_ino;
+   *dev = inode->i_sb->s_dev;
+   return;
+   }
+   *ino = stat.ino;
+   *dev = stat.dev;
+}
+EXPORT_SYMBOL(vfs_map_unique_ino_dev);
+
 /*
  * vfs_getattr - Get the enhanced basic attributes of a file
  * @path: The file of interest
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d78d146a98da..b80789472438 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3077,6 +3077,8 @@ extern void kfree_link(void *);
 extern void generic_fillattr(struct inode *, struct kstat *);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, 
unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t 
*dev);
+
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
 void __inode_sub_bytes(struct inode *inode, loff_t bytes);
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Unmountable root partition

2018-07-31 Thread Hans van Kranenburg
Hi,

On 07/31/2018 08:03 PM, Cerem Cem ASLAN wrote:
> Hi, 
> 
> I'm having trouble with my server setup, which contains a BTRFS root
> partition on top of LVM on top of LUKS partition. 
> 
> Yesterday server was shut down unexpectedly. I booted the system with a
> pendrive which contains Debian 4.9.18 and tried to mount the BTRFS root
> partition manually. 
> 
> 1. cryptsetup luksOpen /dev/sda5 
> 
> Seems to decrypt the partition (there are no errors) 
> 
> 2. /dev/mapper/foo--vg-root and /dev/mapper/foo--vg-swap_1 is created
> automatically, so I suppose LVM works correctly. 
> 
> 3. mount -t btrfs /dev/mapper/foo--vg-root /mnt/foo 
> Gives the following error: 
> 
> mount: wrong fs type, bad option, bad superblock on ...
> 
> 4. dmesg | tail 
> Outputs the following: 
> 
> 
> [17755.840916] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> [17755.840919] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0
> 02 00 00 02 00
> [17755.840921] blk_update_request: I/O error, dev sda, sector 507906
> [17755.840941] EXT4-fs (dm-4): unable to read superblock
> [18140.052300] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> [18140.052305] sd 3:0:0:0: [sda] tag#0 CDB: ATA command pass
> through(16) 85 06 20 00 00 00 00 00 00 00 00 00 00 00 e5 00
> [18142.991851] sd 3:0:0:0: [sda] tag#0 FAILED Result:
> hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
> [18142.991856] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0
> 80 00 00 08 00
> [18142.991860] blk_update_request: I/O error, dev sda, sector 508032
> [18142.991869] Buffer I/O error on dev dm-4, logical block 16, async
> page read

This points at hardware level failure, regardless if the disk would hold
a btrfs or ext4 or whatever other kind of filesystem. The disk itself
cannot read data back when we ask for it.

> 4. 
> 
> # btrfs restore -i -D /dev/mapper/foo--vg-root /dev/null
> No valid Btrfs found on /dev/mapper/foo--vg-root
> Could not open root, trying backup super
> No valid Btrfs found on /dev/mapper/foo--vg-root
> Could not open root, trying backup super
> No valid Btrfs found on /dev/mapper/foo--vg-root
> Could not open root, trying backup super
> 
> We are pretty sure that no unexpected electric cuts has been happened. 
> 
> At this point I don't know what information I should supply. 
> 


-- 
Hans van Kranenburg
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] nfs: check for NULL vfsmount in nfs_getattr

2018-07-31 Thread Mark Fasheh
Hi Al,

On Tue, Jul 31, 2018 at 11:16:05PM +0100, Al Viro wrote:
> On Tue, Jul 31, 2018 at 02:10:43PM -0700, Mark Fasheh wrote:
> > ->getattr from inside the kernel won't always have a vfsmount, check for
> > this.
> 
> Really?  Where would that happen?

It happens in my first patch. FWIW, I'm not tied to that particular bit of
code, or even this (latest) approach. I would actually very much appreciate
your input into how we might fix the problem we have where we are sometimes
not exporting a correct ino/dev pair to user space.

I have a good break-down of the problem and possible solutions here:

https://www.spinics.net/lists/linux-fsdevel/msg128003.html

Thanks,
--Mark
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] nfs: check for NULL vfsmount in nfs_getattr

2018-07-31 Thread Al Viro
On Tue, Jul 31, 2018 at 02:10:43PM -0700, Mark Fasheh wrote:
> ->getattr from inside the kernel won't always have a vfsmount, check for
> this.

Really?  Where would that happen?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] vfs: introduce function to map unique ino/dev pairs

2018-07-31 Thread Mark Fasheh
There are places in the VFS where we export an ino/dev pair to userspace.
/proc/PID/maps is a good example - we directly expose inode->i_ino and
inode->i_sb->s_dev to userspace there.  Many filesystems don't put a unique
value in inode->i_ino and instead rely on ->getattr to provide the real
inode number to userspace.  super->s_dev is similar - some filesystems
expose a difference device from what's put in super->s_dev when queried via
stat/statx.

Ultimately this makes it impossible for a user (or software) to match one of
those reported pairs to the right inode.

We can fix this by adding a helper function, vfs_map_unique_ino_dev(), which
will query the owning filesystem (via ->getattr) to get the correct ino/dev
pair.  Later patches will update those places which simply dump inode->i_ino
and super->s_dev to use the helper.

Signed-off-by: Mark Fasheh 
---
 fs/stat.c  | 23 +++
 include/linux/fs.h |  2 ++
 2 files changed, 25 insertions(+)

diff --git a/fs/stat.c b/fs/stat.c
index f8e6fb2c3657..80ea42505219 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -84,6 +84,29 @@ int vfs_getattr_nosec(const struct path *path, struct kstat 
*stat,
 }
 EXPORT_SYMBOL(vfs_getattr_nosec);
 
+void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t *dev)
+{
+   int ret;
+   struct path path;
+   struct kstat stat;
+
+   path.mnt = NULL;
+   path.dentry = dentry;
+   /* ->dev is always returned, so we only need to specify ino here */
+   ret = vfs_getattr_nosec(, , STATX_INO, 0);
+   if (ret) {
+   struct inode *inode = d_inode(dentry);
+   /* Fallback to old behavior in case of getattr error */
+   *ino = inode->i_ino;
+   *dev = inode->i_sb->s_dev;
+   return ret;
+   }
+   *ino = stat.ino;
+   *dev = stat.dev;
+   return 0;
+}
+EXPORT_SYMBOL(vfs_map_unique_ino_dev);
+
 /*
  * vfs_getattr - Get the enhanced basic attributes of a file
  * @path: The file of interest
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d78d146a98da..b80789472438 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3077,6 +3077,8 @@ extern void kfree_link(void *);
 extern void generic_fillattr(struct inode *, struct kstat *);
 extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, 
unsigned int);
 extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
+extern void vfs_map_unique_ino_dev(struct dentry *dentry, u64 *ino, dev_t 
*dev);
+
 void __inode_add_bytes(struct inode *inode, loff_t bytes);
 void inode_add_bytes(struct inode *inode, loff_t bytes);
 void __inode_sub_bytes(struct inode *inode, loff_t bytes);
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] nfs: check for NULL vfsmount in nfs_getattr

2018-07-31 Thread Mark Fasheh
->getattr from inside the kernel won't always have a vfsmount, check for
this.

Signed-off-by: Mark Fasheh 
---
 fs/nfs/inode.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b65aee481d13..7a3d90a7cc92 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -795,7 +795,9 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
}
 
/*
-* We may force a getattr if the user cares about atime.
+* We may force a getattr if the user cares about atime. A
+* NULL vfsmount means we are called from inside the kernel,
+* where no atime is required.
 *
 * Note that we only have to check the vfsmount flags here:
 *  - NFS always sets S_NOATIME by so checking it would give a
@@ -803,7 +805,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
 *  - NFS never sets SB_NOATIME or SB_NODIRATIME so there is
 *no point in checking those.
 */
-   if ((path->mnt->mnt_flags & MNT_NOATIME) ||
+   if (!path->mnt || (path->mnt->mnt_flags & MNT_NOATIME) ||
((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
request_mask &= ~STATX_ATIME;
 
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] locks: map correct ino/dev pairs when exporting to userspace

2018-07-31 Thread Mark Fasheh
/proc/locks does not always print the correct inode number/device pair.
Update lock_get_status() to use vfs_map_unique_ino_dev() to get the real,
unique values for userspace.

Signed-off-by: Mark Fasheh 
---
 fs/locks.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index db7b6917d9c5..3a012df87fd8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2621,6 +2621,7 @@ static void lock_get_status(struct seq_file *f, struct 
file_lock *fl,
loff_t id, char *pfx)
 {
struct inode *inode = NULL;
+   struct dentry *dentry;
unsigned int fl_pid;
struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info;
 
@@ -2633,8 +2634,10 @@ static void lock_get_status(struct seq_file *f, struct 
file_lock *fl,
if (fl_pid == 0)
return;
 
-   if (fl->fl_file != NULL)
+   if (fl->fl_file != NULL) {
inode = locks_inode(fl->fl_file);
+   dentry = file_dentry(fl->fl_file);
+   }
 
seq_printf(f, "%lld:%s ", id, pfx);
if (IS_POSIX(fl)) {
@@ -2681,10 +2684,13 @@ static void lock_get_status(struct seq_file *f, struct 
file_lock *fl,
   : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ ");
}
if (inode) {
+   __u64 ino;
+   dev_t dev;
+
+   vfs_map_unique_ino_dev(dentry, , );
/* userspace relies on this representation of dev_t */
seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
-   MAJOR(inode->i_sb->s_dev),
-   MINOR(inode->i_sb->s_dev), inode->i_ino);
+   MAJOR(dev), MINOR(dev), inode->i_ino);
} else {
seq_printf(f, "%d :0 ", fl_pid);
}
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/4] vfs: map unique ino/dev pairs for user space

2018-07-31 Thread Mark Fasheh
Hi,

These patches are follow-up to this conversation on fsdevel which provides a
good break-down of the core problem:

https://www.spinics.net/lists/linux-fsdevel/msg128003.html

That e-mail was in turn a follow-up to the following patch series:

https://lwn.net/Articles/753917/

which tried to solve the dev portion of this problem with a bit of structure
re-routing but was never accepted.


We have an inconsistency in how the kernel is exporting inode number /
device pairs for user space.  I'm not talking about stat(2) or statx(2) here
but everywhere else where an ino/dev is exported into a buffer for user
space.

We typically do this by simply dumping the potentially 32 bit inode->i_ino
and single device from super->s_dev.  Sometimes these values differ from
what is returned via stat(2) or statx(2).  Some filesystems might even show
duplicate (but internally different!) pairs when the raw i_ino/s_dev is
used.

Aside from breaking software which expects these pairs to be unique, this
can put the user in a situation where they might not be able to find an
inode referenced from some kernel log (be it /proc/, printk buffer, etc). 
What's even worse - depending on how ino is exported, they might even find
an existing but /wrong/ file.

The following patches fix these inconsistencies by introducing a VFS helper
function which calls the underlying filesystem ->getattr to get our real
inode number / device pair.  The returned values can then be used at those
places in the kernel where we are incorrectly reporting our ino/dev pair. 
We then update fs/proc/ and fs/locks.c to use this helper when writing to
/proc/PID/maps and /proc/locks respectively.


For anyone who is watching the evolution of these patches, this would be one
implementation of the 'callback' method which I discussed in my last e-mail. 
This approach is very straight forward and easy to understand but has some
drawbacks:

- Not all of the call sites can tolerate errors.  Instead, we fallback to
  inode->i_ino and super->s_dev in case of getattr error.  That behavior is
  no worse than what we have today but what we have today is pretty bad so I
  don't particularly feel good about that. It's better than nothing though.

- There are places in the kernel where we could never call into ->getattr. 
  There are tons of tracepoints for example that are printing the the wrong
  ino/dev pair.

- The helper function has to fake a path struct because getting a vfsmount
  from inside these call sites is impossible.  This isn't actually a big
  deal as nobody except NFS cares and the NFS patch is very trivial.  It's
  ugly though, so if we had consensus on this approach I would happily
  rework our ->getattr function signature, perhaps pulling the vfsmount and
  dentry arguments back out.

- The dentry argument to ->getattr is potentially problematic as we have
  some places which _only_ have an inode. Even if we had a dentry I'm not
  sure what would keep it from going negative while in the middle of our
  ->getattr call.


Comments and review would be greatly appreciated. Thanks,
--Mark
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] proc: use vfs helper to get ino/dev pairs for maps file

2018-07-31 Thread Mark Fasheh
Proc writes ino/dev into /proc/PID/maps which it gets from i_ino and s_dev.
The problem with this is that i_ino and s_dev are not guaranteed to be
unique - we can have duplicates in the maps file for otherwise distinct
inodes.

This breaks any software expecting them to be unique, including lsof(8).  We
can fix this by using vfs_map_unique_ino_dev() to map the unique ino/dev
pair for us.

Signed-off-by: Mark Fasheh 
---
 fs/proc/nommu.c  | 11 ---
 fs/proc/task_mmu.c   |  8 +++-
 fs/proc/task_nommu.c |  8 +++-
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3b63be64e436..56c12d02c5ad 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -36,7 +36,7 @@
  */
 static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 {
-   unsigned long ino = 0;
+u64 ino = 0;
struct file *file;
dev_t dev = 0;
int flags;
@@ -44,15 +44,12 @@ static int nommu_region_show(struct seq_file *m, struct 
vm_region *region)
flags = region->vm_flags;
file = region->vm_file;
 
-   if (file) {
-   struct inode *inode = file_inode(region->vm_file);
-   dev = inode->i_sb->s_dev;
-   ino = inode->i_ino;
-   }
+   if (file)
+   vfs_map_unique_ino_dev(file_dentry(file), , ); {
 
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m,
-  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %llu ",
   region->vm_start,
   region->vm_end,
   flags & VM_READ ? 'r' : '-',
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfd73a4616ce..e9cd33425191 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -276,7 +276,7 @@ static int is_stack(struct vm_area_struct *vma)
 static void show_vma_header_prefix(struct seq_file *m,
   unsigned long start, unsigned long end,
   vm_flags_t flags, unsigned long long pgoff,
-  dev_t dev, unsigned long ino)
+  dev_t dev, u64 ino)
 {
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_put_hex_ll(m, NULL, start, 8);
@@ -299,16 +299,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct 
*vma, int is_pid)
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
vm_flags_t flags = vma->vm_flags;
-   unsigned long ino = 0;
+   u64 ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
const char *name = NULL;
 
if (file) {
-   struct inode *inode = file_inode(vma->vm_file);
-   dev = inode->i_sb->s_dev;
-   ino = inode->i_ino;
+   vfs_map_unique_ino_dev(file_dentry(file), , );
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
}
 
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 5b62f57bd9bc..1198e979ed3f 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -146,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct 
vm_area_struct *vma,
  int is_pid)
 {
struct mm_struct *mm = vma->vm_mm;
-   unsigned long ino = 0;
+   u64 ino = 0;
struct file *file;
dev_t dev = 0;
int flags;
@@ -156,15 +156,13 @@ static int nommu_vma_show(struct seq_file *m, struct 
vm_area_struct *vma,
file = vma->vm_file;
 
if (file) {
-   struct inode *inode = file_inode(vma->vm_file);
-   dev = inode->i_sb->s_dev;
-   ino = inode->i_ino;
+   vfs_map_unique_inode_dev(file_dentry(file), , ));
pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
}
 
seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m,
-  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+  "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %llu ",
   vma->vm_start,
   vma->vm_end,
   flags & VM_READ ? 'r' : '-',
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Expected Received UUID

2018-07-31 Thread Cerem Cem ASLAN
Not an answer, but exactly same case:
https://unix.stackexchange.com/questions/377914/how-to-test-if-two-btrfs-snapshots-are-identical

2018-07-31 20:42 GMT+03:00 Gaurav Sanghavi :

> Hello everyone,
>
> While researching BTRFS for a project that involves backing up snapshots
> from device A to device B
> before consolidating backups from device B to device C, I noticed that
> received UUID on snapshot on
> device C is not the same as received UUID for the same snapshot on device
> B. Here are my steps:
>
> 1. Device A
> BTRFS version: v3.17
>
> btrfs su sn -r /home/test/snapshot1 /home/test/snaps/snapshot1
> btrfs su show /home/test/snaps/snapshot1
> Name: snapshot1
> uuid: b00e8ba1-5aaa-3641-9c4c-e168eee5c296
> Parent uuid: cb570dec-e9fd-1f40-99d2-2070c8ee2466
> Received UUID:  ---
> Creation time: 2018-07-30 18:32:37
> Flags: readonly
>
> 2. Send to Device B
> btrfs send /home/test/snaps/snapshot1 | ssh  'btrfs receive
> /home/backups/'
>
> After send completes, on Device B
> BTRFS version: v4.7.3
> btrfs su show /home/backups/snapshot1
> Name: snapshot1
> UUID: 7c13d189-7fee-584e-ac90-e68cb0012f5c
> Parent UUID: a2314f7c-4b11-ed40-901f-f1acb5ebf802
> Received UUID: b00e8ba1-5aaa-3641-9c4c-e168eee5c296
> Creation time: 2018-07-30 18:42:37 -0700
> Flags: readonly
>
>
> 3. Send to Device C
> btrfs send /home/backups/snapshot1 | ssh  'btrfs receive
> /home/backups2/'
>
> After send completes, on Device C
> BTRFS version: v4.7.3
> btrfs su show /home/backups2/snapshot1
> Name: snapshot1
> UUID: 8a13aab5-8e44-2541-9082-bc583933b964
> Parent UUID: 54e9b4ff-46dc-534e-b70f-69eb7bb21028
> Received UUID: 7c13d189-7fee-584e-ac90-e68cb0012f5c
> Creation time: 2018-07-30 18:58:32 -0700
> Flags: readonly
>
> 1. I have gone through some of the archived emails and have noticed people
> mentioning that
> if received UUID is set, btrfs send propogates this 'received UUID'. But
> in above case,
> it's different for the same snapshot on all three devices. Is this the
> expected behavior ?
>
> 2. We want to be able to start backing up from Device A to C, in case B
> goes down or needs
> to be replaced. If received UUID is expected to differ for the snapshot on
> device B and C, incremental
> backups will not work from A to C without setting received UUID. I have
> seen python-btrfs
> mentioned in a couple of emails; but have anyone of you used it in a
> production environment ?
>
> This is my first post to this email. Please let me know if I am missing
> any details.
>
> Thanks,
> Gaurav
>
>


Unmountable root partition

2018-07-31 Thread Cerem Cem ASLAN
Hi,

I'm having trouble with my server setup, which contains a BTRFS root
partition on top of LVM on top of LUKS partition.

Yesterday server was shut down unexpectedly. I booted the system with a
pendrive which contains Debian 4.9.18 and tried to mount the BTRFS root
partition manually.

1. cryptsetup luksOpen /dev/sda5

Seems to decrypt the partition (there are no errors)

2. /dev/mapper/foo--vg-root and /dev/mapper/foo--vg-swap_1 is created
automatically, so I suppose LVM works correctly.

3. mount -t btrfs /dev/mapper/foo--vg-root /mnt/foo
Gives the following error:

mount: wrong fs type, bad option, bad superblock on ...

4. dmesg | tail
Outputs the following:


[17755.840916] sd 3:0:0:0: [sda] tag#0 FAILED Result:
hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
[17755.840919] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0 02 00
00 02 00
[17755.840921] blk_update_request: I/O error, dev sda, sector 507906
[17755.840941] EXT4-fs (dm-4): unable to read superblock
[18140.052300] sd 3:0:0:0: [sda] tag#0 FAILED Result:
hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
[18140.052305] sd 3:0:0:0: [sda] tag#0 CDB: ATA command pass through(16) 85
06 20 00 00 00 00 00 00 00 00 00 00 00 e5 00
[18142.991851] sd 3:0:0:0: [sda] tag#0 FAILED Result:
hostbyte=DID_BAD_TARGET driverbyte=DRIVER_OK
[18142.991856] sd 3:0:0:0: [sda] tag#0 CDB: Read(10) 28 00 00 07 c0 80 00
00 08 00
[18142.991860] blk_update_request: I/O error, dev sda, sector 508032
[18142.991869] Buffer I/O error on dev dm-4, logical block 16, async page
read


4.

# btrfs restore -i -D /dev/mapper/foo--vg-root /dev/null
No valid Btrfs found on /dev/mapper/foo--vg-root
Could not open root, trying backup super
No valid Btrfs found on /dev/mapper/foo--vg-root
Could not open root, trying backup super
No valid Btrfs found on /dev/mapper/foo--vg-root
Could not open root, trying backup super

We are pretty sure that no unexpected electric cuts has been happened.

At this point I don't know what information I should supply.


Expected Received UUID

2018-07-31 Thread Gaurav Sanghavi
Hello everyone,

While researching BTRFS for a project that involves backing up snapshots
from device A to device B
before consolidating backups from device B to device C, I noticed that
received UUID on snapshot on
device C is not the same as received UUID for the same snapshot on device
B. Here are my steps:

1. Device A
BTRFS version: v3.17

btrfs su sn -r /home/test/snapshot1 /home/test/snaps/snapshot1
btrfs su show /home/test/snaps/snapshot1
Name: snapshot1
uuid: b00e8ba1-5aaa-3641-9c4c-e168eee5c296
Parent uuid: cb570dec-e9fd-1f40-99d2-2070c8ee2466
Received UUID:  ---
Creation time: 2018-07-30 18:32:37
Flags: readonly

2. Send to Device B
btrfs send /home/test/snaps/snapshot1 | ssh  'btrfs receive
/home/backups/'

After send completes, on Device B
BTRFS version: v4.7.3
btrfs su show /home/backups/snapshot1
Name: snapshot1
UUID: 7c13d189-7fee-584e-ac90-e68cb0012f5c
Parent UUID: a2314f7c-4b11-ed40-901f-f1acb5ebf802
Received UUID: b00e8ba1-5aaa-3641-9c4c-e168eee5c296
Creation time: 2018-07-30 18:42:37 -0700
Flags: readonly


3. Send to Device C
btrfs send /home/backups/snapshot1 | ssh  'btrfs receive
/home/backups2/'

After send completes, on Device C
BTRFS version: v4.7.3
btrfs su show /home/backups2/snapshot1
Name: snapshot1
UUID: 8a13aab5-8e44-2541-9082-bc583933b964
Parent UUID: 54e9b4ff-46dc-534e-b70f-69eb7bb21028
Received UUID: 7c13d189-7fee-584e-ac90-e68cb0012f5c
Creation time: 2018-07-30 18:58:32 -0700
Flags: readonly

1. I have gone through some of the archived emails and have noticed people
mentioning that
if received UUID is set, btrfs send propogates this 'received UUID'. But in
above case,
it's different for the same snapshot on all three devices. Is this the
expected behavior ?

2. We want to be able to start backing up from Device A to C, in case B
goes down or needs
to be replaced. If received UUID is expected to differ for the snapshot on
device B and C, incremental
backups will not work from A to C without setting received UUID. I have
seen python-btrfs
mentioned in a couple of emails; but have anyone of you used it in a
production environment ?

This is my first post to this email. Please let me know if I am missing any
details.

Thanks,
Gaurav


Re: [PATCH] btrfs: revert fs_devices state on error of btrfs_init_new_device()

2018-07-31 Thread Filipe Manana
On Tue, Jul 31, 2018 at 11:12 AM, Anand Jain  wrote:
>
>
> On 07/27/2018 08:04 AM, Naohiro Aota wrote:
>>
>> When btrfs hits error after modifying fs_devices in
>> btrfs_init_new_device() (such as btrfs_add_dev_item() returns error), it
>> leaves everything as is, but frees allocated btrfs_device. As a result,
>> fs_devices->devices and fs_devices->alloc_list contain already freed
>> btrfs_device, leading to later use-after-free bug.
>
>
>  the undo part of the btrfs_init_new_device() is broken for a while now.
>  Thanks for the fix, but..
>
>   - this patch does not fix the seed device context, its ok to fix that
> in a separate patch though.
>   - and does not undo the effect of
>
> -
> if (!blk_queue_nonrot(q))
> fs_info->fs_devices->rotating = 1
> ::
> btrfs_clear_space_info_full(fs_info);
> 
>  which I think should be handled as part of this patch.

Doesn't matter, the filesystem was turned to RO mode (transaction aborted).

>
> Thanks, Anand
>
>
>
>> Error path also messes the things like ->num_devices. While they go backs
>> to the original value by unscanning btrfs devices, it is safe to revert
>> them here.
>>
>> Fixes: 79787eaab461 ("btrfs: replace many BUG_ONs with proper error
>> handling")
>> Signed-off-by: Naohiro Aota 
>> ---
>>   fs/btrfs/volumes.c | 28 +++-
>>   1 file changed, 23 insertions(+), 5 deletions(-)
>>
>>   This patch applies on master, but not on kdave/for-next because of
>>   74b9f4e186eb ("btrfs: declare fs_devices in btrfs_init_new_device()")
>>
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 1da162928d1a..5f0512fffa52 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2410,7 +2410,7 @@ int btrfs_init_new_device(struct btrfs_fs_info
>> *fs_info, const char *device_path
>> struct list_head *devices;
>> struct super_block *sb = fs_info->sb;
>> struct rcu_string *name;
>> -   u64 tmp;
>> +   u64 orig_super_total_bytes, orig_super_num_devices;
>> int seeding_dev = 0;
>> int ret = 0;
>> bool unlocked = false;
>> @@ -2509,12 +2509,14 @@ int btrfs_init_new_device(struct btrfs_fs_info
>> *fs_info, const char *device_path
>> if (!blk_queue_nonrot(q))
>> fs_info->fs_devices->rotating = 1;
>>   - tmp = btrfs_super_total_bytes(fs_info->super_copy);
>> +   orig_super_total_bytes =
>> btrfs_super_total_bytes(fs_info->super_copy);
>> btrfs_set_super_total_bytes(fs_info->super_copy,
>> -   round_down(tmp + device->total_bytes,
>> fs_info->sectorsize));
>> +   round_down(orig_super_total_bytes + device->total_bytes,
>> +  fs_info->sectorsize));
>>   - tmp = btrfs_super_num_devices(fs_info->super_copy);
>> -   btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
>> +   orig_super_num_devices =
>> btrfs_super_num_devices(fs_info->super_copy);
>> +   btrfs_set_super_num_devices(fs_info->super_copy,
>> +   orig_super_num_devices + 1);
>> /* add sysfs device entry */
>> btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
>> @@ -2594,6 +2596,22 @@ int btrfs_init_new_device(struct btrfs_fs_info
>> *fs_info, const char *device_path
>> error_sysfs:
>> btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
>> +   mutex_lock(_info->fs_devices->device_list_mutex);
>> +   mutex_lock(_info->chunk_mutex);
>> +   list_del_rcu(>dev_list);
>> +   list_del(>dev_alloc_list);
>> +   fs_info->fs_devices->num_devices--;
>> +   fs_info->fs_devices->open_devices--;
>> +   fs_info->fs_devices->rw_devices--;
>> +   fs_info->fs_devices->total_devices--;
>> +   fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
>> +   atomic64_sub(device->total_bytes, _info->free_chunk_space);
>> +   btrfs_set_super_total_bytes(fs_info->super_copy,
>> +   orig_super_total_bytes);
>> +   btrfs_set_super_num_devices(fs_info->super_copy,
>> +   orig_super_num_devices);
>> +   mutex_unlock(_info->chunk_mutex);
>> +   mutex_unlock(_info->fs_devices->device_list_mutex);
>>   error_trans:
>> if (seeding_dev)
>> sb->s_flags |= SB_RDONLY;
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: fix data lose with snapshot when nospace

2018-07-31 Thread Filipe Manana
On Tue, Jul 31, 2018 at 11:17 AM, robbieko  wrote:
> Filipe Manana 於 2018-07-30 20:34 寫到:
>
>> On Mon, Jul 30, 2018 at 12:28 PM, Filipe Manana 
>> wrote:
>>>
>>> On Mon, Jul 30, 2018 at 12:08 PM, Filipe Manana 
>>> wrote:

 On Mon, Jul 30, 2018 at 11:21 AM, robbieko 
 wrote:
>
> From: Robbie Ko 
>
> Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
> modified the nocow writeback mechanism, if you create a snapshot,
> it will always switch to cow writeback.
>
> This will cause data loss when there is no space, because
> when the space is full, the write will not reserve any space, only
> check if it can be nocow write.


 This is a bit vague.
 You need to mention where space reservation does not happen (at the
 time of the write syscall) and why,
 and that the snapshot happens before flushing IO (running dealloc).
 Then when running dealloc we fallback
 to COW and fail.

 You also need to tell that although the write syscall did not return
 an error, the writeback will
 fail but a subsequent fsync on the file will return an error (ENOSPC)
 because the writeback set the error
 on the inode's mapping, so it's not completely a silent data loss, as
 for buffered writes there's no guarantee
 that if write syscall returns 0 the data will be persisted
 successfully (that can only be guaranteed if a subsequent
 fsync call returns 0).

>
> So fix this by first flush the nocow data, and then switch to the
> cow write.
>>>
>>>
>>> I'm also not seeing how what you've done is better then we have now
>>> using the root->will_be_snapshotted atomic,
>>> which is essentially used the same way as the new atomic you are
>>> adding, and forces the writeback code no nocow
>>> writes as well.
>>
>>
>> So what you have done can be made much more simple by flushing
>> delalloc before incrementing root->will_be_snapshotted instead of
>> after incrementing it:
>>
>> https://friendpaste.com/2LY9eLAR9q0RoOtRK7VYmX
>
> There is no way to solve this problem in this modification.

It minimizes it. It only gives better guarantees that nocow buffered
writes that happened before calling the snapshot ioctl will not fall
back to cow,
not for the ones that happen while the call to the ioctl is happening.

>
> When writing and create snapshot at the same time, the write will not
> reserve space,
> and will not return to ENOSPC, because will_be_snapshotted is still 0.
> So when writeback flush data, there will still be problems with ENOSPC.

Which is precisely what I proposed does without adding a new atomic
and more changes.
It flushes delalloc before incrementing root->will_be_snapshotted, so
that previous buffered nocow writes will not fallback to cow mode (and
require data space allocation).

It only leaves a very tiny and very unlikely to hit (but not
impossible) time window where nocow writes will fallback
to cow mode - after calling start_delalloc_inodes() and before
incrementing root->will_be_snapshotted a new buffered write can comes
in and gets immediately flushed
because someone called fsync() on the file or the VM decided to
trigger writeback (due to memory pressure or some other reason).

>
> The behavior I changed was to increase will_be_snapshotted first,
> so the following write must have a reserve space,
> otherwise it must be returned to ENOSPC.
> And then go to flush data and flush the diry page with nocow,
> When all the dirty pages are written back, then switch to cow mode.

And why did you wrote such a vague changelog? It misses all those
important and subtle details of the change.

>
>>
>> Just checked the code and failure to allocate space during writeback
>> after falling back to COW mode does indeed set
>> AS_ENOSPC on the inode's mapping, which makes fsync return ENOSPC
>> (through file_check_and_advance_wb_err()
>> and filemap_check_wb_err()).
>>
>> Since fsync reports the error, I'm unsure to call it data loss but
>> rather an optimization to avoid ENOSPC for nocow writes when running
>> low on space.
>>
>
> If you do not use fsync, you will not find the data loss.

That's one of the reasons why fsync exists.

> I think that as long as the write returns successfully,
> the data must be written back to the disk at the end,
> otherwise it will be considered as data loss.

No filesystem gives you that guarantee, neither does POSIX or SUS.
Even for direct IO writes, you still need to call fsync to make sure
the writes will be seen after a power failure (metadata updates, super
block).
For buffered IO it's even more important. During writeback many things
can lead to failure, not just hardware problems, for example failure
to allocate memory or running out of space can happen.

> (Excluded, improper shutdown, IO error,)

Even with proper shutdown, you have no way to know if a write succeed
unless you call fsync, or you do something like evict page cache and
read from the 

Re: [PATCH] Btrfs: fix data lose with snapshot when nospace

2018-07-31 Thread robbieko

Filipe Manana 於 2018-07-30 20:34 寫到:
On Mon, Jul 30, 2018 at 12:28 PM, Filipe Manana  
wrote:
On Mon, Jul 30, 2018 at 12:08 PM, Filipe Manana  
wrote:
On Mon, Jul 30, 2018 at 11:21 AM, robbieko  
wrote:

From: Robbie Ko 

Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
modified the nocow writeback mechanism, if you create a snapshot,
it will always switch to cow writeback.

This will cause data loss when there is no space, because
when the space is full, the write will not reserve any space, only
check if it can be nocow write.


This is a bit vague.
You need to mention where space reservation does not happen (at the
time of the write syscall) and why,
and that the snapshot happens before flushing IO (running dealloc).
Then when running dealloc we fallback
to COW and fail.

You also need to tell that although the write syscall did not return
an error, the writeback will
fail but a subsequent fsync on the file will return an error (ENOSPC)
because the writeback set the error
on the inode's mapping, so it's not completely a silent data loss, as
for buffered writes there's no guarantee
that if write syscall returns 0 the data will be persisted
successfully (that can only be guaranteed if a subsequent
fsync call returns 0).



So fix this by first flush the nocow data, and then switch to the
cow write.


I'm also not seeing how what you've done is better then we have now
using the root->will_be_snapshotted atomic,
which is essentially used the same way as the new atomic you are
adding, and forces the writeback code no nocow
writes as well.


So what you have done can be made much more simple by flushing
delalloc before incrementing root->will_be_snapshotted instead of
after incrementing it:

https://friendpaste.com/2LY9eLAR9q0RoOtRK7VYmX

There is no way to solve this problem in this modification.

When writing and create snapshot at the same time, the write will not 
reserve space,

and will not return to ENOSPC, because will_be_snapshotted is still 0.
So when writeback flush data, there will still be problems with ENOSPC.

The behavior I changed was to increase will_be_snapshotted first,
so the following write must have a reserve space,
otherwise it must be returned to ENOSPC.
And then go to flush data and flush the diry page with nocow,
When all the dirty pages are written back, then switch to cow mode.



Just checked the code and failure to allocate space during writeback
after falling back to COW mode does indeed set
AS_ENOSPC on the inode's mapping, which makes fsync return ENOSPC
(through file_check_and_advance_wb_err()
and filemap_check_wb_err()).

Since fsync reports the error, I'm unsure to call it data loss but
rather an optimization to avoid ENOSPC for nocow writes when running
low on space.



If you do not use fsync, you will not find the data loss.
I think that as long as the write returns successfully,
the data must be written back to the disk at the end,
otherwise it will be considered as data loss.
(Excluded, improper shutdown, IO error,)








This seems easy to reproduce using deterministic steps.
Can you please write a test case for fstests?

Also the subject "Btrfs: fix data lose with snapshot when nospace",
besides the typo (lose -> loss), should be
more clear like for example "Btrfs: fix data loss for nocow writes
after snapshot when low on data space".


Also I'm not even sure if I would call it data loss.
If there was no error returned from a subsequent fsync, I would
definitely call it data loss.

So unless the fsync is not returning ENOSPC, I don't see anything that
needs to be fixed.



Thanks.


Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting")
Signed-off-by: Robbie Ko 
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/inode.c   | 26 +-
 fs/btrfs/ioctl.c   |  6 ++
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 118346a..663ce05 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1277,6 +1277,7 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
+   atomic_t snapshot_force_cow;

/* For qgroup metadata reserved space */
spinlock_t qgroup_meta_rsv_lock;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 205092d..5573916 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1216,6 +1216,7 @@ static void __setup_root(struct btrfs_root 
*root, struct btrfs_fs_info *fs_info,

atomic_set(>log_batch, 0);
refcount_set(>refs, 1);
atomic_set(>will_be_snapshotted, 0);
+   atomic_set(>snapshot_force_cow, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eba61bc..263b852 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1275,7 +1275,7 @@ static noinline int 

Re: [PATCH] btrfs: revert fs_devices state on error of btrfs_init_new_device()

2018-07-31 Thread Anand Jain




On 07/27/2018 08:04 AM, Naohiro Aota wrote:

When btrfs hits error after modifying fs_devices in
btrfs_init_new_device() (such as btrfs_add_dev_item() returns error), it
leaves everything as is, but frees allocated btrfs_device. As a result,
fs_devices->devices and fs_devices->alloc_list contain already freed
btrfs_device, leading to later use-after-free bug.


 the undo part of the btrfs_init_new_device() is broken for a while now.
 Thanks for the fix, but..

  - this patch does not fix the seed device context, its ok to fix that
in a separate patch though.
  - and does not undo the effect of

-
if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1
::
btrfs_clear_space_info_full(fs_info);

 which I think should be handled as part of this patch.

Thanks, Anand



Error path also messes the things like ->num_devices. While they go backs
to the original value by unscanning btrfs devices, it is safe to revert
them here.

Fixes: 79787eaab461 ("btrfs: replace many BUG_ONs with proper error handling")
Signed-off-by: Naohiro Aota 
---
  fs/btrfs/volumes.c | 28 +++-
  1 file changed, 23 insertions(+), 5 deletions(-)

  This patch applies on master, but not on kdave/for-next because of
  74b9f4e186eb ("btrfs: declare fs_devices in btrfs_init_new_device()")

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1da162928d1a..5f0512fffa52 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2410,7 +2410,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, 
const char *device_path
struct list_head *devices;
struct super_block *sb = fs_info->sb;
struct rcu_string *name;
-   u64 tmp;
+   u64 orig_super_total_bytes, orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
bool unlocked = false;
@@ -2509,12 +2509,14 @@ int btrfs_init_new_device(struct btrfs_fs_info 
*fs_info, const char *device_path
if (!blk_queue_nonrot(q))
fs_info->fs_devices->rotating = 1;
  
-	tmp = btrfs_super_total_bytes(fs_info->super_copy);

+   orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
btrfs_set_super_total_bytes(fs_info->super_copy,
-   round_down(tmp + device->total_bytes, fs_info->sectorsize));
+   round_down(orig_super_total_bytes + device->total_bytes,
+  fs_info->sectorsize));
  
-	tmp = btrfs_super_num_devices(fs_info->super_copy);

-   btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
+   orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
+   btrfs_set_super_num_devices(fs_info->super_copy,
+   orig_super_num_devices + 1);
  
  	/* add sysfs device entry */

btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
@@ -2594,6 +2596,22 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, 
const char *device_path
  
  error_sysfs:

btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
+   mutex_lock(_info->fs_devices->device_list_mutex);
+   mutex_lock(_info->chunk_mutex);
+   list_del_rcu(>dev_list);
+   list_del(>dev_alloc_list);
+   fs_info->fs_devices->num_devices--;
+   fs_info->fs_devices->open_devices--;
+   fs_info->fs_devices->rw_devices--;
+   fs_info->fs_devices->total_devices--;
+   fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
+   atomic64_sub(device->total_bytes, _info->free_chunk_space);
+   btrfs_set_super_total_bytes(fs_info->super_copy,
+   orig_super_total_bytes);
+   btrfs_set_super_num_devices(fs_info->super_copy,
+   orig_super_num_devices);
+   mutex_unlock(_info->chunk_mutex);
+   mutex_unlock(_info->fs_devices->device_list_mutex);
  error_trans:
if (seeding_dev)
sb->s_flags |= SB_RDONLY;


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs: Introduce mount time chunk <-> dev extent mapping check

2018-07-31 Thread Qu Wenruo


On 2018年07月16日 21:06, David Sterba wrote:
> On Mon, Jul 09, 2018 at 02:42:02PM +0800, Qu Wenruo wrote:
>> This patch will introduce chunk <-> dev extent mapping check, to protect
>> us against invalid dev extents or chunks.
>>
>> Since chunk mapping is the fundamental infrastructure of btrfs, extra
>> check at mount time could prevent a lot of unexpected behavior (BUG_ON).
>>
>> Reported-by: Xu Wen 
>> Links: https://bugzilla.kernel.org/show_bug.cgi?id=200403
> 
> Link:
> 
>> Links: https://bugzilla.kernel.org/show_bug.cgi?id=200407
>> Signed-off-by: Qu Wenruo 
>> ---
>>  fs/btrfs/disk-io.c |   7 ++
>>  fs/btrfs/volumes.c | 173 +
>>  fs/btrfs/volumes.h |   2 +
>>  3 files changed, 182 insertions(+)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 205092dc9390..068ca7498e94 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -3075,6 +3075,13 @@ int open_ctree(struct super_block *sb,
>>  fs_info->generation = generation;
>>  fs_info->last_trans_committed = generation;
>>  
>> +ret = btrfs_verify_dev_extents(fs_info);
>> +if (ret) {
>> +btrfs_err(fs_info,
>> +  "failed to verify dev extents against chunks: %d",
>> +  ret);
>> +goto fail_block_groups;
>> +}
>>  ret = btrfs_recover_balance(fs_info);
>>  if (ret) {
>>  btrfs_err(fs_info, "failed to recover balance: %d", ret);
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index e6a8e4aabc66..05e418cb37f3 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -6440,6 +6440,7 @@ static int read_one_chunk(struct btrfs_fs_info 
>> *fs_info, struct btrfs_key *key,
>>  map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
>>  map->type = btrfs_chunk_type(leaf, chunk);
>>  map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
>> +map->verified_stripes = 0;
>>  for (i = 0; i < num_stripes; i++) {
>>  map->stripes[i].physical =
>>  btrfs_stripe_offset_nr(leaf, chunk, i);
>> @@ -7295,3 +7296,175 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info 
>> *fs_info)
>>  fs_devices = fs_devices->seed;
>>  }
>>  }
>> +
>> +static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
>> +{
>> +switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>> +case BTRFS_BLOCK_GROUP_RAID0:
>> +return div_u64(chunk_len, num_stripes);
>> +case BTRFS_BLOCK_GROUP_RAID10:
>> +return div_u64(chunk_len * 2, num_stripes);
>> +case BTRFS_BLOCK_GROUP_RAID5:
>> +return div_u64(chunk_len, num_stripes - 1);
>> +case BTRFS_BLOCK_GROUP_RAID6:
>> +return div_u64(chunk_len, num_stripes - 2);
>> +default:
>> +return chunk_len;
>> +}
>> +}
> 
> There are already too many hardcoded values for the raid profiles,
> please don't add another one unless really necessary and use existing
> predefined constants or helpers (eg. nr_data_stripes or
> btrfs_raid_array).

OK, I'll try to reuse btrfs_raid_array in next version.

Thanks,
Qu

> 
>> +static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
>> + u64 chunk_offset, u64 devid,
>> + u64 physical_offset, u64 physical_len)
>> +{
>> +struct extent_map_tree *em_tree = _info->mapping_tree.map_tree;
>> +struct extent_map *em;
>> +struct map_lookup *map;
>> +u64 stripe_len;
>> +bool found = false;
> 
> This variable is only set and never checked.
> 
>> +int ret = 0;
>> +int i;
>> +
>> +read_lock(_tree->lock);
>> +em = lookup_extent_mapping(em_tree, chunk_offset, 1);
>> +read_unlock(_tree->lock);
>> +
>> +if (!em) {
>> +ret = -EUCLEAN;
>> +btrfs_err(fs_info,
>> +"dev extent (%llu, %llu) doesn't have corresponding chunk",
>> +  devid, physical_offset);
>> +goto out;
>> +}
>> +
>> +map = em->map_lookup;
>> +stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
>> +if (physical_len != stripe_len) {
>> +btrfs_err(fs_info,
>> +"dev extent (%llu, %llu) length doesn't match with chunk %llu, have %llu 
>> expect %llu",
>> +  devid, physical_offset, em->start, physical_len,
>> +  stripe_len);
>> +ret = -EUCLEAN;
>> +goto out;
>> +}
>> +
>> +for (i = 0; i < map->num_stripes; i++) {
>> +if (map->stripes[i].dev->devid == devid &&
>> +map->stripes[i].physical == physical_offset) {
>> +found = true;
> 
> 2nd time set
> 
>> +if (map->verified_stripes >= map->num_stripes) {
>> +btrfs_err(fs_info,
>> +"too many dev extent for chunk %llu is detected",
>> +  em->start);
>> + 

[PATCH] btrfs: replace: Reset on-disk dev stats value after replace

2018-07-31 Thread Misono Tomohiro
on-disk devs stats value is updated in btrfs_run_dev_stats(),
which is called during commit transaction, if device->dev_stats_ccnt
is not zero.

Since current replace operation does not touch dev_stats_ccnt,
on-disk dev stats value is not updated. Therefore "btrfs device stats"
may return old device's value after umount/mount
(Example: See "btrfs ins dump-t -t DEV $DEV" after btrfs/100 finish).

Fix this by just increment dev_stats_ccnt in
btrfs_dev_replace_finishing() when replace is succeeded.

Signed-off-by: Misono Tomohiro 
---
 fs/btrfs/dev-replace.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e2ba0419297a..d20b244623f2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -676,6 +676,12 @@ static int btrfs_dev_replace_finishing(struct 
btrfs_fs_info *fs_info,
 
btrfs_rm_dev_replace_unblocked(fs_info);
 
+   /*
+* Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
+* update on-disk dev stats value during commit transaction
+*/
+   atomic_inc(_device->dev_stats_ccnt);
+
/*
 * this is again a consistent state where no dev_replace procedure
 * is running, the target device is part of the filesystem, the
-- 
2.14.4


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] btrfs: locking: Allow btrfs_tree_lock() to return error to avoid deadlock

2018-07-31 Thread Qu Wenruo
[BUG]
For certains crafted image, whose csum root leaf has missing backref, if
we try to trigger write with data csum, it could cause deadlock with the
following kernel WARN_ON():
--
WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400
CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
Workqueue: btrfs-endio-write btrfs_endio_write_helper
RIP: 0010:btrfs_tree_lock+0x3e2/0x400
Call Trace:
 btrfs_alloc_tree_block+0x39f/0x770
 __btrfs_cow_block+0x285/0x9e0
 btrfs_cow_block+0x191/0x2e0
 btrfs_search_slot+0x492/0x1160
 btrfs_lookup_csum+0xec/0x280
 btrfs_csum_file_blocks+0x2be/0xa60
 add_pending_csums+0xaf/0xf0
 btrfs_finish_ordered_io+0x74b/0xc90
 finish_ordered_fn+0x15/0x20
 normal_work_helper+0xf6/0x500
 btrfs_endio_write_helper+0x12/0x20
 process_one_work+0x302/0x770
 worker_thread+0x81/0x6d0
 kthread+0x180/0x1d0
 ret_from_fork+0x35/0x40
---[ end trace 2e85051acb5f6dc1 ]---
--

[CAUSE]
That crafted image has missing backref for csum tree root leaf.
And when we try to allocate new tree block, since there is no
EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
and use it.

However we have already locked csum tree root leaf by ourselves, thus we
will ourselves to release read/write lock, and causing deadlock.

[FIX]
This patch will allow btrfs_tree_lock() to return error number, so
most callers can exit gracefully.
(Some caller doesn't really expect btrfs_tree_lock() to return error,
and in that case, we use BUG_ON())

Since such problem can only happen in crafted image, we will still
trigger kernel warning, but with a little more meaningful warning
message.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 
Reviewed-by: Su Yue 
---
changelog:
v2:
  Fix pid_t output format from %llu to %d. Pointed by Su.
  Fix missing return 0 for btrfs_lock_tree().
  Update reviewed-by tags.
---
 fs/btrfs/ctree.c   | 57 +++---
 fs/btrfs/extent-tree.c | 28 +++
 fs/btrfs/extent_io.c   |  8 --
 fs/btrfs/free-space-tree.c |  4 ++-
 fs/btrfs/locking.c | 13 +++--
 fs/btrfs/locking.h |  2 +-
 fs/btrfs/qgroup.c  |  4 ++-
 fs/btrfs/relocation.c  | 13 +++--
 fs/btrfs/tree-log.c| 14 --
 9 files changed, 115 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc326df472e..6e695f4cd931 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -161,10 +161,12 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root 
*root)
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
struct extent_buffer *eb;
+   int ret;
 
while (1) {
eb = btrfs_root_node(root);
-   btrfs_tree_lock(eb);
+   ret = btrfs_tree_lock(eb);
+   BUG_ON(ret < 0);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
@@ -1584,6 +1586,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
int close = 1;
+   int ret;
 
btrfs_node_key(parent, _key, i);
if (!progress_passed && comp_keys(_key, progress) < 0)
@@ -1637,7 +1640,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (search_start == 0)
search_start = last_block;
 
-   btrfs_tree_lock(cur);
+   ret = btrfs_tree_lock(cur);
+   if (ret < 0) {
+   free_extent_buffer(cur);
+   return ret;
+   }
btrfs_set_lock_blocking(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
, search_start,
@@ -1853,7 +1860,11 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
goto enospc;
}
 
-   btrfs_tree_lock(child);
+   ret = btrfs_tree_lock(child);
+   if (ret < 0) {
+   free_extent_buffer(child);
+   goto enospc;
+   }
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, );
if (ret) {
@@ -1891,7 +1902,12 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
left = NULL;
 
if (left) {
-   btrfs_tree_lock(left);
+   ret = btrfs_tree_lock(left);
+   if (ret < 0) {
+   free_extent_buffer(left);
+   left = NULL;
+   goto enospc;
+   }
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
   parent, 

Re: [PATCH 1/1] btrfs: locking: Allow btrfs_tree_lock() to return error to avoid deadlock

2018-07-31 Thread Qu Wenruo



On 2018年07月31日 14:44, Qu Wenruo wrote:
> 
> 
> On 2018年07月31日 14:48, Su Yue wrote:
>>
>>
>> On 07/30/2018 02:17 PM, Qu Wenruo wrote:
>>> [BUG]
>>> For certains crafted image, whose csum root leaf has missing backref, if
>>> we try to trigger write with data csum, it could cause deadlock with the
>>> following kernel WARN_ON():
>>> --
>>> WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230
>>> btrfs_tree_lock+0x3e2/0x400
>>> CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
>>> Workqueue: btrfs-endio-write btrfs_endio_write_helper
>>> RIP: 0010:btrfs_tree_lock+0x3e2/0x400
>>> Call Trace:
>>>   btrfs_alloc_tree_block+0x39f/0x770
>>>   __btrfs_cow_block+0x285/0x9e0
>>>   btrfs_cow_block+0x191/0x2e0
>>>   btrfs_search_slot+0x492/0x1160
>>>   btrfs_lookup_csum+0xec/0x280
>>>   btrfs_csum_file_blocks+0x2be/0xa60
>>>   add_pending_csums+0xaf/0xf0
>>>   btrfs_finish_ordered_io+0x74b/0xc90
>>>   finish_ordered_fn+0x15/0x20
>>>   normal_work_helper+0xf6/0x500
>>>   btrfs_endio_write_helper+0x12/0x20
>>>   process_one_work+0x302/0x770
>>>   worker_thread+0x81/0x6d0
>>>   kthread+0x180/0x1d0
>>>   ret_from_fork+0x35/0x40
>>> ---[ end trace 2e85051acb5f6dc1 ]---
>>> --
>>>
>>> [CAUSE]
>>> That crafted image has missing backref for csum tree root leaf.
>>> And when we try to allocate new tree block, since there is no
>>> EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
>>> and use it.
>>>
>>> However we have already locked csum tree root leaf by ourselves, thus we
>>> will ourselves to release read/write lock, and causing deadlock.
>>>
>>> [FIX]
>>> This patch will allow btrfs_tree_lock() to return error number, so
>>> most callers can exit gracefully.
>>> (Some caller doesn't really expect btrfs_tree_lock() to return error,
>>> and in that case, we use BUG_ON())
>>>
>>> Since such problem can only happen in crafted image, we will still
>>> trigger kernel warning, but with a little more meaningful warning
>>> message.
>>>
>>> Link: https://bugzilla.kernel.org/show_bug.cgi?id 0405
>>> Reported-by: Xu Wen 
>>> Signed-off-by: Qu Wenruo 
>>
>> OK.. I looked through every places where the callee is called.
>> Those errors are handled gracefully as my thought.
>> Except one nitpick, the %llu about pid_t in btrfs_tree_lock.
>> However, you can add the tag:
>>
>> Reviewed-by: Su Yue 
> 
> snip
> 
>>
>> Nit:
>> pid_t is as an signed integer, should be pid= .
> 
> BTW, how could pid be negative?
> And why my gcc doesn't give such warning?

My fault, gcc does give warning about this.
I'll fix it soon.

Thanks,
Qu

> 
> Thanks,
> Qu
> 
>>
>> Thanks,
>> Su
>>
>>> + eb->start, current->pid);
>>> +    return -EUCLEAN;
>>> +    }
>>>   again:
>>>   wait_event(eb->read_lock_wq, atomic_read(>blocking_readers)
>>> =0);
>>>   wait_event(eb->write_lock_wq, atomic_read(>blocking_writers)
>>> =0);
>>> diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
>>> index 29135def468e..7394d7ba2ff9 100644
>>> --- a/fs/btrfs/locking.h
>>> +++ b/fs/btrfs/locking.h
>>> @@ -11,7 +11,7 @@
>>>   #define BTRFS_WRITE_LOCK_BLOCKING 3
>>>   #define BTRFS_READ_LOCK_BLOCKING 4
>>>   -void btrfs_tree_lock(struct extent_buffer *eb);
>>> +int btrfs_tree_lock(struct extent_buffer *eb);
>>>   void btrfs_tree_unlock(struct extent_buffer *eb);
>>>     void btrfs_tree_read_lock(struct extent_buffer *eb);
>>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>>> index 1874a6d2e6f5..ec4351fd7537 100644
>>> --- a/fs/btrfs/qgroup.c
>>> +++ b/fs/btrfs/qgroup.c
>>> @@ -1040,7 +1040,9 @@ int btrfs_quota_disable(struct
>>> btrfs_trans_handle *trans,
>>>     list_del(_root->dirty_list);
>>>   -    btrfs_tree_lock(quota_root->node);
>>> +    ret =trfs_tree_lock(quota_root->node);
>>> +    if (ret < 0)
>>> +    goto out;
>>>   clean_tree_block(fs_info, quota_root->node);
>>>   btrfs_tree_unlock(quota_root->node);
>>>   btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
>>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>>> index be94c65bb4d2..a2fc0bd83a40 100644
>>> --- a/fs/btrfs/relocation.c
>>> +++ b/fs/btrfs/relocation.c
>>> @@ -1877,7 +1877,11 @@ int replace_path(struct btrfs_trans_handle *trans,
>>>   free_extent_buffer(eb);
>>>   break;
>>>   }
>>> -    btrfs_tree_lock(eb);
>>> +    ret =trfs_tree_lock(eb);
>>> +    if (ret < 0) {
>>> +    free_extent_buffer(eb);
>>> +    break;
>>> +    }
>>>   if (cow) {
>>>   ret =trfs_cow_block(trans, dest, eb, parent,
>>>     slot, );
>>> @@ -2788,7 +2792,12 @@ static int do_relocation(struct
>>> btrfs_trans_handle *trans,
>>>   err =EIO;
>>>   goto next;
>>>   }
>>> -    btrfs_tree_lock(eb);
>>> +    ret =trfs_tree_lock(eb);
>>> +    if (ret < 0) {
>>> +    free_extent_buffer(eb);
>>> +    err =et;
>>> +

Re: [PATCH 1/1] btrfs: locking: Allow btrfs_tree_lock() to return error to avoid deadlock

2018-07-31 Thread Qu Wenruo



On 2018年07月31日 14:48, Su Yue wrote:
> 
> 
> On 07/30/2018 02:17 PM, Qu Wenruo wrote:
>> [BUG]
>> For certains crafted image, whose csum root leaf has missing backref, if
>> we try to trigger write with data csum, it could cause deadlock with the
>> following kernel WARN_ON():
>> --
>> WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230
>> btrfs_tree_lock+0x3e2/0x400
>> CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
>> Workqueue: btrfs-endio-write btrfs_endio_write_helper
>> RIP: 0010:btrfs_tree_lock+0x3e2/0x400
>> Call Trace:
>>   btrfs_alloc_tree_block+0x39f/0x770
>>   __btrfs_cow_block+0x285/0x9e0
>>   btrfs_cow_block+0x191/0x2e0
>>   btrfs_search_slot+0x492/0x1160
>>   btrfs_lookup_csum+0xec/0x280
>>   btrfs_csum_file_blocks+0x2be/0xa60
>>   add_pending_csums+0xaf/0xf0
>>   btrfs_finish_ordered_io+0x74b/0xc90
>>   finish_ordered_fn+0x15/0x20
>>   normal_work_helper+0xf6/0x500
>>   btrfs_endio_write_helper+0x12/0x20
>>   process_one_work+0x302/0x770
>>   worker_thread+0x81/0x6d0
>>   kthread+0x180/0x1d0
>>   ret_from_fork+0x35/0x40
>> ---[ end trace 2e85051acb5f6dc1 ]---
>> --
>>
>> [CAUSE]
>> That crafted image has missing backref for csum tree root leaf.
>> And when we try to allocate new tree block, since there is no
>> EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
>> and use it.
>>
>> However we have already locked csum tree root leaf by ourselves, thus we
>> will ourselves to release read/write lock, and causing deadlock.
>>
>> [FIX]
>> This patch will allow btrfs_tree_lock() to return error number, so
>> most callers can exit gracefully.
>> (Some caller doesn't really expect btrfs_tree_lock() to return error,
>> and in that case, we use BUG_ON())
>>
>> Since such problem can only happen in crafted image, we will still
>> trigger kernel warning, but with a little more meaningful warning
>> message.
>>
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405
>> Reported-by: Xu Wen 
>> Signed-off-by: Qu Wenruo 
> 
> OK.. I looked through every places where the callee is called.
> Those errors are handled gracefully as my thought.
> Except one nitpick, the %llu about pid_t in btrfs_tree_lock.
> However, you can add the tag:
> 
> Reviewed-by: Su Yue 

snip

> 
> Nit:
> pid_t is as an signed integer, should be pid=%d .

BTW, how could pid be negative?
And why my gcc doesn't give such warning?

Thanks,
Qu

> 
> Thanks,
> Su
> 
>> + eb->start, current->pid);
>> +    return -EUCLEAN;
>> +    }
>>   again:
>>   wait_event(eb->read_lock_wq, atomic_read(>blocking_readers)
>> == 0);
>>   wait_event(eb->write_lock_wq, atomic_read(>blocking_writers)
>> == 0);
>> diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
>> index 29135def468e..7394d7ba2ff9 100644
>> --- a/fs/btrfs/locking.h
>> +++ b/fs/btrfs/locking.h
>> @@ -11,7 +11,7 @@
>>   #define BTRFS_WRITE_LOCK_BLOCKING 3
>>   #define BTRFS_READ_LOCK_BLOCKING 4
>>   -void btrfs_tree_lock(struct extent_buffer *eb);
>> +int btrfs_tree_lock(struct extent_buffer *eb);
>>   void btrfs_tree_unlock(struct extent_buffer *eb);
>>     void btrfs_tree_read_lock(struct extent_buffer *eb);
>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>> index 1874a6d2e6f5..ec4351fd7537 100644
>> --- a/fs/btrfs/qgroup.c
>> +++ b/fs/btrfs/qgroup.c
>> @@ -1040,7 +1040,9 @@ int btrfs_quota_disable(struct
>> btrfs_trans_handle *trans,
>>     list_del(_root->dirty_list);
>>   -    btrfs_tree_lock(quota_root->node);
>> +    ret = btrfs_tree_lock(quota_root->node);
>> +    if (ret < 0)
>> +    goto out;
>>   clean_tree_block(fs_info, quota_root->node);
>>   btrfs_tree_unlock(quota_root->node);
>>   btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index be94c65bb4d2..a2fc0bd83a40 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -1877,7 +1877,11 @@ int replace_path(struct btrfs_trans_handle *trans,
>>   free_extent_buffer(eb);
>>   break;
>>   }
>> -    btrfs_tree_lock(eb);
>> +    ret = btrfs_tree_lock(eb);
>> +    if (ret < 0) {
>> +    free_extent_buffer(eb);
>> +    break;
>> +    }
>>   if (cow) {
>>   ret = btrfs_cow_block(trans, dest, eb, parent,
>>     slot, );
>> @@ -2788,7 +2792,12 @@ static int do_relocation(struct
>> btrfs_trans_handle *trans,
>>   err = -EIO;
>>   goto next;
>>   }
>> -    btrfs_tree_lock(eb);
>> +    ret = btrfs_tree_lock(eb);
>> +    if (ret < 0) {
>> +    free_extent_buffer(eb);
>> +    err = ret;
>> +    goto next;
>> +    }
>>   btrfs_set_lock_blocking(eb);
>>     if (!node->eb) {
>> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
>> index f8220ec02036..173bc15a7cd1 100644
>> --- a/fs/btrfs/tree-log.c
>> +++ 

Re: [PATCH 1/1] btrfs: locking: Allow btrfs_tree_lock() to return error to avoid deadlock

2018-07-31 Thread Su Yue




On 07/30/2018 02:17 PM, Qu Wenruo wrote:

[BUG]
For certains crafted image, whose csum root leaf has missing backref, if
we try to trigger write with data csum, it could cause deadlock with the
following kernel WARN_ON():
--
WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400
CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8
Workqueue: btrfs-endio-write btrfs_endio_write_helper
RIP: 0010:btrfs_tree_lock+0x3e2/0x400
Call Trace:
  btrfs_alloc_tree_block+0x39f/0x770
  __btrfs_cow_block+0x285/0x9e0
  btrfs_cow_block+0x191/0x2e0
  btrfs_search_slot+0x492/0x1160
  btrfs_lookup_csum+0xec/0x280
  btrfs_csum_file_blocks+0x2be/0xa60
  add_pending_csums+0xaf/0xf0
  btrfs_finish_ordered_io+0x74b/0xc90
  finish_ordered_fn+0x15/0x20
  normal_work_helper+0xf6/0x500
  btrfs_endio_write_helper+0x12/0x20
  process_one_work+0x302/0x770
  worker_thread+0x81/0x6d0
  kthread+0x180/0x1d0
  ret_from_fork+0x35/0x40
---[ end trace 2e85051acb5f6dc1 ]---
--

[CAUSE]
That crafted image has missing backref for csum tree root leaf.
And when we try to allocate new tree block, since there is no
EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot
and use it.

However we have already locked csum tree root leaf by ourselves, thus we
will ourselves to release read/write lock, and causing deadlock.

[FIX]
This patch will allow btrfs_tree_lock() to return error number, so
most callers can exit gracefully.
(Some caller doesn't really expect btrfs_tree_lock() to return error,
and in that case, we use BUG_ON())

Since such problem can only happen in crafted image, we will still
trigger kernel warning, but with a little more meaningful warning
message.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405
Reported-by: Xu Wen 
Signed-off-by: Qu Wenruo 


OK.. I looked through every places where the callee is called.
Those errors are handled gracefully as my thought.
Except one nitpick, the %llu about pid_t in btrfs_tree_lock.
However, you can add the tag:

Reviewed-by: Su Yue 


---
  fs/btrfs/ctree.c   | 57 +++---
  fs/btrfs/extent-tree.c | 28 +++
  fs/btrfs/extent_io.c   |  8 --
  fs/btrfs/free-space-tree.c |  4 ++-
  fs/btrfs/locking.c | 12 ++--
  fs/btrfs/locking.h |  2 +-
  fs/btrfs/qgroup.c  |  4 ++-
  fs/btrfs/relocation.c  | 13 +++--
  fs/btrfs/tree-log.c| 14 --
  9 files changed, 114 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc326df472e..6e695f4cd931 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -161,10 +161,12 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root 
*root)
  struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
  {
struct extent_buffer *eb;
+   int ret;
  
  	while (1) {

eb = btrfs_root_node(root);
-   btrfs_tree_lock(eb);
+   ret = btrfs_tree_lock(eb);
+   BUG_ON(ret < 0);
if (eb == root->node)
break;
btrfs_tree_unlock(eb);
@@ -1584,6 +1586,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
for (i = start_slot; i <= end_slot; i++) {
struct btrfs_key first_key;
int close = 1;
+   int ret;
  
  		btrfs_node_key(parent, _key, i);

if (!progress_passed && comp_keys(_key, progress) < 0)
@@ -1637,7 +1640,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (search_start == 0)
search_start = last_block;
  
-		btrfs_tree_lock(cur);

+   ret = btrfs_tree_lock(cur);
+   if (ret < 0) {
+   free_extent_buffer(cur);
+   return ret;
+   }
btrfs_set_lock_blocking(cur);
err = __btrfs_cow_block(trans, root, cur, parent, i,
, search_start,
@@ -1853,7 +1860,11 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
goto enospc;
}
  
-		btrfs_tree_lock(child);

+   ret = btrfs_tree_lock(child);
+   if (ret < 0) {
+   free_extent_buffer(child);
+   goto enospc;
+   }
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, );
if (ret) {
@@ -1891,7 +1902,12 @@ static noinline int balance_level(struct 
btrfs_trans_handle *trans,
left = NULL;
  
  	if (left) {

-   btrfs_tree_lock(left);
+   ret = btrfs_tree_lock(left);
+   if (ret < 0) {
+   free_extent_buffer(left);
+   left = NULL;
+   goto enospc;
+   }
btrfs_set_lock_blocking(left);