[PATCH RESEND v12] btrfs-progs: add cli to forget one or all scanned devices

2018-11-30 Thread Anand Jain
This patch adds cli
  btrfs device forget [dev]
to remove the given device structure in the kernel if the device
is unmounted. If no argument is given it shall remove all stale
(device which are not mounted) from the kernel.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
v11->v12: none.
v1->v12: pls ref to the cover-letter.

 cmds-device.c | 72 ---
 ioctl.h   |  2 ++
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/cmds-device.c b/cmds-device.c
index 2a05f70a76a9..280d6f555377 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
 }
 
+static int btrfs_forget_devices(char *path)
+{
+   struct btrfs_ioctl_vol_args args;
+   int ret;
+   int fd;
+
+   fd = open("/dev/btrfs-control", O_RDWR);
+   if (fd < 0)
+   return -errno;
+
+   memset(, 0, sizeof(args));
+   if (path)
+   strncpy_null(args.name, path);
+   ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
+   if (ret)
+   ret = -errno;
+   close(fd);
+   return ret;
+}
+
 static const char * const cmd_device_scan_usage[] = {
-   "btrfs device scan [(-d|--all-devices)| [...]]",
-   "Scan devices for a btrfs filesystem",
+   "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
+   "[...]]",
+   "Scan or forget (deregister) devices for a btrfs filesystem",
" -d|--all-devices (deprecated)",
+   " -u|--forget [ ..]",
NULL
 };
 
@@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv)
int devstart;
int all = 0;
int ret = 0;
+   int forget = 0;
 
optind = 0;
while (1) {
int c;
static const struct option long_options[] = {
{ "all-devices", no_argument, NULL, 'd'},
+   { "forget", no_argument, NULL, 'u'},
{ NULL, 0, NULL, 0}
};
 
-   c = getopt_long(argc, argv, "d", long_options, NULL);
+   c = getopt_long(argc, argv, "du", long_options, NULL);
if (c < 0)
break;
switch (c) {
case 'd':
all = 1;
break;
+   case 'u':
+   forget = 1;
+   break;
default:
usage(cmd_device_scan_usage);
}
}
devstart = optind;
 
+   if (all && forget)
+   usage(cmd_device_scan_usage);
+
if (all && check_argc_max(argc - optind, 1))
usage(cmd_device_scan_usage);
 
if (all || argc - optind == 0) {
-   printf("Scanning for Btrfs filesystems\n");
-   ret = btrfs_scan_devices();
-   error_on(ret, "error %d while scanning", ret);
-   ret = btrfs_register_all_devices();
-   error_on(ret, "there are %d errors while registering devices", 
ret);
+   if (forget) {
+   ret = btrfs_forget_devices(NULL);
+   error_on(ret, "'%s', forget failed",
+strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems\n");
+   ret = btrfs_scan_devices();
+   error_on(ret, "error %d while scanning", ret);
+   ret = btrfs_register_all_devices();
+   error_on(ret,
+   "there are %d errors while registering devices",
+   ret);
+   }
goto out;
}
 
@@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv)
ret = 1;
goto out;
}
-   printf("Scanning for Btrfs filesystems in '%s'\n", path);
-   if (btrfs_register_one_device(path) != 0) {
-   ret = 1;
-   free(path);
-   goto out;
+   if (forget) {
+   ret = btrfs_forget_devices(path);
+   if (ret)
+   error("Can't forget '%s': %s",
+   path, strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems in '%s'\n",
+   

[PATCH RESEND] fstests: btrfs use forget if not reload

2018-11-30 Thread Anand Jain
[I will send this the xfstest ML after kernel and progs patch
 has been integrated].

btrfs reload was introduced to cleanup the device list inside the btrfs
kernel module.

The problem with the reload approach is that you can't run btrfs test
cases 124,125, 154 and 164 on the system with btrfs as root fs.

Now as we are introducing the btrfs forget feature as an btrfs device
scan option [1], so here are its pertaining changes in the fstests.

So these changes ensures we use the forget feature where available and
if its not then falls back to the reload approach.

[1]
btrfs-progs: add cli to forget one or all scanned devices
btrfs: introduce feature to forget a btrfs device

Signed-off-by: Anand Jain 
---
 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/common/btrfs b/common/btrfs
index 26dc0bb9600f..c7fbec11c8c1 100644
--- a/common/btrfs
+++ b/common/btrfs
@@ -374,3 +374,23 @@ _scratch_btrfs_sectorsize()
$BTRFS_UTIL_PROG inspect-internal dump-super $SCRATCH_DEV |\
grep sectorsize | awk '{print $2}'
 }
+
+_btrfs_supports_forget()
+{
+   $BTRFS_UTIL_PROG device scan --help | grep -wq forget && \
+   $BTRFS_UTIL_PROG device scan --forget > /dev/null 2>&1
+}
+
+_require_btrfs_forget_if_not_fs_loadable()
+{
+   _btrfs_supports_forget && return
+
+   _require_loadable_fs_module "btrfs"
+}
+
+_btrfs_forget_if_not_fs_reload()
+{
+   _btrfs_supports_forget && return
+
+   _reload_fs_module "btrfs"
+}
diff --git a/tests/btrfs/124 b/tests/btrfs/124
index ce3ad6aa3a58..edbeff1443f5 100755
--- a/tests/btrfs/124
+++ b/tests/btrfs/124
@@ -51,7 +51,7 @@ _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 2
 _test_unmount
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 2
 
@@ -86,7 +86,7 @@ echo "clean btrfs ko" >> $seqres.full
 _scratch_unmount
 
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 
 echo >> $seqres.full
 echo "-Write degraded mount fill upto $max_fs_sz bytes-" >> 
$seqres.full
@@ -125,7 +125,7 @@ echo
 echo "Mount degraded with the other dev"
 _scratch_unmount
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 _mount -o degraded $dev2 $SCRATCH_MNT >>$seqres.full 2>&1
 _run_btrfs_util_prog filesystem show
 checkpoint3=`md5sum $SCRATCH_MNT/tf2`
diff --git a/tests/btrfs/125 b/tests/btrfs/125
index e38de264b28e..9161a2ddeaad 100755
--- a/tests/btrfs/125
+++ b/tests/btrfs/125
@@ -50,7 +50,7 @@ _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 3
 _test_unmount
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 3
 
@@ -102,7 +102,7 @@ echo "unmount" >> $seqres.full
 _scratch_unmount
 echo "clean btrfs ko" >> $seqres.full
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 _mount -o degraded,device=$dev2 $dev1 $SCRATCH_MNT >>$seqres.full 2>&1
 dd if=/dev/zero of="$SCRATCH_MNT"/tf2 bs=$bs count=$count \
>>$seqres.full 2>&1
@@ -138,7 +138,7 @@ echo "Mount degraded but with other dev"
 
 _scratch_unmount
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 
 _mount -o degraded,device=${dev2} $dev3 $SCRATCH_MNT >>$seqres.full 2>&1
 
diff --git a/tests/btrfs/154 b/tests/btrfs/154
index 99ea232aba4c..01745d20e9fc 100755
--- a/tests/btrfs/154
+++ b/tests/btrfs/154
@@ -36,7 +36,7 @@ rm -f $seqres.full
 _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 2
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 2
 
@@ -90,7 +90,7 @@ degrade_mount_write()
 
echo "clean btrfs ko" >> $seqres.full
# un-scan the btrfs devices
-   _reload_fs_module "btrfs"
+   _btrfs_forget_if_not_fs_reload
_mount -o degraded $DEV1 $SCRATCH_MNT >>$seqres.full 2>&1
cnt=$(( $COUNT/10 ))
dd if=/dev/urandom of="$SCRATCH_MNT"/tf1 bs=$bs count=$cnt \
@@ -142,7 +142,7 @@ verify()
echo "unmount" >> $seqres.full
 
_scratch_unmount
-   _reload_fs_module "btrfs"
+   _btrfs_forget_if_not_fs_reload
_mount -o degraded $DEV2 $SCRATCH_MNT >>$seqres.full 2>&1
verify_checkpoint1=`md5sum $SCRATCH_MNT/tf1`
verify_checkpoint2=`md5sum $SCRATCH_MNT/tf2`
diff --git a/tests/btrfs/164 b/tests

[PATCH RESEND v12] Add cli and ioctl to forget scanned device(s)

2018-11-30 Thread Anand Jain
v12:
  Fixed coding style - leave space between " : ".

v11:
 btrfs-progs: Bring the code into the else part of if(forget).
  Use strerror to print the erorr instead of ret.

v10:
 Make btrfs-progs changes more readable.
 With an effort to keep the known bug [1] as it is..
  [1]
   The cli 'btrfs device scan --all /dev/sdb' which should have scanned
only one device, ends up scanning all the devices and I am not trying
to fix this bug in this patch because..
  . -d|--all is marked as deprecated, I hope -d option would go away
  . For now some script might be using this bug as a feature, and fixing
this bug might lead to mount failure.

v9:
 Make forget as a btrfs device scan option.
 Use forget in the fstests, now you can run fstests with btrfs as rootfs
  which helps to exercise the uuid_mutex lock.

v8:
 Change log update in the kernel patch.

v7:
 Use struct btrfs_ioctl_vol_args (instead of struct
  btrfs_ioctl_vol_args_v2) as its inline with other ioctl
  btrfs-control
 The CLI usage/features remains same. However internally the ioctl flag
  is not required to delete all the unmounted devices. Instead leave
  btrfs_ioctl_vol_args::name NULL.

v6:
 Use the changed fn name btrfs_free_stale_devices().

 Change in title:
 Old v5:
 Cover-letter:
  [PATCH v5] Add cli and ioctl to ignore a scanned device
 Kernel:
  [PATCH v5] btrfs: introduce feature to ignore a btrfs device
 Progs:
  [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli

v5:
  Adds feature to delete all stale devices
  Reuses btrfs_free_stale_devices() fn and so depends on the
patch-set [1] in the ML.
  Uses struct btrfs_ioctl_vol_args_v2 instead of
struct btrfs_ioctl_vol_args as arg
  Does the device path matching instead of btrfs_device matching
(we won't delete the mounted device as btrfs_free_stale_devices()
checks for it)
v4:
  No change. But as the ML thread may be confusing, so resend.
v3:
  No change. Send to correct ML.
v2:
  Accepts review from Nikolay, details are in the specific patch.
  Patch 1/2 is renamed from
[PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list 
delete
  to
[PATCH 1/2] btrfs: add function to device list delete

Adds cli and ioctl to forget a scanned device or forget all stale
devices in the kernel.

Anand Jain (1):
  btrfs: introduce feature to forget a btrfs device

 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

Anand Jain (1):
  btrfs-progs: add cli to forget one or all scanned devices

 cmds-device.c | 63 ++-
 ioctl.h   |  2 ++
 2 files changed, 56 insertions(+), 9 deletions(-)

[1]
Anand Jain (1):
  fstests: btrfs use forget if not reload

 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

-- 
1.8.3.1


[PATCH RESEND] fstests: btrfs use forget if not reload

2018-11-30 Thread Anand Jain
[I will send this the xfstest ML after kernel and progs patch
 has been integrated].

btrfs reload was introduced to cleanup the device list inside the btrfs
kernel module.

The problem with the reload approach is that you can't run btrfs test
cases 124,125, 154 and 164 on the system with btrfs as root fs.

Now as we are introducing the btrfs forget feature as an btrfs device
scan option [1], so here are its pertaining changes in the fstests.

So these changes ensures we use the forget feature where available and
if its not then falls back to the reload approach.

[1]
btrfs-progs: add cli to forget one or all scanned devices
btrfs: introduce feature to forget a btrfs device

Signed-off-by: Anand Jain 
---
 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/common/btrfs b/common/btrfs
index 26dc0bb9600f..c7fbec11c8c1 100644
--- a/common/btrfs
+++ b/common/btrfs
@@ -374,3 +374,23 @@ _scratch_btrfs_sectorsize()
$BTRFS_UTIL_PROG inspect-internal dump-super $SCRATCH_DEV |\
grep sectorsize | awk '{print $2}'
 }
+
+_btrfs_supports_forget()
+{
+   $BTRFS_UTIL_PROG device scan --help | grep -wq forget && \
+   $BTRFS_UTIL_PROG device scan --forget > /dev/null 2>&1
+}
+
+_require_btrfs_forget_if_not_fs_loadable()
+{
+   _btrfs_supports_forget && return
+
+   _require_loadable_fs_module "btrfs"
+}
+
+_btrfs_forget_if_not_fs_reload()
+{
+   _btrfs_supports_forget && return
+
+   _reload_fs_module "btrfs"
+}
diff --git a/tests/btrfs/124 b/tests/btrfs/124
index ce3ad6aa3a58..edbeff1443f5 100755
--- a/tests/btrfs/124
+++ b/tests/btrfs/124
@@ -51,7 +51,7 @@ _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 2
 _test_unmount
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 2
 
@@ -86,7 +86,7 @@ echo "clean btrfs ko" >> $seqres.full
 _scratch_unmount
 
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 
 echo >> $seqres.full
 echo "-Write degraded mount fill upto $max_fs_sz bytes-" >> 
$seqres.full
@@ -125,7 +125,7 @@ echo
 echo "Mount degraded with the other dev"
 _scratch_unmount
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 _mount -o degraded $dev2 $SCRATCH_MNT >>$seqres.full 2>&1
 _run_btrfs_util_prog filesystem show
 checkpoint3=`md5sum $SCRATCH_MNT/tf2`
diff --git a/tests/btrfs/125 b/tests/btrfs/125
index e38de264b28e..9161a2ddeaad 100755
--- a/tests/btrfs/125
+++ b/tests/btrfs/125
@@ -50,7 +50,7 @@ _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 3
 _test_unmount
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 3
 
@@ -102,7 +102,7 @@ echo "unmount" >> $seqres.full
 _scratch_unmount
 echo "clean btrfs ko" >> $seqres.full
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 _mount -o degraded,device=$dev2 $dev1 $SCRATCH_MNT >>$seqres.full 2>&1
 dd if=/dev/zero of="$SCRATCH_MNT"/tf2 bs=$bs count=$count \
>>$seqres.full 2>&1
@@ -138,7 +138,7 @@ echo "Mount degraded but with other dev"
 
 _scratch_unmount
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 
 _mount -o degraded,device=${dev2} $dev3 $SCRATCH_MNT >>$seqres.full 2>&1
 
diff --git a/tests/btrfs/154 b/tests/btrfs/154
index 99ea232aba4c..01745d20e9fc 100755
--- a/tests/btrfs/154
+++ b/tests/btrfs/154
@@ -36,7 +36,7 @@ rm -f $seqres.full
 _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 2
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 2
 
@@ -90,7 +90,7 @@ degrade_mount_write()
 
echo "clean btrfs ko" >> $seqres.full
# un-scan the btrfs devices
-   _reload_fs_module "btrfs"
+   _btrfs_forget_if_not_fs_reload
_mount -o degraded $DEV1 $SCRATCH_MNT >>$seqres.full 2>&1
cnt=$(( $COUNT/10 ))
dd if=/dev/urandom of="$SCRATCH_MNT"/tf1 bs=$bs count=$cnt \
@@ -142,7 +142,7 @@ verify()
echo "unmount" >> $seqres.full
 
_scratch_unmount
-   _reload_fs_module "btrfs"
+   _btrfs_forget_if_not_fs_reload
_mount -o degraded $DEV2 $SCRATCH_MNT >>$seqres.full 2>&1
verify_checkpoint1=`md5sum $SCRATCH_MNT/tf1`
verify_checkpoint2=`md5sum $SCRATCH_MNT/tf2`
diff --git a/tests/btrfs/164 b/tests

[PATCH RESEND v12] btrfs: introduce feature to forget a btrfs device

2018-11-30 Thread Anand Jain
Support for a new command 'btrfs dev forget [dev]' is proposed here
to undo the effects of 'btrfs dev scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty.
IOW(BTRFS_IOCTL_MAGIC, 5, ..)
This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
the /dev/btrfs-control to forget one or all devices, (devices which are
not mounted) from the btrfs kernel.

The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
set to specify the device path. And all unmounted devices can be removed
from the kernel if no device path is provided.

Again, the devices are removed only if the relevant fsid aren't mounted.

This new cli can provide..
 . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
   device is not going to be mounted.
 . Ability to mount the device in degraded mode when one of the other
   device is corrupted like in split brain raid1.
 . Running test cases which requires btrfs.ko-reload if the rootfs
   is btrfs.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
v11->v12: fix coding style add spacing before after ":".
v1->v11: Pls ref to the cover-letter. (sorry about that).

 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d3c6bbc0aa3a..eba2966913ae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2247,6 +2247,9 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(_mutex);
break;
+   case BTRFS_IOC_FORGET_DEV:
+   ret = btrfs_forget_devices(vol->name);
+   break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8e36cbb355df..48415a9edd46 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1397,6 +1397,15 @@ static int btrfs_read_disk_super(struct block_device 
*bdev, u64 bytenr,
return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+   mutex_lock(_mutex);
+   btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+   mutex_unlock(_mutex);
+
+   return 0;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount 
path
  * and we are not allowed to call set_blocksize during the scan. The superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1d936ce282c3..a4a89d5050f5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -414,6 +414,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
   fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e0763bc4158e..c195896d478f 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -837,6 +837,8 @@ enum btrfs_err_code {
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+  struct btrfs_ioctl_vol_args)
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
-- 
1.8.3.1



Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning

2018-11-29 Thread Anand Jain




On 11/30/2018 09:05 AM, Anand Jain wrote:



On 11/29/2018 10:31 PM, David Sterba wrote:

On Wed, Nov 28, 2018 at 04:47:27PM +0800, Anand Jain wrote:

2. scrub_workers_refcnt must eventually be converted to refcount_t type


   ok. Added in v2 patch set.


No such thing is in v2 and this would actually get rid of the need to
hold scrub_lock in scrub_workers_put. 


 Well we still need scrub_lock, because we spun scrub thread per device,
 and these threads can race as well. So the first scrub thread should
 make sure the scrub workers are allocated and keep the other threads
 waiting.
 As of now btrfs-progs serializes the scrub per device. But btrfs-progs
 is just one of the tool and there can be some other similar tool. We
 need hardening with in the kernel.

Thanks, Anand



Which in turn can be moved out of
the locked section in btrfs_scrub_dev and the warning is gone. Problem
solved.






  Right. When testing btrfs/011 it got hung and bisect pointed to the
  patch which was converting int to refcount_t.
  I had difficulties to get the logs out of the test machines, so I
  had to drop the patch.
  Will send refcount_t patch, patch 1/3 and possibly scrub concurrency
  patch (make scrub independent of the btrfs-progs locks) patches all
  together.

Thanks, Anand


[PATCH v3 0/2] btrfs: scrub: fix scrub_lock

2018-11-29 Thread Anand Jain
v3: Drops the patch [1]from this set.
[1]
btrfs: scrub: maintain the unlock order in scrub thread

Fixes the circular locking dependency warning as in patch 1/2,
and patch 2/2 adds lockdep_assert_held() to scrub_workers_get().

Anand Jain (2):
  btrfs: scrub: fix circular locking dependency warning
  btrfs: add lockdep check for scrub_lock in scrub_workers_get

 fs/btrfs/scrub.c   | 22 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

-- 
1.8.3.1



[PATCH v3 1/2] btrfs: scrub: fix circular locking dependency warning

2018-11-29 Thread Anand Jain
find_held_lock+0x2d/0x90
[   76.170450]  do_vfs_ioctl+0xa9/0x6d0
[   76.170690]  ? __fget+0x101/0x1f0
[   76.170910]  ? __fget+0x5/0x1f0
[   76.171157]  ksys_ioctl+0x60/0x90
[   76.171391]  __x64_sys_ioctl+0x16/0x20
[   76.171634]  do_syscall_64+0x50/0x180
[   76.171892]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.172186] RIP: 0033:0x7f61d422e567
[   76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00
48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48
[   76.172911] RSP: 002b:7f61d3936d68 EFLAGS: 0246 ORIG_RAX:
0010
[   76.173328] RAX: ffda RBX: 019026b0 RCX:
7f61d422e567
[   76.173649] RDX: 019026b0 RSI: c400941b RDI:
0003
[   76.173909] RBP:  R08: 7f61d3937700 R09:

[   76.174244] R10: 7f61d3937700 R11: 0246 R12:

[   76.174566] R13: 000000801000 R14:  R15:
7f61d3937700
[   76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left

Signed-off-by: Anand Jain 
---
v2->v3: none
v1->v2: none
 fs/btrfs/scrub.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b5a19ba38ab7..9ade0659f017 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3757,10 +3757,13 @@ static noinline_for_stack int scrub_workers_get(struct 
btrfs_fs_info *fs_info,
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
+   lockdep_assert_held(_info->scrub_lock);
if (--fs_info->scrub_workers_refcnt == 0) {
+   mutex_unlock(_info->scrub_lock);
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
+   mutex_lock(_info->scrub_lock);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
-- 
1.8.3.1



[PATCH v3 2/2] btrfs: scrub: add scrub_lock lockdep check in scrub_workers_get

2018-11-29 Thread Anand Jain
scrub_workers_refcnt is protected by scrub_lock, add lockdep_assert_held()
function in scrub_workers_get().

Signed-off-by: Anand Jain 
Suggested-by: Nikolay Borisov 
---
v3: none
v2: born
 fs/btrfs/scrub.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9ade0659f017..84ef1f0d371e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3726,6 +3726,8 @@ static noinline_for_stack int scrub_workers_get(struct 
btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
 
+   lockdep_assert_held(_info->scrub_lock);
+
if (fs_info->scrub_workers_refcnt == 0) {
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
-- 
1.8.3.1



Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning

2018-11-29 Thread Anand Jain




On 11/29/2018 10:31 PM, David Sterba wrote:

On Wed, Nov 28, 2018 at 04:47:27PM +0800, Anand Jain wrote:

2. scrub_workers_refcnt must eventually be converted to refcount_t type


   ok. Added in v2 patch set.


No such thing is in v2 and this would actually get rid of the need to
hold scrub_lock in scrub_workers_put. Which in turn can be moved out of
the locked section in btrfs_scrub_dev and the warning is gone. Problem
solved.



 Right. When testing btrfs/011 it got hung and bisect pointed to the
 patch which was converting int to refcount_t.
 I had difficulties to get the logs out of the test machines, so I
 had to drop the patch.
 Will send refcount_t patch, patch 1/3 and possibly scrub concurrency
 patch (make scrub independent of the btrfs-progs locks) patches all
 together.

Thanks, Anand


Re: [PATCH v2 1/3] btrfs: scrub: maintain the unlock order in scrub thread

2018-11-29 Thread Anand Jain




On 11/29/2018 06:36 PM, Filipe Manana wrote:

On Thu, Nov 29, 2018 at 9:27 AM Anand Jain  wrote:


The device_list_mutex and scrub_lock creates a nested locks in
btrfs_scrub_dev().

During lock the order is device_list_mutex and then scrub_lock, and during
unlock, the order is device_list_mutex and then scrub_lock.
Fix this to the lock order of scrub_lock and then device_list_mutex.

Signed-off-by: Anand Jain 
---
v1->v2: change the order of lock acquire first scrub_lock and then
 device_list_mutex, which matches with the order of unlock.
 The extra line which are now in the scrub_lock are ok to be
 under the scrub_lock.


I don't get it.
What problem does this patch fixes?
Doesn't seem any functional fix to me, nor performance gain (by the
contrary, the scrub_lock is now held for a longer time than needed),
nor makes anything more readable or "beautiful".


 btrfs_scrub_dev() isn't following the lock and unlock FILO order.
 Such as lock-a lock-b .. unlock-b unlock-a. So this patch is
 trying to fix it.

 This patch fixes the order but I think you mean to say as
 __scrub_blocked_if_needed() calls unlock scrub_lock. oops my
 bad this patch is wrong.

 Scrub concurrency needs overhaul including the dependency on the
 user land btrfs-progs, which I was trying to avoid. but looks like
 its better to fix that as well. As of now I am NACK this patch.

Thanks, Anand


  fs/btrfs/scrub.c | 13 +++--
  1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..a9d6fc3b01d4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3813,28 +3813,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
 return -EINVAL;
 }

-
+   mutex_lock(_info->scrub_lock);
 mutex_lock(_info->fs_devices->device_list_mutex);
 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, >dev_state) &&
  !is_dev_replace)) {
 mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
 return -ENODEV;
 }

 if (!is_dev_replace && !readonly &&
 !test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state)) {
 mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
 btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
 rcu_str_deref(dev->name));
 return -EROFS;
 }

-   mutex_lock(_info->scrub_lock);
 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, >dev_state) ||
 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) {
-   mutex_unlock(_info->scrub_lock);
 mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
 return -EIO;
 }

@@ -3843,23 +3844,23 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
 (!is_dev_replace &&
  btrfs_dev_replace_is_ongoing(_info->dev_replace))) {
 btrfs_dev_replace_read_unlock(_info->dev_replace);
-   mutex_unlock(_info->scrub_lock);
 mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
 return -EINPROGRESS;
 }
 btrfs_dev_replace_read_unlock(_info->dev_replace);

 ret = scrub_workers_get(fs_info, is_dev_replace);
 if (ret) {
-   mutex_unlock(_info->scrub_lock);
 mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
 return ret;
 }

 sctx = scrub_setup_ctx(dev, is_dev_replace);
 if (IS_ERR(sctx)) {
-   mutex_unlock(_info->scrub_lock);
 mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
 scrub_workers_put(fs_info);
 return PTR_ERR(sctx);
 }
--
1.8.3.1






[PATCH v2 2/3] btrfs: scrub: fix circular locking dependency warning

2018-11-29 Thread Anand Jain
find_held_lock+0x2d/0x90
[   76.170450]  do_vfs_ioctl+0xa9/0x6d0
[   76.170690]  ? __fget+0x101/0x1f0
[   76.170910]  ? __fget+0x5/0x1f0
[   76.171157]  ksys_ioctl+0x60/0x90
[   76.171391]  __x64_sys_ioctl+0x16/0x20
[   76.171634]  do_syscall_64+0x50/0x180
[   76.171892]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.172186] RIP: 0033:0x7f61d422e567
[   76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00
48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48
[   76.172911] RSP: 002b:7f61d3936d68 EFLAGS: 0246 ORIG_RAX:
0010
[   76.173328] RAX: ffda RBX: 019026b0 RCX:
7f61d422e567
[   76.173649] RDX: 019026b0 RSI: c400941b RDI:
0003
[   76.173909] RBP:  R08: 7f61d3937700 R09:

[   76.174244] R10: 7f61d3937700 R11: 0246 R12:

[   76.174566] R13: 000000801000 R14:  R15:
7f61d3937700
[   76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left

Signed-off-by: Anand Jain 
---
v1->v2: none
 fs/btrfs/scrub.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b5a19ba38ab7..9ade0659f017 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3757,10 +3757,13 @@ static noinline_for_stack int scrub_workers_get(struct 
btrfs_fs_info *fs_info,
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
+   lockdep_assert_held(_info->scrub_lock);
if (--fs_info->scrub_workers_refcnt == 0) {
+   mutex_unlock(_info->scrub_lock);
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
+   mutex_lock(_info->scrub_lock);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
-- 
1.8.3.1



[PATCH v2 1/3] btrfs: scrub: maintain the unlock order in scrub thread

2018-11-29 Thread Anand Jain
The device_list_mutex and scrub_lock creates a nested locks in
btrfs_scrub_dev().

During lock the order is device_list_mutex and then scrub_lock, and during
unlock, the order is device_list_mutex and then scrub_lock.
Fix this to the lock order of scrub_lock and then device_list_mutex.

Signed-off-by: Anand Jain 
---
v1->v2: change the order of lock acquire first scrub_lock and then
device_list_mutex, which matches with the order of unlock.
The extra line which are now in the scrub_lock are ok to be
under the scrub_lock.
 fs/btrfs/scrub.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..a9d6fc3b01d4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3813,28 +3813,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
return -EINVAL;
}
 
-
+   mutex_lock(_info->scrub_lock);
mutex_lock(_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, >dev_state) &&
 !is_dev_replace)) {
mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
return -ENODEV;
}
 
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state)) {
mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
return -EROFS;
}
 
-   mutex_lock(_info->scrub_lock);
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, >dev_state) ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) {
-   mutex_unlock(_info->scrub_lock);
mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
return -EIO;
}
 
@@ -3843,23 +3844,23 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
(!is_dev_replace &&
 btrfs_dev_replace_is_ongoing(_info->dev_replace))) {
btrfs_dev_replace_read_unlock(_info->dev_replace);
-   mutex_unlock(_info->scrub_lock);
mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
return -EINPROGRESS;
}
btrfs_dev_replace_read_unlock(_info->dev_replace);
 
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
-   mutex_unlock(_info->scrub_lock);
mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
return ret;
}
 
sctx = scrub_setup_ctx(dev, is_dev_replace);
if (IS_ERR(sctx)) {
-   mutex_unlock(_info->scrub_lock);
mutex_unlock(_info->fs_devices->device_list_mutex);
+   mutex_unlock(_info->scrub_lock);
scrub_workers_put(fs_info);
return PTR_ERR(sctx);
}
-- 
1.8.3.1


[PATCH v2 0/3] btrfs: scrub: fix scrub_lock

2018-11-29 Thread Anand Jain
Idea was to fix the circular locking dependency warning as in patch 2/3,
and in the process also fixes the other identified cleanups patches 1/3,3/3
and they aren't dependent on 2ttch /3.

Anand Jain (3):
  btrfs: scrub: maintain the unlock order in scrub thread
  btrfs: scrub: fix circular locking dependency warning
  btrfs: add lockdep check for scrub_lock in scrub_workers_get

 fs/btrfs/scrub.c   | 22 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

-- 
1.8.3.1



[PATCH v2 3/3] btrfs: scrub: add scrub_lock lockdep check in scrub_workers_get

2018-11-29 Thread Anand Jain
scrub_workers_refcnt is protected by scrub_lock, add lockdep_assert_held()
function in scrub_workers_get().

Signed-off-by: Anand Jain 
Suggested-by: Nikolay Borisov 
---
v2: born
 fs/btrfs/scrub.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9ade0659f017..84ef1f0d371e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3726,6 +3726,8 @@ static noinline_for_stack int scrub_workers_get(struct 
btrfs_fs_info *fs_info,
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
 
+   lockdep_assert_held(_info->scrub_lock);
+
if (fs_info->scrub_workers_refcnt == 0) {
fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
flags, is_dev_replace ? 1 : max_active, 4);
-- 
1.8.3.1



Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning

2018-11-28 Thread Anand Jain




On 11/26/2018 05:59 PM, Nikolay Borisov wrote:



On 26.11.18 г. 11:07 ч., Anand Jain wrote:

Circular locking dependency check reports warning[1], that's because
the btrfs_scrub_dev() calls the stack #0 below with, the
fs_info::scrub_lock held. The test case leading to this warning..

   mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs
   btrfs scrub start -B /btrfs

In fact we have fs_info::scrub_workers_refcnt to tack if the init and
destroy of the scrub workers are needed. So once we have incremented
and decremented the fs_info::scrub_workers_refcnt value in the thread,
its ok to drop the scrub_lock, and then actually do the
btrfs_destroy_workqueue() part. So this patch drops the scrub_lock
before calling btrfs_destroy_workqueue().

[1]
[   76.146826] ==
[   76.147086] WARNING: possible circular locking dependency detected
[   76.147316] 4.20.0-rc3+ #41 Not tainted
[   76.147489] --
[   76.147722] btrfs/4065 is trying to acquire lock:
[   76.147984] 38593bc0 ((wq_completion)"%s-%s""btrfs",
name){+.+.}, at: flush_workqueue+0x70/0x4d0
[   76.148337]
but task is already holding lock:
[   76.148594] 62392ab7 (_info->scrub_lock){+.+.}, at:
btrfs_scrub_dev+0x316/0x5d0 [btrfs]
[   76.148909]
which lock already depends on the new lock.

[   76.149191]
the existing dependency chain (in reverse order) is:
[   76.149446]
-> #3 (_info->scrub_lock){+.+.}:
[   76.149707]btrfs_scrub_dev+0x11f/0x5d0 [btrfs]
[   76.149924]btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.150216]do_vfs_ioctl+0xa9/0x6d0
[   76.150468]ksys_ioctl+0x60/0x90
[   76.150716]__x64_sys_ioctl+0x16/0x20
[   76.150911]do_syscall_64+0x50/0x180
[   76.151182]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.151469]
-> #2 (_devs->device_list_mutex){+.+.}:
[   76.151851]reada_start_machine_worker+0xca/0x3f0 [btrfs]
[   76.152195]normal_work_helper+0xf0/0x4c0 [btrfs]
[   76.152489]process_one_work+0x1f4/0x520
[   76.152751]worker_thread+0x46/0x3d0
[   76.153715]kthread+0xf8/0x130
[   76.153912]ret_from_fork+0x3a/0x50
[   76.154178]
-> #1 ((work_completion)(>normal_work)){+.+.}:
[   76.154575]worker_thread+0x46/0x3d0
[   76.154828]kthread+0xf8/0x130
[   76.155108]ret_from_fork+0x3a/0x50
[   76.155357]
-> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}:
[   76.155751]flush_workqueue+0x9a/0x4d0
[   76.155911]drain_workqueue+0xca/0x1a0
[   76.156182]destroy_workqueue+0x17/0x230
[   76.156455]btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
[   76.156756]scrub_workers_put+0x2e/0x60 [btrfs]
[   76.156931]btrfs_scrub_dev+0x329/0x5d0 [btrfs]
[   76.157219]btrfs_ioctl+0x1ac3/0x2d80 [btrfs]
[   76.157491]do_vfs_ioctl+0xa9/0x6d0
[   76.157742]ksys_ioctl+0x60/0x90
[   76.157910]__x64_sys_ioctl+0x16/0x20
[   76.158177]do_syscall_64+0x50/0x180
[   76.158429]entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.158716]
other info that might help us debug this:

[   76.158908] Chain exists of:
   (wq_completion)"%s-%s""btrfs", name --> _devs->device_list_mutex
--> _info->scrub_lock

[   76.159629]  Possible unsafe locking scenario:

[   76.160607]CPU0CPU1
[   76.160934]
[   76.161210]   lock(_info->scrub_lock);
[   76.161458]
lock(_devs->device_list_mutex);
[   76.161805]
lock(_info->scrub_lock);
[   76.161909]   lock((wq_completion)"%s-%s""btrfs", name);
[   76.162201]
  *** DEADLOCK ***

[   76.162627] 2 locks held by btrfs/4065:
[   76.162897]  #0: bef2775b (sb_writers#12){.+.+}, at:
mnt_want_write_file+0x24/0x50
[   76.163335]  #1: 62392ab7 (_info->scrub_lock){+.+.}, at:
btrfs_scrub_dev+0x316/0x5d0 [btrfs]
[   76.163796]
stack backtrace:
[   76.163911] CPU: 1 PID: 4065 Comm: btrfs Not tainted 4.20.0-rc3+ #41
[   76.164228] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS
VirtualBox 12/01/2006
[   76.164646] Call Trace:
[   76.164872]  dump_stack+0x5e/0x8b
[   76.165128]  print_circular_bug.isra.37+0x1f1/0x1fe
[   76.165398]  __lock_acquire+0x14aa/0x1620
[   76.165652]  lock_acquire+0xb0/0x190
[   76.165910]  ? flush_workqueue+0x70/0x4d0
[   76.166175]  flush_workqueue+0x9a/0x4d0
[   76.166420]  ? flush_workqueue+0x70/0x4d0
[   76.166671]  ? drain_workqueue+0x52/0x1a0
[   76.166911]  drain_workqueue+0xca/0x1a0
[   76.167167]  destroy_workqueue+0x17/0x230
[   76.167428]  btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs]
[   76.167720]  scrub_workers_put+0x2e/0x60 [btrfs]
[   76.168233]  btrfs_scrub_dev+0x329/0x5d0 [btrfs]
[   76.168504]  ? __sb_start_write+0x121/0x1b0
[   76.168759]  ? mnt_want_write_file+0x24/0x50
[   76.169654

Re: [PATCH 1/2] btrfs: scrub: maintain the unlock order in scrub thread

2018-11-27 Thread Anand Jain




On 11/26/2018 05:47 PM, Nikolay Borisov wrote:



On 26.11.18 г. 11:07 ч., Anand Jain wrote:

The fs_info::device_list_mutex and fs_info::scrub_lock creates a
nested locks in btrfs_scrub_dev(). During the lock acquire the
hierarchy is fs_info::device_list_mutex and then fs_info::scrub_lock,
so following the same reverse order during unlock, that is
fs_info::scrub_lock and then fs_info::device_list_mutex.

Signed-off-by: Anand Jain 
---
  fs/btrfs/scrub.c | 16 +++-
  1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..b1c2d1cdbd4b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3865,7 +3865,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
}
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
-   mutex_unlock(_info->fs_devices->device_list_mutex);
  
  	/*

 * checking @scrub_pause_req here, we can avoid
@@ -3875,15 +3874,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
atomic_inc(_info->scrubs_running);
mutex_unlock(_info->scrub_lock);
  
-	if (!is_dev_replace) {

-   /*
-* by holding device list mutex, we can
-* kick off writing super in log tree sync.
-*/
-   mutex_lock(_info->fs_devices->device_list_mutex);
+   /*
+* by holding device list mutex, we can kick off writing super in log
+* tree sync.
+*/
+   if (!is_dev_replace)
ret = scrub_supers(sctx, dev);
-   mutex_unlock(_info->fs_devices->device_list_mutex);
-   }
+
+   mutex_unlock(_info->fs_devices->device_list_mutex);


Have you considered whether this change will have any negative impact
due to the fact that __scrtub_blocked_if_needed can go to sleep for
arbitrary time with device_list_mutex held now ?


 You are right. I missed that point. The device_list_mutex must not be
 blocked. In fact here we don't need the nested device_list_mutex and
 scrub_lock at all. I have comeup with a new fix [1] below separating
 them.

[1]
-
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..db895ad23eda 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3830,42 +3830,37 @@ int btrfs_scrub_dev(struct btrfs_fs_info 
*fs_info, u64 devid, u64 start,

return -EROFS;
}

-   mutex_lock(_info->scrub_lock);
if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, >dev_state) ||
test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) {
-   mutex_unlock(_info->scrub_lock);
mutex_unlock(_info->fs_devices->device_list_mutex);
return -EIO;
}
+   mutex_unlock(_info->fs_devices->device_list_mutex);

btrfs_dev_replace_read_lock(_info->dev_replace);
if (dev->scrub_ctx ||
(!is_dev_replace &&
 btrfs_dev_replace_is_ongoing(_info->dev_replace))) {
btrfs_dev_replace_read_unlock(_info->dev_replace);
-   mutex_unlock(_info->scrub_lock);
-   mutex_unlock(_info->fs_devices->device_list_mutex);
return -EINPROGRESS;
}
btrfs_dev_replace_read_unlock(_info->dev_replace);

+   mutex_lock(_info->scrub_lock);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
mutex_unlock(_info->scrub_lock);
-   mutex_unlock(_info->fs_devices->device_list_mutex);
return ret;
}

sctx = scrub_setup_ctx(dev, is_dev_replace);
if (IS_ERR(sctx)) {
mutex_unlock(_info->scrub_lock);
-   mutex_unlock(_info->fs_devices->device_list_mutex);
scrub_workers_put(fs_info);
return PTR_ERR(sctx);
}
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
-   mutex_unlock(_info->fs_devices->device_list_mutex);

/*
 * checking @scrub_pause_req here, we can avoid


Will send v2.

Thanks, Anand





  
  	if (!ret)

ret = scrub_enumerate_chunks(sctx, dev, start, end);



[PATCH 2/2] btrfs: scrub: fix circular locking dependency warning

2018-11-26 Thread Anand Jain
find_held_lock+0x2d/0x90
[   76.170450]  do_vfs_ioctl+0xa9/0x6d0
[   76.170690]  ? __fget+0x101/0x1f0
[   76.170910]  ? __fget+0x5/0x1f0
[   76.171157]  ksys_ioctl+0x60/0x90
[   76.171391]  __x64_sys_ioctl+0x16/0x20
[   76.171634]  do_syscall_64+0x50/0x180
[   76.171892]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   76.172186] RIP: 0033:0x7f61d422e567
[   76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00
48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48
[   76.172911] RSP: 002b:7f61d3936d68 EFLAGS: 0246 ORIG_RAX:
0010
[   76.173328] RAX: ffda RBX: 019026b0 RCX:
7f61d422e567
[   76.173649] RDX: 019026b0 RSI: c400941b RDI:
0003
[   76.173909] RBP:  R08: 7f61d3937700 R09:

[   76.174244] R10: 7f61d3937700 R11: 0246 R12:

[   76.174566] R13: 000000801000 R14:  R15:
7f61d3937700
[   76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left

Signed-off-by: Anand Jain 
---
 fs/btrfs/scrub.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b1c2d1cdbd4b..3fc31eeef2a7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3757,10 +3757,13 @@ static noinline_for_stack int scrub_workers_get(struct 
btrfs_fs_info *fs_info,
 
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
+   lockdep_assert_held(_info->scrub_lock);
if (--fs_info->scrub_workers_refcnt == 0) {
+   mutex_unlock(_info->scrub_lock);
btrfs_destroy_workqueue(fs_info->scrub_workers);
btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
+   mutex_lock(_info->scrub_lock);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
-- 
1.8.3.1



[PATCH 1/2] btrfs: scrub: maintain the unlock order in scrub thread

2018-11-26 Thread Anand Jain
The fs_info::device_list_mutex and fs_info::scrub_lock creates a
nested locks in btrfs_scrub_dev(). During the lock acquire the
hierarchy is fs_info::device_list_mutex and then fs_info::scrub_lock,
so following the same reverse order during unlock, that is
fs_info::scrub_lock and then fs_info::device_list_mutex.

Signed-off-by: Anand Jain 
---
 fs/btrfs/scrub.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..b1c2d1cdbd4b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3865,7 +3865,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
}
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
-   mutex_unlock(_info->fs_devices->device_list_mutex);
 
/*
 * checking @scrub_pause_req here, we can avoid
@@ -3875,15 +3874,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 
devid, u64 start,
atomic_inc(_info->scrubs_running);
mutex_unlock(_info->scrub_lock);
 
-   if (!is_dev_replace) {
-   /*
-* by holding device list mutex, we can
-* kick off writing super in log tree sync.
-*/
-   mutex_lock(_info->fs_devices->device_list_mutex);
+   /*
+* by holding device list mutex, we can kick off writing super in log
+* tree sync.
+*/
+   if (!is_dev_replace)
ret = scrub_supers(sctx, dev);
-   mutex_unlock(_info->fs_devices->device_list_mutex);
-   }
+
+   mutex_unlock(_info->fs_devices->device_list_mutex);
 
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
-- 
1.8.3.1



[PATCH 0/2] btrfs: scrub: fix scrub_lock

2018-11-26 Thread Anand Jain
Anand Jain (2):
  btrfs: scrub: maintain the unlock order in scrub thread
  btrfs: scrub: fix circular locking dependency warning

 fs/btrfs/scrub.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

-- 
1.8.3.1



[PATCH v12] Add cli and ioctl to forget scanned device(s)

2018-11-21 Thread Anand Jain
v12:
  Fixed coding style - leave space between " : ".

v11:
 btrfs-progs: Bring the code into the else part of if(forget).
  Use strerror to print the erorr instead of ret.

v10:
 Make btrfs-progs changes more readable.
 With an effort to keep the known bug [1] as it is..
  [1]
   The cli 'btrfs device scan --all /dev/sdb' which should have scanned
only one device, ends up scanning all the devices and I am not trying
to fix this bug in this patch because..
  . -d|--all is marked as deprecated, I hope -d option would go away
  . For now some script might be using this bug as a feature, and fixing
this bug might lead to mount failure.

v9:
 Make forget as a btrfs device scan option.
 Use forget in the fstests, now you can run fstests with btrfs as rootfs
  which helps to exercise the uuid_mutex lock.

v8:
 Change log update in the kernel patch.

v7:
 Use struct btrfs_ioctl_vol_args (instead of struct
  btrfs_ioctl_vol_args_v2) as its inline with other ioctl
  btrfs-control
 The CLI usage/features remains same. However internally the ioctl flag
  is not required to delete all the unmounted devices. Instead leave
  btrfs_ioctl_vol_args::name NULL.

v6:
 Use the changed fn name btrfs_free_stale_devices().

 Change in title:
 Old v5:
 Cover-letter:
  [PATCH v5] Add cli and ioctl to ignore a scanned device
 Kernel:
  [PATCH v5] btrfs: introduce feature to ignore a btrfs device
 Progs:
  [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli

v5:
  Adds feature to delete all stale devices
  Reuses btrfs_free_stale_devices() fn and so depends on the
patch-set [1] in the ML.
  Uses struct btrfs_ioctl_vol_args_v2 instead of
struct btrfs_ioctl_vol_args as arg
  Does the device path matching instead of btrfs_device matching
(we won't delete the mounted device as btrfs_free_stale_devices()
checks for it)
v4:
  No change. But as the ML thread may be confusing, so resend.
v3:
  No change. Send to correct ML.
v2:
  Accepts review from Nikolay, details are in the specific patch.
  Patch 1/2 is renamed from
[PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list 
delete
  to
[PATCH 1/2] btrfs: add function to device list delete

Adds cli and ioctl to forget a scanned device or forget all stale
devices in the kernel.

Anand Jain (1):
  btrfs: introduce feature to forget a btrfs device

 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

Anand Jain (1):
  btrfs-progs: add cli to forget one or all scanned devices

 cmds-device.c | 63 ++-
 ioctl.h   |  2 ++
 2 files changed, 56 insertions(+), 9 deletions(-)

[1]
Anand Jain (1):
  fstests: btrfs use forget if not reload

 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

-- 
1.8.3.1


[PATCH] fstests: btrfs use forget if not reload

2018-11-21 Thread Anand Jain
[I will send this the xfstest ML after kernel and progs patch
 has been integrated].

btrfs reload was introduced to cleanup the device list inside the btrfs
kernel module.

The problem with the reload approach is that you can't run btrfs test
cases 124,125, 154 and 164 on the system with btrfs as root fs.

Now as we are introducing the btrfs forget feature as an btrfs device
scan option [1], so here are its pertaining changes in the fstests.

So these changes ensures we use the forget feature where available and
if its not then falls back to the reload approach.

[1]
btrfs-progs: add cli to forget one or all scanned devices
btrfs: introduce feature to forget a btrfs device

Signed-off-by: Anand Jain 
---
 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/common/btrfs b/common/btrfs
index 26dc0bb9600f..c7fbec11c8c1 100644
--- a/common/btrfs
+++ b/common/btrfs
@@ -374,3 +374,23 @@ _scratch_btrfs_sectorsize()
$BTRFS_UTIL_PROG inspect-internal dump-super $SCRATCH_DEV |\
grep sectorsize | awk '{print $2}'
 }
+
+_btrfs_supports_forget()
+{
+   $BTRFS_UTIL_PROG device scan --help | grep -wq forget && \
+   $BTRFS_UTIL_PROG device scan --forget > /dev/null 2>&1
+}
+
+_require_btrfs_forget_if_not_fs_loadable()
+{
+   _btrfs_supports_forget && return
+
+   _require_loadable_fs_module "btrfs"
+}
+
+_btrfs_forget_if_not_fs_reload()
+{
+   _btrfs_supports_forget && return
+
+   _reload_fs_module "btrfs"
+}
diff --git a/tests/btrfs/124 b/tests/btrfs/124
index ce3ad6aa3a58..edbeff1443f5 100755
--- a/tests/btrfs/124
+++ b/tests/btrfs/124
@@ -51,7 +51,7 @@ _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 2
 _test_unmount
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 2
 
@@ -86,7 +86,7 @@ echo "clean btrfs ko" >> $seqres.full
 _scratch_unmount
 
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 
 echo >> $seqres.full
 echo "-Write degraded mount fill upto $max_fs_sz bytes-" >> 
$seqres.full
@@ -125,7 +125,7 @@ echo
 echo "Mount degraded with the other dev"
 _scratch_unmount
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 _mount -o degraded $dev2 $SCRATCH_MNT >>$seqres.full 2>&1
 _run_btrfs_util_prog filesystem show
 checkpoint3=`md5sum $SCRATCH_MNT/tf2`
diff --git a/tests/btrfs/125 b/tests/btrfs/125
index e38de264b28e..9161a2ddeaad 100755
--- a/tests/btrfs/125
+++ b/tests/btrfs/125
@@ -50,7 +50,7 @@ _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 3
 _test_unmount
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 3
 
@@ -102,7 +102,7 @@ echo "unmount" >> $seqres.full
 _scratch_unmount
 echo "clean btrfs ko" >> $seqres.full
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 _mount -o degraded,device=$dev2 $dev1 $SCRATCH_MNT >>$seqres.full 2>&1
 dd if=/dev/zero of="$SCRATCH_MNT"/tf2 bs=$bs count=$count \
>>$seqres.full 2>&1
@@ -138,7 +138,7 @@ echo "Mount degraded but with other dev"
 
 _scratch_unmount
 # un-scan the btrfs devices
-_reload_fs_module "btrfs"
+_btrfs_forget_if_not_fs_reload
 
 _mount -o degraded,device=${dev2} $dev3 $SCRATCH_MNT >>$seqres.full 2>&1
 
diff --git a/tests/btrfs/154 b/tests/btrfs/154
index 99ea232aba4c..01745d20e9fc 100755
--- a/tests/btrfs/154
+++ b/tests/btrfs/154
@@ -36,7 +36,7 @@ rm -f $seqres.full
 _supported_fs btrfs
 _supported_os Linux
 _require_scratch_dev_pool 2
-_require_loadable_fs_module "btrfs"
+_require_btrfs_forget_if_not_fs_loadable
 
 _scratch_dev_pool_get 2
 
@@ -90,7 +90,7 @@ degrade_mount_write()
 
echo "clean btrfs ko" >> $seqres.full
# un-scan the btrfs devices
-   _reload_fs_module "btrfs"
+   _btrfs_forget_if_not_fs_reload
_mount -o degraded $DEV1 $SCRATCH_MNT >>$seqres.full 2>&1
cnt=$(( $COUNT/10 ))
dd if=/dev/urandom of="$SCRATCH_MNT"/tf1 bs=$bs count=$cnt \
@@ -142,7 +142,7 @@ verify()
echo "unmount" >> $seqres.full
 
_scratch_unmount
-   _reload_fs_module "btrfs"
+   _btrfs_forget_if_not_fs_reload
_mount -o degraded $DEV2 $SCRATCH_MNT >>$seqres.full 2>&1
verify_checkpoint1=`md5sum $SCRATCH_MNT/tf1`
verify_checkpoint2=`md5sum $SCRATCH_MNT/tf2`
diff --git a/tests/btrfs/164 b/tests

[PATCH v12] btrfs: introduce feature to forget a btrfs device

2018-11-21 Thread Anand Jain
Support for a new command 'btrfs dev forget [dev]' is proposed here
to undo the effects of 'btrfs dev scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty.
IOW(BTRFS_IOCTL_MAGIC, 5, ..)
This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
the /dev/btrfs-control to forget one or all devices, (devices which are
not mounted) from the btrfs kernel.

The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
set to specify the device path. And all unmounted devices can be removed
from the kernel if no device path is provided.

Again, the devices are removed only if the relevant fsid aren't mounted.

This new cli can provide..
 . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
   device is not going to be mounted.
 . Ability to mount the device in degraded mode when one of the other
   device is corrupted like in split brain raid1.
 . Running test cases which requires btrfs.ko-reload if the rootfs
   is btrfs.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
v11->v12: fix coding style add spacing before after ":".
v1->v11: Pls ref to the cover-letter. (sorry about that).

 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d3c6bbc0aa3a..eba2966913ae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2247,6 +2247,9 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(_mutex);
break;
+   case BTRFS_IOC_FORGET_DEV:
+   ret = btrfs_forget_devices(vol->name);
+   break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8e36cbb355df..48415a9edd46 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1397,6 +1397,15 @@ static int btrfs_read_disk_super(struct block_device 
*bdev, u64 bytenr,
return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+   mutex_lock(_mutex);
+   btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+   mutex_unlock(_mutex);
+
+   return 0;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount 
path
  * and we are not allowed to call set_blocksize during the scan. The superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1d936ce282c3..a4a89d5050f5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -414,6 +414,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
   fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index e0763bc4158e..c195896d478f 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -837,6 +837,8 @@ enum btrfs_err_code {
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+  struct btrfs_ioctl_vol_args)
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
-- 
1.8.3.1



[PATCH v12] btrfs-progs: add cli to forget one or all scanned devices

2018-11-21 Thread Anand Jain
This patch adds cli
  btrfs device forget [dev]
to remove the given device structure in the kernel if the device
is unmounted. If no argument is given it shall remove all stale
(device which are not mounted) from the kernel.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
v11->v12: none.
v1->v12: pls ref to the cover-letter.

 cmds-device.c | 72 ---
 ioctl.h   |  2 ++
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/cmds-device.c b/cmds-device.c
index 2a05f70a76a9..280d6f555377 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
 }
 
+static int btrfs_forget_devices(char *path)
+{
+   struct btrfs_ioctl_vol_args args;
+   int ret;
+   int fd;
+
+   fd = open("/dev/btrfs-control", O_RDWR);
+   if (fd < 0)
+   return -errno;
+
+   memset(, 0, sizeof(args));
+   if (path)
+   strncpy_null(args.name, path);
+   ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
+   if (ret)
+   ret = -errno;
+   close(fd);
+   return ret;
+}
+
 static const char * const cmd_device_scan_usage[] = {
-   "btrfs device scan [(-d|--all-devices)| [...]]",
-   "Scan devices for a btrfs filesystem",
+   "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
+   "[...]]",
+   "Scan or forget (deregister) devices for a btrfs filesystem",
" -d|--all-devices (deprecated)",
+   " -u|--forget [ ..]",
NULL
 };
 
@@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv)
int devstart;
int all = 0;
int ret = 0;
+   int forget = 0;
 
optind = 0;
while (1) {
int c;
static const struct option long_options[] = {
{ "all-devices", no_argument, NULL, 'd'},
+   { "forget", no_argument, NULL, 'u'},
{ NULL, 0, NULL, 0}
};
 
-   c = getopt_long(argc, argv, "d", long_options, NULL);
+   c = getopt_long(argc, argv, "du", long_options, NULL);
if (c < 0)
break;
switch (c) {
case 'd':
all = 1;
break;
+   case 'u':
+   forget = 1;
+   break;
default:
usage(cmd_device_scan_usage);
}
}
devstart = optind;
 
+   if (all && forget)
+   usage(cmd_device_scan_usage);
+
if (all && check_argc_max(argc - optind, 1))
usage(cmd_device_scan_usage);
 
if (all || argc - optind == 0) {
-   printf("Scanning for Btrfs filesystems\n");
-   ret = btrfs_scan_devices();
-   error_on(ret, "error %d while scanning", ret);
-   ret = btrfs_register_all_devices();
-   error_on(ret, "there are %d errors while registering devices", 
ret);
+   if (forget) {
+   ret = btrfs_forget_devices(NULL);
+   error_on(ret, "'%s', forget failed",
+strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems\n");
+   ret = btrfs_scan_devices();
+   error_on(ret, "error %d while scanning", ret);
+   ret = btrfs_register_all_devices();
+   error_on(ret,
+   "there are %d errors while registering devices",
+   ret);
+   }
goto out;
}
 
@@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv)
ret = 1;
goto out;
}
-   printf("Scanning for Btrfs filesystems in '%s'\n", path);
-   if (btrfs_register_one_device(path) != 0) {
-   ret = 1;
-   free(path);
-   goto out;
+   if (forget) {
+   ret = btrfs_forget_devices(path);
+   if (ret)
+   error("Can't forget '%s': %s",
+   path, strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems in '%s'\n",
+   

Re: [RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk

2018-11-20 Thread Anand Jain



David, any comments on this please.

Thanks, Anand


On 11/13/2018 06:32 PM, Anand Jain wrote:


David, Gentle ping.

Thanks, Anand

On 11/12/2018 03:50 PM, Nikolay Borisov wrote:



On 12.11.18 г. 6:58 ч., Anand Jain wrote:


The dev_replace_state defines are miss matched between the
BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1].

[1]
-
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED    2
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED    3
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED    4

btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED    2
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED    3
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED    4
-

The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and
btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_*
(we set dev_replace->replace_state using the
BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk).

  359 btrfs_set_dev_replace_replace_state(eb, ptr,
  360 dev_replace->replace_state);

IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_*
altogether? But how about the userland progs other than btrfs-progs?
If not at least fix the miss match as in [2], any comments?


Unfortunately you are right. This seems to stem from sloppy job back in
the days of initial dev-replace support. BTRFS_DEV_REPLACE_ITEM_STATE_*
were added in e922e087a35c ("Btrfs: enhance btrfs structures for device
replace support"), yet they were never used. And the
IOCTL_DEV_REPLACE_STATE* were added in e93c89c1 ("Btrfs: add new
sources for device replace code").

It looks like the ITEM_STATE* definitions were stillborn so to speak and
personally I'm in favor of removing them. They shouldn't have been
merged in the first place and indeed the patch doesn't even have a
Reviewed-by tag. So it originated from the, I'd say, spartan days of
btrfs development...

David,  any code which is using BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED
is inherently broken, so how about we remove those definitions, then
when it's compilation is broken in the future the author will actually
have a chance to fix it, though it's highly unlikely anyone is relying
on those definitions.




[2]
--
diff --git a/include/uapi/linux/btrfs_tree.h
b/include/uapi/linux/btrfs_tree.h
index aff1356c2bb8..9ffa7534cadf 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -805,9 +805,9 @@ struct btrfs_dev_stats_item {
  #define 
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1

  #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
  #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED   1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  4
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4

  struct btrfs_dev_replace_item {
 /*
--


Thanks, Anand





[PATCH RESEND v2 1/2] btrfs: quieten warn if replace is canceled at finish

2018-11-20 Thread Anand Jain
When we successfully cancel the replace its scrub returns -ECANCELED,
which then passed to btrfs_dev_replace_finishing(), it cleans up based
on the scrub returned status and propagates the same -ECANCELED back
the parent function. As of now only user can cancel the replace-scrub,
so its ok to quieten the warn here.

Signed-off-by: Anand Jain 
---
[fix: quieten spelling]
v1->v2: Use the condition within the WARN_ON()

 fs/btrfs/dev-replace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1dc8e86546db..9031a362921a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
*fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-   } else {
+   } else if (ret != -ECANCELED) {
WARN_ON(ret);
}
 
@@ -954,7 +954,7 @@ static int btrfs_dev_replace_kthread(void *data)
  btrfs_device_get_total_bytes(dev_replace->srcdev),
  _replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
-   WARN_ON(ret);
+   WARN_ON(ret && ret != -ECANCELED);
 
clear_bit(BTRFS_FS_EXCL_OP, _info->flags);
return 0;
-- 
1.8.3.1



[PATCH RESEND v2 2/2] btrfs: user requsted replace cancel is not an error

2018-11-20 Thread Anand Jain
As of now only user requested replace cancel can cancel the replace-scrub
so no need to log error for it.

Signed-off-by: Anand Jain 
---
v1->v2: none.

 fs/btrfs/dev-replace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9031a362921a..40a0942b4659 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
src_device,
tgt_device);
} else {
-   btrfs_err_in_rcu(fs_info,
+   if (scrub_ret != -ECANCELED)
+   btrfs_err_in_rcu(fs_info,
 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 btrfs_dev_name(src_device),
 src_device->devid,
-- 
1.8.3.1



[PATCH RESEND 0/9 v2] fix warn_on for replace cancel

2018-11-20 Thread Anand Jain
These two patches were sent as part of
   [PATCH 0/9 v2] fix replace-start and replace-cancel racing
before but as these aren't integrated so I am sending these again.

The patch [1] which is in misc-next, calls btrfs_dev_replace_finishing()
after replace is canceled, so the ret argument passed to the
btrfs_dev_replace_finishing() is -ECANCEL, which is not a real error.
So these patches quieten the warn and error log if its -ECANCEL.

These should be integrated otherwise we see the WARN_ON and btrfs_error()
after replace cancel.

[1] 
08bdfc3a0611 btrfs: fix use-after-free due to race between replace start and 
cancel

Anand Jain (2):
  btrfs: quieten warn if the replace is canceled at finish
  btrfs: user requsted replace cancel is not an error


[PATCH v6 1/3] btrfs: add helper function describe_block_group()

2018-11-20 Thread Anand Jain
Improve on describe_relocation() add a common helper function to describe
the block groups.

Signed-off-by: Anand Jain 
Reviewed-by: David Sterba 
---
v5->v6: Use () in the body for the args sent in defines
Use right indent to align '\'
Use goto to out_overflow instead of return
v4.1->v5: Initialize buf[128] to null.
v4->v4.1: Use strcpy(buf, "|NONE"); as in the original
v3->v4: Just pass full flag name in the define DESCRIBE_FLAG(flag,..),
 so that it can be used at couple of more places.
Rename describe_block_groups() to btrfs_describe_block_groups().
Drop useless return u32.
v3: Born.

 fs/btrfs/relocation.c | 30 +++---
 fs/btrfs/volumes.c| 46 ++
 fs/btrfs/volumes.h|  1 +
 3 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 924116f654a1..0373b3cc1d36 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4184,37 +4184,13 @@ static struct reloc_control *alloc_reloc_control(void)
 static void describe_relocation(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
 {
-   char buf[128];  /* prefixed by a '|' that'll be dropped */
-   u64 flags = block_group->flags;
+   char buf[128] = {'\0'};
 
-   /* Shouldn't happen */
-   if (!flags) {
-   strcpy(buf, "|NONE");
-   } else {
-   char *bp = buf;
-
-#define DESCRIBE_FLAG(f, d) \
-   if (flags & BTRFS_BLOCK_GROUP_##f) { \
-   bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
-   flags &= ~BTRFS_BLOCK_GROUP_##f; \
-   }
-   DESCRIBE_FLAG(DATA, "data");
-   DESCRIBE_FLAG(SYSTEM,   "system");
-   DESCRIBE_FLAG(METADATA, "metadata");
-   DESCRIBE_FLAG(RAID0,"raid0");
-   DESCRIBE_FLAG(RAID1,"raid1");
-   DESCRIBE_FLAG(DUP,  "dup");
-   DESCRIBE_FLAG(RAID10,   "raid10");
-   DESCRIBE_FLAG(RAID5,"raid5");
-   DESCRIBE_FLAG(RAID6,"raid6");
-   if (flags)
-   snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
-#undef DESCRIBE_FLAG
-   }
+   btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
 
btrfs_info(fs_info,
   "relocating block group %llu flags %s",
-  block_group->key.objectid, buf + 1);
+  block_group->key.objectid, buf);
 }
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..3ab22e7e404e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -123,6 +123,52 @@ const char *get_raid_name(enum btrfs_raid_types type)
return btrfs_raid_array[type].raid_name;
 }
 
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
+{
+   int i;
+   int ret;
+   char *bp = buf;
+   u64 flags = bg_flags;
+   u32 size_bp = size_buf;
+
+   if (!flags) {
+   strcpy(bp, "NONE");
+   return;
+   }
+
+#define DESCRIBE_FLAG(f, d)\
+   do {\
+   if (flags & (f)) {  \
+   ret = snprintf(bp, size_bp, "%s|", (d));\
+   if (ret < 0 || ret >= size_bp)  \
+   goto out_overflow;  \
+   size_bp -= ret; \
+   bp += ret;  \
+   flags &= ~(f);  \
+   }   \
+   } while (0)
+
+   DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
+   DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
+   DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+   DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
+   for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+   DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
+ btrfs_raid_array[i].raid_name);
+#undef DESCRIBE_FLAG
+
+   if (flags) {
+   ret = snprintf(bp, size_bp, "0x%llx|", flags);
+   size_bp -= ret;
+   }
+
+   if (size_bp < size_buf)
+   buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
+
+out_overflow:
+   return;
+}
+
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
 static i

[PATCH v6 0/3] btrfs: balance: improve kernel logs

2018-11-20 Thread Anand Jain
v5->v6:
 Mostly the defines non functional changes.
 Use goto instead of return in middle of the define.
 Pls ref individual patches 1/3 and 2/3 for more info.

v4->v5:
 Mainly address David review comment [1].
 [1]
  https://patchwork.kernel.org/patch/10425987/
 pls ref to individual patch 2/3 for details.

v3->v4:
 Pls ref to individual patches.

Based on misc-next.

v2->v3:
  Inspried by describe_relocation(), improves it, makes it a helper
  function and use it to log the balance operations.

Kernel logs are very important for the forensic investigations of the
issues, these patchs make balance logs easy to review.

Anand Jain (3):
  btrfs: add helper function describe_block_group()
  btrfs: balance: add args info during start and resume
  btrfs: balance: add kernel log for end or paused

 fs/btrfs/relocation.c |  30 +--
 fs/btrfs/volumes.c| 216 +-
 fs/btrfs/volumes.h|   1 +
 3 files changed, 217 insertions(+), 30 deletions(-)

-- 
1.8.3.1


[PATCH v6 3/3] btrfs: balance: add kernel log for end or paused

2018-11-20 Thread Anand Jain
Add a kernel log when the balance ends, either for cancel or completed
or if it is paused.

Signed-off-by: Anand Jain 
---
v5->v6: Quite soul. nothing.
v4->v5: nothing.
v3->v4: nothing.
v2->v3: nothing.
v1->v2: Moved from 2/3 to 3/3
 fs/btrfs/volumes.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb18739f19ef..0684f14ff70d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4060,6 +4060,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ret = __btrfs_balance(fs_info);
 
mutex_lock(_info->balance_mutex);
+   if (ret == -ECANCELED && atomic_read(_info->balance_pause_req))
+   btrfs_info(fs_info, "balance: paused");
+   else if (ret == -ECANCELED && atomic_read(_info->balance_cancel_req))
+   btrfs_info(fs_info, "balance: canceled");
+   else
+   btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
clear_bit(BTRFS_FS_BALANCE_RUNNING, _info->flags);
 
if (bargs) {
-- 
1.8.3.1



[PATCH v6 2/3] btrfs: balance: add args info during start and resume

2018-11-20 Thread Anand Jain
Balance arg info is an important information to be reviewed for the
system audit. So this patch adds them to the kernel log.

Example:
->btrfs bal start -f -mprofiles=raid1,convert=single,soft 
-dlimit=10..20,usage=50 /btrfs

kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 
-msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single

Signed-off-by: Anand Jain 
---
v5->v6: Use () in the body for the args sent in defines
Use right indent to align '\'
Use goto to out_overflow instead of return (also fixes a mem leak in v5)
Rename CHECK_UPDATE_BP_XXX to CHECK_APPEND_XXX
v4.1->v5: Per David review comment the code..
   bp += snprintf(bp, buf - bp + size_buf, "soft,");
  is not safe if 'buf - bp + size_buf' becomes accidentally
  negative, so fix this by using following snprintf.

 #define CHECK_UPDATE_BP_1(a) \
   do { \
   ret = snprintf(bp, size_bp, a); \
   if (ret < 0 || ret >= size_bp) \
   goto out; \
   size_bp -= ret; \
   bp += ret; \
   } while (0)

And initialize the tmp_buf to null.

v4->v4.1: Rename last missed sz to size_buf.

v3->v4: Change log updated with new example.
Log format is changed to almost match with the cli.
snprintf drop %s for inline string.
Print range as =%u..%u instead of min=%umax=%u.
soft is under the args now.
force is printed as -f.

v2->v3: Change log updated.
Make use of describe_block_group() added in 1/4
Drop using strcat instead use snprintf.
Logs string format updated as shown in the example above.

v1->v2: Change log update.
Move adding the logs for balance complete and end to a new patch

 fs/btrfs/volumes.c | 163 -
 1 file changed, 160 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab22e7e404e..eb18739f19ef 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3757,6 +3757,164 @@ static inline int validate_convert_profile(struct 
btrfs_balance_args *bctl_arg,
 (bctl_arg->target & ~allowed)));
 }
 
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+u32 size_buf)
+{
+   int ret;
+   u32 size_bp = size_buf;
+   char *bp = buf;
+   u64 flags = bargs->flags;
+   char tmp_buf[128] = {'\0'};
+
+   if (!flags)
+   return;
+
+#define CHECK_APPEND_NOARG(a)  \
+   do {\
+   ret = snprintf(bp, size_bp, (a));   \
+   if (ret < 0 || ret >= size_bp)  \
+   goto out_overflow;  \
+   size_bp -= ret; \
+   bp += ret;  \
+   } while (0)
+
+#define CHECK_APPEND_1ARG(a, v1)   \
+   do {\
+   ret = snprintf(bp, size_bp, (a), (v1)); \
+   if (ret < 0 || ret >= size_bp)  \
+   goto out_overflow;  \
+   size_bp -= ret; \
+   bp += ret;  \
+   } while (0)
+
+#define CHECK_APPEND_2ARG(a, v1, v2)   \
+   do {\
+   ret = snprintf(bp, size_bp, (a), (v1), (v2));   \
+   if (ret < 0 || ret >= size_bp)  \
+   goto out_overflow;  \
+   size_bp -= ret; \
+   bp += ret;  \
+   } while (0)
+
+   if (flags & BTRFS_BALANCE_ARGS_SOFT)
+   CHECK_APPEND_NOARG("soft,");
+
+   if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+   btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+   sizeof(tmp_buf));
+   CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_USAGE)
+   CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
+
+   if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
+   CHECK_APPEND_2ARG("usage=%u..%u,",
+ bargs->usage_min, bargs->usage_max);
+
+   if (flags & BTRFS_BALANCE_ARGS_DEVID)
+   CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
+
+   if (flags & BTRFS_BALANCE_ARGS_DRANGE)
+   CHECK_APPEND_2ARG("drange=%llu..%llu,",
+ bargs->pstart, bargs->pend);
+
+   if (fl

Re: [PATCH v5 2/3] btrfs: balance: add args info during start and resume

2018-11-20 Thread Anand Jain




On 11/20/2018 01:07 AM, David Sterba wrote:

On Wed, Nov 14, 2018 at 09:17:11PM +0800, Anand Jain wrote:

Balance arg info is an important information to be reviewed for the
system audit. So this patch adds them to the kernel log.

Example:
->btrfs bal start -f -mprofiles=raid1,convert=single,soft 
-dlimit=10..20,usage=50 /btrfs

kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 
-msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single

Signed-off-by: Anand Jain 
---
v4.1->v5: Per David review comment the code..
bp += snprintf(bp, buf - bp + size_buf, "soft,");
  is not safe if 'buf - bp + size_buf' becomes accidentally
  negative, so fix this by using following snprintf.

  #define CHECK_UPDATE_BP_1(a) \
do { \
ret = snprintf(bp, size_bp, a); \
if (ret < 0 || ret >= size_bp) \
goto out; \
size_bp -= ret; \
bp += ret; \
} while (0)

And initialize the tmp_buf to null.

v4->v4.1: Rename last missed sz to size_buf.

v3->v4: Change log updated with new example.
Log format is changed to almost match with the cli.
snprintf drop %s for inline string.
Print range as =%u..%u instead of min=%umax=%u.
soft is under the args now.
force is printed as -f.

v2->v3: Change log updated.
 Make use of describe_block_group() added in 1/4
 Drop using strcat instead use snprintf.
 Logs string format updated as shown in the example above.

v1->v2: Change log update.
 Move adding the logs for balance complete and end to a new patch

  fs/btrfs/volumes.c | 172 -
  1 file changed, 169 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 12b3d0625c6a..8397f72bdac7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3754,6 +3754,173 @@ static inline int validate_convert_profile(struct 
btrfs_balance_args *bctl_arg,
 (bctl_arg->target & ~allowed)));
  }
  
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,

+u32 size_buf)
+{
+   int ret;
+   u32 size_bp = size_buf;
+   char *bp = buf;
+   u64 flags = bargs->flags;
+   char tmp_buf[128] = {'\0'};
+
+   if (!flags)
+   return;
+
+#define CHECK_UPDATE_BP_1(a) \


CHECK_UPDATE_BP_NOARG and the others CHECK_UPDATE_BP_1ARG, though I'm
thinking about picking another name for the macro like CHECK_APPEND_2ARG
etc.


 I liked CHECK_APPEND_XXX I have renamed
  CHECK_UPDATE_BP_1 to CHECK_APPEND_NOARG
  CHECK_UPDATE_BP_2 to CHECK_APPEND_1ARG
  CHECK_UPDATE_BP_3 to CHECK_APPEND_2ARG

Hope its better now.


+   do { \
+   ret = snprintf(bp, size_bp, a); \
+   if (ret < 0 || ret >= size_bp) \
+   goto out; \


out_overflow and align \ to the right, please.


 Yep done.


+   size_bp -= ret; \
+   bp += ret; \
+   } while (0)
+
+#define CHECK_UPDATE_BP_2(a, v1) \
+   do { \
+   ret = snprintf(bp, size_bp, a, v1); \
+   if (ret < 0 || ret >= size_bp) \
+   goto out; \
+   size_bp -= ret; \
+   bp += ret; \
+   } while (0)
+
+#define CHECK_UPDATE_BP_3(a, v1, v2) \
+   do { \
+   ret = snprintf(bp, size_bp, a, v1, v2); \
+   if (ret < 0 || ret >= size_bp) \
+   goto out; \
+   size_bp -= ret; \
+   bp += ret; \
+   } while (0)
+
+   if (flags & BTRFS_BALANCE_ARGS_SOFT) {
+   CHECK_UPDATE_BP_1("soft,");
+   }


no { } around single statement


 fixed.


+
+   if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+   btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+   sizeof(tmp_buf));
+   CHECK_UPDATE_BP_2("profiles=%s,", tmp_buf);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_USAGE) {
+   CHECK_UPDATE_BP_2("usage=%llu,", bargs->usage);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) {
+   CHECK_UPDATE_BP_3("usage=%u..%u,",
+ bargs->usage_min, bargs->usage_max);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_DEVID) {
+   CHECK_UPDATE_BP_2("devid=%llu,", bargs->devid);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_DRANGE) {
+   CHECK_UPDATE_BP_3("drange=%llu..%llu,",
+ bargs->pstart, bargs->pend);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_VRANGE) {
+   CHECK_UPDATE_BP_3("vrange%llu..%llu,",
+  

Re: [PATCH v5 1/3] btrfs: add helper function describe_block_group()

2018-11-20 Thread Anand Jain



Thanks for the review.. more below.

On 11/20/2018 01:02 AM, David Sterba wrote:

On Wed, Nov 14, 2018 at 09:17:10PM +0800, Anand Jain wrote:

Improve on describe_relocation() add a common helper function to describe
the block groups.

Signed-off-by: Anand Jain 
Reviewed-by: David Sterba 
---
v4.1->v5: Initialize buf[128] to null.
v4->v4.1: Use strcpy(buf, "|NONE"); as in the original
v3->v4: Just pass full flag name in the define DESCRIBE_FLAG(flag,..),
 so that it can be used at couple of more places.
Rename describe_block_groups() to btrfs_describe_block_groups().
Drop useless return u32.
v3: Born.

  fs/btrfs/relocation.c | 30 +++---
  fs/btrfs/volumes.c| 43 +++
  fs/btrfs/volumes.h|  1 +
  3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 924116f654a1..0373b3cc1d36 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4184,37 +4184,13 @@ static struct reloc_control *alloc_reloc_control(void)
  static void describe_relocation(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
  {
-   char buf[128];  /* prefixed by a '|' that'll be dropped */
-   u64 flags = block_group->flags;
+   char buf[128] = {'\0'};
  
-	/* Shouldn't happen */

-   if (!flags) {
-   strcpy(buf, "|NONE");
-   } else {
-   char *bp = buf;
-
-#define DESCRIBE_FLAG(f, d) \
-   if (flags & BTRFS_BLOCK_GROUP_##f) { \
-   bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
-   flags &= ~BTRFS_BLOCK_GROUP_##f; \
-   }
-   DESCRIBE_FLAG(DATA, "data");
-   DESCRIBE_FLAG(SYSTEM,   "system");
-   DESCRIBE_FLAG(METADATA, "metadata");
-   DESCRIBE_FLAG(RAID0,"raid0");
-   DESCRIBE_FLAG(RAID1,"raid1");
-   DESCRIBE_FLAG(DUP,  "dup");
-   DESCRIBE_FLAG(RAID10,   "raid10");
-   DESCRIBE_FLAG(RAID5,"raid5");
-   DESCRIBE_FLAG(RAID6,"raid6");
-   if (flags)
-   snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
-#undef DESCRIBE_FLAG
-   }
+   btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
  
  	btrfs_info(fs_info,

   "relocating block group %llu flags %s",
-  block_group->key.objectid, buf + 1);
+  block_group->key.objectid, buf);
  }
  
  /*

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..12b3d0625c6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -123,6 +123,49 @@ const char *get_raid_name(enum btrfs_raid_types type)
return btrfs_raid_array[type].raid_name;
  }
  
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)

+{
+   int i;
+   int ret;
+   char *bp = buf;
+   u64 flags = bg_flags;
+   u32 size_bp = size_buf;
+
+   if (!flags) {
+   strcpy(bp, "NONE");
+   return;
+   }
+
+#define DESCRIBE_FLAG(f, d) \


Macro arguments should be in ( ) in the body


 Fixed in v6.


+   do { \
+   if (flags & f) { \
+   ret = snprintf(bp, size_bp, "%s|", d); \
+   if (ret < 0 || ret >= size_bp) \
+   return; \


Ok, this seems to be safe. snprintf will not write more than size_bp and
this will be caught if it overflows.

The return is obscured by the macro, I'd rather make it more visible
that there is a potential change in the code flow. The proposed change:

goto out_overflow;
...

and define out_overflow at the end of function. Same for the other
helper macros.


 makes sense. Thanks for explaining. Fixed in v6.


Also please align the backslashes of the macro a few tabs to the right.


 Yep done in v6.

Thanks, Anand



+   size_bp -= ret; \
+   bp += ret; \
+   flags &= ~f; \
+   } \
+   } while (0)


Re: bad tree block start, want 705757184 have 82362368

2018-11-18 Thread Anand Jain




On 11/18/2018 03:56 PM, Stephan Olbrich wrote:

Am Sonntag, 18. November 2018, 01:30:14 CET schrieb Qu Wenruo:

Late on I got the same errors for my /home partition (on the same drive)
as well. I have snapshots of all partitions on another drive made by
btrbk. To get a working system, I made new (rw) snapshots of the most
recent backup and setup grub and fstab, so my system would boot from the
other drive. Unfortunately now I got the "bad tree block start" error
again at least once in dmesg but I didn't save it and it's not in syslog
:-( What I remember is, that it was followed by other btrfs error
messages saying something about correcting something. And the filesystem
was still read/write this time.
At the moment I can't reproduce it.


Today it happend again (sdb is my backup drive, which is my main drive at the 
moment):
[  286.325857] BTRFS error (device sdb1): bad tree block start, want 
787719208960 have 11268016545161247416
[  286.363245] BTRFS info (device sdb1): read error corrected: ino 0 off 
787719208960 (dev /dev/sdb1 sector 1243815072)
[  286.364087] BTRFS info (device sdb1): read error corrected: ino 0 off 
787719213056 (dev /dev/sdb1 sector 1243815080)
[  286.425946] BTRFS info (device sdb1): read error corrected: ino 0 off 
787719217152 (dev /dev/sdb1 sector 1243815088)
[  286.427530] BTRFS info (device sdb1): read error corrected: ino 0 off 
787719221248 (dev /dev/sdb1 sector 1243815096)


 Was there any hardware issues? How about the following data from the 
system..


  btrfs fi df 
  btrfs dev stat 

Thanks, Anand




Is there any way to find out, which files are affected by the errors
above?


No files are affected, but an essential tree, extent tree, is corrupted.

Normally this may prevent RW mount, and even it mounts it can still
cause problem when doing any write.
It could even prevent RO mount if the corrupted leaf contains block
group item.

But your data should be OK if there is no other corruption, and in that
case btrfs-restore should work well.


I don't really trust the data on the drive I'm using at the
moment, as it has shown errors as well, but I have a less current backup
on yet another drive but at it is a few weeks old, I don't want to use
it
to setup the system on the SSD again, but just copy the relevant files
if
possible. Or is it possible to repair the original file system?


At least we need "btrfs check" output.


I updated btrfs-progs and run btrfs check for / and /home
No errors are found on / (sda2), but there are errors on /home ??

$ btrfs --version
btrfs-progs v4.19

$ btrfs check /dev/sda2
Opening filesystem to check...
Checking filesystem on /dev/sda2
UUID: 80368989-ffa8-463c-98fb-fe2e28ca7bf3
[1/7] checking root items
[2/7] checking extents
[3/7] checking free space cache
[4/7] checking fs roots
[5/7] checking only csums items (without verifying data)
[6/7] checking root refs
[7/7] checking quota groups skipped (not enabled on this FS)
found 64816218112 bytes used, no error found
total csum bytes: 59518732
total tree bytes: 2180268032
total fs tree bytes: 1965965312
total extent tree bytes: 123289600
btree space waste bytes: 478665777
file data blocks allocated: 151083261952

  referenced 76879990784


This fs is completely fine, including your data.


$ btrfs check /dev/sda4
Opening filesystem to check...
Checking filesystem on /dev/sda4
UUID: 81c38df8-b7f9-412c-8c88-cfde8db68eb1
[1/7] checking root items
[2/7] checking extents
[3/7] checking free space cache
[4/7] checking fs roots
root 257 inode 7970563 errors 100, file extent discount

Found file extent holes:
 start: 0, len: 20480

root 257 inode 7970564 errors 100, file extent discount

Found file extent holes:
 start: 0, len: 77824

ERROR: errors found in fs roots


These are just minor errors, won't even causing any data mismatch.

So all your fses should be mostly OK.

Would you please try to use v4.19-rc* to see if it changes anything?

v4.19-rc1 is the only rc I found, but that is older than v4.19, right?
Anyway, here is the output:

$ btrfs --version
btrfs-progs v4.19-rc1

$ btrfs check /dev/sda2
Opening filesystem to check...
Checking filesystem on /dev/sda2
UUID: 80368989-ffa8-463c-98fb-fe2e28ca7bf3
[1/7] checking root items
[2/7] checking extents
[3/7] checking free space cache
[4/7] checking fs roots
[5/7] checking only csums items (without verifying data)
[6/7] checking root refs
[7/7] checking quota groups skipped (not enabled on this FS)
found 64816218112 bytes used, no error found
total csum bytes: 59518732
total tree bytes: 2180268032
total fs tree bytes: 1965965312
total extent tree bytes: 123289600
btree space waste bytes: 478665777
file data blocks allocated: 151083261952
  referenced 76879990784

$ btrfs check /dev/sda4
Opening filesystem to check...
Checking filesystem on /dev/sda4
UUID: 81c38df8-b7f9-412c-8c88-cfde8db68eb1
[1/7] checking root items
[2/7] checking extents
[3/7] checking free space cache
[4/7] checking fs roots
root 257 inode 7970563 errors 

Re: [PATCH 2/5] btrfs: Refactor btrfs_can_relocate

2018-11-16 Thread Anand Jain




On 10/26/2018 07:43 PM, Nikolay Borisov wrote:

btrfs_can_relocate returns 0 when it concludes the given chunk can be
relocated and -1 otherwise. Since this function is used as a predicated
and it return a binary value it makes no sense to have it's return
value as an int so change it to bool. Furthermore, remove a stale
leftover comment from e6ec716f0ddb ("Btrfs: make raid attr array more
readable").

Finally make the logic in the list_for_each_entry loop a bit more obvious. Up
until now the fact that we are returning success (0) was a bit masked by the
fact that the 0 is re-used from the return value of find_free_dev_extent.
Instead, now just increment dev_nr if we find a suitable extent and explicitly
set can_reloc to true if enough devices with unallocated space are present.
No functional changes.

Signed-off-by: Nikolay Borisov 
---
  fs/btrfs/ctree.h   |  2 +-
  fs/btrfs/extent-tree.c | 39 ++-
  fs/btrfs/volumes.c |  3 +--
  3 files changed, 16 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68ca41dbbef3..06edc4f9ceb2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2680,7 +2680,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle 
*trans,
  int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
  int btrfs_free_block_groups(struct btrfs_fs_info *info);
  int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
+bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
   u64 bytes_used, u64 type, u64 chunk_offset,
   u64 size);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a1febf155747..816bca482358 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9423,10 +9423,10 @@ void btrfs_dec_block_group_ro(struct 
btrfs_block_group_cache *cache)
  /*
   * checks to see if its even possible to relocate this block group.
   *
- * @return - -1 if it's not a good idea to relocate this block group, 0 if its
- * ok to go ahead and try.
+ * @return - false if not enough space can be found for relocation, true
+ * otherwise
   */
-int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
+bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
  {
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_block_group_cache *block_group;
@@ -9441,7 +9441,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 
bytenr)
int debug;
int index;
int full = 0;
-   int ret = 0;
+   bool can_reloc = true;
  
  	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
  
@@ -9453,7 +9453,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)

btrfs_warn(fs_info,
   "can't find block group for bytenr %llu",
   bytenr);
-   return -1;
+   return false;
}
  
  	min_free = btrfs_block_group_used(_group->item);

@@ -9489,16 +9489,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
 * group is going to be restriped, run checks against the target
 * profile instead of the current one.
 */
-   ret = -1;
+   can_reloc = false;
  
-	/*

-* index:
-*  0: raid10
-*  1: raid1
-*  2: dup
-*  3: raid0
-*  4: single
-*/
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
@@ -9534,10 +9526,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
  
  	/* We need to do this so that we can look at pending chunks */

trans = btrfs_join_transaction(root);
-   if (IS_ERR(trans)) {
-   ret = PTR_ERR(trans);
+   if (IS_ERR(trans))
goto out;
-   }
  
  	mutex_lock(_info->chunk_mutex);

list_for_each_entry(device, _devices->alloc_list, dev_alloc_list) {
@@ -9549,18 +9539,17 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, 
u64 bytenr)
 */
if (device->total_bytes > device->bytes_used + min_free &&
!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) 
{
-   ret = find_free_dev_extent(trans, device, min_free,
-  _offset, NULL);
-   if (!ret)
+   if (!find_free_dev_extent(trans, device, min_free,
+  _offset, NULL))


 This can return -ENOMEM;


dev_nr++;
  
-			if (dev_nr >= dev_min)

+   if (dev_nr >= dev_min) {
+   

Re: [PATCH 5/5] btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range

2018-11-16 Thread Anand Jain




On 10/26/2018 07:43 PM, Nikolay Borisov wrote:

lock_delalloc_pages should only return 2 values - 0 in case of success
and -EAGAIN if the range of pages to be locked should be shrunk due to
some of gone. Manual inspections confirms that this is
indeed the case since __process_pages_contig is where lock_delalloc_pages
gets its return value. The latter always returns 0  or -EAGAIN so the
invariant holds. No functional changes.

Signed-off-by: Nikolay Borisov 


Reviewed-by: Anand Jain 

Thanks, Anand


---
  fs/btrfs/extent_io.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1a9a521aefe5..94bc53472031 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1606,6 +1606,7 @@ static noinline_for_stack u64 
find_lock_delalloc_range(struct inode *inode,
/* step two, lock all the pages after the page that has start */
ret = lock_delalloc_pages(inode, locked_page,
  delalloc_start, delalloc_end);
+   ASSERT(!ret || ret == -EAGAIN);
if (ret == -EAGAIN) {
/* some of the pages are gone, lets avoid looping by
 * shortening the size of the delalloc range we're searching
@@ -1621,7 +1622,6 @@ static noinline_for_stack u64 
find_lock_delalloc_range(struct inode *inode,
goto out_failed;
}
}
-   BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
  
  	/* step three, lock the state bits for the whole range */

lock_extent_bits(tree, delalloc_start, delalloc_end, _state);



Re: [PATCH 4/5] btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument

2018-11-16 Thread Anand Jain




On 10/26/2018 07:43 PM, Nikolay Borisov wrote:

All callers of this function pass BTRFS_MAX_EXTENT_SIZE (128M) so let's
reduce the argument count and make that a local variable. No functional
changes.

Signed-off-by: Nikolay Borisov 


  Reviewed-by: Anand Jain 

Thanks, Anand


---
  fs/btrfs/extent_io.c | 11 +--
  fs/btrfs/extent_io.h |  2 +-
  fs/btrfs/tests/extent-io-tests.c | 10 +-
  3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6877a74c7469..1a9a521aefe5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1566,8 +1566,9 @@ static noinline int lock_delalloc_pages(struct inode 
*inode,
  static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
-   u64 *end, u64 max_bytes)
+   u64 *end)
  {
+   u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 delalloc_start;
u64 delalloc_end;
u64 found;
@@ -1647,10 +1648,9 @@ static noinline_for_stack u64 
find_lock_delalloc_range(struct inode *inode,
  u64 btrfs_find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
-   u64 *end, u64 max_bytes)
+   u64 *end)
  {
-   return find_lock_delalloc_range(inode, tree, locked_page, start, end,
-   max_bytes);
+   return find_lock_delalloc_range(inode, tree, locked_page, start, end);
  }
  #endif
  
@@ -3233,8 +3233,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,

nr_delalloc = find_lock_delalloc_range(inode, tree,
   page,
   _start,
-  _end,
-  BTRFS_MAX_EXTENT_SIZE);
+  _end);
if (nr_delalloc == 0) {
delalloc_start = delalloc_end + 1;
continue;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 369daa5d4f73..7ec4f93caf78 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
  u64 btrfs_find_lock_delalloc_range(struct inode *inode,
  struct extent_io_tree *tree,
  struct page *locked_page, u64 *start,
- u64 *end, u64 max_bytes);
+ u64 *end);
  #endif
  struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
   u64 start);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e0f4a01be14..8ea8c2aa6696 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -107,7 +107,7 @@ static int test_find_delalloc(u32 sectorsize)
start = 0;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (!found) {
test_err("should have found at least one delalloc");
goto out_bits;
@@ -138,7 +138,7 @@ static int test_find_delalloc(u32 sectorsize)
start = test_start;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (!found) {
test_err("couldn't find delalloc in our range");
goto out_bits;
@@ -172,7 +172,7 @@ static int test_find_delalloc(u32 sectorsize)
start = test_start;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (found) {
test_err("found range when we shouldn't have");
goto out_bits;
@@ -193,7 +193,7 @@ static int test_find_delalloc(u32 sectorsize)
start = test_start;
end = 0;
found = btrfs_find_lock_delalloc_range(inode, , locked_page, ,
-, max_bytes);
+);
if (!found) {
test_err("didn't find our range");
goto out_bits;
@@ -234,7 +234,7 @@ static int test_find_delalloc(u32 sectorsize)
 * tests expected behavior.
 */
foun

Re: [PATCH 3/5] btrfs: Remove superfluous check form btrfs_remove_chunk

2018-11-16 Thread Anand Jain




On 10/26/2018 07:43 PM, Nikolay Borisov wrote:

It's unnecessary to check map->stripes[i].dev for NULL given its value
is already set and dereferenced above the the check. No functional changes.

Signed-off-by: Nikolay Borisov 


Reviewed-by: Anand Jain 

Thanks, Anand


---
  fs/btrfs/volumes.c | 12 +---
  1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dc53d94a62aa..f0db43d08456 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2797,13 +2797,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle 
*trans, u64 chunk_offset)
mutex_unlock(_info->chunk_mutex);
}
  
-		if (map->stripes[i].dev) {

-   ret = btrfs_update_device(trans, map->stripes[i].dev);
-   if (ret) {
-   mutex_unlock(_devices->device_list_mutex);
-   btrfs_abort_transaction(trans, ret);
-   goto out;
-   }
+   ret = btrfs_update_device(trans, device);
+   if (ret) {
+   mutex_unlock(_devices->device_list_mutex);
+   btrfs_abort_transaction(trans, ret);
+   goto out;
}
}
mutex_unlock(_devices->device_list_mutex);



Re: BTRFS on production: NVR 16+ IP Cameras

2018-11-16 Thread Anand Jain




On 11/16/2018 03:51 AM, Nikolay Borisov wrote:



On 15.11.18 г. 20:39 ч., Juan Alberto Cirez wrote:

Is BTRFS mature enough to be deployed on a production system to underpin
the storage layer of a 16+ ipcameras-based NVR (or VMS if you prefer)?

Based on our limited experience with BTRFS (1+ year) under the above
scenario the answer seems to be no; but I wanted you ask the community
at large for their experience before making a final decision to hold off
on deploying BTRFS on production systems.

Let us be clear: We think BTRFS has great potential, and as it matures
we will continue to watch its progress, so that at some future point we
can return to using it.

The issue has been the myriad of problems we have encountered when
deploying BTRFS as the storage fs for the NVR/VMS in cases were the
camera count exceeds 10: Corrupted file systems, sudden read-only file
system, re-balance kernel panics, broken partitions, etc.


What version of the file system, how many of those issues (if any) have
you reported to the upstream mailing list? I don't remember seeing any
reports from you.



One specific case saw the btrfs drive pool mounted under the /var
partition so that upon installation the btrfs pool contained all files
under /var; /lib/mysql as well as the video storage. Needless to say
this was a catastrophe...


BTRFS is not suitable for random workloads, such as databases for
example. In those cases it's recommended, at the very least, to disable
COW on the database files. Note, disabling CoW doesn't preclude you from
creating snapshots, it just means that not every 4k write (for example)
will result in a CoW operation.



At the other end of the spectrum is a use case where ONLY the video
storage was on the btrfs pool; but in that case, the btrfs pool became
read-only suddenly and would not re-mount as rw despite all the recovery
trick btrfs-tools could throw at it. This, of course prevented the
NVR/VMS from recording any footage.


Again, you give absolutely no information about how you have configured
the filesystem or versions of the software used.



So, again, the question is: is BTRFS mature enough to be used in such
use case and if so, what approach can be used to mitigate such issues.


 And..

  What is the blocksize used by the application to write into btrfs?
  And what type of write IO? async, directio or fsync?

Thanks, Anand




Thank you all for your assistance

-Ryan-


Re: [PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish

2018-11-16 Thread Anand Jain




On 11/15/2018 11:35 PM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:22PM +0800, Anand Jain wrote:

When we successfully cancel the replace its scrub returns -ECANCELED,
which then passed to btrfs_dev_replace_finishing(), it cleans up based
on the scrub returned status and propagates the same -ECANCELED back
the parent function. As of now only user can cancel the replace-scrub,
so its ok to quieten the warn here.

Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1dc8e86546db..9031a362921a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
*fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-   } else {
+   } else if (ret != -ECANCELED) {
WARN_ON(ret);


While this looks ok, can you please rework it so there are no WARN_ON at
random places in device-replace, poorly substituting error handling?

The code flow in this case could be changed to make explicit checks for
the know codes and then a catch-all branch like:

if (ret == -EINPROGRESS) {
...
} else (if == -ESOMETHINGELSE) {
...
} else {
unknown error, print error and do a proper cleanup
}



As below..


}
  
@@ -954,7 +954,7 @@ static int btrfs_dev_replace_kthread(void *data)

  btrfs_device_get_total_bytes(dev_replace->srcdev),
  _replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
-   WARN_ON(ret);
+   WARN_ON(ret && ret != -ECANCELED);


This one too, thanks.



 btrfs_dev_scrub() can return quite a lot of errno, which is passed
 here through the btrfs_dev_replace_finishing(), so it won't be
 possible to code them all.

 (we use -ECANCELED only in replace and balance).

Thanks, Anand



Re: [PATCH 8/9] btrfs: user requsted replace cancel is not an error

2018-11-16 Thread Anand Jain




On 11/16/2018 06:29 PM, Anand Jain wrote:



On 11/15/2018 11:31 PM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:23PM +0800, Anand Jain wrote:
As of now only user requested replace cancel can cancel the 
replace-scrub

so no need to log error for it.


This has probably some user visible effect or threre are steps to
reproduce the message even if it's not expected (ie. the case that this
patch fixes). Please update the changelog, thanks.





before the patch [1]
   [1]
     btrfs: fix use-after-free due to race between replace start and cancel

  We used to set the replace_state to CANCELED [2] and then call
  btrfs_scrub_cancel(), but the problem with this approach is
  if the scrub isn't running yet, then things get messier.

[2]
-   dev_replace->replace_state = 
BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;


  So to fix, [1] shall set replace_state to CANCELED only after the
  replace cancel thread has successfully canceled the replace using
  btrfs_scrub_cancel().

  And about the cleanup process for the replace target..
  we call
    btrfs_dev_replace_finishing(.. ret)
  after
   ret= btrfs_scrub_dev()

  now ret is -ECANCELED due to replace cancel request by the user.
  (its not set to -ECANCELED for any other reason).

  btrfs_dev_replace_finishing() has the following code [3] which it
  earlier blocked processing of the cleanup process after the cancel,
  because the replace_state was already updated to
  BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED.


[3]
--
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
    int scrub_ret)
{

::
     /* was the operation canceled, or is it finished? */
     if (dev_replace->replace_state !=
     BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
     btrfs_dev_replace_read_unlock(dev_replace);
     mutex_unlock(_replace->lock_finishing_cancel_unmount);
     return 0;
     }
---

  In fact before this patch [1] the code wasn't sync-ing the IO
  when replace was canceled. Which this patch also fixes by using
  the  btrfs_dev_replace_finishing()


---
btrfs_dev_replace_finishing
::
     /*
  * flush all outstanding I/O and inode extent mappings before the
  * copy operation is declared as being finished
  */
     ret = btrfs_start_delalloc_roots(fs_info, -1);
     if (ret) {
     mutex_unlock(_replace->lock_finishing_cancel_unmount);
     return ret;
     }
     btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
---

  So to answer above question... these warn and error log wasn't
  reported before when replace was canceled and now since I am
  using the btrfs_dev_replace_finishing() to finish the job
  of cancel.. it shall be reported which is ok to quite.

  Do you think we still need to update the change log?


 OR I think more appropriately this patch should be merged with

 [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

-Anand

HTH.

Thanks, Anand


Re: [PATCH 8/9] btrfs: user requsted replace cancel is not an error

2018-11-16 Thread Anand Jain




On 11/15/2018 11:31 PM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:23PM +0800, Anand Jain wrote:

As of now only user requested replace cancel can cancel the replace-scrub
so no need to log error for it.


This has probably some user visible effect or threre are steps to
reproduce the message even if it's not expected (ie. the case that this
patch fixes). Please update the changelog, thanks.





before the patch [1]
  [1]
btrfs: fix use-after-free due to race between replace start and cancel

 We used to set the replace_state to CANCELED [2] and then call
 btrfs_scrub_cancel(), but the problem with this approach is
 if the scrub isn't running yet, then things get messier.

[2]
-   dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;

 So to fix, [1] shall set replace_state to CANCELED only after the
 replace cancel thread has successfully canceled the replace using
 btrfs_scrub_cancel().

 And about the cleanup process for the replace target..
 we call
   btrfs_dev_replace_finishing(.. ret)
 after
  ret= btrfs_scrub_dev()

 now ret is -ECANCELED due to replace cancel request by the user.
 (its not set to -ECANCELED for any other reason).

 btrfs_dev_replace_finishing() has the following code [3] which it
 earlier blocked processing of the cleanup process after the cancel,
 because the replace_state was already updated to
 BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED.


[3]
--
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
   int scrub_ret)
{

::
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
btrfs_dev_replace_read_unlock(dev_replace);
mutex_unlock(_replace->lock_finishing_cancel_unmount);
return 0;
}
---

 In fact before this patch [1] the code wasn't sync-ing the IO
 when replace was canceled. Which this patch also fixes by using
 the  btrfs_dev_replace_finishing()


---
btrfs_dev_replace_finishing
::
/*
 * flush all outstanding I/O and inode extent mappings before the
 * copy operation is declared as being finished
 */
ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret) {
mutex_unlock(_replace->lock_finishing_cancel_unmount);
return ret;
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
---

 So to answer above question... these warn and error log wasn't
 reported before when replace was canceled and now since I am
 using the btrfs_dev_replace_finishing() to finish the job
 of cancel.. it shall be reported which is ok to quite.

 Do you think we still need to update the change log?

HTH.

Thanks, Anand


Re: [PATCH 0/9 v2] fix replace-start and replace-cancel racing

2018-11-16 Thread Anand Jain




On 11/15/2018 11:41 PM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:15PM +0800, Anand Jain wrote:

v1->v2:
   2/9: Drop writeback required
   3/9: Drop writeback required
   7/9: Use the condition within the WARN_ON()
   6/9: Use the condition within the ASSERT()

Replace-start and replace-cancel threads can race to create a messy
situation leading to UAF. We use the scrub code to write
the blocks on the replace target. So if we haven't have set the
replace-scrub-running yet, without this patch we just ignore the error
and free the target device. When this happens the system panics with
UAF error.

Its nice to see that btrfs_dev_replace_finishing() already handles
the ECANCELED (replace canceled) situation, but for an unknown reason
we aren't using it to cleanup the replace cancel situation, instead
we just let the replace cancel ioctl thread to cleanup the target
device and return and out of synchronous with the scrub code.

This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel()
to check if the scrub was really running. And if its not then shall
return an error to the user (replace not started error) so that user
can retry replace cancel. And uses btrfs_dev_replace_finishing() code
to cleanup after successful cancel of the replace scrub.

Further, a suspended replace, when tries to restart, and if it fails
(for example target device missing, or excl ops running) it goes to the
started state, and so the cli 'btrfs replace status /mnt' hangs with no
progress. So patches 2/9 and 3/9 fixes that.

As the originals code idea of ECANCELED was limited to the situation of
the error only and not user requested, there are unnecessary error log
and warn log which 7/9 and 8/9 patches fixes.

Patches 1/9 and 9/9 are good to have fixes. Makes a function static and
code readability good.

Testing: (I did some attempt to convert these into xfstests but need a
mechanism where kernel thread can wait for user land script. I thought
I could do it using ebfp, but needs more digging on how).
As of now hand tested with using procfs to hold kernel thread at
(wait_for_user(..)) until user land issues go.


This could be tricky to get implemented but would be of course useful. I
saw the crash about once a week so will watch if this still happens.


 That will be nice.


Anand Jain (9):
   btrfs: mark btrfs_dev_replace_start() as static
   btrfs: replace go back to suspended if target missing
   btrfs: replace back to suspend state if EXCL OP is running
   btrfs: fix UAF due to race between replace start and cancel
   btrfs: replace cancel is successful if scrub cancel is successful
   btrfs: replace's scrub must not be running in replace suspended state
   btrfs: quiten warn if the replace is canceled at finish
   btrfs: user requsted replace cancel is not an error
   btrfs: add explicit check for replace result no error


The above is merged to misc-next, except:

btrfs: quiten warn if the replace is canceled at finish
btrfs: user requsted replace cancel is not an error

with replies under the patches what could be improved. The changes can
be sent independently if you need to do that in several patches. Thanks.


 We need these patch otherwise you will see WARN_ON and btrfs_err
 after a successful replace cancel. Will send revised patch.

Thanks, Anand


Re: [PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful

2018-11-15 Thread Anand Jain




On 11/15/2018 11:27 PM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:20PM +0800, Anand Jain wrote:

In btrfs_dev_replace_cancel() we should check if the
btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return
BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to
cancel the replace again.


I've updated the subject and changelog as this was not easy to
understand what's going on.


Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 22 +-
  1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d32d696d931c..9fc3cb8d3918 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
btrfs_dev_replace_write_unlock(dev_replace);
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
-   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
btrfs_dev_replace_write_unlock(dev_replace);
-   btrfs_scrub_cancel(fs_info);
-   /*
-* btrfs_dev_replace_finishing() will handle the cleanup part
-*/
-   btrfs_info_in_rcu(fs_info,
-   "dev_replace from %s (devid %llu) to %s canceled",
-   btrfs_dev_name(src_device), src_device->devid,
-   btrfs_dev_name(tgt_device));
+   ret = btrfs_scrub_cancel(fs_info);
+   if (ret) {


This should be ret < 0, fixed.


 Ok with me. Thanks for fixing. Anand




+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+   } else {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   /*
+* btrfs_dev_replace_finishing() will handle the 
cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s 
canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
--
1.8.3.1


Re: [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-15 Thread Anand Jain




On 11/15/2018 11:25 PM, David Sterba wrote:

On Thu, Nov 15, 2018 at 03:00:21PM +0100, David Sterba wrote:

On Wed, Nov 14, 2018 at 09:28:34AM +0800, Anand Jain wrote:

mutex_unlock(_replace->lock_finishing_cancel_unmount);
return result;


There's a compiler warning:

fs/btrfs/dev-replace.c: In function ‘btrfs_dev_replace_cancel’:
fs/btrfs/dev-replace.c:865:9: warning: ‘result’ may be used uninitialized in 
this function [-Wmaybe-uninitialized]
return result;
   ^~



I haven't looked closer though it looks valid.


int result; is assigned within switch(), so there isn't actual problem.


The warning is there because switch (dev_replace->replace_state) does
not have a default: case that would catch the values outside of what's
defined by the enum. So in that case result would have undefined value.


But will initialize the result to -EINVAL to quite the compiler.
Sending v3.


I don't see any change in the followup version.
https://patchwork.kernel.org/patch/10681939/


 Looks like I didn't run git commit --amend, now pulled your changes
 from misc-next.


I've added

default:
result = -EINVAL;

to the end of the switch.


 Ok. misc-next looks good to me.

Thanks, Anand



Re: [PATCH RFC] btrfs: harden agaist duplicate fsid

2018-11-14 Thread Anand Jain




On 11/13/2018 11:47 PM, Anand Jain wrote:



On 11/13/2018 11:31 PM, David Sterba wrote:

On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote:

+    /*
+ * we are going to replace the device path, make sure its the
+ * same device if the device mounted
+ */
+    if (device->bdev) {
+    struct block_device *path_bdev;
+
+    path_bdev = lookup_bdev(path);
+    if (IS_ERR(path_bdev)) {
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_CAST(path_bdev);
+    }
+
+    if (device->bdev != path_bdev) {
+    bdput(path_bdev);
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_PTR(-EEXIST);


 I have installed ubuntu with btrfs as root and with a subvol to be
 mounted at boot. Its working fine.

Thanks, Anand


Re: [PATCH] btrfs: introduce feature to forget a btrfs device

2018-11-14 Thread Anand Jain




On 11/14/2018 07:28 PM, Filipe Manana wrote:

On Wed, Nov 14, 2018 at 11:15 AM Filipe Manana  wrote:


On Wed, Nov 14, 2018 at 9:14 AM Anand Jain  wrote:


Support for a new command 'btrfs dev forget [dev]' is proposed here
to undo the effects of 'btrfs dev scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty.
 IOW(BTRFS_IOCTL_MAGIC, 5, ..)
This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
the /dev/btrfs-control to forget one or all devices, (devices which are
not mounted) from the btrfs kernel.

The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
set to specify the device path. And all unmounted devices can be removed
from the kernel if no device path is provided.

Again, the devices are removed only if the relevant fsid aren't mounted.

This new cli can provide..
  . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
device is not going to be mounted.
  . Ability to mount the device in degraded mode when one of the other
device is corrupted like in split brain raid1.
  . Running test cases which requires btrfs.ko-reload if the rootfs
is btrfs.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
  fs/btrfs/super.c   | 3 +++
  fs/btrfs/volumes.c | 9 +
  fs/btrfs/volumes.h | 1 +
  include/uapi/linux/btrfs.h | 2 ++
  4 files changed, 15 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 345c64d810d4..f99db6899004 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
 ret = PTR_ERR_OR_ZERO(device);
 mutex_unlock(_mutex);
 break;
+   case BTRFS_IOC_FORGET_DEV:
+   ret = btrfs_forget_devices(vol->name);
+   break;
 case BTRFS_IOC_DEVICES_READY:
 mutex_lock(_mutex);
 device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..e1365a122657 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device 
*bdev, u64 bytenr,
 return 0;
  }

+int btrfs_forget_devices(const char *path)
+{
+   mutex_lock(_mutex);
+   btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL);


One space before : and another one after it please.

Now the more important: don't use strlen, use strnlen. Some malicious
or sloppy user might have passed a non-null terminated string, you
don't want strlen to go past the limits of btrfs_ioctl_vol_args for
obvious reasons.


In fact that's a problem for the entire use of vol->name in
btrfs_control_ioctl. The name's last byte should be set to '\0' to
avoid issues.
I'll send a fix for that, so if David fixes the white spaces on commit
there's no need for a v12.


  Ok. Thanks.

  David, wonder if white spaces can be fixed when integrating?

Thanks, Anand



Also, please, not just to make a maintainer's life easier, but current
and future reviewers, add the patch version to each patch's subject
and not just the cover letter. Also list (after ---) what changes
between each patch version in the patch itself and not the cover
letter.

V12, here we go.


+   mutex_unlock(_mutex);
+
+   return 0;
+}
+
  /*
   * Look for a btrfs signature on a device. This may be called out of the 
mount path
   * and we are not allowed to call set_blocksize during the scan. The 
superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce895e994..180297d04938 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
  struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
  void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
  void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..b1be7f828cb4 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -836,6 +836,8 @@ enum btrfs_err_code {
struct btrfs_ioctl_vol_args)
  #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+  struct btrfs_ioctl_vol_args)
  /* trans start and trans end are dangerous, and only for
   * use by applications that know how to avoid the
   * resulting deadlocks
--
1.8.3.1




--
Filipe David Manana,

“Whether you think you can, or you think 

Re: [PATCH] btrfs: introduce feature to forget a btrfs device

2018-11-14 Thread Anand Jain




On 11/14/2018 07:15 PM, Filipe Manana wrote:

On Wed, Nov 14, 2018 at 9:14 AM Anand Jain  wrote:


Support for a new command 'btrfs dev forget [dev]' is proposed here
to undo the effects of 'btrfs dev scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty.
 IOW(BTRFS_IOCTL_MAGIC, 5, ..)
This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
the /dev/btrfs-control to forget one or all devices, (devices which are
not mounted) from the btrfs kernel.

The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
set to specify the device path. And all unmounted devices can be removed
from the kernel if no device path is provided.

Again, the devices are removed only if the relevant fsid aren't mounted.

This new cli can provide..
  . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
device is not going to be mounted.
  . Ability to mount the device in degraded mode when one of the other
device is corrupted like in split brain raid1.
  . Running test cases which requires btrfs.ko-reload if the rootfs
is btrfs.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
  fs/btrfs/super.c   | 3 +++
  fs/btrfs/volumes.c | 9 +
  fs/btrfs/volumes.h | 1 +
  include/uapi/linux/btrfs.h | 2 ++
  4 files changed, 15 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 345c64d810d4..f99db6899004 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
 ret = PTR_ERR_OR_ZERO(device);
 mutex_unlock(_mutex);
 break;
+   case BTRFS_IOC_FORGET_DEV:
+   ret = btrfs_forget_devices(vol->name);
+   break;
 case BTRFS_IOC_DEVICES_READY:
 mutex_lock(_mutex);
 device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..e1365a122657 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device 
*bdev, u64 bytenr,
 return 0;
  }

+int btrfs_forget_devices(const char *path)
+{
+   mutex_lock(_mutex);
+   btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL);


One space before : and another one after it please.


 will fix.


Now the more important: don't use strlen, use strnlen. Some malicious
or sloppy user might have passed a non-null terminated string, you
don't want strlen to go past the limits of btrfs_ioctl_vol_args for
obvious reasons.


 Makes sense. Will fix.


Also, please, not just to make a maintainer's life easier, but current
and future reviewers, add the patch version to each patch's subject
and not just the cover letter. Also list (after ---) what changes
between each patch version in the patch itself and not the cover
letter.


 Sure. Thanks for the feedback.

-Anand


V12, here we go.


+   mutex_unlock(_mutex);
+
+   return 0;
+}
+
  /*
   * Look for a btrfs signature on a device. This may be called out of the 
mount path
   * and we are not allowed to call set_blocksize during the scan. The 
superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce895e994..180297d04938 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
  struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
  void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
  void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..b1be7f828cb4 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -836,6 +836,8 @@ enum btrfs_err_code {
struct btrfs_ioctl_vol_args)
  #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+  struct btrfs_ioctl_vol_args)
  /* trans start and trans end are dangerous, and only for
   * use by applications that know how to avoid the
   * resulting deadlocks
--
1.8.3.1






Re: [PATCH] Btrfs: ensure path name is null terminated at btrfs_control_ioctl

2018-11-14 Thread Anand Jain




On 11/14/2018 07:35 PM, fdman...@kernel.org wrote:

From: Filipe Manana 

We were using the path name received from user space without checking that
it is null terminated. While btrfs-progs is well behaved and does proper
validation and null termination, someone could call the ioctl and pass
a non-null terminated patch, leading to buffer overrun problems in the
kernel.

So just set the last byte of the path to a null character, similar to what
we do in other ioctls (add/remove/resize device, snapshot creation, etc).

Signed-off-by: Filipe Manana 


 Reviewed-by: Anand Jain 

Thanks, Anand


---
  fs/btrfs/super.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6601c9aa5e35..8ad145820ea8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2235,6 +2235,7 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
vol = memdup_user((void __user *)arg, sizeof(*vol));
if (IS_ERR(vol))
return PTR_ERR(vol);
+   vol->name[BTRFS_PATH_NAME_MAX] = '\0';
  
  	switch (cmd) {

case BTRFS_IOC_SCAN_DEV:



Re: [PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add

2018-11-14 Thread Anand Jain




On 11/14/2018 03:24 PM, Lu Fengqi wrote:

=
WARNING: suspicious RCU usage
4.20.0-rc2+ #23 Tainted: G   O
-
fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage!

Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of
RCU string.

Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned devices")
Signed-off-by: Lu Fengqi 


 Thanks Lu. I missed it.
 Reviewed-by: Anand Jain 



---
  fs/btrfs/volumes.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2186300bab91..6039ae5c549e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -873,15 +873,15 @@ static noinline struct btrfs_device 
*device_list_add(const char *path,
if (device->bdev != path_bdev) {
bdput(path_bdev);
mutex_unlock(_devices->device_list_mutex);
-   pr_warn(
-   "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s 
new:%s\n",
+   btrfs_warn_in_rcu(device->fs_info,
+   "duplicate device fsid:devid for %pU:%llu old:%s 
new:%s\n",
disk_super->fsid, devid,
rcu_str_deref(device->name), path);
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
-   pr_info(
-   "BTRFS: device fsid %pU devid %llu moved old:%s 
new:%s\n",
+   btrfs_info_in_rcu(device->fs_info,
+   "device fsid %pU devid %llu moved old:%s new:%s\n",
disk_super->fsid, devid,
rcu_str_deref(device->name), path);
}



[PATCH v5 3/3] btrfs: balance: add kernel log for end or paused

2018-11-14 Thread Anand Jain
Add a kernel log when the balance ends, either for cancel or completed
or if it is paused.

Signed-off-by: Anand Jain 
---
v4->v5: nothing.
v3->v4: nothing.
v2->v3: nothing.
v1->v2: Moved from 2/3 to 3/3
 fs/btrfs/volumes.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8397f72bdac7..93f1683f8f5c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4066,6 +4066,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
ret = __btrfs_balance(fs_info);
 
mutex_lock(_info->balance_mutex);
+   if (ret == -ECANCELED && atomic_read(_info->balance_pause_req))
+   btrfs_info(fs_info, "balance: paused");
+   else if (ret == -ECANCELED && atomic_read(_info->balance_cancel_req))
+   btrfs_info(fs_info, "balance: canceled");
+   else
+   btrfs_info(fs_info, "balance: ended with status: %d", ret);
+
clear_bit(BTRFS_FS_BALANCE_RUNNING, _info->flags);
 
if (bargs) {
-- 
1.8.3.1



[PATCH v5 2/3] btrfs: balance: add args info during start and resume

2018-11-14 Thread Anand Jain
Balance arg info is an important information to be reviewed for the
system audit. So this patch adds them to the kernel log.

Example:
->btrfs bal start -f -mprofiles=raid1,convert=single,soft 
-dlimit=10..20,usage=50 /btrfs

kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 
-msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single

Signed-off-by: Anand Jain 
---
v4.1->v5: Per David review comment the code..
   bp += snprintf(bp, buf - bp + size_buf, "soft,");
  is not safe if 'buf - bp + size_buf' becomes accidentally
  negative, so fix this by using following snprintf.

 #define CHECK_UPDATE_BP_1(a) \
   do { \
   ret = snprintf(bp, size_bp, a); \
   if (ret < 0 || ret >= size_bp) \
   goto out; \
   size_bp -= ret; \
   bp += ret; \
   } while (0)

And initialize the tmp_buf to null.

v4->v4.1: Rename last missed sz to size_buf.

v3->v4: Change log updated with new example.
Log format is changed to almost match with the cli.
snprintf drop %s for inline string.
Print range as =%u..%u instead of min=%umax=%u.
soft is under the args now.
force is printed as -f.

v2->v3: Change log updated.
Make use of describe_block_group() added in 1/4
Drop using strcat instead use snprintf.
Logs string format updated as shown in the example above.

v1->v2: Change log update.
Move adding the logs for balance complete and end to a new patch

 fs/btrfs/volumes.c | 172 -
 1 file changed, 169 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 12b3d0625c6a..8397f72bdac7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3754,6 +3754,173 @@ static inline int validate_convert_profile(struct 
btrfs_balance_args *bctl_arg,
 (bctl_arg->target & ~allowed)));
 }
 
+static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+u32 size_buf)
+{
+   int ret;
+   u32 size_bp = size_buf;
+   char *bp = buf;
+   u64 flags = bargs->flags;
+   char tmp_buf[128] = {'\0'};
+
+   if (!flags)
+   return;
+
+#define CHECK_UPDATE_BP_1(a) \
+   do { \
+   ret = snprintf(bp, size_bp, a); \
+   if (ret < 0 || ret >= size_bp) \
+   goto out; \
+   size_bp -= ret; \
+   bp += ret; \
+   } while (0)
+
+#define CHECK_UPDATE_BP_2(a, v1) \
+   do { \
+   ret = snprintf(bp, size_bp, a, v1); \
+   if (ret < 0 || ret >= size_bp) \
+   goto out; \
+   size_bp -= ret; \
+   bp += ret; \
+   } while (0)
+
+#define CHECK_UPDATE_BP_3(a, v1, v2) \
+   do { \
+   ret = snprintf(bp, size_bp, a, v1, v2); \
+   if (ret < 0 || ret >= size_bp) \
+   goto out; \
+   size_bp -= ret; \
+   bp += ret; \
+   } while (0)
+
+   if (flags & BTRFS_BALANCE_ARGS_SOFT) {
+   CHECK_UPDATE_BP_1("soft,");
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
+   btrfs_describe_block_groups(bargs->profiles, tmp_buf,
+   sizeof(tmp_buf));
+   CHECK_UPDATE_BP_2("profiles=%s,", tmp_buf);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_USAGE) {
+   CHECK_UPDATE_BP_2("usage=%llu,", bargs->usage);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) {
+   CHECK_UPDATE_BP_3("usage=%u..%u,",
+ bargs->usage_min, bargs->usage_max);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_DEVID) {
+   CHECK_UPDATE_BP_2("devid=%llu,", bargs->devid);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_DRANGE) {
+   CHECK_UPDATE_BP_3("drange=%llu..%llu,",
+ bargs->pstart, bargs->pend);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_VRANGE) {
+   CHECK_UPDATE_BP_3("vrange%llu..%llu,",
+ bargs->vstart, bargs->vend);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_LIMIT) {
+   CHECK_UPDATE_BP_2("limit=%llu,", bargs->limit);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) {
+   CHECK_UPDATE_BP_3("limit=%u..%u,",
+   bargs->limit_min, bargs->limit_max);
+   }
+
+   if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) {
+   CHECK_UPDATE_BP_3("stripes=%u..%u,",
+ ba

[PATCH v5 1/3] btrfs: add helper function describe_block_group()

2018-11-14 Thread Anand Jain
Improve on describe_relocation() add a common helper function to describe
the block groups.

Signed-off-by: Anand Jain 
Reviewed-by: David Sterba 
---
v4.1->v5: Initialize buf[128] to null.
v4->v4.1: Use strcpy(buf, "|NONE"); as in the original
v3->v4: Just pass full flag name in the define DESCRIBE_FLAG(flag,..),
 so that it can be used at couple of more places.
Rename describe_block_groups() to btrfs_describe_block_groups().
Drop useless return u32.
v3: Born.

 fs/btrfs/relocation.c | 30 +++---
 fs/btrfs/volumes.c| 43 +++
 fs/btrfs/volumes.h|  1 +
 3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 924116f654a1..0373b3cc1d36 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4184,37 +4184,13 @@ static struct reloc_control *alloc_reloc_control(void)
 static void describe_relocation(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group)
 {
-   char buf[128];  /* prefixed by a '|' that'll be dropped */
-   u64 flags = block_group->flags;
+   char buf[128] = {'\0'};
 
-   /* Shouldn't happen */
-   if (!flags) {
-   strcpy(buf, "|NONE");
-   } else {
-   char *bp = buf;
-
-#define DESCRIBE_FLAG(f, d) \
-   if (flags & BTRFS_BLOCK_GROUP_##f) { \
-   bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
-   flags &= ~BTRFS_BLOCK_GROUP_##f; \
-   }
-   DESCRIBE_FLAG(DATA, "data");
-   DESCRIBE_FLAG(SYSTEM,   "system");
-   DESCRIBE_FLAG(METADATA, "metadata");
-   DESCRIBE_FLAG(RAID0,"raid0");
-   DESCRIBE_FLAG(RAID1,"raid1");
-   DESCRIBE_FLAG(DUP,  "dup");
-   DESCRIBE_FLAG(RAID10,   "raid10");
-   DESCRIBE_FLAG(RAID5,"raid5");
-   DESCRIBE_FLAG(RAID6,"raid6");
-   if (flags)
-   snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags);
-#undef DESCRIBE_FLAG
-   }
+   btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
 
btrfs_info(fs_info,
   "relocating block group %llu flags %s",
-  block_group->key.objectid, buf + 1);
+  block_group->key.objectid, buf);
 }
 
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..12b3d0625c6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -123,6 +123,49 @@ const char *get_raid_name(enum btrfs_raid_types type)
return btrfs_raid_array[type].raid_name;
 }
 
+void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
+{
+   int i;
+   int ret;
+   char *bp = buf;
+   u64 flags = bg_flags;
+   u32 size_bp = size_buf;
+
+   if (!flags) {
+   strcpy(bp, "NONE");
+   return;
+   }
+
+#define DESCRIBE_FLAG(f, d) \
+   do { \
+   if (flags & f) { \
+   ret = snprintf(bp, size_bp, "%s|", d); \
+   if (ret < 0 || ret >= size_bp) \
+   return; \
+   size_bp -= ret; \
+   bp += ret; \
+   flags &= ~f; \
+   } \
+   } while (0)
+
+   DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
+   DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
+   DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+   DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
+   for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+   DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
+ btrfs_raid_array[i].raid_name);
+#undef DESCRIBE_FLAG
+
+   if (flags) {
+   ret = snprintf(bp, size_bp, "0x%llx|", flags);
+   size_bp -= ret;
+   }
+
+   if (size_bp < size_buf)
+   buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
+}
+
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce895e994..3e914effcdf6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -430,6 +430,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info 
*fs_info, u64 devid,
 int btrfs_balance(struct btrfs_fs_info *fs_info,
  struct btrfs_balance_control *bctl,
  struct btrfs_ioctl_balance_arg

[PATCH v5 0/3] btrfs: balance: improve kernel logs

2018-11-14 Thread Anand Jain
v4->v5:
 Mainly address David review comment [1].
 [1]
  https://patchwork.kernel.org/patch/10425987/
 pls ref to individual patch 2/3 for details.

v3->v4:
 Pls ref to individual patches.

Based on misc-next.

v2->v3:
  Inspried by describe_relocation(), improves it, makes it a helper
  function and use it to log the balance operations.

Kernel logs are very important for the forensic investigations of the
issues, these patchs make balance logs easy to review.

Anand Jain (3):
  btrfs: add helper function describe_block_group()
  btrfs: balance: add args info during start and resume
  btrfs: balance: add kernel log for end or paused

 fs/btrfs/relocation.c |  30 +--
 fs/btrfs/volumes.c| 222 +-
 fs/btrfs/volumes.h|   1 +
 3 files changed, 223 insertions(+), 30 deletions(-)

-- 
1.8.3.1


Re: [PATCH v4.1b 2/3] btrfs: balance: add args info during start and resume

2018-11-14 Thread Anand Jain




On 05/31/2018 05:47 PM, David Sterba wrote:

On Fri, May 25, 2018 at 11:05:47AM +0800, Anand Jain wrote:

Balance arg info is an important information to be reviewed for the
system audit. So this patch adds them to the kernel log.

Example:
->btrfs bal start -f -mprofiles=raid1,convert=single,soft 
-dlimit=10..20,usage=50 /btrfs

kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 
-msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single

Signed-off-by: Anand Jain 
---
v4->v4.1b: Rename last missed sz to size_buf.


Please send a full version instead of the incremental bumps to
individual patches. I try to take the last version but am never sure
what exactly gets merged.


+static u32 describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
+u32 size_buf)
+{
+   char *bp = buf;
+   u64 flags = bargs->flags;
+   char tmp_buf[128];
+
+   if (!flags)
+   return 0;
+
+   if (flags & BTRFS_BALANCE_ARGS_SOFT)
+   bp += snprintf(bp, buf - bp + size_buf, "soft,");


I have some suspicion that this snprintf conscturct is not safe when the
value of 'buf - bp + size_buf' becomes accidentally negative. A quick
test showed that a sequence of snprintf and incremented bp does not stop
writing when the size_buf limit is hit and happily overwrites the
memory.

We'd have to check that the negative value is never passed to snprintf.
The patches are potponed until the issue is resolved, either to verify
that it's a false alert or the bounds are correctly checked.

The bug should not be in the original code as the buffer is known to be
large enough for all the printed bg flags.


 Agreed, possibilities of accidental -ve value for snprintf.
 It followed the original code, but as you said the data length
 is more deterministic there.
 I am fixing this in both new and old original code. Pls find v5.
 Sorry for the delay, I took some time to get a fresh look at it.

Thanks, Anand



I'm sorry to delay the patches, the log messages are useful, but I want
to be sure that there are no random memory overwrites introduced.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[PATCH] btrfs-progs: add cli to forget one or all scanned devices

2018-11-14 Thread Anand Jain
This patch adds cli
  btrfs device forget [dev]
to remove the given device structure in the kernel if the device
is unmounted. If no argument is given it shall remove all stale
(device which are not mounted) from the kernel.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
 cmds-device.c | 72 ---
 ioctl.h   |  2 ++
 2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/cmds-device.c b/cmds-device.c
index 2a05f70a76a9..280d6f555377 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv)
return _cmd_device_remove(argc, argv, cmd_device_delete_usage);
 }
 
+static int btrfs_forget_devices(char *path)
+{
+   struct btrfs_ioctl_vol_args args;
+   int ret;
+   int fd;
+
+   fd = open("/dev/btrfs-control", O_RDWR);
+   if (fd < 0)
+   return -errno;
+
+   memset(, 0, sizeof(args));
+   if (path)
+   strncpy_null(args.name, path);
+   ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, );
+   if (ret)
+   ret = -errno;
+   close(fd);
+   return ret;
+}
+
 static const char * const cmd_device_scan_usage[] = {
-   "btrfs device scan [(-d|--all-devices)| [...]]",
-   "Scan devices for a btrfs filesystem",
+   "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\
+   "[...]]",
+   "Scan or forget (deregister) devices for a btrfs filesystem",
" -d|--all-devices (deprecated)",
+   " -u|--forget [ ..]",
NULL
 };
 
@@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv)
int devstart;
int all = 0;
int ret = 0;
+   int forget = 0;
 
optind = 0;
while (1) {
int c;
static const struct option long_options[] = {
{ "all-devices", no_argument, NULL, 'd'},
+   { "forget", no_argument, NULL, 'u'},
{ NULL, 0, NULL, 0}
};
 
-   c = getopt_long(argc, argv, "d", long_options, NULL);
+   c = getopt_long(argc, argv, "du", long_options, NULL);
if (c < 0)
break;
switch (c) {
case 'd':
all = 1;
break;
+   case 'u':
+   forget = 1;
+   break;
default:
usage(cmd_device_scan_usage);
}
}
devstart = optind;
 
+   if (all && forget)
+   usage(cmd_device_scan_usage);
+
if (all && check_argc_max(argc - optind, 1))
usage(cmd_device_scan_usage);
 
if (all || argc - optind == 0) {
-   printf("Scanning for Btrfs filesystems\n");
-   ret = btrfs_scan_devices();
-   error_on(ret, "error %d while scanning", ret);
-   ret = btrfs_register_all_devices();
-   error_on(ret, "there are %d errors while registering devices", 
ret);
+   if (forget) {
+   ret = btrfs_forget_devices(NULL);
+   error_on(ret, "'%s', forget failed",
+strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems\n");
+   ret = btrfs_scan_devices();
+   error_on(ret, "error %d while scanning", ret);
+   ret = btrfs_register_all_devices();
+   error_on(ret,
+   "there are %d errors while registering devices",
+   ret);
+   }
goto out;
}
 
@@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv)
ret = 1;
goto out;
}
-   printf("Scanning for Btrfs filesystems in '%s'\n", path);
-   if (btrfs_register_one_device(path) != 0) {
-   ret = 1;
-   free(path);
-   goto out;
+   if (forget) {
+   ret = btrfs_forget_devices(path);
+   if (ret)
+   error("Can't forget '%s': %s",
+   path, strerror(-ret));
+   } else {
+   printf("Scanning for Btrfs filesystems in '%s'\n",
+   path);
+   if (btrfs_register_on

[PATCH RESEND v11] Add cli and ioctl to forget scanned device(s)

2018-11-14 Thread Anand Jain
v11:
 btrfs-progs: Bring the code into the else part of if(forget).
  Use strerror to print the erorr instead of ret.

v10:
 Make btrfs-progs changes more readable.
 With an effort to keep the known bug [1] as it is..
  [1]
   The cli 'btrfs device scan --all /dev/sdb' which should have scanned
only one device, ends up scanning all the devices and I am not trying
to fix this bug in this patch because..
  . -d|--all is marked as deprecated, I hope -d option would go away
  . For now some script might be using this bug as a feature, and fixing
this bug might lead to mount failure.

v9:
 Make forget as a btrfs device scan option.
 Use forget in the fstests, now you can run fstests with btrfs as rootfs
  which helps to exercise the uuid_mutex lock.

v8:
 Change log update in the kernel patch.

v7:
 Use struct btrfs_ioctl_vol_args (instead of struct
  btrfs_ioctl_vol_args_v2) as its inline with other ioctl
  btrfs-control
 The CLI usage/features remains same. However internally the ioctl flag
  is not required to delete all the unmounted devices. Instead leave
  btrfs_ioctl_vol_args::name NULL.

v6:
 Use the changed fn name btrfs_free_stale_devices().

 Change in title:
 Old v5:
 Cover-letter:
  [PATCH v5] Add cli and ioctl to ignore a scanned device
 Kernel:
  [PATCH v5] btrfs: introduce feature to ignore a btrfs device
 Progs:
  [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli

v5:
  Adds feature to delete all stale devices
  Reuses btrfs_free_stale_devices() fn and so depends on the
patch-set [1] in the ML.
  Uses struct btrfs_ioctl_vol_args_v2 instead of
struct btrfs_ioctl_vol_args as arg
  Does the device path matching instead of btrfs_device matching
(we won't delete the mounted device as btrfs_free_stale_devices()
checks for it)
v4:
  No change. But as the ML thread may be confusing, so resend.
v3:
  No change. Send to correct ML.
v2:
  Accepts review from Nikolay, details are in the specific patch.
  Patch 1/2 is renamed from
[PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list 
delete
  to
[PATCH 1/2] btrfs: add function to device list delete

Adds cli and ioctl to forget a scanned device or forget all stale
devices in the kernel.


Anand Jain (1):
  btrfs: introduce feature to forget a btrfs device

 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

Anand Jain (1):
  btrfs-progs: add cli to forget one or all scanned devices

 cmds-device.c | 63 ++-
 ioctl.h   |  2 ++
 2 files changed, 56 insertions(+), 9 deletions(-)

[1]
Anand Jain (1):
  fstests: btrfs use forget if not reload

 common/btrfs| 20 
 tests/btrfs/124 |  6 +++---
 tests/btrfs/125 |  6 +++---
 tests/btrfs/154 |  6 +++---
 tests/btrfs/164 |  4 ++--
 5 files changed, 31 insertions(+), 11 deletions(-)

-- 
1.8.3.1



[PATCH] btrfs: introduce feature to forget a btrfs device

2018-11-14 Thread Anand Jain
Support for a new command 'btrfs dev forget [dev]' is proposed here
to undo the effects of 'btrfs dev scan [dev]'. For this purpose
this patch proposes to use ioctl #5 as it was empty.
IOW(BTRFS_IOCTL_MAGIC, 5, ..)
This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from
the /dev/btrfs-control to forget one or all devices, (devices which are
not mounted) from the btrfs kernel.

The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be
set to specify the device path. And all unmounted devices can be removed
from the kernel if no device path is provided.

Again, the devices are removed only if the relevant fsid aren't mounted.

This new cli can provide..
 . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the
   device is not going to be mounted.
 . Ability to mount the device in degraded mode when one of the other
   device is corrupted like in split brain raid1.
 . Running test cases which requires btrfs.ko-reload if the rootfs
   is btrfs.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 9 +
 fs/btrfs/volumes.h | 1 +
 include/uapi/linux/btrfs.h | 2 ++
 4 files changed, 15 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 345c64d810d4..f99db6899004 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, 
unsigned int cmd,
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(_mutex);
break;
+   case BTRFS_IOC_FORGET_DEV:
+   ret = btrfs_forget_devices(vol->name);
+   break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(_mutex);
device = btrfs_scan_one_device(vol->name, FMODE_READ,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f435d397019e..e1365a122657 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device 
*bdev, u64 bytenr,
return 0;
 }
 
+int btrfs_forget_devices(const char *path)
+{
+   mutex_lock(_mutex);
+   btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL);
+   mutex_unlock(_mutex);
+
+   return 0;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount 
path
  * and we are not allowed to call set_blocksize during the scan. The superblock
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aefce895e994..180297d04938 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
   fmode_t flags, void *holder);
 struct btrfs_device *btrfs_scan_one_device(const char *path,
   fmode_t flags, void *holder);
+int btrfs_forget_devices(const char *path);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 5ca1d21fc4a7..b1be7f828cb4 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -836,6 +836,8 @@ enum btrfs_err_code {
   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \
+  struct btrfs_ioctl_vol_args)
 /* trans start and trans end are dangerous, and only for
  * use by applications that know how to avoid the
  * resulting deadlocks
-- 
1.8.3.1



[PATCH 4/9 v2.1] btrfs: fix UAF due to race between replace start and cancel

2018-11-13 Thread Anand Jain
replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);  <===
sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
::
  bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
v2->v2.1: Fix compiler warning. (I couldn't reproduce on gcc 4.8.5)

fs/btrfs/dev-replace.c: In function ‘btrfs_dev_replace_cancel’:
fs/btrfs/dev-replace.c:865:9: warning: ‘result’ may be used uninitialized in 
this function [-Wmaybe-uninitialized]
  return result;
 ^~

v1->v2: pls ref to the cover letter

 fs/btrfs/dev-replace.c | 61 --
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 35ce10f18607..d32d696d931c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/

Re: [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-13 Thread Anand Jain




On 11/14/2018 01:24 AM, David Sterba wrote:

On Sun, Nov 11, 2018 at 10:22:19PM +0800, Anand Jain wrote:

replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
 if (is_dev_replace) {
 WARN_ON(!fs_info->dev_replace.tgtdev);  <===
 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 sctx->flush_all_writes = false;
 }

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 struct scrub_page *spage)
{
::
   bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 61 --
  1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 35ce10f18607..d32d696d931c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
  

Re: [PATCH V7] Btrfs: enhance raid1/10 balance heuristic

2018-11-13 Thread Anand Jain




I am ok with the least used path approach here for the IO routing
that's probably most reasonable in generic configurations. It can
be default read mirror policy as well.

But as I mentioned. Not all configurations would agree to the heuristic
approach here. For example: To make use of the SAN storage cache to
get high IO throughput read must access based on the LBA, And this
heuristic would make matter worst. There are plans to add more
options read_mirror_policy [1].

[1]
https://patchwork.kernel.org/patch/10403299/

I would rather provide the configuration tune-ables to the use
cases rather than fixing it using heuristic. Heuristic are good
only with the known set of IO pattern for which heuristic is
designed for.

This is not the first time you are assuming heuristic would provide
the best possible performance in all use cases. As I mentioned
in the compression heuristic there was no problem statement that
you wanted to address using heuristic, theoretically the integrated
compression heuristic would have to do a lot more computation when
all the file-extents are compressible, its not convenience to me
how compression heuristic would help on a desktop machine where
most of the files are compressible.

IMO heuristic are good only for a set of types of workload. Giving
an option to move away from it for the manual tuning is desired.

Thanks, Anand


On 11/12/2018 07:58 PM, Timofey Titovets wrote:

From: Timofey Titovets 

Currently btrfs raid1/10 balancer bаlance requests to mirrors,
based on pid % num of mirrors.

Make logic understood:
  - if one of underline devices are non rotational
  - Queue length to underline devices

By default try use pid % num_mirrors guessing, but:
  - If one of mirrors are non rotational, repick optimal to it
  - If underline mirror have less queue length then optimal,
repick to that mirror

For avoid round-robin request balancing,
lets round down queue length:
  - By 8 for rotational devs
  - By 2 for all non rotational devs

Some bench results from mail list
(Dmitrii Tcvetkov ):
Benchmark summary (arithmetic mean of 3 runs):
  Mainline Patch

RAID1  | 18.9 MiB/s | 26.5 MiB/s
RAID10 | 30.7 MiB/s | 30.7 MiB/s

mainline, fio got lucky to read from first HDD (quite slow HDD):
Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS]
   read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec)
   lat (msec): min=2, max=825, avg=60.17, stdev=65.06

mainline, fio got lucky to read from second HDD (much more modern):
Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS]
   read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec)
   lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56

mainline, fio got lucky to read from an SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0 IOPS]
   read: IOPS=13.9k, BW=433MiB/s (454MB/s)(25.4GiB/60002msec)
   lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36

With the patch, 2 HDDs:
Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS]
   read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec)
   lat (usec): min=435, max=341037, avg=28511.64, stdev=3.14

With the patch, HDD(old one)+SSD:
Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0 IOPS]
   read: IOPS=11.6k, BW=361MiB/s (379MB/s)(21.2GiB/60084msec)
   lat  (usec): min=363, max=346752, avg=1381.73, stdev=6948.32

Changes:
   v1 -> v2:
 - Use helper part_in_flight() from genhd.c
   to get queue length
 - Move guess code to guess_optimal()
 - Change balancer logic, try use pid % mirror by default
   Make balancing on spinning rust if one of underline devices
   are overloaded
   v2 -> v3:
 - Fix arg for RAID10 - use sub_stripes, instead of num_stripes
   v3 -> v4:
 - Rebased on latest misc-next
   v4 -> v5:
 - Rebased on latest misc-next
   v5 -> v6:
 - Fix spelling
 - Include bench results
   v6 -> v7:
 - Fixes based on Nikolay Borisov review:
   * Assume num == 2
   * Remove "for" loop based on that assumption, where possible
   * No functional changes

Signed-off-by: Timofey Titovets 
Tested-by: Dmitrii Tcvetkov 
Reviewed-by: Dmitrii Tcvetkov 
---
  block/genhd.c  |   1 +
  fs/btrfs/volumes.c | 100 -
  2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index be5bab20b2ab..939f0c6a2d79 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct 
*part,
 

Re: [PATCH RFC] btrfs: harden agaist duplicate fsid

2018-11-13 Thread Anand Jain




On 11/13/2018 11:31 PM, David Sterba wrote:

On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote:

+    /*
+ * we are going to replace the device path, make sure its the
+ * same device if the device mounted
+ */
+    if (device->bdev) {
+    struct block_device *path_bdev;
+
+    path_bdev = lookup_bdev(path);
+    if (IS_ERR(path_bdev)) {
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_CAST(path_bdev);
+    }
+
+    if (device->bdev != path_bdev) {
+    bdput(path_bdev);
+    mutex_unlock(_devices->device_list_mutex);
+    return ERR_PTR(-EEXIST);

It would be _really_ nice to have an informative error message printed
here.  Aside from the possibility of an admin accidentally making a
block-level copy of the volume,



this code triggering could represent an
attempted attack against the system, so it's arguably something that
should be reported as happening.



   Personally, I think a WARN_ON_ONCE for
this would make sense, ideally per-volume if possible.


   Ah. Will add an warn. Thanks, Anand


The requested error message is not in the patch you posted or I have
missed that (https://patchwork.kernel.org/patch/10641041/) .


 No you didn't miss. I missed it. When you integrated this into
 for-next, I should have sent out v2. Sorry. Thanks for taking
 care of it.


Austin, is the following ok for you?

   "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n"

   BTRFS: duplicate device fsid:devid 7c667b96-59eb-43ad-9ae9-c878f6ad51d8:2 
old:/dev/sda6 new:/dev/sdb6

As the UUID and paths are long I tried to squeeeze the rest so it's
still comprehensible but this would be better confirmed. Thanks.


Thanks, Anand




Re: [PATCH RFC RESEND] btrfs: harden agaist duplicate fsid

2018-11-13 Thread Anand Jain




On 11/13/2018 11:21 PM, David Sterba wrote:

On Mon, Oct 15, 2018 at 10:45:17AM +0800, Anand Jain wrote:

(Thanks for the comments on requiring to warn_on if we fail the device change.)
(This fixes an ugly bug, I appreciate if you have any further comments).

Its not that impossible to imagine that a device OR a btrfs image is
been copied just by using the dd or the cp command. Which in case both
the copies of the btrfs will have the same fsid. If on the system with
automount enabled, the copied FS gets scanned.

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part /
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
Total devices 1 FS bytes used 1.40GiB
devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:01 14.9G  0 disk
|-sda48:414G  0 part /
|-sda28:21  500M  0 part
|-sda38:31  256M  0 part
`-sda18:11  256M  0 part
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show /
  Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
  Total devices 1 FS bytes used 1.40GiB
  devid1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
sda out of the system on to another system to change its fsid
using the 'btrfstune -u' command.

Signed-off-by: Anand Jain 


I'm adding the patch to misc-next now, with an update message that
matches the format when a device is scanned.

"BTRFS: device fsid %pU devid %llu moved old:%s new:%s\n",

That way it should be possible to grep for all messages that are related
to the scanning ioctl.


 Right. Looks fine to me. Thanks, Anand


Re: [RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk

2018-11-13 Thread Anand Jain



David, Gentle ping.

Thanks, Anand

On 11/12/2018 03:50 PM, Nikolay Borisov wrote:



On 12.11.18 г. 6:58 ч., Anand Jain wrote:


The dev_replace_state defines are miss matched between the
BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1].

[1]
-
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED    2
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED    3
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED    4

btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED    2
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED    3
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED    4
-

The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and
btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_*
(we set dev_replace->replace_state using the
BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk).

  359 btrfs_set_dev_replace_replace_state(eb, ptr,
  360 dev_replace->replace_state);

IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_*
altogether? But how about the userland progs other than btrfs-progs?
If not at least fix the miss match as in [2], any comments?


Unfortunately you are right. This seems to stem from sloppy job back in
the days of initial dev-replace support. BTRFS_DEV_REPLACE_ITEM_STATE_*
were added in e922e087a35c ("Btrfs: enhance btrfs structures for device
replace support"), yet they were never used. And the
IOCTL_DEV_REPLACE_STATE* were added in e93c89c1 ("Btrfs: add new
sources for device replace code").

It looks like the ITEM_STATE* definitions were stillborn so to speak and
personally I'm in favor of removing them. They shouldn't have been
merged in the first place and indeed the patch doesn't even have a
Reviewed-by tag. So it originated from the, I'd say, spartan days of
btrfs development...

David,  any code which is using BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED
is inherently broken, so how about we remove those definitions, then
when it's compilation is broken in the future the author will actually
have a chance to fix it, though it's highly unlikely anyone is relying
on those definitions.




[2]
--
diff --git a/include/uapi/linux/btrfs_tree.h
b/include/uapi/linux/btrfs_tree.h
index aff1356c2bb8..9ffa7534cadf 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -805,9 +805,9 @@ struct btrfs_dev_stats_item {
  #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
  #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
  #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED   1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  4
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4

  struct btrfs_dev_replace_item {
     /*
--


Thanks, Anand



Re: [PATCH] btrfs: fix computation of max fs size for multiple device fs tests

2018-11-13 Thread Anand Jain




On 11/06/2018 11:34 PM, fdman...@kernel.org wrote:

From: Filipe Manana 

We were sorting numerical values with the 'sort' tool without telling it
that we are sorting numbers, giving us unexpected ordering. So just pass
the '-n' option to the 'sort' tool.

Example:

$ echo -e "11\n9\n20" | sort
11
20
9

$ echo -e "11\n9\n20" | sort -n
9
11
20

Signed-off-by: Filipe Manana 


Thanks for the fix.

Reviewed-by: Anand Jain 




---
  tests/btrfs/124 | 2 +-
  tests/btrfs/125 | 2 +-
  tests/btrfs/154 | 2 +-
  3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/btrfs/124 b/tests/btrfs/124
index ce3ad6aa..a52c65f6 100755
--- a/tests/btrfs/124
+++ b/tests/btrfs/124
@@ -61,7 +61,7 @@ dev2=`echo $SCRATCH_DEV_POOL | awk '{print $2}'`
  dev1_sz=`blockdev --getsize64 $dev1`
  dev2_sz=`blockdev --getsize64 $dev2`
  # get min of both
-max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort | head -1`
+max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort -n | head -1`
  # Need disks with more than 2G.
  if [ $max_fs_sz -lt 20 ]; then
_scratch_dev_pool_put
diff --git a/tests/btrfs/125 b/tests/btrfs/125
index e38de264..5ac68b67 100755
--- a/tests/btrfs/125
+++ b/tests/btrfs/125
@@ -68,7 +68,7 @@ dev2_sz=`blockdev --getsize64 $dev2`
  dev3_sz=`blockdev --getsize64 $dev3`
  
  # get min of both.

-max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort | head -1`
+max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort -n | head -1`
  # Need disks with more than 2G
  if [ $max_fs_sz -lt 20 ]; then
_scratch_dev_pool_put
diff --git a/tests/btrfs/154 b/tests/btrfs/154
index 99ea232a..cd6c688f 100755
--- a/tests/btrfs/154
+++ b/tests/btrfs/154
@@ -51,7 +51,7 @@ DEV1_SZ=`blockdev --getsize64 $DEV1`
  DEV2_SZ=`blockdev --getsize64 $DEV2`
  
  # get min

-MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort | head -1`
+MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort -n | head -1`
  # Need disks with more than 2G
  if [ $MAX_FS_SZ -lt 20 ]; then
_scratch_dev_pool_put



Re: Full filesystem btrfs rebalance kernel panic to read-only lock

2018-11-11 Thread Anand Jain




On 11/12/2018 10:12 AM, Qu Wenruo wrote:



On 2018/11/12 上午9:35, Anand Jain wrote:



On 11/09/2018 09:21 AM, Qu Wenruo wrote:



On 2018/11/9 上午6:40, Pieter Maes wrote:

Hello,

So, I've had the full disk issue, so when I tried re-balancing,
I got a panic, that pushed filesystem read-only and I'm unable to
balance or grow the filesystem now.

fs info:
btrfs fi show /
Label: none  uuid: 9b591b6b-6040-437e-9398-6883ca3bf1bb
  Total devices 1 FS bytes used 614.94GiB
  devid    1 size 750.00GiB used 750.00GiB path /dev/mapper/vg0-root

btrfs fi df /
Data, single: total=740.94GiB, used=610.75GiB
System, DUP: total=32.00MiB, used=112.00KiB
Metadata, DUP: total=4.50GiB, used=3.94GiB


Metadata usage the the biggest problem.
It's already used up.


GlobalReserve, single: total=512.00MiB, used=255.06MiB


And the reserved space is also been used, that's a pretty bad news.



btrfs sub list -ta /
ID    gen    top level    path
--    ---    -    

btrfs --version
btrfs-progs v4.4

Log when booting machine now from root:



[   54.746700] [ cut here ]
[   54.746701] BTRFS: Transaction aborted (error -28)


Transaction can't even be done due to lack of space.

[snip]




When booting to a net/livecd rescue
First I run a check with repair:



enabling repair mode
Checking filesystem on /dev/vg0/root
UUID: 9b591b6b-6040-437e-9398-6883ca3bf1bb
checking extents
Fixed 0 roots.
checking free space cache
cache and super generation don't match, space cache will be invalidated
checking fs roots
reset nbytes for ino 6228034 root 5


It's a minor problem.
So the fs itself is still pretty health.


checking csums
checking root refs
found 664259596288 bytes used err is 0
total csum bytes: 619404608
total tree bytes: 4237737984
total fs tree bytes: 1692581888
total extent tree bytes: 1461665792
btree space waste bytes: 945044758
file data blocks allocated: 1568329531392
   referenced 537131163648


But then when I try to mount the fs:



[snip]


rescue kernel: 4.9.120



I've grown the blockdevice, but there is no way I can grow the fs,
it doesn't want to mount in my rescue system, and it only mounts
read-only when booting from it, so I can't do it from there either


Btrfs-progs could do it with some extra dirty work.
(I purposed offline device resize idea, but didn't implement it yet)

You could use this branch:
https://github.com/adam900710/btrfs-progs/tree/dirty_fix


Qu,

  The online resize should work here right?


Nope, the user reported unable to mount RW, due to exhausted metadata space.

And due to the failure of RW mount, reported can't do online resize,
thus we need to do offline one.


 Its nice tool fixed the issue here, but in the long term we need
 a way to free some space IMO.

 Source of the problem is unable to mount RW when metadata space is
 full. A serious issue.

 Adding more disk space was viable workaround at this use case, which
 might not be true in all use cases. Like user may just want to mount
 and free some space.

 I think we need to fine tune the reserve space usage like distinguish
 the reserve space allocation between the new metadata item VS
 modification of the old metadata items. And reserve a space for
 the modification of the metadata, so that mount and freeing of
 some files will work.

Thanks, Anand



Thanks,
Qu



Thanks, Anand



It's a quick and dirty fix to allow "btrfs-corrupt-block -X " to
extent device size to max.

Please try above command to see if it solves your problem.

Thanks,
Qu



I hope someone can help me out with this.
Thanks!







[PATCH] btrfs: remove redundant replace_state init

2018-11-11 Thread Anand Jain
dev_replace::replace_state has been set to
BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED (0) in the same function,
So delete the line which sets replace_state = 0;

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cc25a34f87b0..cdc4313446a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -59,7 +59,6 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
-   dev_replace->replace_state = 0;
dev_replace->time_started = 0;
dev_replace->time_stopped = 0;
atomic64_set(_replace->num_write_errors, 0);
-- 
1.8.3.1



[RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk

2018-11-11 Thread Anand Jain



The dev_replace_state defines are miss matched between the
BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1].

[1]
-
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED  2
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED  3
btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4

btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  3
btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  4
-

The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and 
btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_* 
(we set dev_replace->replace_state using the

BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk).

 359 btrfs_set_dev_replace_replace_state(eb, ptr,
 360 dev_replace->replace_state);

IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_*
altogether? But how about the userland progs other than btrfs-progs?
If not at least fix the miss match as in [2], any comments?

[2]
--
diff --git a/include/uapi/linux/btrfs_tree.h 
b/include/uapi/linux/btrfs_tree.h

index aff1356c2bb8..9ffa7534cadf 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -805,9 +805,9 @@ struct btrfs_dev_stats_item {
 #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
 #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
 #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED   1
-#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
-#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  3
-#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  4
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED  2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED  3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4

 struct btrfs_dev_replace_item {
/*
--


Thanks, Anand


Re: Full filesystem btrfs rebalance kernel panic to read-only lock

2018-11-11 Thread Anand Jain




On 11/09/2018 09:21 AM, Qu Wenruo wrote:



On 2018/11/9 上午6:40, Pieter Maes wrote:

Hello,

So, I've had the full disk issue, so when I tried re-balancing,
I got a panic, that pushed filesystem read-only and I'm unable to
balance or grow the filesystem now.

fs info:
btrfs fi show /
Label: none  uuid: 9b591b6b-6040-437e-9398-6883ca3bf1bb
     Total devices 1 FS bytes used 614.94GiB
     devid    1 size 750.00GiB used 750.00GiB path /dev/mapper/vg0-root

btrfs fi df /
Data, single: total=740.94GiB, used=610.75GiB
System, DUP: total=32.00MiB, used=112.00KiB
Metadata, DUP: total=4.50GiB, used=3.94GiB


Metadata usage the the biggest problem.
It's already used up.


GlobalReserve, single: total=512.00MiB, used=255.06MiB


And the reserved space is also been used, that's a pretty bad news.



btrfs sub list -ta /
ID    gen    top level    path
--    ---    -    

btrfs --version
btrfs-progs v4.4

Log when booting machine now from root:



[   54.746700] [ cut here ]
[   54.746701] BTRFS: Transaction aborted (error -28)


Transaction can't even be done due to lack of space.

[snip]




When booting to a net/livecd rescue
First I run a check with repair:



enabling repair mode
Checking filesystem on /dev/vg0/root
UUID: 9b591b6b-6040-437e-9398-6883ca3bf1bb
checking extents
Fixed 0 roots.
checking free space cache
cache and super generation don't match, space cache will be invalidated
checking fs roots
reset nbytes for ino 6228034 root 5


It's a minor problem.
So the fs itself is still pretty health.


checking csums
checking root refs
found 664259596288 bytes used err is 0
total csum bytes: 619404608
total tree bytes: 4237737984
total fs tree bytes: 1692581888
total extent tree bytes: 1461665792
btree space waste bytes: 945044758
file data blocks allocated: 1568329531392
  referenced 537131163648


But then when I try to mount the fs:



[snip]


rescue kernel: 4.9.120



I've grown the blockdevice, but there is no way I can grow the fs,
it doesn't want to mount in my rescue system, and it only mounts
read-only when booting from it, so I can't do it from there either


Btrfs-progs could do it with some extra dirty work.
(I purposed offline device resize idea, but didn't implement it yet)

You could use this branch:
https://github.com/adam900710/btrfs-progs/tree/dirty_fix


Qu,

 The online resize should work here right?

Thanks, Anand



It's a quick and dirty fix to allow "btrfs-corrupt-block -X " to
extent device size to max.

Please try above command to see if it solves your problem.

Thanks,
Qu



I hope someone can help me out with this.
Thanks!





[PATCH 9/9] btrfs: add explicit check for replace result no error

2018-11-11 Thread Anand Jain
We recast the replace return status
BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no
error.
And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0,
which is also declared as 0, so we just return. Instead add it to the if
statement so that there is enough clarity while reading the code.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 40a0942b4659..cc25a34f87b0 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info 
*fs_info,

args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
-   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
-   ret = 0;
+   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+   ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+   return 0;
 
return ret;
 }
-- 
1.8.3.1



[PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful

2018-11-11 Thread Anand Jain
In btrfs_dev_replace_cancel() we should check if the
btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return
BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to
cancel the replace again.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d32d696d931c..9fc3cb8d3918 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
btrfs_dev_replace_write_unlock(dev_replace);
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
-   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
btrfs_dev_replace_write_unlock(dev_replace);
-   btrfs_scrub_cancel(fs_info);
-   /*
-* btrfs_dev_replace_finishing() will handle the cleanup part
-*/
-   btrfs_info_in_rcu(fs_info,
-   "dev_replace from %s (devid %llu) to %s canceled",
-   btrfs_dev_name(src_device), src_device->devid,
-   btrfs_dev_name(tgt_device));
+   ret = btrfs_scrub_cancel(fs_info);
+   if (ret) {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+   } else {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   /*
+* btrfs_dev_replace_finishing() will handle the 
cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s 
canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
-- 
1.8.3.1



[PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state

2018-11-11 Thread Anand Jain
When the replace state is placed in the suspended state,
btrfs_scrub_cancel() should fail with -ENOTCONN as there is no
scrub running, as a safety catch check if btrfs_scrub_cancel()
returns -ENOTCONN and assert if it doesn't.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9fc3cb8d3918..1dc8e86546db 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -831,7 +831,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 
btrfs_dev_replace_write_unlock(dev_replace);
 
-   btrfs_scrub_cancel(fs_info);
+   /* scrub for replace must not be running in suspended state */
+   ret = btrfs_scrub_cancel(fs_info);
+   ASSERT(ret != -ENOTCONN);
 
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
-- 
1.8.3.1



[PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish

2018-11-11 Thread Anand Jain
When we successfully cancel the replace its scrub returns -ECANCELED,
which then passed to btrfs_dev_replace_finishing(), it cleans up based
on the scrub returned status and propagates the same -ECANCELED back
the parent function. As of now only user can cancel the replace-scrub,
so its ok to quieten the warn here.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1dc8e86546db..9031a362921a 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
*fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-   } else {
+   } else if (ret != -ECANCELED) {
WARN_ON(ret);
}
 
@@ -954,7 +954,7 @@ static int btrfs_dev_replace_kthread(void *data)
  btrfs_device_get_total_bytes(dev_replace->srcdev),
  _replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
-   WARN_ON(ret);
+   WARN_ON(ret && ret != -ECANCELED);
 
clear_bit(BTRFS_FS_EXCL_OP, _info->flags);
return 0;
-- 
1.8.3.1



[PATCH 8/9] btrfs: user requsted replace cancel is not an error

2018-11-11 Thread Anand Jain
As of now only user requested replace cancel can cancel the replace-scrub
so no need to log error for it.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9031a362921a..40a0942b4659 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
src_device,
tgt_device);
} else {
-   btrfs_err_in_rcu(fs_info,
+   if (scrub_ret != -ECANCELED)
+   btrfs_err_in_rcu(fs_info,
 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 btrfs_dev_name(src_device),
 src_device->devid,
-- 
1.8.3.1



[PATCH 3/9] btrfs: replace back to suspend state if EXCL OP is running

2018-11-11 Thread Anand Jain
In a secnario where balance and replace co-exists as below,
  start balance; pause balance; start replace; reboot
and when system restarts balance restarts first and the
replace restart will fail as EXCL OP lock is already held by
the balance. If so place the replace state back to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3c29b0976087..35ce10f18607 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -897,6 +897,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
 * dev-replace to start anyway.
 */
if (test_and_set_bit(BTRFS_FS_EXCL_OP, _info->flags)) {
+   btrfs_dev_replace_write_lock(dev_replace);
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+   btrfs_dev_replace_write_unlock(dev_replace);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
-- 
1.8.3.1



[PATCH 2/9] btrfs: replace go back to suspended if target missing

2018-11-11 Thread Anand Jain
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and suppose the target device is missing, then let the replace
state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state
instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any
matching scrub running as part of replace.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 59991165e126..3c29b0976087 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -884,6 +884,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
   "cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
   "you may cancel the operation after 'mount -o 
degraded'");
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
-- 
1.8.3.1



[PATCH 1/9] btrfs: mark btrfs_dev_replace_start() as static

2018-11-11 Thread Anand Jain
There isn't any other consumer other than in its own file dev-replace.c.

Signed-off-by: Anand Jain 
Reviewed-by: Nikolay Borisov 
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/dev-replace.h | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2aa48aecc52b..59991165e126 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -390,7 +390,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
 }
 
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
 {
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 795c551f5b5e..27e3bb0cca11 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -13,9 +13,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
  struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
-   const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
-   int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dev_replace_args *args);
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
-- 
1.8.3.1



[PATCH 0/9 v2] fix replace-start and replace-cancel racing

2018-11-11 Thread Anand Jain
v1->v2:
  2/9: Drop writeback required
  3/9: Drop writeback required
  7/9: Use the condition within the WARN_ON()
  6/9: Use the condition within the ASSERT()

Replace-start and replace-cancel threads can race to create a messy
situation leading to UAF. We use the scrub code to write
the blocks on the replace target. So if we haven't have set the
replace-scrub-running yet, without this patch we just ignore the error
and free the target device. When this happens the system panics with
UAF error.

Its nice to see that btrfs_dev_replace_finishing() already handles
the ECANCELED (replace canceled) situation, but for an unknown reason
we aren't using it to cleanup the replace cancel situation, instead
we just let the replace cancel ioctl thread to cleanup the target
device and return and out of synchronous with the scrub code.

This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel()
to check if the scrub was really running. And if its not then shall
return an error to the user (replace not started error) so that user
can retry replace cancel. And uses btrfs_dev_replace_finishing() code
to cleanup after successful cancel of the replace scrub.

Further, a suspended replace, when tries to restart, and if it fails
(for example target device missing, or excl ops running) it goes to the
started state, and so the cli 'btrfs replace status /mnt' hangs with no
progress. So patches 2/9 and 3/9 fixes that.

As the originals code idea of ECANCELED was limited to the situation of
the error only and not user requested, there are unnecessary error log
and warn log which 7/9 and 8/9 patches fixes.

Patches 1/9 and 9/9 are good to have fixes. Makes a function static and
code readability good.

Testing: (I did some attempt to convert these into xfstests but need a
mechanism where kernel thread can wait for user land script. I thought
I could do it using ebfp, but needs more digging on how).
As of now hand tested with using procfs to hold kernel thread at
(wait_for_user(..)) until user land issues go.


1.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
btrfs replace cancel /btrfs
  wait_for_user_go();
btrfs replace status /btrfs

2.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
reboot
mount -o device=/dev/sdc /dev/sdb /btrfs
  wait_for_user_go();
btrfs replace status /btrfs
btrfs replace cancel /btrfs
btrfs replace status /btrfs

3.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
reboot
mount -o degraded /dev/sdb /btrfs
btrfs replace status /btrfs
btrfs replace cancel /btrfs
btrfs replace status /btrfs
umount /btrfs
mount /dev/sdb /btrfs

Anand Jain (9):
  btrfs: mark btrfs_dev_replace_start() as static
  btrfs: replace go back to suspended if target missing
  btrfs: replace back to suspend state if EXCL OP is running
  btrfs: fix UAF due to race between replace start and cancel
  btrfs: replace cancel is successful if scrub cancel is successful
  btrfs: replace's scrub must not be running in replace suspended state
  btrfs: quiten warn if the replace is canceled at finish
  btrfs: user requsted replace cancel is not an error
  btrfs: add explicit check for replace result no error

 fs/btrfs/dev-replace.c | 87 ++
 fs/btrfs/dev-replace.h |  3 --
 2 files changed, 59 insertions(+), 31 deletions(-)

-- 
1.8.3.1

>From e5a84ccf4f73ec405bc7f5ad812b04762f287e2d Mon Sep 17 00:00:00 2001
From: Anand Jain 
Date: Wed, 7 Nov 2018 18:51:19 +0800


Anand Jain (9):
  btrfs: mark btrfs_dev_replace_start() as static
  btrfs: replace go back to suspended if target missing
  btrfs: replace back to suspend state if EXCL OP is running
  btrfs: fix UAF due to race between replace start and cancel
  btrfs: replace cancel is successful if scrub cancel is successful
  btrfs: replace's scrub must not be running in replace suspended state
  btrfs: quiten warn if the replace is canceled at finish
  btrfs: user requsted replace cancel is not an error
  btrfs: add explicit check for replace result no error

 fs/btrfs/dev-replace.c | 90 ++
 fs/btrfs/dev-replace.h |  3 --
 2 files changed, 62 insertions(+), 31 deletions(-)

-- 
1.8.3.1



[PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-11 Thread Anand Jain
replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);  <===
sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
::
  bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 61 --
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 35ce10f18607..d32d696d931c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
-   break;
-   }
-   dev_replace->replace_state = BTRFS_IOCTL_

Re: [PATCH 2/9] btrfs: replace go back to suspended if target missing

2018-11-11 Thread Anand Jain




On 11/07/2018 08:35 PM, Nikolay Borisov wrote:



On 7.11.18 г. 13:43 ч., Anand Jain wrote:

At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and suppose the target device is missing, then let the replace
state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state
instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any
matching scrub running as part of replace.

Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 59991165e126..47d6768a9cde 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -884,6 +884,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
   "cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
   "you may cancel the operation after 'mount -o 
degraded'");
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+   dev_replace->item_needs_writeback = 1;


Why do we need items_needs_writeback = 1 here, nothing is changed w.r.t
on-disk data?


 You are right. We don't need writeback. Will fix.

Thanks, Anand



btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}



Re: [PATCH] btrfs: Check for missing device before bio submission in btrfs_map_bio

2018-11-10 Thread Anand Jain




On 11/08/2018 10:16 PM, Nikolay Borisov wrote:

Before btrfs_map_bio submits all stripe bio it does a number of checks
to ensure the device for every stripe is present. However, it doesn't
do a DEV_STATE_MISSING check, instead this is relegated to the lower
level btrfs_schedule_bio (in the async submission case, sync submission
doesn't check DEV_STATE_MISSING at all). Additionally
btrfs_schedule_bios does the duplicate device->bdev check which has
already been performed in btrfs_map_bio.

This patch moves the DEV_STATE_MISSING check in btrfs_map_bio and
removes the duplicate device->bdev check. Doing so ensures that no bio
cloning/submission happens for both async/sync requests in the face of
missing device. This makes the async io submission path slightly shorter
in terms of instruction count. No functional changes.

Signed-off-by: Nikolay Borisov 


Reviewed-by: Anand Jain 

Thanks, Anand


---
  fs/btrfs/volumes.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 44c5e8ccb644..3312cad65209 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6106,12 +6106,6 @@ static noinline void btrfs_schedule_bio(struct 
btrfs_device *device,
int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
  
-	if (test_bit(BTRFS_DEV_STATE_MISSING, >dev_state) ||

-   !device->bdev) {
-   bio_io_error(bio);
-   return;
-   }
-
/* don't bother with additional async steps for reads, right now */
if (bio_op(bio) == REQ_OP_READ) {
btrfsic_submit_bio(bio);
@@ -6240,7 +6234,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, 
struct bio *bio,
  
  	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {

dev = bbio->stripes[dev_nr].dev;
-   if (!dev || !dev->bdev ||
+   if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
+  >dev_state) ||
(bio_op(first_bio) == REQ_OP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state))) {
bbio_error(bbio, first_bio, logical);



Re: [PATCH v15.1 00/13] Btrfs In-band De-duplication

2018-11-09 Thread Anand Jain




De-duplication must also let use cases to enable de-duplication on per 
subvolume level, using the subvolume properties. Similar to compression 
and future-encryption.


Thanks, Anand


Re: [PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state

2018-11-08 Thread Anand Jain




On 11/08/2018 04:52 PM, Nikolay Borisov wrote:



On 8.11.18 г. 10:33 ч., Anand Jain wrote:



On 11/07/2018 08:19 PM, Nikolay Borisov wrote:



On 7.11.18 г. 13:43 ч., Anand Jain wrote:

+    /* scrub for replace must not be running in suspended state */
+    if (btrfs_scrub_cancel(fs_info) != -ENOTCONN)
+    ASSERT(0);


ASSERT(btrfs_scrub_cancel(fs_info) == -ENOTCONN)



There will be substantial difference in code when compiled with and
without CONFIG_BTRFS_ASSERT [1]. That is, btrfs_scrub_cancel(fs_info)
won't be run at all,  I would like to keep it as it is.


Fair point, in that case do:

ret = btrfs_scrub_cancel(fs_info);
ASSERT(ret != -ENOTCONN);


Fixed.

Thanks, Anand


result




[1]
--
./fs/btrfs/ctree.h
#ifdef CONFIG_BTRFS_ASSERT

__cold
static inline void assfail(const char *expr, const char *file, int line)
{
     pr_err("assertion failed: %s, file: %s, line: %d\n",
    expr, file, line);
     BUG();
}

#define ASSERT(expr)    \
     (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#else
#define ASSERT(expr)    ((void)0)
#endif
---

Thanks, Anand




Re: [PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state

2018-11-08 Thread Anand Jain




On 11/07/2018 08:19 PM, Nikolay Borisov wrote:



On 7.11.18 г. 13:43 ч., Anand Jain wrote:

+   /* scrub for replace must not be running in suspended state */
+   if (btrfs_scrub_cancel(fs_info) != -ENOTCONN)
+   ASSERT(0);


ASSERT(btrfs_scrub_cancel(fs_info) == -ENOTCONN)



There will be substantial difference in code when compiled with and 
without CONFIG_BTRFS_ASSERT [1]. That is, btrfs_scrub_cancel(fs_info)

won't be run at all,  I would like to keep it as it is.

[1]
--
./fs/btrfs/ctree.h
#ifdef CONFIG_BTRFS_ASSERT

__cold
static inline void assfail(const char *expr, const char *file, int line)
{
pr_err("assertion failed: %s, file: %s, line: %d\n",
   expr, file, line);
BUG();
}

#define ASSERT(expr)\
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
#else
#define ASSERT(expr)((void)0)
#endif
---

Thanks, Anand




Re: [PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish

2018-11-08 Thread Anand Jain




On 11/07/2018 08:17 PM, Nikolay Borisov wrote:



On 7.11.18 г. 13:43 ч., Anand Jain wrote:

-   WARN_ON(ret);
+   if (ret != -ECANCELED)
+   WARN_ON(ret);


WARN_ON(ret && ret != -ECANCELED)



Will fix.
Thanks, Anand


Re: [PATCH 9/9] btrfs: add explicit check for replace result no error

2018-11-07 Thread Anand Jain




On 11/07/2018 08:15 PM, Nikolay Borisov wrote:



On 7.11.18 г. 13:43 ч., Anand Jain wrote:

We recast the replace return status
BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no
error.
And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0,
which is also declared as 0, so we just return. Instead add it to the if
statement so that there is enough clarity while reading the code.

Signed-off-by: Anand Jain 
---
  fs/btrfs/dev-replace.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cf3554554616..ca44998189c7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info 
*fs_info,

args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
-   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
-   ret = 0;
+   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+   ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)


BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR can only be returned from
btrfs_dev_replace_cancel,


 It can return 0 and which is also
 BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR which this patch makes it
 explicit.

 Looking at this again tells me that, as btrfs_dev_replace_start()
 is internal helper function, its better if it free from all usage
 of BTRFS_IOCTL_DEV_REPLACE_RESULT*.

Thanks, Anand


so what you are doing with checking ret ==
BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is redundant. So using
BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is at the very least misleading
here.



I suggest you drop this patch.


+   return 0;
  
  	return ret;

  }



[PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish

2018-11-07 Thread Anand Jain
When we successfully cancel the replace its scrub returns -ECANCELED,
which then passed to btrfs_dev_replace_finishing(), it cleans up based
on the scrub returned status and propagates the same -ECANCELED back
the parent function. So skip the -ECANCELED error to log the WARN.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dae2b920f1a9..c14c41b70287 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info 
*fs_info,
ret = btrfs_dev_replace_finishing(fs_info, ret);
if (ret == -EINPROGRESS) {
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
-   } else {
+   } else if (ret != -ECANCELED) {
WARN_ON(ret);
}
 
@@ -956,7 +956,8 @@ static int btrfs_dev_replace_kthread(void *data)
  btrfs_device_get_total_bytes(dev_replace->srcdev),
  _replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
-   WARN_ON(ret);
+   if (ret != -ECANCELED)
+   WARN_ON(ret);
 
clear_bit(BTRFS_FS_EXCL_OP, _info->flags);
return 0;
-- 
1.8.3.1



[PATCH 8/9] btrfs: user requsted replace cancel is not an error

2018-11-07 Thread Anand Jain
As of now only user requested replace cancel can cancel the replace-scrub
so no need to log error for it.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c14c41b70287..cf3554554616 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
src_device,
tgt_device);
} else {
-   btrfs_err_in_rcu(fs_info,
+   if (scrub_ret != -ECANCELED)
+   btrfs_err_in_rcu(fs_info,
 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 btrfs_dev_name(src_device),
 src_device->devid,
-- 
1.8.3.1



[PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state

2018-11-07 Thread Anand Jain
When the replace state is placed in the suspended state,
btrfs_scrub_cancel() should fail with -ENOTCONN as there is no
scrub running, as a safety catch check if btrfs_scrub_cancel()
returns -ENOTCONN and assert if it doesn't.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index c092ed559714..dae2b920f1a9 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -831,7 +831,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 
btrfs_dev_replace_write_unlock(dev_replace);
 
-   btrfs_scrub_cancel(fs_info);
+   /* scrub for replace must not be running in suspended state */
+   if (btrfs_scrub_cancel(fs_info) != -ENOTCONN)
+   ASSERT(0);
 
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
-- 
1.8.3.1



[PATCH 9/9] btrfs: add explicit check for replace result no error

2018-11-07 Thread Anand Jain
We recast the replace return status
BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no
error.
And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0,
which is also declared as 0, so we just return. Instead add it to the if
statement so that there is enough clarity while reading the code.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index cf3554554616..ca44998189c7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info 
*fs_info,

args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
-   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
-   ret = 0;
+   if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
+   ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
+   return 0;
 
return ret;
 }
-- 
1.8.3.1



[PATCH 0/9] fix replace-start and replace-cancel racing

2018-11-07 Thread Anand Jain
Replace-start and replace-cancel threads can race to create a messy
situation leading to UAF. We use the scrub code to write
the blocks on the replace target. So if we haven't have set the
replace-scrub-running yet, without this patch we just ignore the error
and free the target device. When this happens the system panics with
UAF error.

Its nice to see that btrfs_dev_replace_finishing() already handles
the ECANCELED (replace canceled) situation, but for an unknown reason
we aren't using it to cleanup the replace cancel situation, instead
we just let the replace cancel ioctl thread to cleanup the target
device and return and out of synchronous with the scrub code.

This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel()
to check if the scrub was really running. And if its not then shall
return an error to the user (replace not started error) so that user
can retry replace cancel. And uses btrfs_dev_replace_finishing() code
to cleanup after successful cancel of the replace scrub.

Further, a suspended replace, when tries to restart, and if it fails
(for example target device missing, or excl ops running) it goes to the
started state, and so the cli 'btrfs replace status /mnt' hangs with no
progress. So patches 2/9 and 3/9 fixes that.

As the originals code idea of ECANCELED was limited to the situation of
the error only and not user requested, there are unnecessary error log
and warn log which 7/9 and 8/9 patches fixes.

Patches 1/9 and 9/9 are good to have fixes. Makes a function static and
code readability good.

Testing: (I did some attempt to convert these into xfstests but need a
mechanism where kernel thread can wait for user land script. I thought
I could do it using ebfp, but needs more digging on how).
As of now hand tested with using procfs to hold kernel thread at
(wait_for_user(..)) until user land issues go.


1.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
btrfs replace cancel /btrfs
  wait_for_user_go();
btrfs replace status /btrfs

2.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
reboot
mount -o device=/dev/sdc /dev/sdb /btrfs
  wait_for_user_go();
btrfs replace status /btrfs
btrfs replace cancel /btrfs
btrfs replace status /btrfs

3.
umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount 
/dev/sdb /btrfs && fillfs /btrfs 1
btrfs replace start /dev/sdb /dev/sdc /btrfs
  wait_for_user("scrub running is set..waiting"); AND OR
  wait_for_user("scrub running is NOT set..waiting");
reboot
mount -o degraded /dev/sdb /btrfs
btrfs replace status /btrfs
btrfs replace cancel /btrfs
btrfs replace status /btrfs
umount /btrfs
mount /dev/sdb /btrfs


Anand Jain (9):
  btrfs: mark btrfs_dev_replace_start() as static
  btrfs: replace go back to suspended if target missing
  btrfs: replace back to suspend state if EXCL OP is running
  btrfs: fix UAF due to race between replace start and cancel
  btrfs: replace cancel is successful if scrub cancel is successful
  btrfs: replace's scrub must not be running in replace suspended state
  btrfs: quiten warn if the replace is canceled at finish
  btrfs: user requsted replace cancel is not an error
  btrfs: add explicit check for replace result no error

 fs/btrfs/dev-replace.c | 90 ++
 fs/btrfs/dev-replace.h |  3 --
 2 files changed, 62 insertions(+), 31 deletions(-)

-- 
1.8.3.1



[PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful

2018-11-07 Thread Anand Jain
In btrfs_dev_replace_cancel() we should check if the
btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return
BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to
cancel the replace again.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 90c124edec76..c092ed559714 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
btrfs_dev_replace_write_unlock(dev_replace);
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
-   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
btrfs_dev_replace_write_unlock(dev_replace);
-   btrfs_scrub_cancel(fs_info);
-   /*
-* btrfs_dev_replace_finishing() will handle the cleanup part
-*/
-   btrfs_info_in_rcu(fs_info,
-   "dev_replace from %s (devid %llu) to %s canceled",
-   btrfs_dev_name(src_device), src_device->devid,
-   btrfs_dev_name(tgt_device));
+   ret = btrfs_scrub_cancel(fs_info);
+   if (ret) {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+   } else {
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   /*
+* btrfs_dev_replace_finishing() will handle the 
cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s 
canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   }
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
-- 
1.8.3.1



[PATCH 2/9] btrfs: replace go back to suspended if target missing

2018-11-07 Thread Anand Jain
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and suppose the target device is missing, then let the replace
state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state
instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any
matching scrub running as part of replace.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 59991165e126..47d6768a9cde 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -884,6 +884,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
   "cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
   "you may cancel the operation after 'mount -o 
degraded'");
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+   dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
-- 
1.8.3.1



[PATCH 1/9] btrfs: mark btrfs_dev_replace_start() as static

2018-11-07 Thread Anand Jain
There isn't any other consumer other than in its own file dev-replace.c.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/dev-replace.h | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 2aa48aecc52b..59991165e126 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -390,7 +390,7 @@ static char* btrfs_dev_name(struct btrfs_device *device)
return rcu_str_deref(device->name);
 }
 
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
+static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
 {
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 795c551f5b5e..27e3bb0cca11 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -13,9 +13,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
  struct btrfs_fs_info *fs_info);
 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
-   const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
-   int read_src);
 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dev_replace_args *args);
 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
-- 
1.8.3.1



[PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel

2018-11-07 Thread Anand Jain
replace cancel thread can race with the replace start thread and if
fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail
to stop the scrub thread, so the scrub thread continue with the scrub for
replace which then shall try to write to the target device and which is
already freed by the cancel thread. Please ref to the logs below.

So scrub_setup_ctx() warns as tgtdev is null.

static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int
is_dev_replace)
{
::
if (is_dev_replace) {
WARN_ON(!fs_info->dev_replace.tgtdev);  <===
sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
sctx->flush_all_writes = false;
}

[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc started
[ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to 
/dev/sdc canceled
[ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 
scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
::
[ 6852.432970] Call Trace:
[ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
[ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
[ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
[ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
[ 6852.434365]  ? do_sigaction+0x7d/0x1e0
[ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
[ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
[ 6852.435387]  ksys_ioctl+0x60/0x90
[ 6852.435663]  __x64_sys_ioctl+0x16/0x20
[ 6852.435907]  do_syscall_64+0x50/0x180
[ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters the
scrub_write_page_to_dev_replace() without the target device it panics

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
struct scrub_page *spage)
{
::
  bio_set_dev(bio, sbio->dev->bdev); <==

[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 
00a0
::
[ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
[ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
[btrfs]
::
[ 6929.721430] Call Trace:
[ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
[ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
[ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
[ 6929.722552]  process_one_work+0x1f4/0x520
[ 6929.722805]  ? process_one_work+0x16e/0x520
[ 6929.723063]  worker_thread+0x46/0x3d0
[ 6929.723313]  kthread+0xf8/0x130
[ 6929.723544]  ? process_one_work+0x520/0x520
[ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
[ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 61 --
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e001c2418940..90c124edec76 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info 
*fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
btrfs_dev_replace_write_unlock(dev_replace);
-   goto leave;
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+   result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+   tgt_device = dev_replace->tgtdev;
+   src_device = dev_replace->srcdev;
+   btrfs_dev_replace_write_unlock(dev_replace);
+   btrfs_scrub_cancel(fs_info);
+   /*
+* btrfs_dev_replace_finishing() will handle the cleanup part
+*/
+   btrfs_info_in_rcu(fs_info,
+   "dev_replace from %s (devid %llu) to %s canceled",
+   btrfs_dev_name(src_device), src_device->devid,
+   btrfs_dev_name(tgt_device));
+   break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+   /*
+* scrub doing the replace isn't running so do the cleanup here
+*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
-   break;
-   }
-   dev_replace->replace_state = BTRFS_IOCTL_

[PATCH 3/9] btrfs: replace back to suspend state if EXCL OP is running

2018-11-07 Thread Anand Jain
In a secnario where balance and replace co-exists as below,
  start balance; pause balance; start replace; reboot
and when system restarts balance restarts first and the
replace restart will fail as EXCL OP lock is already held by
the balance. If so place the replace state back to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state.

Signed-off-by: Anand Jain 
---
 fs/btrfs/dev-replace.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 47d6768a9cde..e001c2418940 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -898,6 +898,11 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info 
*fs_info)
 * dev-replace to start anyway.
 */
if (test_and_set_bit(BTRFS_FS_EXCL_OP, _info->flags)) {
+   btrfs_dev_replace_write_lock(dev_replace);
+   dev_replace->replace_state =
+   BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+   dev_replace->item_needs_writeback = 1;
+   btrfs_dev_replace_write_unlock(dev_replace);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
-- 
1.8.3.1



[PATCH v3] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Anand Jain
We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied. So this test case reproduces this issue.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part /
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
Total devices 1 FS bytes used 1.40GiB
devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:01 14.9G  0 disk
|-sda48:414G  0 part /
|-sda28:21  500M  0 part
|-sda38:31  256M  0 part
`-sda18:11  256M  0 part
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi
btrfs fi show /
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
Total devices 1 FS bytes used 1.40GiB
devid1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
the sda out of the system on to another system to change its fsid using
the 'btrfstune -u' command.

Signed-off-by: Anand Jain 
---
v2->v3:
  Check the return code and use _fail to verify and accordingly fix golden
output.
  Rename dev_foo(bar) to device_1(2)
  Don't log dd retun to $seqres.full
 
v1->v2: 
  dont play around with dev patch use it as it is.
  do not use SCRATCH_MNT instead create it at the TEST_DIR and its related
   changes.
  golden out changes

 tests/btrfs/173 | 82 +
 tests/btrfs/173.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 85 insertions(+)
 create mode 100755 tests/btrfs/173
 create mode 100644 tests/btrfs/173.out

diff --git a/tests/btrfs/173 b/tests/btrfs/173
new file mode 100755
index ..342ae92b4781
--- /dev/null
+++ b/tests/btrfs/173
@@ -0,0 +1,82 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Oracle. All Rights Reserved.
+#
+# FS QA Test 173
+#
+# Fuzzy test for FS image duplication.
+#  Could be fixed by
+#[patch] btrfs: harden agaist duplicate fsid
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+mnt=$TEST_DIR/$seq.mnt
+_cleanup()
+{
+   rm -rf $mnt > /dev/null 2>&1
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_scratch_dev_pool_get 2
+
+device_1=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
+device_2=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')
+
+echo device_1=$device_1 device_2=$device_2 >> $seqres.full
+
+rm -rf $mnt > /dev/null 2>&1
+mkdir $mnt
+_mkfs_dev $device_1
+_mount $device_1 $mnt
+
+[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \
+   _fail "mounted device changed"
+
+for sb_bytenr in 65536 67108864
+do
+   echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\
+   "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full
+   dd status=none if=$device_1 of=$device_2 bs=1 seek=$sb_bytenr \
+   skip=$sb_bytenr count=4096 > /dev/null 2>&1
+   echo ..:$? >> $seqres.full
+done
+
+#Original device is mounted, scan of its clone should fail
+$BTRFS_UTIL_PROG device scan $device_2 >> $seqres.full 2>&1
+[[ $? != 1 ]] && _fail "cloned device scan should fail"
+
+[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \
+   _fail "mounted device changed"
+
+#Original device scan should be successful
+$BTRFS_UTIL_PROG device scan $device_1 >> $seqres.full 2>&1
+[[ $? != 0 ]] && \
+   _fail "if it fails here, then it means subvolume mount at boot may fail 
"\
+ &

Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Anand Jain




On 10/26/2018 11:52 PM, Nikolay Borisov wrote:



On 26.10.18 г. 18:34 ч., Anand Jain wrote:



On 10/26/2018 11:02 PM, Nikolay Borisov wrote:



On 8.10.18 г. 21:28 ч., Anand Jain wrote:

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied. So this test case reproduces this issue.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:0    0 29.2G  0 disk
|-mmcblk0p4 179:4    0    4G  0 part /
|-mmcblk0p2 179:2    0  500M  0 part /boot
|-mmcblk0p3 179:3    0  256M  0 part [SWAP]
`-mmcblk0p1 179:1    0  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
  Total devices 1 FS bytes used 1.40GiB
  devid    1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAME    MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:0    1 14.9G  0 disk
|-sda4    8:4    1    4G  0 part /
|-sda2    8:2    1  500M  0 part
|-sda3    8:3    1  256M  0 part
`-sda1    8:1    1  256M  0 part
mmcblk0 179:0    0 29.2G  0 disk
|-mmcblk0p4 179:4    0    4G  0 part
|-mmcblk0p2 179:2    0  500M  0 part /boot
|-mmcblk0p3 179:3    0  256M  0 part [SWAP]
`-mmcblk0p1 179:1    0  256M  0 part /boot/efi
btrfs fi show /
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
  Total devices 1 FS bytes used 1.40GiB
  devid    1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
the sda out of the system on to another system to change its fsid using
the 'btrfstune -u' command.


Is there a pending fix for this?


Yes.
https://patchwork.kernel.org/patch/10641041/

Test case header mentioned it.





Signed-off-by: Anand Jain 
---
v1->v2:
    dont play around with dev patch use it as it is.
    do not use SCRATCH_MNT instead create it at the TEST_DIR and its
related
     changes.
    golden out changes
       tests/btrfs/173 | 88
+
   tests/btrfs/173.out |  6 
   tests/btrfs/group   |  1 +
   3 files changed, 95 insertions(+)
   create mode 100755 tests/btrfs/173
   create mode 100644 tests/btrfs/173.out

diff --git a/tests/btrfs/173 b/tests/btrfs/173
new file mode 100755
index ..b466ae921e19
--- /dev/null
+++ b/tests/btrfs/173
@@ -0,0 +1,88 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Oracle. All Rights Reserved.
+#
+# FS QA Test 173
+#
+# Fuzzy test for FS image duplication.
+#  Could be fixed by
+#    [patch] btrfs: harden agaist duplicate fsid
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1    # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+mnt=$TEST_DIR/$seq.mnt
+_cleanup()
+{
+    rm -rf $mnt > /dev/null 2>&1
+    cd /
+    rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_scratch_dev_pool_get 2
+
+dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
+dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')


Naming devices _foo and _bar just shows you could not care less about
the test.


   Hmm. Will make it device_1 and device_2.


Either use sane names - primary_dev/secondary dev or device_1
and device_2.



+
+echo dev_foo=$dev_foo >> $seqres.full
+echo dev_bar=$dev_bar >> $seqres.full
+echo | tee -a $seqres.full
+
+rm -rf $mnt > /dev/null 2>&1


So what is $mnt? I can see it's used by very few tests and it's not
obvious? Generally you should either define it as a local variable or
use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked
with Eric re. the use of $mnt in tests and his conclusion is :

" looks like a bug"


  No its not. As few lines above, I have assigned it as..
     mnt=$TEST_DIR/$seq.mnt


I missed that, I will recommend moving the assignment near where you set
the devices


 cleanup() is using $mnt. cleanup() may get called for any trap before 
the actual test.


Thanks, Anand






+mkdir $mnt
+_mkfs_dev $dev_foo
+_mount $dev_foo $mnt
+
+check_btrfs_mount()
+{
+    local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}')
+    [[ $x == $dev_foo ]] && echo DEV_FOO
+    [[ $x == $dev_bar ]] && echo 

Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid

2018-10-26 Thread Anand Jain




On 10/26/2018 11:02 PM, Nikolay Borisov wrote:



On 8.10.18 г. 21:28 ч., Anand Jain wrote:

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after its
been copied. So this test case reproduces this issue.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part /
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi

btrfs fi show
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
 Total devices 1 FS bytes used 1.40GiB
 devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda
dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using
btrfs device scan and the new device sda becomes the mounted root
device.

lsblk
NAMEMAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
sda   8:01 14.9G  0 disk
|-sda48:414G  0 part /
|-sda28:21  500M  0 part
|-sda38:31  256M  0 part
`-sda18:11  256M  0 part
mmcblk0 179:00 29.2G  0 disk
|-mmcblk0p4 179:404G  0 part
|-mmcblk0p2 179:20  500M  0 part /boot
|-mmcblk0p3 179:30  256M  0 part [SWAP]
`-mmcblk0p1 179:10  256M  0 part /boot/efi
btrfs fi show /
Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
 Total devices 1 FS bytes used 1.40GiB
 devid1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take
the sda out of the system on to another system to change its fsid using
the 'btrfstune -u' command.


Is there a pending fix for this?


Yes.
https://patchwork.kernel.org/patch/10641041/

Test case header mentioned it.





Signed-off-by: Anand Jain 
---
v1->v2:
   dont play around with dev patch use it as it is.
   do not use SCRATCH_MNT instead create it at the TEST_DIR and its related
changes.
   golden out changes

  tests/btrfs/173 | 88 +

  tests/btrfs/173.out |  6 
  tests/btrfs/group   |  1 +
  3 files changed, 95 insertions(+)
  create mode 100755 tests/btrfs/173
  create mode 100644 tests/btrfs/173.out

diff --git a/tests/btrfs/173 b/tests/btrfs/173
new file mode 100755
index ..b466ae921e19
--- /dev/null
+++ b/tests/btrfs/173
@@ -0,0 +1,88 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Oracle. All Rights Reserved.
+#
+# FS QA Test 173
+#
+# Fuzzy test for FS image duplication.
+#  Could be fixed by
+#[patch] btrfs: harden agaist duplicate fsid
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+mnt=$TEST_DIR/$seq.mnt
+_cleanup()
+{
+   rm -rf $mnt > /dev/null 2>&1
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch_dev_pool 2
+_scratch_dev_pool_get 2
+
+dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}')
+dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}')


Naming devices _foo and _bar just shows you could not care less about
the test.


  Hmm. Will make it device_1 and device_2.


Either use sane names - primary_dev/secondary dev or device_1
and device_2.



+
+echo dev_foo=$dev_foo >> $seqres.full
+echo dev_bar=$dev_bar >> $seqres.full
+echo | tee -a $seqres.full
+
+rm -rf $mnt > /dev/null 2>&1


So what is $mnt? I can see it's used by very few tests and it's not
obvious? Generally you should either define it as a local variable or
use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked
with Eric re. the use of $mnt in tests and his conclusion is :

" looks like a bug"


 No its not. As few lines above, I have assigned it as..
mnt=$TEST_DIR/$seq.mnt



+mkdir $mnt
+_mkfs_dev $dev_foo
+_mount $dev_foo $mnt
+
+check_btrfs_mount()
+{
+   local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}')
+   [[ $x == $dev_foo ]] && echo DEV_FOO
+   [[ $x == $dev_bar ]] && echo DEV_BAR
+}


Same thing here re. DEV_(FOO|BAR).


+
+echo MNT $(check_btrfs_mount)
+
+for sb_bytenr in 65536 67108864
+do
+   echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\
+   "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full

  1   2   3   4   5   6   7   8   9   10   >