[PATCH RESEND v12] btrfs-progs: add cli to forget one or all scanned devices
This patch adds cli btrfs device forget [dev] to remove the given device structure in the kernel if the device is unmounted. If no argument is given it shall remove all stale (device which are not mounted) from the kernel. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- v11->v12: none. v1->v12: pls ref to the cover-letter. cmds-device.c | 72 --- ioctl.h | 2 ++ 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/cmds-device.c b/cmds-device.c index 2a05f70a76a9..280d6f555377 100644 --- a/cmds-device.c +++ b/cmds-device.c @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) return _cmd_device_remove(argc, argv, cmd_device_delete_usage); } +static int btrfs_forget_devices(char *path) +{ + struct btrfs_ioctl_vol_args args; + int ret; + int fd; + + fd = open("/dev/btrfs-control", O_RDWR); + if (fd < 0) + return -errno; + + memset(, 0, sizeof(args)); + if (path) + strncpy_null(args.name, path); + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); + if (ret) + ret = -errno; + close(fd); + return ret; +} + static const char * const cmd_device_scan_usage[] = { - "btrfs device scan [(-d|--all-devices)| [...]]", - "Scan devices for a btrfs filesystem", + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ + "[...]]", + "Scan or forget (deregister) devices for a btrfs filesystem", " -d|--all-devices (deprecated)", + " -u|--forget [ ..]", NULL }; @@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv) int devstart; int all = 0; int ret = 0; + int forget = 0; optind = 0; while (1) { int c; static const struct option long_options[] = { { "all-devices", no_argument, NULL, 'd'}, + { "forget", no_argument, NULL, 'u'}, { NULL, 0, NULL, 0} }; - c = getopt_long(argc, argv, "d", long_options, NULL); + c = getopt_long(argc, argv, "du", long_options, NULL); if (c < 0) break; switch (c) { case 'd': all = 1; break; + case 'u': + forget = 1; + break; default: usage(cmd_device_scan_usage); } } devstart = optind; + if (all && forget) + usage(cmd_device_scan_usage); + if (all && check_argc_max(argc - optind, 1)) usage(cmd_device_scan_usage); if (all || argc - optind == 0) { - printf("Scanning for Btrfs filesystems\n"); - ret = btrfs_scan_devices(); - error_on(ret, "error %d while scanning", ret); - ret = btrfs_register_all_devices(); - error_on(ret, "there are %d errors while registering devices", ret); + if (forget) { + ret = btrfs_forget_devices(NULL); + error_on(ret, "'%s', forget failed", +strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems\n"); + ret = btrfs_scan_devices(); + error_on(ret, "error %d while scanning", ret); + ret = btrfs_register_all_devices(); + error_on(ret, + "there are %d errors while registering devices", + ret); + } goto out; } @@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv) ret = 1; goto out; } - printf("Scanning for Btrfs filesystems in '%s'\n", path); - if (btrfs_register_one_device(path) != 0) { - ret = 1; - free(path); - goto out; + if (forget) { + ret = btrfs_forget_devices(path); + if (ret) + error("Can't forget '%s': %s", + path, strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems in '%s'\n", +
[PATCH RESEND] fstests: btrfs use forget if not reload
[I will send this the xfstest ML after kernel and progs patch has been integrated]. btrfs reload was introduced to cleanup the device list inside the btrfs kernel module. The problem with the reload approach is that you can't run btrfs test cases 124,125, 154 and 164 on the system with btrfs as root fs. Now as we are introducing the btrfs forget feature as an btrfs device scan option [1], so here are its pertaining changes in the fstests. So these changes ensures we use the forget feature where available and if its not then falls back to the reload approach. [1] btrfs-progs: add cli to forget one or all scanned devices btrfs: introduce feature to forget a btrfs device Signed-off-by: Anand Jain --- common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) diff --git a/common/btrfs b/common/btrfs index 26dc0bb9600f..c7fbec11c8c1 100644 --- a/common/btrfs +++ b/common/btrfs @@ -374,3 +374,23 @@ _scratch_btrfs_sectorsize() $BTRFS_UTIL_PROG inspect-internal dump-super $SCRATCH_DEV |\ grep sectorsize | awk '{print $2}' } + +_btrfs_supports_forget() +{ + $BTRFS_UTIL_PROG device scan --help | grep -wq forget && \ + $BTRFS_UTIL_PROG device scan --forget > /dev/null 2>&1 +} + +_require_btrfs_forget_if_not_fs_loadable() +{ + _btrfs_supports_forget && return + + _require_loadable_fs_module "btrfs" +} + +_btrfs_forget_if_not_fs_reload() +{ + _btrfs_supports_forget && return + + _reload_fs_module "btrfs" +} diff --git a/tests/btrfs/124 b/tests/btrfs/124 index ce3ad6aa3a58..edbeff1443f5 100755 --- a/tests/btrfs/124 +++ b/tests/btrfs/124 @@ -51,7 +51,7 @@ _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 2 _test_unmount -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 2 @@ -86,7 +86,7 @@ echo "clean btrfs ko" >> $seqres.full _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload echo >> $seqres.full echo "-Write degraded mount fill upto $max_fs_sz bytes-" >> $seqres.full @@ -125,7 +125,7 @@ echo echo "Mount degraded with the other dev" _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded $dev2 $SCRATCH_MNT >>$seqres.full 2>&1 _run_btrfs_util_prog filesystem show checkpoint3=`md5sum $SCRATCH_MNT/tf2` diff --git a/tests/btrfs/125 b/tests/btrfs/125 index e38de264b28e..9161a2ddeaad 100755 --- a/tests/btrfs/125 +++ b/tests/btrfs/125 @@ -50,7 +50,7 @@ _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 3 _test_unmount -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 3 @@ -102,7 +102,7 @@ echo "unmount" >> $seqres.full _scratch_unmount echo "clean btrfs ko" >> $seqres.full # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded,device=$dev2 $dev1 $SCRATCH_MNT >>$seqres.full 2>&1 dd if=/dev/zero of="$SCRATCH_MNT"/tf2 bs=$bs count=$count \ >>$seqres.full 2>&1 @@ -138,7 +138,7 @@ echo "Mount degraded but with other dev" _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded,device=${dev2} $dev3 $SCRATCH_MNT >>$seqres.full 2>&1 diff --git a/tests/btrfs/154 b/tests/btrfs/154 index 99ea232aba4c..01745d20e9fc 100755 --- a/tests/btrfs/154 +++ b/tests/btrfs/154 @@ -36,7 +36,7 @@ rm -f $seqres.full _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 2 -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 2 @@ -90,7 +90,7 @@ degrade_mount_write() echo "clean btrfs ko" >> $seqres.full # un-scan the btrfs devices - _reload_fs_module "btrfs" + _btrfs_forget_if_not_fs_reload _mount -o degraded $DEV1 $SCRATCH_MNT >>$seqres.full 2>&1 cnt=$(( $COUNT/10 )) dd if=/dev/urandom of="$SCRATCH_MNT"/tf1 bs=$bs count=$cnt \ @@ -142,7 +142,7 @@ verify() echo "unmount" >> $seqres.full _scratch_unmount - _reload_fs_module "btrfs" + _btrfs_forget_if_not_fs_reload _mount -o degraded $DEV2 $SCRATCH_MNT >>$seqres.full 2>&1 verify_checkpoint1=`md5sum $SCRATCH_MNT/tf1` verify_checkpoint2=`md5sum $SCRATCH_MNT/tf2` diff --git a/tests/btrfs/164 b/tests
[PATCH RESEND v12] Add cli and ioctl to forget scanned device(s)
v12: Fixed coding style - leave space between " : ". v11: btrfs-progs: Bring the code into the else part of if(forget). Use strerror to print the erorr instead of ret. v10: Make btrfs-progs changes more readable. With an effort to keep the known bug [1] as it is.. [1] The cli 'btrfs device scan --all /dev/sdb' which should have scanned only one device, ends up scanning all the devices and I am not trying to fix this bug in this patch because.. . -d|--all is marked as deprecated, I hope -d option would go away . For now some script might be using this bug as a feature, and fixing this bug might lead to mount failure. v9: Make forget as a btrfs device scan option. Use forget in the fstests, now you can run fstests with btrfs as rootfs which helps to exercise the uuid_mutex lock. v8: Change log update in the kernel patch. v7: Use struct btrfs_ioctl_vol_args (instead of struct btrfs_ioctl_vol_args_v2) as its inline with other ioctl btrfs-control The CLI usage/features remains same. However internally the ioctl flag is not required to delete all the unmounted devices. Instead leave btrfs_ioctl_vol_args::name NULL. v6: Use the changed fn name btrfs_free_stale_devices(). Change in title: Old v5: Cover-letter: [PATCH v5] Add cli and ioctl to ignore a scanned device Kernel: [PATCH v5] btrfs: introduce feature to ignore a btrfs device Progs: [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli v5: Adds feature to delete all stale devices Reuses btrfs_free_stale_devices() fn and so depends on the patch-set [1] in the ML. Uses struct btrfs_ioctl_vol_args_v2 instead of struct btrfs_ioctl_vol_args as arg Does the device path matching instead of btrfs_device matching (we won't delete the mounted device as btrfs_free_stale_devices() checks for it) v4: No change. But as the ML thread may be confusing, so resend. v3: No change. Send to correct ML. v2: Accepts review from Nikolay, details are in the specific patch. Patch 1/2 is renamed from [PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list delete to [PATCH 1/2] btrfs: add function to device list delete Adds cli and ioctl to forget a scanned device or forget all stale devices in the kernel. Anand Jain (1): btrfs: introduce feature to forget a btrfs device fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) Anand Jain (1): btrfs-progs: add cli to forget one or all scanned devices cmds-device.c | 63 ++- ioctl.h | 2 ++ 2 files changed, 56 insertions(+), 9 deletions(-) [1] Anand Jain (1): fstests: btrfs use forget if not reload common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) -- 1.8.3.1
[PATCH RESEND] fstests: btrfs use forget if not reload
[I will send this the xfstest ML after kernel and progs patch has been integrated]. btrfs reload was introduced to cleanup the device list inside the btrfs kernel module. The problem with the reload approach is that you can't run btrfs test cases 124,125, 154 and 164 on the system with btrfs as root fs. Now as we are introducing the btrfs forget feature as an btrfs device scan option [1], so here are its pertaining changes in the fstests. So these changes ensures we use the forget feature where available and if its not then falls back to the reload approach. [1] btrfs-progs: add cli to forget one or all scanned devices btrfs: introduce feature to forget a btrfs device Signed-off-by: Anand Jain --- common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) diff --git a/common/btrfs b/common/btrfs index 26dc0bb9600f..c7fbec11c8c1 100644 --- a/common/btrfs +++ b/common/btrfs @@ -374,3 +374,23 @@ _scratch_btrfs_sectorsize() $BTRFS_UTIL_PROG inspect-internal dump-super $SCRATCH_DEV |\ grep sectorsize | awk '{print $2}' } + +_btrfs_supports_forget() +{ + $BTRFS_UTIL_PROG device scan --help | grep -wq forget && \ + $BTRFS_UTIL_PROG device scan --forget > /dev/null 2>&1 +} + +_require_btrfs_forget_if_not_fs_loadable() +{ + _btrfs_supports_forget && return + + _require_loadable_fs_module "btrfs" +} + +_btrfs_forget_if_not_fs_reload() +{ + _btrfs_supports_forget && return + + _reload_fs_module "btrfs" +} diff --git a/tests/btrfs/124 b/tests/btrfs/124 index ce3ad6aa3a58..edbeff1443f5 100755 --- a/tests/btrfs/124 +++ b/tests/btrfs/124 @@ -51,7 +51,7 @@ _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 2 _test_unmount -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 2 @@ -86,7 +86,7 @@ echo "clean btrfs ko" >> $seqres.full _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload echo >> $seqres.full echo "-Write degraded mount fill upto $max_fs_sz bytes-" >> $seqres.full @@ -125,7 +125,7 @@ echo echo "Mount degraded with the other dev" _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded $dev2 $SCRATCH_MNT >>$seqres.full 2>&1 _run_btrfs_util_prog filesystem show checkpoint3=`md5sum $SCRATCH_MNT/tf2` diff --git a/tests/btrfs/125 b/tests/btrfs/125 index e38de264b28e..9161a2ddeaad 100755 --- a/tests/btrfs/125 +++ b/tests/btrfs/125 @@ -50,7 +50,7 @@ _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 3 _test_unmount -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 3 @@ -102,7 +102,7 @@ echo "unmount" >> $seqres.full _scratch_unmount echo "clean btrfs ko" >> $seqres.full # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded,device=$dev2 $dev1 $SCRATCH_MNT >>$seqres.full 2>&1 dd if=/dev/zero of="$SCRATCH_MNT"/tf2 bs=$bs count=$count \ >>$seqres.full 2>&1 @@ -138,7 +138,7 @@ echo "Mount degraded but with other dev" _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded,device=${dev2} $dev3 $SCRATCH_MNT >>$seqres.full 2>&1 diff --git a/tests/btrfs/154 b/tests/btrfs/154 index 99ea232aba4c..01745d20e9fc 100755 --- a/tests/btrfs/154 +++ b/tests/btrfs/154 @@ -36,7 +36,7 @@ rm -f $seqres.full _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 2 -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 2 @@ -90,7 +90,7 @@ degrade_mount_write() echo "clean btrfs ko" >> $seqres.full # un-scan the btrfs devices - _reload_fs_module "btrfs" + _btrfs_forget_if_not_fs_reload _mount -o degraded $DEV1 $SCRATCH_MNT >>$seqres.full 2>&1 cnt=$(( $COUNT/10 )) dd if=/dev/urandom of="$SCRATCH_MNT"/tf1 bs=$bs count=$cnt \ @@ -142,7 +142,7 @@ verify() echo "unmount" >> $seqres.full _scratch_unmount - _reload_fs_module "btrfs" + _btrfs_forget_if_not_fs_reload _mount -o degraded $DEV2 $SCRATCH_MNT >>$seqres.full 2>&1 verify_checkpoint1=`md5sum $SCRATCH_MNT/tf1` verify_checkpoint2=`md5sum $SCRATCH_MNT/tf2` diff --git a/tests/btrfs/164 b/tests
[PATCH RESEND v12] btrfs: introduce feature to forget a btrfs device
Support for a new command 'btrfs dev forget [dev]' is proposed here to undo the effects of 'btrfs dev scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty. IOW(BTRFS_IOCTL_MAGIC, 5, ..) This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from the /dev/btrfs-control to forget one or all devices, (devices which are not mounted) from the btrfs kernel. The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be set to specify the device path. And all unmounted devices can be removed from the kernel if no device path is provided. Again, the devices are removed only if the relevant fsid aren't mounted. This new cli can provide.. . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the device is not going to be mounted. . Ability to mount the device in degraded mode when one of the other device is corrupted like in split brain raid1. . Running test cases which requires btrfs.ko-reload if the rootfs is btrfs. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- v11->v12: fix coding style add spacing before after ":". v1->v11: Pls ref to the cover-letter. (sorry about that). fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d3c6bbc0aa3a..eba2966913ae 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2247,6 +2247,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8e36cbb355df..48415a9edd46 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1397,6 +1397,15 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + mutex_lock(_mutex); + btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(_mutex); + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1d936ce282c3..a4a89d5050f5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -414,6 +414,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e0763bc4158e..c195896d478f 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -837,6 +837,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- 1.8.3.1
Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning
On 11/30/2018 09:05 AM, Anand Jain wrote: On 11/29/2018 10:31 PM, David Sterba wrote: On Wed, Nov 28, 2018 at 04:47:27PM +0800, Anand Jain wrote: 2. scrub_workers_refcnt must eventually be converted to refcount_t type ok. Added in v2 patch set. No such thing is in v2 and this would actually get rid of the need to hold scrub_lock in scrub_workers_put. Well we still need scrub_lock, because we spun scrub thread per device, and these threads can race as well. So the first scrub thread should make sure the scrub workers are allocated and keep the other threads waiting. As of now btrfs-progs serializes the scrub per device. But btrfs-progs is just one of the tool and there can be some other similar tool. We need hardening with in the kernel. Thanks, Anand Which in turn can be moved out of the locked section in btrfs_scrub_dev and the warning is gone. Problem solved. Right. When testing btrfs/011 it got hung and bisect pointed to the patch which was converting int to refcount_t. I had difficulties to get the logs out of the test machines, so I had to drop the patch. Will send refcount_t patch, patch 1/3 and possibly scrub concurrency patch (make scrub independent of the btrfs-progs locks) patches all together. Thanks, Anand
[PATCH v3 0/2] btrfs: scrub: fix scrub_lock
v3: Drops the patch [1]from this set. [1] btrfs: scrub: maintain the unlock order in scrub thread Fixes the circular locking dependency warning as in patch 1/2, and patch 2/2 adds lockdep_assert_held() to scrub_workers_get(). Anand Jain (2): btrfs: scrub: fix circular locking dependency warning btrfs: add lockdep check for scrub_lock in scrub_workers_get fs/btrfs/scrub.c | 22 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) -- 1.8.3.1
[PATCH v3 1/2] btrfs: scrub: fix circular locking dependency warning
find_held_lock+0x2d/0x90 [ 76.170450] do_vfs_ioctl+0xa9/0x6d0 [ 76.170690] ? __fget+0x101/0x1f0 [ 76.170910] ? __fget+0x5/0x1f0 [ 76.171157] ksys_ioctl+0x60/0x90 [ 76.171391] __x64_sys_ioctl+0x16/0x20 [ 76.171634] do_syscall_64+0x50/0x180 [ 76.171892] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 76.172186] RIP: 0033:0x7f61d422e567 [ 76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48 [ 76.172911] RSP: 002b:7f61d3936d68 EFLAGS: 0246 ORIG_RAX: 0010 [ 76.173328] RAX: ffda RBX: 019026b0 RCX: 7f61d422e567 [ 76.173649] RDX: 019026b0 RSI: c400941b RDI: 0003 [ 76.173909] RBP: R08: 7f61d3937700 R09: [ 76.174244] R10: 7f61d3937700 R11: 0246 R12: [ 76.174566] R13: 000000801000 R14: R15: 7f61d3937700 [ 76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left Signed-off-by: Anand Jain --- v2->v3: none v1->v2: none fs/btrfs/scrub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b5a19ba38ab7..9ade0659f017 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3757,10 +3757,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { + lockdep_assert_held(_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) { + mutex_unlock(_info->scrub_lock); btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_parity_workers); + mutex_lock(_info->scrub_lock); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } -- 1.8.3.1
[PATCH v3 2/2] btrfs: scrub: add scrub_lock lockdep check in scrub_workers_get
scrub_workers_refcnt is protected by scrub_lock, add lockdep_assert_held() function in scrub_workers_get(). Signed-off-by: Anand Jain Suggested-by: Nikolay Borisov --- v3: none v2: born fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9ade0659f017..84ef1f0d371e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3726,6 +3726,8 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; + lockdep_assert_held(_info->scrub_lock); + if (fs_info->scrub_workers_refcnt == 0) { fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, is_dev_replace ? 1 : max_active, 4); -- 1.8.3.1
Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning
On 11/29/2018 10:31 PM, David Sterba wrote: On Wed, Nov 28, 2018 at 04:47:27PM +0800, Anand Jain wrote: 2. scrub_workers_refcnt must eventually be converted to refcount_t type ok. Added in v2 patch set. No such thing is in v2 and this would actually get rid of the need to hold scrub_lock in scrub_workers_put. Which in turn can be moved out of the locked section in btrfs_scrub_dev and the warning is gone. Problem solved. Right. When testing btrfs/011 it got hung and bisect pointed to the patch which was converting int to refcount_t. I had difficulties to get the logs out of the test machines, so I had to drop the patch. Will send refcount_t patch, patch 1/3 and possibly scrub concurrency patch (make scrub independent of the btrfs-progs locks) patches all together. Thanks, Anand
Re: [PATCH v2 1/3] btrfs: scrub: maintain the unlock order in scrub thread
On 11/29/2018 06:36 PM, Filipe Manana wrote: On Thu, Nov 29, 2018 at 9:27 AM Anand Jain wrote: The device_list_mutex and scrub_lock creates a nested locks in btrfs_scrub_dev(). During lock the order is device_list_mutex and then scrub_lock, and during unlock, the order is device_list_mutex and then scrub_lock. Fix this to the lock order of scrub_lock and then device_list_mutex. Signed-off-by: Anand Jain --- v1->v2: change the order of lock acquire first scrub_lock and then device_list_mutex, which matches with the order of unlock. The extra line which are now in the scrub_lock are ok to be under the scrub_lock. I don't get it. What problem does this patch fixes? Doesn't seem any functional fix to me, nor performance gain (by the contrary, the scrub_lock is now held for a longer time than needed), nor makes anything more readable or "beautiful". btrfs_scrub_dev() isn't following the lock and unlock FILO order. Such as lock-a lock-b .. unlock-b unlock-a. So this patch is trying to fix it. This patch fixes the order but I think you mean to say as __scrub_blocked_if_needed() calls unlock scrub_lock. oops my bad this patch is wrong. Scrub concurrency needs overhaul including the dependency on the user land btrfs-progs, which I was trying to avoid. but looks like its better to fix that as well. As of now I am NACK this patch. Thanks, Anand fs/btrfs/scrub.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..a9d6fc3b01d4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3813,28 +3813,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - + mutex_lock(_info->scrub_lock); mutex_lock(_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, >dev_state) && !is_dev_replace)) { mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return -ENODEV; } if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state)) { mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); return -EROFS; } - mutex_lock(_info->scrub_lock); if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, >dev_state) || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return -EIO; } @@ -3843,23 +3844,23 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, (!is_dev_replace && btrfs_dev_replace_is_ongoing(_info->dev_replace))) { btrfs_dev_replace_read_unlock(_info->dev_replace); - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return -EINPROGRESS; } btrfs_dev_replace_read_unlock(_info->dev_replace); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return ret; } sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); scrub_workers_put(fs_info); return PTR_ERR(sctx); } -- 1.8.3.1
[PATCH v2 2/3] btrfs: scrub: fix circular locking dependency warning
find_held_lock+0x2d/0x90 [ 76.170450] do_vfs_ioctl+0xa9/0x6d0 [ 76.170690] ? __fget+0x101/0x1f0 [ 76.170910] ? __fget+0x5/0x1f0 [ 76.171157] ksys_ioctl+0x60/0x90 [ 76.171391] __x64_sys_ioctl+0x16/0x20 [ 76.171634] do_syscall_64+0x50/0x180 [ 76.171892] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 76.172186] RIP: 0033:0x7f61d422e567 [ 76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48 [ 76.172911] RSP: 002b:7f61d3936d68 EFLAGS: 0246 ORIG_RAX: 0010 [ 76.173328] RAX: ffda RBX: 019026b0 RCX: 7f61d422e567 [ 76.173649] RDX: 019026b0 RSI: c400941b RDI: 0003 [ 76.173909] RBP: R08: 7f61d3937700 R09: [ 76.174244] R10: 7f61d3937700 R11: 0246 R12: [ 76.174566] R13: 000000801000 R14: R15: 7f61d3937700 [ 76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left Signed-off-by: Anand Jain --- v1->v2: none fs/btrfs/scrub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b5a19ba38ab7..9ade0659f017 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3757,10 +3757,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { + lockdep_assert_held(_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) { + mutex_unlock(_info->scrub_lock); btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_parity_workers); + mutex_lock(_info->scrub_lock); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } -- 1.8.3.1
[PATCH v2 1/3] btrfs: scrub: maintain the unlock order in scrub thread
The device_list_mutex and scrub_lock creates a nested locks in btrfs_scrub_dev(). During lock the order is device_list_mutex and then scrub_lock, and during unlock, the order is device_list_mutex and then scrub_lock. Fix this to the lock order of scrub_lock and then device_list_mutex. Signed-off-by: Anand Jain --- v1->v2: change the order of lock acquire first scrub_lock and then device_list_mutex, which matches with the order of unlock. The extra line which are now in the scrub_lock are ok to be under the scrub_lock. fs/btrfs/scrub.c | 13 +++-- 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..a9d6fc3b01d4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3813,28 +3813,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } - + mutex_lock(_info->scrub_lock); mutex_lock(_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, >dev_state) && !is_dev_replace)) { mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return -ENODEV; } if (!is_dev_replace && !readonly && !test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state)) { mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); return -EROFS; } - mutex_lock(_info->scrub_lock); if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, >dev_state) || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return -EIO; } @@ -3843,23 +3844,23 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, (!is_dev_replace && btrfs_dev_replace_is_ongoing(_info->dev_replace))) { btrfs_dev_replace_read_unlock(_info->dev_replace); - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return -EINPROGRESS; } btrfs_dev_replace_read_unlock(_info->dev_replace); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); return ret; } sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); + mutex_unlock(_info->scrub_lock); scrub_workers_put(fs_info); return PTR_ERR(sctx); } -- 1.8.3.1
[PATCH v2 0/3] btrfs: scrub: fix scrub_lock
Idea was to fix the circular locking dependency warning as in patch 2/3, and in the process also fixes the other identified cleanups patches 1/3,3/3 and they aren't dependent on 2ttch /3. Anand Jain (3): btrfs: scrub: maintain the unlock order in scrub thread btrfs: scrub: fix circular locking dependency warning btrfs: add lockdep check for scrub_lock in scrub_workers_get fs/btrfs/scrub.c | 22 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) -- 1.8.3.1
[PATCH v2 3/3] btrfs: scrub: add scrub_lock lockdep check in scrub_workers_get
scrub_workers_refcnt is protected by scrub_lock, add lockdep_assert_held() function in scrub_workers_get(). Signed-off-by: Anand Jain Suggested-by: Nikolay Borisov --- v2: born fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9ade0659f017..84ef1f0d371e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3726,6 +3726,8 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; + lockdep_assert_held(_info->scrub_lock); + if (fs_info->scrub_workers_refcnt == 0) { fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, is_dev_replace ? 1 : max_active, 4); -- 1.8.3.1
Re: [PATCH 2/2] btrfs: scrub: fix circular locking dependency warning
On 11/26/2018 05:59 PM, Nikolay Borisov wrote: On 26.11.18 г. 11:07 ч., Anand Jain wrote: Circular locking dependency check reports warning[1], that's because the btrfs_scrub_dev() calls the stack #0 below with, the fs_info::scrub_lock held. The test case leading to this warning.. mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs btrfs scrub start -B /btrfs In fact we have fs_info::scrub_workers_refcnt to tack if the init and destroy of the scrub workers are needed. So once we have incremented and decremented the fs_info::scrub_workers_refcnt value in the thread, its ok to drop the scrub_lock, and then actually do the btrfs_destroy_workqueue() part. So this patch drops the scrub_lock before calling btrfs_destroy_workqueue(). [1] [ 76.146826] == [ 76.147086] WARNING: possible circular locking dependency detected [ 76.147316] 4.20.0-rc3+ #41 Not tainted [ 76.147489] -- [ 76.147722] btrfs/4065 is trying to acquire lock: [ 76.147984] 38593bc0 ((wq_completion)"%s-%s""btrfs", name){+.+.}, at: flush_workqueue+0x70/0x4d0 [ 76.148337] but task is already holding lock: [ 76.148594] 62392ab7 (_info->scrub_lock){+.+.}, at: btrfs_scrub_dev+0x316/0x5d0 [btrfs] [ 76.148909] which lock already depends on the new lock. [ 76.149191] the existing dependency chain (in reverse order) is: [ 76.149446] -> #3 (_info->scrub_lock){+.+.}: [ 76.149707]btrfs_scrub_dev+0x11f/0x5d0 [btrfs] [ 76.149924]btrfs_ioctl+0x1ac3/0x2d80 [btrfs] [ 76.150216]do_vfs_ioctl+0xa9/0x6d0 [ 76.150468]ksys_ioctl+0x60/0x90 [ 76.150716]__x64_sys_ioctl+0x16/0x20 [ 76.150911]do_syscall_64+0x50/0x180 [ 76.151182]entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 76.151469] -> #2 (_devs->device_list_mutex){+.+.}: [ 76.151851]reada_start_machine_worker+0xca/0x3f0 [btrfs] [ 76.152195]normal_work_helper+0xf0/0x4c0 [btrfs] [ 76.152489]process_one_work+0x1f4/0x520 [ 76.152751]worker_thread+0x46/0x3d0 [ 76.153715]kthread+0xf8/0x130 [ 76.153912]ret_from_fork+0x3a/0x50 [ 76.154178] -> #1 ((work_completion)(>normal_work)){+.+.}: [ 76.154575]worker_thread+0x46/0x3d0 [ 76.154828]kthread+0xf8/0x130 [ 76.155108]ret_from_fork+0x3a/0x50 [ 76.155357] -> #0 ((wq_completion)"%s-%s""btrfs", name){+.+.}: [ 76.155751]flush_workqueue+0x9a/0x4d0 [ 76.155911]drain_workqueue+0xca/0x1a0 [ 76.156182]destroy_workqueue+0x17/0x230 [ 76.156455]btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs] [ 76.156756]scrub_workers_put+0x2e/0x60 [btrfs] [ 76.156931]btrfs_scrub_dev+0x329/0x5d0 [btrfs] [ 76.157219]btrfs_ioctl+0x1ac3/0x2d80 [btrfs] [ 76.157491]do_vfs_ioctl+0xa9/0x6d0 [ 76.157742]ksys_ioctl+0x60/0x90 [ 76.157910]__x64_sys_ioctl+0x16/0x20 [ 76.158177]do_syscall_64+0x50/0x180 [ 76.158429]entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 76.158716] other info that might help us debug this: [ 76.158908] Chain exists of: (wq_completion)"%s-%s""btrfs", name --> _devs->device_list_mutex --> _info->scrub_lock [ 76.159629] Possible unsafe locking scenario: [ 76.160607]CPU0CPU1 [ 76.160934] [ 76.161210] lock(_info->scrub_lock); [ 76.161458] lock(_devs->device_list_mutex); [ 76.161805] lock(_info->scrub_lock); [ 76.161909] lock((wq_completion)"%s-%s""btrfs", name); [ 76.162201] *** DEADLOCK *** [ 76.162627] 2 locks held by btrfs/4065: [ 76.162897] #0: bef2775b (sb_writers#12){.+.+}, at: mnt_want_write_file+0x24/0x50 [ 76.163335] #1: 62392ab7 (_info->scrub_lock){+.+.}, at: btrfs_scrub_dev+0x316/0x5d0 [btrfs] [ 76.163796] stack backtrace: [ 76.163911] CPU: 1 PID: 4065 Comm: btrfs Not tainted 4.20.0-rc3+ #41 [ 76.164228] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 76.164646] Call Trace: [ 76.164872] dump_stack+0x5e/0x8b [ 76.165128] print_circular_bug.isra.37+0x1f1/0x1fe [ 76.165398] __lock_acquire+0x14aa/0x1620 [ 76.165652] lock_acquire+0xb0/0x190 [ 76.165910] ? flush_workqueue+0x70/0x4d0 [ 76.166175] flush_workqueue+0x9a/0x4d0 [ 76.166420] ? flush_workqueue+0x70/0x4d0 [ 76.166671] ? drain_workqueue+0x52/0x1a0 [ 76.166911] drain_workqueue+0xca/0x1a0 [ 76.167167] destroy_workqueue+0x17/0x230 [ 76.167428] btrfs_destroy_workqueue+0x5d/0x1c0 [btrfs] [ 76.167720] scrub_workers_put+0x2e/0x60 [btrfs] [ 76.168233] btrfs_scrub_dev+0x329/0x5d0 [btrfs] [ 76.168504] ? __sb_start_write+0x121/0x1b0 [ 76.168759] ? mnt_want_write_file+0x24/0x50 [ 76.169654
Re: [PATCH 1/2] btrfs: scrub: maintain the unlock order in scrub thread
On 11/26/2018 05:47 PM, Nikolay Borisov wrote: On 26.11.18 г. 11:07 ч., Anand Jain wrote: The fs_info::device_list_mutex and fs_info::scrub_lock creates a nested locks in btrfs_scrub_dev(). During the lock acquire the hierarchy is fs_info::device_list_mutex and then fs_info::scrub_lock, so following the same reverse order during unlock, that is fs_info::scrub_lock and then fs_info::device_list_mutex. Signed-off-by: Anand Jain --- fs/btrfs/scrub.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..b1c2d1cdbd4b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3865,7 +3865,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } sctx->readonly = readonly; dev->scrub_ctx = sctx; - mutex_unlock(_info->fs_devices->device_list_mutex); /* * checking @scrub_pause_req here, we can avoid @@ -3875,15 +3874,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(_info->scrubs_running); mutex_unlock(_info->scrub_lock); - if (!is_dev_replace) { - /* -* by holding device list mutex, we can -* kick off writing super in log tree sync. -*/ - mutex_lock(_info->fs_devices->device_list_mutex); + /* +* by holding device list mutex, we can kick off writing super in log +* tree sync. +*/ + if (!is_dev_replace) ret = scrub_supers(sctx, dev); - mutex_unlock(_info->fs_devices->device_list_mutex); - } + + mutex_unlock(_info->fs_devices->device_list_mutex); Have you considered whether this change will have any negative impact due to the fact that __scrtub_blocked_if_needed can go to sleep for arbitrary time with device_list_mutex held now ? You are right. I missed that point. The device_list_mutex must not be blocked. In fact here we don't need the nested device_list_mutex and scrub_lock at all. I have comeup with a new fix [1] below separating them. [1] - diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..db895ad23eda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3830,42 +3830,37 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EROFS; } - mutex_lock(_info->scrub_lock); if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, >dev_state) || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) { - mutex_unlock(_info->scrub_lock); mutex_unlock(_info->fs_devices->device_list_mutex); return -EIO; } + mutex_unlock(_info->fs_devices->device_list_mutex); btrfs_dev_replace_read_lock(_info->dev_replace); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(_info->dev_replace))) { btrfs_dev_replace_read_unlock(_info->dev_replace); - mutex_unlock(_info->scrub_lock); - mutex_unlock(_info->fs_devices->device_list_mutex); return -EINPROGRESS; } btrfs_dev_replace_read_unlock(_info->dev_replace); + mutex_lock(_info->scrub_lock); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { mutex_unlock(_info->scrub_lock); - mutex_unlock(_info->fs_devices->device_list_mutex); return ret; } sctx = scrub_setup_ctx(dev, is_dev_replace); if (IS_ERR(sctx)) { mutex_unlock(_info->scrub_lock); - mutex_unlock(_info->fs_devices->device_list_mutex); scrub_workers_put(fs_info); return PTR_ERR(sctx); } sctx->readonly = readonly; dev->scrub_ctx = sctx; - mutex_unlock(_info->fs_devices->device_list_mutex); /* * checking @scrub_pause_req here, we can avoid Will send v2. Thanks, Anand if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end);
[PATCH 2/2] btrfs: scrub: fix circular locking dependency warning
find_held_lock+0x2d/0x90 [ 76.170450] do_vfs_ioctl+0xa9/0x6d0 [ 76.170690] ? __fget+0x101/0x1f0 [ 76.170910] ? __fget+0x5/0x1f0 [ 76.171157] ksys_ioctl+0x60/0x90 [ 76.171391] __x64_sys_ioctl+0x16/0x20 [ 76.171634] do_syscall_64+0x50/0x180 [ 76.171892] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 76.172186] RIP: 0033:0x7f61d422e567 [ 76.172425] Code: 44 00 00 48 8b 05 29 09 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f9 08 2d 00 f7 d8 64 89 01 48 [ 76.172911] RSP: 002b:7f61d3936d68 EFLAGS: 0246 ORIG_RAX: 0010 [ 76.173328] RAX: ffda RBX: 019026b0 RCX: 7f61d422e567 [ 76.173649] RDX: 019026b0 RSI: c400941b RDI: 0003 [ 76.173909] RBP: R08: 7f61d3937700 R09: [ 76.174244] R10: 7f61d3937700 R11: 0246 R12: [ 76.174566] R13: 000000801000 R14: R15: 7f61d3937700 [ 76.175217] btrfs (4065) used greatest stack depth: 11424 bytes left Signed-off-by: Anand Jain --- fs/btrfs/scrub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b1c2d1cdbd4b..3fc31eeef2a7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3757,10 +3757,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) { + lockdep_assert_held(_info->scrub_lock); if (--fs_info->scrub_workers_refcnt == 0) { + mutex_unlock(_info->scrub_lock); btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_parity_workers); + mutex_lock(_info->scrub_lock); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } -- 1.8.3.1
[PATCH 1/2] btrfs: scrub: maintain the unlock order in scrub thread
The fs_info::device_list_mutex and fs_info::scrub_lock creates a nested locks in btrfs_scrub_dev(). During the lock acquire the hierarchy is fs_info::device_list_mutex and then fs_info::scrub_lock, so following the same reverse order during unlock, that is fs_info::scrub_lock and then fs_info::device_list_mutex. Signed-off-by: Anand Jain --- fs/btrfs/scrub.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..b1c2d1cdbd4b 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3865,7 +3865,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } sctx->readonly = readonly; dev->scrub_ctx = sctx; - mutex_unlock(_info->fs_devices->device_list_mutex); /* * checking @scrub_pause_req here, we can avoid @@ -3875,15 +3874,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(_info->scrubs_running); mutex_unlock(_info->scrub_lock); - if (!is_dev_replace) { - /* -* by holding device list mutex, we can -* kick off writing super in log tree sync. -*/ - mutex_lock(_info->fs_devices->device_list_mutex); + /* +* by holding device list mutex, we can kick off writing super in log +* tree sync. +*/ + if (!is_dev_replace) ret = scrub_supers(sctx, dev); - mutex_unlock(_info->fs_devices->device_list_mutex); - } + + mutex_unlock(_info->fs_devices->device_list_mutex); if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); -- 1.8.3.1
[PATCH 0/2] btrfs: scrub: fix scrub_lock
Anand Jain (2): btrfs: scrub: maintain the unlock order in scrub thread btrfs: scrub: fix circular locking dependency warning fs/btrfs/scrub.c | 19 ++- 1 file changed, 10 insertions(+), 9 deletions(-) -- 1.8.3.1
[PATCH v12] Add cli and ioctl to forget scanned device(s)
v12: Fixed coding style - leave space between " : ". v11: btrfs-progs: Bring the code into the else part of if(forget). Use strerror to print the erorr instead of ret. v10: Make btrfs-progs changes more readable. With an effort to keep the known bug [1] as it is.. [1] The cli 'btrfs device scan --all /dev/sdb' which should have scanned only one device, ends up scanning all the devices and I am not trying to fix this bug in this patch because.. . -d|--all is marked as deprecated, I hope -d option would go away . For now some script might be using this bug as a feature, and fixing this bug might lead to mount failure. v9: Make forget as a btrfs device scan option. Use forget in the fstests, now you can run fstests with btrfs as rootfs which helps to exercise the uuid_mutex lock. v8: Change log update in the kernel patch. v7: Use struct btrfs_ioctl_vol_args (instead of struct btrfs_ioctl_vol_args_v2) as its inline with other ioctl btrfs-control The CLI usage/features remains same. However internally the ioctl flag is not required to delete all the unmounted devices. Instead leave btrfs_ioctl_vol_args::name NULL. v6: Use the changed fn name btrfs_free_stale_devices(). Change in title: Old v5: Cover-letter: [PATCH v5] Add cli and ioctl to ignore a scanned device Kernel: [PATCH v5] btrfs: introduce feature to ignore a btrfs device Progs: [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli v5: Adds feature to delete all stale devices Reuses btrfs_free_stale_devices() fn and so depends on the patch-set [1] in the ML. Uses struct btrfs_ioctl_vol_args_v2 instead of struct btrfs_ioctl_vol_args as arg Does the device path matching instead of btrfs_device matching (we won't delete the mounted device as btrfs_free_stale_devices() checks for it) v4: No change. But as the ML thread may be confusing, so resend. v3: No change. Send to correct ML. v2: Accepts review from Nikolay, details are in the specific patch. Patch 1/2 is renamed from [PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list delete to [PATCH 1/2] btrfs: add function to device list delete Adds cli and ioctl to forget a scanned device or forget all stale devices in the kernel. Anand Jain (1): btrfs: introduce feature to forget a btrfs device fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) Anand Jain (1): btrfs-progs: add cli to forget one or all scanned devices cmds-device.c | 63 ++- ioctl.h | 2 ++ 2 files changed, 56 insertions(+), 9 deletions(-) [1] Anand Jain (1): fstests: btrfs use forget if not reload common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) -- 1.8.3.1
[PATCH] fstests: btrfs use forget if not reload
[I will send this the xfstest ML after kernel and progs patch has been integrated]. btrfs reload was introduced to cleanup the device list inside the btrfs kernel module. The problem with the reload approach is that you can't run btrfs test cases 124,125, 154 and 164 on the system with btrfs as root fs. Now as we are introducing the btrfs forget feature as an btrfs device scan option [1], so here are its pertaining changes in the fstests. So these changes ensures we use the forget feature where available and if its not then falls back to the reload approach. [1] btrfs-progs: add cli to forget one or all scanned devices btrfs: introduce feature to forget a btrfs device Signed-off-by: Anand Jain --- common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) diff --git a/common/btrfs b/common/btrfs index 26dc0bb9600f..c7fbec11c8c1 100644 --- a/common/btrfs +++ b/common/btrfs @@ -374,3 +374,23 @@ _scratch_btrfs_sectorsize() $BTRFS_UTIL_PROG inspect-internal dump-super $SCRATCH_DEV |\ grep sectorsize | awk '{print $2}' } + +_btrfs_supports_forget() +{ + $BTRFS_UTIL_PROG device scan --help | grep -wq forget && \ + $BTRFS_UTIL_PROG device scan --forget > /dev/null 2>&1 +} + +_require_btrfs_forget_if_not_fs_loadable() +{ + _btrfs_supports_forget && return + + _require_loadable_fs_module "btrfs" +} + +_btrfs_forget_if_not_fs_reload() +{ + _btrfs_supports_forget && return + + _reload_fs_module "btrfs" +} diff --git a/tests/btrfs/124 b/tests/btrfs/124 index ce3ad6aa3a58..edbeff1443f5 100755 --- a/tests/btrfs/124 +++ b/tests/btrfs/124 @@ -51,7 +51,7 @@ _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 2 _test_unmount -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 2 @@ -86,7 +86,7 @@ echo "clean btrfs ko" >> $seqres.full _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload echo >> $seqres.full echo "-Write degraded mount fill upto $max_fs_sz bytes-" >> $seqres.full @@ -125,7 +125,7 @@ echo echo "Mount degraded with the other dev" _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded $dev2 $SCRATCH_MNT >>$seqres.full 2>&1 _run_btrfs_util_prog filesystem show checkpoint3=`md5sum $SCRATCH_MNT/tf2` diff --git a/tests/btrfs/125 b/tests/btrfs/125 index e38de264b28e..9161a2ddeaad 100755 --- a/tests/btrfs/125 +++ b/tests/btrfs/125 @@ -50,7 +50,7 @@ _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 3 _test_unmount -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 3 @@ -102,7 +102,7 @@ echo "unmount" >> $seqres.full _scratch_unmount echo "clean btrfs ko" >> $seqres.full # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded,device=$dev2 $dev1 $SCRATCH_MNT >>$seqres.full 2>&1 dd if=/dev/zero of="$SCRATCH_MNT"/tf2 bs=$bs count=$count \ >>$seqres.full 2>&1 @@ -138,7 +138,7 @@ echo "Mount degraded but with other dev" _scratch_unmount # un-scan the btrfs devices -_reload_fs_module "btrfs" +_btrfs_forget_if_not_fs_reload _mount -o degraded,device=${dev2} $dev3 $SCRATCH_MNT >>$seqres.full 2>&1 diff --git a/tests/btrfs/154 b/tests/btrfs/154 index 99ea232aba4c..01745d20e9fc 100755 --- a/tests/btrfs/154 +++ b/tests/btrfs/154 @@ -36,7 +36,7 @@ rm -f $seqres.full _supported_fs btrfs _supported_os Linux _require_scratch_dev_pool 2 -_require_loadable_fs_module "btrfs" +_require_btrfs_forget_if_not_fs_loadable _scratch_dev_pool_get 2 @@ -90,7 +90,7 @@ degrade_mount_write() echo "clean btrfs ko" >> $seqres.full # un-scan the btrfs devices - _reload_fs_module "btrfs" + _btrfs_forget_if_not_fs_reload _mount -o degraded $DEV1 $SCRATCH_MNT >>$seqres.full 2>&1 cnt=$(( $COUNT/10 )) dd if=/dev/urandom of="$SCRATCH_MNT"/tf1 bs=$bs count=$cnt \ @@ -142,7 +142,7 @@ verify() echo "unmount" >> $seqres.full _scratch_unmount - _reload_fs_module "btrfs" + _btrfs_forget_if_not_fs_reload _mount -o degraded $DEV2 $SCRATCH_MNT >>$seqres.full 2>&1 verify_checkpoint1=`md5sum $SCRATCH_MNT/tf1` verify_checkpoint2=`md5sum $SCRATCH_MNT/tf2` diff --git a/tests/btrfs/164 b/tests
[PATCH v12] btrfs: introduce feature to forget a btrfs device
Support for a new command 'btrfs dev forget [dev]' is proposed here to undo the effects of 'btrfs dev scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty. IOW(BTRFS_IOCTL_MAGIC, 5, ..) This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from the /dev/btrfs-control to forget one or all devices, (devices which are not mounted) from the btrfs kernel. The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be set to specify the device path. And all unmounted devices can be removed from the kernel if no device path is provided. Again, the devices are removed only if the relevant fsid aren't mounted. This new cli can provide.. . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the device is not going to be mounted. . Ability to mount the device in degraded mode when one of the other device is corrupted like in split brain raid1. . Running test cases which requires btrfs.ko-reload if the rootfs is btrfs. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- v11->v12: fix coding style add spacing before after ":". v1->v11: Pls ref to the cover-letter. (sorry about that). fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d3c6bbc0aa3a..eba2966913ae 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2247,6 +2247,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8e36cbb355df..48415a9edd46 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1397,6 +1397,15 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + mutex_lock(_mutex); + btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(_mutex); + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1d936ce282c3..a4a89d5050f5 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -414,6 +414,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e0763bc4158e..c195896d478f 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -837,6 +837,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- 1.8.3.1
[PATCH v12] btrfs-progs: add cli to forget one or all scanned devices
This patch adds cli btrfs device forget [dev] to remove the given device structure in the kernel if the device is unmounted. If no argument is given it shall remove all stale (device which are not mounted) from the kernel. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- v11->v12: none. v1->v12: pls ref to the cover-letter. cmds-device.c | 72 --- ioctl.h | 2 ++ 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/cmds-device.c b/cmds-device.c index 2a05f70a76a9..280d6f555377 100644 --- a/cmds-device.c +++ b/cmds-device.c @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) return _cmd_device_remove(argc, argv, cmd_device_delete_usage); } +static int btrfs_forget_devices(char *path) +{ + struct btrfs_ioctl_vol_args args; + int ret; + int fd; + + fd = open("/dev/btrfs-control", O_RDWR); + if (fd < 0) + return -errno; + + memset(, 0, sizeof(args)); + if (path) + strncpy_null(args.name, path); + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); + if (ret) + ret = -errno; + close(fd); + return ret; +} + static const char * const cmd_device_scan_usage[] = { - "btrfs device scan [(-d|--all-devices)| [...]]", - "Scan devices for a btrfs filesystem", + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ + "[...]]", + "Scan or forget (deregister) devices for a btrfs filesystem", " -d|--all-devices (deprecated)", + " -u|--forget [ ..]", NULL }; @@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv) int devstart; int all = 0; int ret = 0; + int forget = 0; optind = 0; while (1) { int c; static const struct option long_options[] = { { "all-devices", no_argument, NULL, 'd'}, + { "forget", no_argument, NULL, 'u'}, { NULL, 0, NULL, 0} }; - c = getopt_long(argc, argv, "d", long_options, NULL); + c = getopt_long(argc, argv, "du", long_options, NULL); if (c < 0) break; switch (c) { case 'd': all = 1; break; + case 'u': + forget = 1; + break; default: usage(cmd_device_scan_usage); } } devstart = optind; + if (all && forget) + usage(cmd_device_scan_usage); + if (all && check_argc_max(argc - optind, 1)) usage(cmd_device_scan_usage); if (all || argc - optind == 0) { - printf("Scanning for Btrfs filesystems\n"); - ret = btrfs_scan_devices(); - error_on(ret, "error %d while scanning", ret); - ret = btrfs_register_all_devices(); - error_on(ret, "there are %d errors while registering devices", ret); + if (forget) { + ret = btrfs_forget_devices(NULL); + error_on(ret, "'%s', forget failed", +strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems\n"); + ret = btrfs_scan_devices(); + error_on(ret, "error %d while scanning", ret); + ret = btrfs_register_all_devices(); + error_on(ret, + "there are %d errors while registering devices", + ret); + } goto out; } @@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv) ret = 1; goto out; } - printf("Scanning for Btrfs filesystems in '%s'\n", path); - if (btrfs_register_one_device(path) != 0) { - ret = 1; - free(path); - goto out; + if (forget) { + ret = btrfs_forget_devices(path); + if (ret) + error("Can't forget '%s': %s", + path, strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems in '%s'\n", +
Re: [RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk
David, any comments on this please. Thanks, Anand On 11/13/2018 06:32 PM, Anand Jain wrote: David, Gentle ping. Thanks, Anand On 11/12/2018 03:50 PM, Nikolay Borisov wrote: On 12.11.18 г. 6:58 ч., Anand Jain wrote: The dev_replace_state defines are miss matched between the BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1]. [1] - btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_* (we set dev_replace->replace_state using the BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk). 359 btrfs_set_dev_replace_replace_state(eb, ptr, 360 dev_replace->replace_state); IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_* altogether? But how about the userland progs other than btrfs-progs? If not at least fix the miss match as in [2], any comments? Unfortunately you are right. This seems to stem from sloppy job back in the days of initial dev-replace support. BTRFS_DEV_REPLACE_ITEM_STATE_* were added in e922e087a35c ("Btrfs: enhance btrfs structures for device replace support"), yet they were never used. And the IOCTL_DEV_REPLACE_STATE* were added in e93c89c1 ("Btrfs: add new sources for device replace code"). It looks like the ITEM_STATE* definitions were stillborn so to speak and personally I'm in favor of removing them. They shouldn't have been merged in the first place and indeed the patch doesn't even have a Reviewed-by tag. So it originated from the, I'd say, spartan days of btrfs development... David, any code which is using BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED is inherently broken, so how about we remove those definitions, then when it's compilation is broken in the future the author will actually have a chance to fix it, though it's highly unlikely anyone is relying on those definitions. [2] -- diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index aff1356c2bb8..9ffa7534cadf 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -805,9 +805,9 @@ struct btrfs_dev_stats_item { #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4 struct btrfs_dev_replace_item { /* -- Thanks, Anand
[PATCH RESEND v2 1/2] btrfs: quieten warn if replace is canceled at finish
When we successfully cancel the replace its scrub returns -ECANCELED, which then passed to btrfs_dev_replace_finishing(), it cleans up based on the scrub returned status and propagates the same -ECANCELED back the parent function. As of now only user can cancel the replace-scrub, so its ok to quieten the warn here. Signed-off-by: Anand Jain --- [fix: quieten spelling] v1->v2: Use the condition within the WARN_ON() fs/btrfs/dev-replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1dc8e86546db..9031a362921a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -954,7 +954,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), _replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); clear_bit(BTRFS_FS_EXCL_OP, _info->flags); return 0; -- 1.8.3.1
[PATCH RESEND v2 2/2] btrfs: user requsted replace cancel is not an error
As of now only user requested replace cancel can cancel the replace-scrub so no need to log error for it. Signed-off-by: Anand Jain --- v1->v2: none. fs/btrfs/dev-replace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9031a362921a..40a0942b4659 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, -- 1.8.3.1
[PATCH RESEND 0/9 v2] fix warn_on for replace cancel
These two patches were sent as part of [PATCH 0/9 v2] fix replace-start and replace-cancel racing before but as these aren't integrated so I am sending these again. The patch [1] which is in misc-next, calls btrfs_dev_replace_finishing() after replace is canceled, so the ret argument passed to the btrfs_dev_replace_finishing() is -ECANCEL, which is not a real error. So these patches quieten the warn and error log if its -ECANCEL. These should be integrated otherwise we see the WARN_ON and btrfs_error() after replace cancel. [1] 08bdfc3a0611 btrfs: fix use-after-free due to race between replace start and cancel Anand Jain (2): btrfs: quieten warn if the replace is canceled at finish btrfs: user requsted replace cancel is not an error
[PATCH v6 1/3] btrfs: add helper function describe_block_group()
Improve on describe_relocation() add a common helper function to describe the block groups. Signed-off-by: Anand Jain Reviewed-by: David Sterba --- v5->v6: Use () in the body for the args sent in defines Use right indent to align '\' Use goto to out_overflow instead of return v4.1->v5: Initialize buf[128] to null. v4->v4.1: Use strcpy(buf, "|NONE"); as in the original v3->v4: Just pass full flag name in the define DESCRIBE_FLAG(flag,..), so that it can be used at couple of more places. Rename describe_block_groups() to btrfs_describe_block_groups(). Drop useless return u32. v3: Born. fs/btrfs/relocation.c | 30 +++--- fs/btrfs/volumes.c| 46 ++ fs/btrfs/volumes.h| 1 + 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..0373b3cc1d36 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4184,37 +4184,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0,"raid0"); - DESCRIBE_FLAG(RAID1,"raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5,"raid5"); - DESCRIBE_FLAG(RAID6,"raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..3ab22e7e404e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -123,6 +123,52 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(f, d)\ + do {\ + if (flags & (f)) { \ + ret = snprintf(bp, size_bp, "%s|", (d));\ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(f); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + +out_overflow: + return; +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); static i
[PATCH v6 0/3] btrfs: balance: improve kernel logs
v5->v6: Mostly the defines non functional changes. Use goto instead of return in middle of the define. Pls ref individual patches 1/3 and 2/3 for more info. v4->v5: Mainly address David review comment [1]. [1] https://patchwork.kernel.org/patch/10425987/ pls ref to individual patch 2/3 for details. v3->v4: Pls ref to individual patches. Based on misc-next. v2->v3: Inspried by describe_relocation(), improves it, makes it a helper function and use it to log the balance operations. Kernel logs are very important for the forensic investigations of the issues, these patchs make balance logs easy to review. Anand Jain (3): btrfs: add helper function describe_block_group() btrfs: balance: add args info during start and resume btrfs: balance: add kernel log for end or paused fs/btrfs/relocation.c | 30 +-- fs/btrfs/volumes.c| 216 +- fs/btrfs/volumes.h| 1 + 3 files changed, 217 insertions(+), 30 deletions(-) -- 1.8.3.1
[PATCH v6 3/3] btrfs: balance: add kernel log for end or paused
Add a kernel log when the balance ends, either for cancel or completed or if it is paused. Signed-off-by: Anand Jain --- v5->v6: Quite soul. nothing. v4->v5: nothing. v3->v4: nothing. v2->v3: nothing. v1->v2: Moved from 2/3 to 3/3 fs/btrfs/volumes.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eb18739f19ef..0684f14ff70d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4060,6 +4060,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ret = __btrfs_balance(fs_info); mutex_lock(_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + else if (ret == -ECANCELED && atomic_read(_info->balance_cancel_req)) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, _info->flags); if (bargs) { -- 1.8.3.1
[PATCH v6 2/3] btrfs: balance: add args info during start and resume
Balance arg info is an important information to be reviewed for the system audit. So this patch adds them to the kernel log. Example: ->btrfs bal start -f -mprofiles=raid1,convert=single,soft -dlimit=10..20,usage=50 /btrfs kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 -msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single Signed-off-by: Anand Jain --- v5->v6: Use () in the body for the args sent in defines Use right indent to align '\' Use goto to out_overflow instead of return (also fixes a mem leak in v5) Rename CHECK_UPDATE_BP_XXX to CHECK_APPEND_XXX v4.1->v5: Per David review comment the code.. bp += snprintf(bp, buf - bp + size_buf, "soft,"); is not safe if 'buf - bp + size_buf' becomes accidentally negative, so fix this by using following snprintf. #define CHECK_UPDATE_BP_1(a) \ do { \ ret = snprintf(bp, size_bp, a); \ if (ret < 0 || ret >= size_bp) \ goto out; \ size_bp -= ret; \ bp += ret; \ } while (0) And initialize the tmp_buf to null. v4->v4.1: Rename last missed sz to size_buf. v3->v4: Change log updated with new example. Log format is changed to almost match with the cli. snprintf drop %s for inline string. Print range as =%u..%u instead of min=%umax=%u. soft is under the args now. force is printed as -f. v2->v3: Change log updated. Make use of describe_block_group() added in 1/4 Drop using strcat instead use snprintf. Logs string format updated as shown in the example above. v1->v2: Change log update. Move adding the logs for balance complete and end to a new patch fs/btrfs/volumes.c | 163 - 1 file changed, 160 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab22e7e404e..eb18739f19ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3757,6 +3757,164 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, (bctl_arg->target & ~allowed))); } +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, +u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do {\ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do {\ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do {\ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (fl
Re: [PATCH v5 2/3] btrfs: balance: add args info during start and resume
On 11/20/2018 01:07 AM, David Sterba wrote: On Wed, Nov 14, 2018 at 09:17:11PM +0800, Anand Jain wrote: Balance arg info is an important information to be reviewed for the system audit. So this patch adds them to the kernel log. Example: ->btrfs bal start -f -mprofiles=raid1,convert=single,soft -dlimit=10..20,usage=50 /btrfs kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 -msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single Signed-off-by: Anand Jain --- v4.1->v5: Per David review comment the code.. bp += snprintf(bp, buf - bp + size_buf, "soft,"); is not safe if 'buf - bp + size_buf' becomes accidentally negative, so fix this by using following snprintf. #define CHECK_UPDATE_BP_1(a) \ do { \ ret = snprintf(bp, size_bp, a); \ if (ret < 0 || ret >= size_bp) \ goto out; \ size_bp -= ret; \ bp += ret; \ } while (0) And initialize the tmp_buf to null. v4->v4.1: Rename last missed sz to size_buf. v3->v4: Change log updated with new example. Log format is changed to almost match with the cli. snprintf drop %s for inline string. Print range as =%u..%u instead of min=%umax=%u. soft is under the args now. force is printed as -f. v2->v3: Change log updated. Make use of describe_block_group() added in 1/4 Drop using strcat instead use snprintf. Logs string format updated as shown in the example above. v1->v2: Change log update. Move adding the logs for balance complete and end to a new patch fs/btrfs/volumes.c | 172 - 1 file changed, 169 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 12b3d0625c6a..8397f72bdac7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3754,6 +3754,173 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, (bctl_arg->target & ~allowed))); } +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, +u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_UPDATE_BP_1(a) \ CHECK_UPDATE_BP_NOARG and the others CHECK_UPDATE_BP_1ARG, though I'm thinking about picking another name for the macro like CHECK_APPEND_2ARG etc. I liked CHECK_APPEND_XXX I have renamed CHECK_UPDATE_BP_1 to CHECK_APPEND_NOARG CHECK_UPDATE_BP_2 to CHECK_APPEND_1ARG CHECK_UPDATE_BP_3 to CHECK_APPEND_2ARG Hope its better now. + do { \ + ret = snprintf(bp, size_bp, a); \ + if (ret < 0 || ret >= size_bp) \ + goto out; \ out_overflow and align \ to the right, please. Yep done. + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_UPDATE_BP_2(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, a, v1); \ + if (ret < 0 || ret >= size_bp) \ + goto out; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_UPDATE_BP_3(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, a, v1, v2); \ + if (ret < 0 || ret >= size_bp) \ + goto out; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_SOFT) { + CHECK_UPDATE_BP_1("soft,"); + } no { } around single statement fixed. + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_UPDATE_BP_2("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) { + CHECK_UPDATE_BP_2("usage=%llu,", bargs->usage); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) { + CHECK_UPDATE_BP_3("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + } + + if (flags & BTRFS_BALANCE_ARGS_DEVID) { + CHECK_UPDATE_BP_2("devid=%llu,", bargs->devid); + } + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) { + CHECK_UPDATE_BP_3("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + } + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) { + CHECK_UPDATE_BP_3("vrange%llu..%llu,", +
Re: [PATCH v5 1/3] btrfs: add helper function describe_block_group()
Thanks for the review.. more below. On 11/20/2018 01:02 AM, David Sterba wrote: On Wed, Nov 14, 2018 at 09:17:10PM +0800, Anand Jain wrote: Improve on describe_relocation() add a common helper function to describe the block groups. Signed-off-by: Anand Jain Reviewed-by: David Sterba --- v4.1->v5: Initialize buf[128] to null. v4->v4.1: Use strcpy(buf, "|NONE"); as in the original v3->v4: Just pass full flag name in the define DESCRIBE_FLAG(flag,..), so that it can be used at couple of more places. Rename describe_block_groups() to btrfs_describe_block_groups(). Drop useless return u32. v3: Born. fs/btrfs/relocation.c | 30 +++--- fs/btrfs/volumes.c| 43 +++ fs/btrfs/volumes.h| 1 + 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..0373b3cc1d36 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4184,37 +4184,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0,"raid0"); - DESCRIBE_FLAG(RAID1,"raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5,"raid5"); - DESCRIBE_FLAG(RAID6,"raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..12b3d0625c6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -123,6 +123,49 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(f, d) \ Macro arguments should be in ( ) in the body Fixed in v6. + do { \ + if (flags & f) { \ + ret = snprintf(bp, size_bp, "%s|", d); \ + if (ret < 0 || ret >= size_bp) \ + return; \ Ok, this seems to be safe. snprintf will not write more than size_bp and this will be caught if it overflows. The return is obscured by the macro, I'd rather make it more visible that there is a potential change in the code flow. The proposed change: goto out_overflow; ... and define out_overflow at the end of function. Same for the other helper macros. makes sense. Thanks for explaining. Fixed in v6. Also please align the backslashes of the macro a few tabs to the right. Yep done in v6. Thanks, Anand + size_bp -= ret; \ + bp += ret; \ + flags &= ~f; \ + } \ + } while (0)
Re: bad tree block start, want 705757184 have 82362368
On 11/18/2018 03:56 PM, Stephan Olbrich wrote: Am Sonntag, 18. November 2018, 01:30:14 CET schrieb Qu Wenruo: Late on I got the same errors for my /home partition (on the same drive) as well. I have snapshots of all partitions on another drive made by btrbk. To get a working system, I made new (rw) snapshots of the most recent backup and setup grub and fstab, so my system would boot from the other drive. Unfortunately now I got the "bad tree block start" error again at least once in dmesg but I didn't save it and it's not in syslog :-( What I remember is, that it was followed by other btrfs error messages saying something about correcting something. And the filesystem was still read/write this time. At the moment I can't reproduce it. Today it happend again (sdb is my backup drive, which is my main drive at the moment): [ 286.325857] BTRFS error (device sdb1): bad tree block start, want 787719208960 have 11268016545161247416 [ 286.363245] BTRFS info (device sdb1): read error corrected: ino 0 off 787719208960 (dev /dev/sdb1 sector 1243815072) [ 286.364087] BTRFS info (device sdb1): read error corrected: ino 0 off 787719213056 (dev /dev/sdb1 sector 1243815080) [ 286.425946] BTRFS info (device sdb1): read error corrected: ino 0 off 787719217152 (dev /dev/sdb1 sector 1243815088) [ 286.427530] BTRFS info (device sdb1): read error corrected: ino 0 off 787719221248 (dev /dev/sdb1 sector 1243815096) Was there any hardware issues? How about the following data from the system.. btrfs fi df btrfs dev stat Thanks, Anand Is there any way to find out, which files are affected by the errors above? No files are affected, but an essential tree, extent tree, is corrupted. Normally this may prevent RW mount, and even it mounts it can still cause problem when doing any write. It could even prevent RO mount if the corrupted leaf contains block group item. But your data should be OK if there is no other corruption, and in that case btrfs-restore should work well. I don't really trust the data on the drive I'm using at the moment, as it has shown errors as well, but I have a less current backup on yet another drive but at it is a few weeks old, I don't want to use it to setup the system on the SSD again, but just copy the relevant files if possible. Or is it possible to repair the original file system? At least we need "btrfs check" output. I updated btrfs-progs and run btrfs check for / and /home No errors are found on / (sda2), but there are errors on /home ?? $ btrfs --version btrfs-progs v4.19 $ btrfs check /dev/sda2 Opening filesystem to check... Checking filesystem on /dev/sda2 UUID: 80368989-ffa8-463c-98fb-fe2e28ca7bf3 [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 64816218112 bytes used, no error found total csum bytes: 59518732 total tree bytes: 2180268032 total fs tree bytes: 1965965312 total extent tree bytes: 123289600 btree space waste bytes: 478665777 file data blocks allocated: 151083261952 referenced 76879990784 This fs is completely fine, including your data. $ btrfs check /dev/sda4 Opening filesystem to check... Checking filesystem on /dev/sda4 UUID: 81c38df8-b7f9-412c-8c88-cfde8db68eb1 [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots root 257 inode 7970563 errors 100, file extent discount Found file extent holes: start: 0, len: 20480 root 257 inode 7970564 errors 100, file extent discount Found file extent holes: start: 0, len: 77824 ERROR: errors found in fs roots These are just minor errors, won't even causing any data mismatch. So all your fses should be mostly OK. Would you please try to use v4.19-rc* to see if it changes anything? v4.19-rc1 is the only rc I found, but that is older than v4.19, right? Anyway, here is the output: $ btrfs --version btrfs-progs v4.19-rc1 $ btrfs check /dev/sda2 Opening filesystem to check... Checking filesystem on /dev/sda2 UUID: 80368989-ffa8-463c-98fb-fe2e28ca7bf3 [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots [5/7] checking only csums items (without verifying data) [6/7] checking root refs [7/7] checking quota groups skipped (not enabled on this FS) found 64816218112 bytes used, no error found total csum bytes: 59518732 total tree bytes: 2180268032 total fs tree bytes: 1965965312 total extent tree bytes: 123289600 btree space waste bytes: 478665777 file data blocks allocated: 151083261952 referenced 76879990784 $ btrfs check /dev/sda4 Opening filesystem to check... Checking filesystem on /dev/sda4 UUID: 81c38df8-b7f9-412c-8c88-cfde8db68eb1 [1/7] checking root items [2/7] checking extents [3/7] checking free space cache [4/7] checking fs roots root 257 inode 7970563 errors
Re: [PATCH 2/5] btrfs: Refactor btrfs_can_relocate
On 10/26/2018 07:43 PM, Nikolay Borisov wrote: btrfs_can_relocate returns 0 when it concludes the given chunk can be relocated and -1 otherwise. Since this function is used as a predicated and it return a binary value it makes no sense to have it's return value as an int so change it to bool. Furthermore, remove a stale leftover comment from e6ec716f0ddb ("Btrfs: make raid attr array more readable"). Finally make the logic in the list_for_each_entry loop a bit more obvious. Up until now the fact that we are returning success (0) was a bit masked by the fact that the 0 is re-used from the return value of find_free_dev_extent. Instead, now just increment dev_nr if we find a suitable extent and explicitly set can_reloc to true if enough devices with unallocated space are present. No functional changes. Signed-off-by: Nikolay Borisov --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 39 ++- fs/btrfs/volumes.c | 3 +-- 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68ca41dbbef3..06edc4f9ceb2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2680,7 +2680,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_fs_info *info); -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..816bca482358 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9423,10 +9423,10 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) /* * checks to see if its even possible to relocate this block group. * - * @return - -1 if it's not a good idea to relocate this block group, 0 if its - * ok to go ahead and try. + * @return - false if not enough space can be found for relocation, true + * otherwise */ -int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) +bool btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_block_group_cache *block_group; @@ -9441,7 +9441,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) int debug; int index; int full = 0; - int ret = 0; + bool can_reloc = true; debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); @@ -9453,7 +9453,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) btrfs_warn(fs_info, "can't find block group for bytenr %llu", bytenr); - return -1; + return false; } min_free = btrfs_block_group_used(_group->item); @@ -9489,16 +9489,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) * group is going to be restriped, run checks against the target * profile instead of the current one. */ - ret = -1; + can_reloc = false; - /* -* index: -* 0: raid10 -* 1: raid1 -* 2: dup -* 3: raid0 -* 4: single -*/ target = get_restripe_target(fs_info, block_group->flags); if (target) { index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); @@ -9534,10 +9526,8 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) /* We need to do this so that we can look at pending chunks */ trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); + if (IS_ERR(trans)) goto out; - } mutex_lock(_info->chunk_mutex); list_for_each_entry(device, _devices->alloc_list, dev_alloc_list) { @@ -9549,18 +9539,17 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free && !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, >dev_state)) { - ret = find_free_dev_extent(trans, device, min_free, - _offset, NULL); - if (!ret) + if (!find_free_dev_extent(trans, device, min_free, + _offset, NULL)) This can return -ENOMEM; dev_nr++; - if (dev_nr >= dev_min) + if (dev_nr >= dev_min) { +
Re: [PATCH 5/5] btrfs: Replace BUG_ON with ASSERT in find_lock_delalloc_range
On 10/26/2018 07:43 PM, Nikolay Borisov wrote: lock_delalloc_pages should only return 2 values - 0 in case of success and -EAGAIN if the range of pages to be locked should be shrunk due to some of gone. Manual inspections confirms that this is indeed the case since __process_pages_contig is where lock_delalloc_pages gets its return value. The latter always returns 0 or -EAGAIN so the invariant holds. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Anand Jain Thanks, Anand --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a9a521aefe5..94bc53472031 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1606,6 +1606,7 @@ static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1621,7 +1622,6 @@ static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, _state);
Re: [PATCH 4/5] btrfs: Sink find_lock_delalloc_range's 'max_bytes' argument
On 10/26/2018 07:43 PM, Nikolay Borisov wrote: All callers of this function pass BTRFS_MAX_EXTENT_SIZE (128M) so let's reduce the argument count and make that a local variable. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Anand Jain Thanks, Anand --- fs/btrfs/extent_io.c | 11 +-- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 10 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6877a74c7469..1a9a521aefe5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1566,8 +1566,9 @@ static noinline int lock_delalloc_pages(struct inode *inode, static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; u64 found; @@ -1647,10 +1648,9 @@ static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { - return find_lock_delalloc_range(inode, tree, locked_page, start, end, - max_bytes); + return find_lock_delalloc_range(inode, tree, locked_page, start, end); } #endif @@ -3233,8 +3233,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, nr_delalloc = find_lock_delalloc_range(inode, tree, page, _start, - _end, - BTRFS_MAX_EXTENT_SIZE); + _end); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 369daa5d4f73..7ec4f93caf78 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, u64 btrfs_find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes); + u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e0f4a01be14..8ea8c2aa6696 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -107,7 +107,7 @@ static int test_find_delalloc(u32 sectorsize) start = 0; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (!found) { test_err("should have found at least one delalloc"); goto out_bits; @@ -138,7 +138,7 @@ static int test_find_delalloc(u32 sectorsize) start = test_start; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (!found) { test_err("couldn't find delalloc in our range"); goto out_bits; @@ -172,7 +172,7 @@ static int test_find_delalloc(u32 sectorsize) start = test_start; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (found) { test_err("found range when we shouldn't have"); goto out_bits; @@ -193,7 +193,7 @@ static int test_find_delalloc(u32 sectorsize) start = test_start; end = 0; found = btrfs_find_lock_delalloc_range(inode, , locked_page, , -, max_bytes); +); if (!found) { test_err("didn't find our range"); goto out_bits; @@ -234,7 +234,7 @@ static int test_find_delalloc(u32 sectorsize) * tests expected behavior. */ foun
Re: [PATCH 3/5] btrfs: Remove superfluous check form btrfs_remove_chunk
On 10/26/2018 07:43 PM, Nikolay Borisov wrote: It's unnecessary to check map->stripes[i].dev for NULL given its value is already set and dereferenced above the the check. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Anand Jain Thanks, Anand --- fs/btrfs/volumes.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dc53d94a62aa..f0db43d08456 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2797,13 +2797,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) mutex_unlock(_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(_devices->device_list_mutex);
Re: BTRFS on production: NVR 16+ IP Cameras
On 11/16/2018 03:51 AM, Nikolay Borisov wrote: On 15.11.18 г. 20:39 ч., Juan Alberto Cirez wrote: Is BTRFS mature enough to be deployed on a production system to underpin the storage layer of a 16+ ipcameras-based NVR (or VMS if you prefer)? Based on our limited experience with BTRFS (1+ year) under the above scenario the answer seems to be no; but I wanted you ask the community at large for their experience before making a final decision to hold off on deploying BTRFS on production systems. Let us be clear: We think BTRFS has great potential, and as it matures we will continue to watch its progress, so that at some future point we can return to using it. The issue has been the myriad of problems we have encountered when deploying BTRFS as the storage fs for the NVR/VMS in cases were the camera count exceeds 10: Corrupted file systems, sudden read-only file system, re-balance kernel panics, broken partitions, etc. What version of the file system, how many of those issues (if any) have you reported to the upstream mailing list? I don't remember seeing any reports from you. One specific case saw the btrfs drive pool mounted under the /var partition so that upon installation the btrfs pool contained all files under /var; /lib/mysql as well as the video storage. Needless to say this was a catastrophe... BTRFS is not suitable for random workloads, such as databases for example. In those cases it's recommended, at the very least, to disable COW on the database files. Note, disabling CoW doesn't preclude you from creating snapshots, it just means that not every 4k write (for example) will result in a CoW operation. At the other end of the spectrum is a use case where ONLY the video storage was on the btrfs pool; but in that case, the btrfs pool became read-only suddenly and would not re-mount as rw despite all the recovery trick btrfs-tools could throw at it. This, of course prevented the NVR/VMS from recording any footage. Again, you give absolutely no information about how you have configured the filesystem or versions of the software used. So, again, the question is: is BTRFS mature enough to be used in such use case and if so, what approach can be used to mitigate such issues. And.. What is the blocksize used by the application to write into btrfs? And what type of write IO? async, directio or fsync? Thanks, Anand Thank you all for your assistance -Ryan-
Re: [PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish
On 11/15/2018 11:35 PM, David Sterba wrote: On Sun, Nov 11, 2018 at 10:22:22PM +0800, Anand Jain wrote: When we successfully cancel the replace its scrub returns -ECANCELED, which then passed to btrfs_dev_replace_finishing(), it cleans up based on the scrub returned status and propagates the same -ECANCELED back the parent function. As of now only user can cancel the replace-scrub, so its ok to quieten the warn here. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1dc8e86546db..9031a362921a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); While this looks ok, can you please rework it so there are no WARN_ON at random places in device-replace, poorly substituting error handling? The code flow in this case could be changed to make explicit checks for the know codes and then a catch-all branch like: if (ret == -EINPROGRESS) { ... } else (if == -ESOMETHINGELSE) { ... } else { unknown error, print error and do a proper cleanup } As below.. } @@ -954,7 +954,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), _replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); This one too, thanks. btrfs_dev_scrub() can return quite a lot of errno, which is passed here through the btrfs_dev_replace_finishing(), so it won't be possible to code them all. (we use -ECANCELED only in replace and balance). Thanks, Anand
Re: [PATCH 8/9] btrfs: user requsted replace cancel is not an error
On 11/16/2018 06:29 PM, Anand Jain wrote: On 11/15/2018 11:31 PM, David Sterba wrote: On Sun, Nov 11, 2018 at 10:22:23PM +0800, Anand Jain wrote: As of now only user requested replace cancel can cancel the replace-scrub so no need to log error for it. This has probably some user visible effect or threre are steps to reproduce the message even if it's not expected (ie. the case that this patch fixes). Please update the changelog, thanks. before the patch [1] [1] btrfs: fix use-after-free due to race between replace start and cancel We used to set the replace_state to CANCELED [2] and then call btrfs_scrub_cancel(), but the problem with this approach is if the scrub isn't running yet, then things get messier. [2] - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; So to fix, [1] shall set replace_state to CANCELED only after the replace cancel thread has successfully canceled the replace using btrfs_scrub_cancel(). And about the cleanup process for the replace target.. we call btrfs_dev_replace_finishing(.. ret) after ret= btrfs_scrub_dev() now ret is -ECANCELED due to replace cancel request by the user. (its not set to -ECANCELED for any other reason). btrfs_dev_replace_finishing() has the following code [3] which it earlier blocked processing of the cleanup process after the cancel, because the replace_state was already updated to BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED. [3] -- static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { :: /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { btrfs_dev_replace_read_unlock(dev_replace); mutex_unlock(_replace->lock_finishing_cancel_unmount); return 0; } --- In fact before this patch [1] the code wasn't sync-ing the IO when replace was canceled. Which this patch also fixes by using the btrfs_dev_replace_finishing() --- btrfs_dev_replace_finishing :: /* * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ ret = btrfs_start_delalloc_roots(fs_info, -1); if (ret) { mutex_unlock(_replace->lock_finishing_cancel_unmount); return ret; } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); --- So to answer above question... these warn and error log wasn't reported before when replace was canceled and now since I am using the btrfs_dev_replace_finishing() to finish the job of cancel.. it shall be reported which is ok to quite. Do you think we still need to update the change log? OR I think more appropriately this patch should be merged with [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel -Anand HTH. Thanks, Anand
Re: [PATCH 8/9] btrfs: user requsted replace cancel is not an error
On 11/15/2018 11:31 PM, David Sterba wrote: On Sun, Nov 11, 2018 at 10:22:23PM +0800, Anand Jain wrote: As of now only user requested replace cancel can cancel the replace-scrub so no need to log error for it. This has probably some user visible effect or threre are steps to reproduce the message even if it's not expected (ie. the case that this patch fixes). Please update the changelog, thanks. before the patch [1] [1] btrfs: fix use-after-free due to race between replace start and cancel We used to set the replace_state to CANCELED [2] and then call btrfs_scrub_cancel(), but the problem with this approach is if the scrub isn't running yet, then things get messier. [2] - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; So to fix, [1] shall set replace_state to CANCELED only after the replace cancel thread has successfully canceled the replace using btrfs_scrub_cancel(). And about the cleanup process for the replace target.. we call btrfs_dev_replace_finishing(.. ret) after ret= btrfs_scrub_dev() now ret is -ECANCELED due to replace cancel request by the user. (its not set to -ECANCELED for any other reason). btrfs_dev_replace_finishing() has the following code [3] which it earlier blocked processing of the cleanup process after the cancel, because the replace_state was already updated to BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED. [3] -- static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { :: /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { btrfs_dev_replace_read_unlock(dev_replace); mutex_unlock(_replace->lock_finishing_cancel_unmount); return 0; } --- In fact before this patch [1] the code wasn't sync-ing the IO when replace was canceled. Which this patch also fixes by using the btrfs_dev_replace_finishing() --- btrfs_dev_replace_finishing :: /* * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ ret = btrfs_start_delalloc_roots(fs_info, -1); if (ret) { mutex_unlock(_replace->lock_finishing_cancel_unmount); return ret; } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); --- So to answer above question... these warn and error log wasn't reported before when replace was canceled and now since I am using the btrfs_dev_replace_finishing() to finish the job of cancel.. it shall be reported which is ok to quite. Do you think we still need to update the change log? HTH. Thanks, Anand
Re: [PATCH 0/9 v2] fix replace-start and replace-cancel racing
On 11/15/2018 11:41 PM, David Sterba wrote: On Sun, Nov 11, 2018 at 10:22:15PM +0800, Anand Jain wrote: v1->v2: 2/9: Drop writeback required 3/9: Drop writeback required 7/9: Use the condition within the WARN_ON() 6/9: Use the condition within the ASSERT() Replace-start and replace-cancel threads can race to create a messy situation leading to UAF. We use the scrub code to write the blocks on the replace target. So if we haven't have set the replace-scrub-running yet, without this patch we just ignore the error and free the target device. When this happens the system panics with UAF error. Its nice to see that btrfs_dev_replace_finishing() already handles the ECANCELED (replace canceled) situation, but for an unknown reason we aren't using it to cleanup the replace cancel situation, instead we just let the replace cancel ioctl thread to cleanup the target device and return and out of synchronous with the scrub code. This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel() to check if the scrub was really running. And if its not then shall return an error to the user (replace not started error) so that user can retry replace cancel. And uses btrfs_dev_replace_finishing() code to cleanup after successful cancel of the replace scrub. Further, a suspended replace, when tries to restart, and if it fails (for example target device missing, or excl ops running) it goes to the started state, and so the cli 'btrfs replace status /mnt' hangs with no progress. So patches 2/9 and 3/9 fixes that. As the originals code idea of ECANCELED was limited to the situation of the error only and not user requested, there are unnecessary error log and warn log which 7/9 and 8/9 patches fixes. Patches 1/9 and 9/9 are good to have fixes. Makes a function static and code readability good. Testing: (I did some attempt to convert these into xfstests but need a mechanism where kernel thread can wait for user land script. I thought I could do it using ebfp, but needs more digging on how). As of now hand tested with using procfs to hold kernel thread at (wait_for_user(..)) until user land issues go. This could be tricky to get implemented but would be of course useful. I saw the crash about once a week so will watch if this still happens. That will be nice. Anand Jain (9): btrfs: mark btrfs_dev_replace_start() as static btrfs: replace go back to suspended if target missing btrfs: replace back to suspend state if EXCL OP is running btrfs: fix UAF due to race between replace start and cancel btrfs: replace cancel is successful if scrub cancel is successful btrfs: replace's scrub must not be running in replace suspended state btrfs: quiten warn if the replace is canceled at finish btrfs: user requsted replace cancel is not an error btrfs: add explicit check for replace result no error The above is merged to misc-next, except: btrfs: quiten warn if the replace is canceled at finish btrfs: user requsted replace cancel is not an error with replies under the patches what could be improved. The changes can be sent independently if you need to do that in several patches. Thanks. We need these patch otherwise you will see WARN_ON and btrfs_err after a successful replace cancel. Will send revised patch. Thanks, Anand
Re: [PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful
On 11/15/2018 11:27 PM, David Sterba wrote: On Sun, Nov 11, 2018 at 10:22:20PM +0800, Anand Jain wrote: In btrfs_dev_replace_cancel() we should check if the btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to cancel the replace again. I've updated the subject and changelog as this was not easy to understand what's going on. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 22 +- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d32d696d931c..9fc3cb8d3918 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_replace_write_unlock(dev_replace); break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: - result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); - /* -* btrfs_dev_replace_finishing() will handle the cleanup part -*/ - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + ret = btrfs_scrub_cancel(fs_info); + if (ret) { This should be ret < 0, fixed. Ok with me. Thanks for fixing. Anand + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* -- 1.8.3.1
Re: [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel
On 11/15/2018 11:25 PM, David Sterba wrote: On Thu, Nov 15, 2018 at 03:00:21PM +0100, David Sterba wrote: On Wed, Nov 14, 2018 at 09:28:34AM +0800, Anand Jain wrote: mutex_unlock(_replace->lock_finishing_cancel_unmount); return result; There's a compiler warning: fs/btrfs/dev-replace.c: In function ‘btrfs_dev_replace_cancel’: fs/btrfs/dev-replace.c:865:9: warning: ‘result’ may be used uninitialized in this function [-Wmaybe-uninitialized] return result; ^~ I haven't looked closer though it looks valid. int result; is assigned within switch(), so there isn't actual problem. The warning is there because switch (dev_replace->replace_state) does not have a default: case that would catch the values outside of what's defined by the enum. So in that case result would have undefined value. But will initialize the result to -EINVAL to quite the compiler. Sending v3. I don't see any change in the followup version. https://patchwork.kernel.org/patch/10681939/ Looks like I didn't run git commit --amend, now pulled your changes from misc-next. I've added default: result = -EINVAL; to the end of the switch. Ok. misc-next looks good to me. Thanks, Anand
Re: [PATCH RFC] btrfs: harden agaist duplicate fsid
On 11/13/2018 11:47 PM, Anand Jain wrote: On 11/13/2018 11:31 PM, David Sterba wrote: On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote: + /* + * we are going to replace the device path, make sure its the + * same device if the device mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(_devices->device_list_mutex); + return ERR_PTR(-EEXIST); I have installed ubuntu with btrfs as root and with a subvol to be mounted at boot. Its working fine. Thanks, Anand
Re: [PATCH] btrfs: introduce feature to forget a btrfs device
On 11/14/2018 07:28 PM, Filipe Manana wrote: On Wed, Nov 14, 2018 at 11:15 AM Filipe Manana wrote: On Wed, Nov 14, 2018 at 9:14 AM Anand Jain wrote: Support for a new command 'btrfs dev forget [dev]' is proposed here to undo the effects of 'btrfs dev scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty. IOW(BTRFS_IOCTL_MAGIC, 5, ..) This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from the /dev/btrfs-control to forget one or all devices, (devices which are not mounted) from the btrfs kernel. The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be set to specify the device path. And all unmounted devices can be removed from the kernel if no device path is provided. Again, the devices are removed only if the relevant fsid aren't mounted. This new cli can provide.. . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the device is not going to be mounted. . Ability to mount the device in degraded mode when one of the other device is corrupted like in split brain raid1. . Running test cases which requires btrfs.ko-reload if the rootfs is btrfs. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 345c64d810d4..f99db6899004 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..e1365a122657 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + mutex_lock(_mutex); + btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL); One space before : and another one after it please. Now the more important: don't use strlen, use strnlen. Some malicious or sloppy user might have passed a non-null terminated string, you don't want strlen to go past the limits of btrfs_ioctl_vol_args for obvious reasons. In fact that's a problem for the entire use of vol->name in btrfs_control_ioctl. The name's last byte should be set to '\0' to avoid issues. I'll send a fix for that, so if David fixes the white spaces on commit there's no need for a v12. Ok. Thanks. David, wonder if white spaces can be fixed when integrating? Thanks, Anand Also, please, not just to make a maintainer's life easier, but current and future reviewers, add the patch version to each patch's subject and not just the cover letter. Also list (after ---) what changes between each patch version in the patch itself and not the cover letter. V12, here we go. + mutex_unlock(_mutex); + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..180297d04938 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ca1d21fc4a7..b1be7f828cb4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -836,6 +836,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- 1.8.3.1 -- Filipe David Manana, “Whether you think you can, or you think
Re: [PATCH] btrfs: introduce feature to forget a btrfs device
On 11/14/2018 07:15 PM, Filipe Manana wrote: On Wed, Nov 14, 2018 at 9:14 AM Anand Jain wrote: Support for a new command 'btrfs dev forget [dev]' is proposed here to undo the effects of 'btrfs dev scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty. IOW(BTRFS_IOCTL_MAGIC, 5, ..) This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from the /dev/btrfs-control to forget one or all devices, (devices which are not mounted) from the btrfs kernel. The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be set to specify the device path. And all unmounted devices can be removed from the kernel if no device path is provided. Again, the devices are removed only if the relevant fsid aren't mounted. This new cli can provide.. . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the device is not going to be mounted. . Ability to mount the device in degraded mode when one of the other device is corrupted like in split brain raid1. . Running test cases which requires btrfs.ko-reload if the rootfs is btrfs. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 345c64d810d4..f99db6899004 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..e1365a122657 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + mutex_lock(_mutex); + btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL); One space before : and another one after it please. will fix. Now the more important: don't use strlen, use strnlen. Some malicious or sloppy user might have passed a non-null terminated string, you don't want strlen to go past the limits of btrfs_ioctl_vol_args for obvious reasons. Makes sense. Will fix. Also, please, not just to make a maintainer's life easier, but current and future reviewers, add the patch version to each patch's subject and not just the cover letter. Also list (after ---) what changes between each patch version in the patch itself and not the cover letter. Sure. Thanks for the feedback. -Anand V12, here we go. + mutex_unlock(_mutex); + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..180297d04938 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ca1d21fc4a7..b1be7f828cb4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -836,6 +836,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- 1.8.3.1
Re: [PATCH] Btrfs: ensure path name is null terminated at btrfs_control_ioctl
On 11/14/2018 07:35 PM, fdman...@kernel.org wrote: From: Filipe Manana We were using the path name received from user space without checking that it is null terminated. While btrfs-progs is well behaved and does proper validation and null termination, someone could call the ioctl and pass a non-null terminated patch, leading to buffer overrun problems in the kernel. So just set the last byte of the path to a null character, similar to what we do in other ioctls (add/remove/resize device, snapshot creation, etc). Signed-off-by: Filipe Manana Reviewed-by: Anand Jain Thanks, Anand --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6601c9aa5e35..8ad145820ea8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2235,6 +2235,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, vol = memdup_user((void __user *)arg, sizeof(*vol)); if (IS_ERR(vol)) return PTR_ERR(vol); + vol->name[BTRFS_PATH_NAME_MAX] = '\0'; switch (cmd) { case BTRFS_IOC_SCAN_DEV:
Re: [PATCH] btrfs: Fix suspicious RCU usage warning in device_list_add
On 11/14/2018 03:24 PM, Lu Fengqi wrote: = WARNING: suspicious RCU usage 4.20.0-rc2+ #23 Tainted: G O - fs/btrfs/volumes.c:886 suspicious rcu_dereference_check() usage! Use btrfs_info_in_rcu instead of pr_info for the required lock/unlock of RCU string. Fixes: 1f265fc6f58b ("btrfs: harden agaist duplicate fsid on scanned devices") Signed-off-by: Lu Fengqi Thanks Lu. I missed it. Reviewed-by: Anand Jain --- fs/btrfs/volumes.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2186300bab91..6039ae5c549e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -873,15 +873,15 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (device->bdev != path_bdev) { bdput(path_bdev); mutex_unlock(_devices->device_list_mutex); - pr_warn( - "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n", + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s\n", disk_super->fsid, devid, rcu_str_deref(device->name), path); return ERR_PTR(-EEXIST); } bdput(path_bdev); - pr_info( - "BTRFS: device fsid %pU devid %llu moved old:%s new:%s\n", + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s\n", disk_super->fsid, devid, rcu_str_deref(device->name), path); }
[PATCH v5 3/3] btrfs: balance: add kernel log for end or paused
Add a kernel log when the balance ends, either for cancel or completed or if it is paused. Signed-off-by: Anand Jain --- v4->v5: nothing. v3->v4: nothing. v2->v3: nothing. v1->v2: Moved from 2/3 to 3/3 fs/btrfs/volumes.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8397f72bdac7..93f1683f8f5c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4066,6 +4066,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ret = __btrfs_balance(fs_info); mutex_lock(_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + else if (ret == -ECANCELED && atomic_read(_info->balance_cancel_req)) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, _info->flags); if (bargs) { -- 1.8.3.1
[PATCH v5 2/3] btrfs: balance: add args info during start and resume
Balance arg info is an important information to be reviewed for the system audit. So this patch adds them to the kernel log. Example: ->btrfs bal start -f -mprofiles=raid1,convert=single,soft -dlimit=10..20,usage=50 /btrfs kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 -msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single Signed-off-by: Anand Jain --- v4.1->v5: Per David review comment the code.. bp += snprintf(bp, buf - bp + size_buf, "soft,"); is not safe if 'buf - bp + size_buf' becomes accidentally negative, so fix this by using following snprintf. #define CHECK_UPDATE_BP_1(a) \ do { \ ret = snprintf(bp, size_bp, a); \ if (ret < 0 || ret >= size_bp) \ goto out; \ size_bp -= ret; \ bp += ret; \ } while (0) And initialize the tmp_buf to null. v4->v4.1: Rename last missed sz to size_buf. v3->v4: Change log updated with new example. Log format is changed to almost match with the cli. snprintf drop %s for inline string. Print range as =%u..%u instead of min=%umax=%u. soft is under the args now. force is printed as -f. v2->v3: Change log updated. Make use of describe_block_group() added in 1/4 Drop using strcat instead use snprintf. Logs string format updated as shown in the example above. v1->v2: Change log update. Move adding the logs for balance complete and end to a new patch fs/btrfs/volumes.c | 172 - 1 file changed, 169 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 12b3d0625c6a..8397f72bdac7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3754,6 +3754,173 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, (bctl_arg->target & ~allowed))); } +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, +u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_UPDATE_BP_1(a) \ + do { \ + ret = snprintf(bp, size_bp, a); \ + if (ret < 0 || ret >= size_bp) \ + goto out; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_UPDATE_BP_2(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, a, v1); \ + if (ret < 0 || ret >= size_bp) \ + goto out; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_UPDATE_BP_3(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, a, v1, v2); \ + if (ret < 0 || ret >= size_bp) \ + goto out; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_SOFT) { + CHECK_UPDATE_BP_1("soft,"); + } + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_UPDATE_BP_2("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) { + CHECK_UPDATE_BP_2("usage=%llu,", bargs->usage); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) { + CHECK_UPDATE_BP_3("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + } + + if (flags & BTRFS_BALANCE_ARGS_DEVID) { + CHECK_UPDATE_BP_2("devid=%llu,", bargs->devid); + } + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) { + CHECK_UPDATE_BP_3("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + } + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) { + CHECK_UPDATE_BP_3("vrange%llu..%llu,", + bargs->vstart, bargs->vend); + } + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) { + CHECK_UPDATE_BP_2("limit=%llu,", bargs->limit); + } + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) { + CHECK_UPDATE_BP_3("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + } + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) { + CHECK_UPDATE_BP_3("stripes=%u..%u,", + ba
[PATCH v5 1/3] btrfs: add helper function describe_block_group()
Improve on describe_relocation() add a common helper function to describe the block groups. Signed-off-by: Anand Jain Reviewed-by: David Sterba --- v4.1->v5: Initialize buf[128] to null. v4->v4.1: Use strcpy(buf, "|NONE"); as in the original v3->v4: Just pass full flag name in the define DESCRIBE_FLAG(flag,..), so that it can be used at couple of more places. Rename describe_block_groups() to btrfs_describe_block_groups(). Drop useless return u32. v3: Born. fs/btrfs/relocation.c | 30 +++--- fs/btrfs/volumes.c| 43 +++ fs/btrfs/volumes.h| 1 + 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 924116f654a1..0373b3cc1d36 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4184,37 +4184,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0,"raid0"); - DESCRIBE_FLAG(RAID1,"raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5,"raid5"); - DESCRIBE_FLAG(RAID6,"raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..12b3d0625c6a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -123,6 +123,49 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(f, d) \ + do { \ + if (flags & f) { \ + ret = snprintf(bp, size_bp, "%s|", d); \ + if (ret < 0 || ret >= size_bp) \ + return; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~f; \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..3e914effcdf6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -430,6 +430,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_arg
[PATCH v5 0/3] btrfs: balance: improve kernel logs
v4->v5: Mainly address David review comment [1]. [1] https://patchwork.kernel.org/patch/10425987/ pls ref to individual patch 2/3 for details. v3->v4: Pls ref to individual patches. Based on misc-next. v2->v3: Inspried by describe_relocation(), improves it, makes it a helper function and use it to log the balance operations. Kernel logs are very important for the forensic investigations of the issues, these patchs make balance logs easy to review. Anand Jain (3): btrfs: add helper function describe_block_group() btrfs: balance: add args info during start and resume btrfs: balance: add kernel log for end or paused fs/btrfs/relocation.c | 30 +-- fs/btrfs/volumes.c| 222 +- fs/btrfs/volumes.h| 1 + 3 files changed, 223 insertions(+), 30 deletions(-) -- 1.8.3.1
Re: [PATCH v4.1b 2/3] btrfs: balance: add args info during start and resume
On 05/31/2018 05:47 PM, David Sterba wrote: On Fri, May 25, 2018 at 11:05:47AM +0800, Anand Jain wrote: Balance arg info is an important information to be reviewed for the system audit. So this patch adds them to the kernel log. Example: ->btrfs bal start -f -mprofiles=raid1,convert=single,soft -dlimit=10..20,usage=50 /btrfs kernel: BTRFS info (device sdb): balance: start -f -dusage=50,limit=10..20 -msoft,profiles=raid1,convert=single -ssoft,profiles=raid1,convert=single Signed-off-by: Anand Jain --- v4->v4.1b: Rename last missed sz to size_buf. Please send a full version instead of the incremental bumps to individual patches. I try to take the last version but am never sure what exactly gets merged. +static u32 describe_balance_args(struct btrfs_balance_args *bargs, char *buf, +u32 size_buf) +{ + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128]; + + if (!flags) + return 0; + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + bp += snprintf(bp, buf - bp + size_buf, "soft,"); I have some suspicion that this snprintf conscturct is not safe when the value of 'buf - bp + size_buf' becomes accidentally negative. A quick test showed that a sequence of snprintf and incremented bp does not stop writing when the size_buf limit is hit and happily overwrites the memory. We'd have to check that the negative value is never passed to snprintf. The patches are potponed until the issue is resolved, either to verify that it's a false alert or the bounds are correctly checked. The bug should not be in the original code as the buffer is known to be large enough for all the printed bg flags. Agreed, possibilities of accidental -ve value for snprintf. It followed the original code, but as you said the data length is more deterministic there. I am fixing this in both new and old original code. Pls find v5. Sorry for the delay, I took some time to get a fresh look at it. Thanks, Anand I'm sorry to delay the patches, the log messages are useful, but I want to be sure that there are no random memory overwrites introduced. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs-progs: add cli to forget one or all scanned devices
This patch adds cli btrfs device forget [dev] to remove the given device structure in the kernel if the device is unmounted. If no argument is given it shall remove all stale (device which are not mounted) from the kernel. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- cmds-device.c | 72 --- ioctl.h | 2 ++ 2 files changed, 61 insertions(+), 13 deletions(-) diff --git a/cmds-device.c b/cmds-device.c index 2a05f70a76a9..280d6f555377 100644 --- a/cmds-device.c +++ b/cmds-device.c @@ -254,10 +254,32 @@ static int cmd_device_delete(int argc, char **argv) return _cmd_device_remove(argc, argv, cmd_device_delete_usage); } +static int btrfs_forget_devices(char *path) +{ + struct btrfs_ioctl_vol_args args; + int ret; + int fd; + + fd = open("/dev/btrfs-control", O_RDWR); + if (fd < 0) + return -errno; + + memset(, 0, sizeof(args)); + if (path) + strncpy_null(args.name, path); + ret = ioctl(fd, BTRFS_IOC_FORGET_DEV, ); + if (ret) + ret = -errno; + close(fd); + return ret; +} + static const char * const cmd_device_scan_usage[] = { - "btrfs device scan [(-d|--all-devices)| [...]]", - "Scan devices for a btrfs filesystem", + "btrfs device scan [(-d|--all-devices)|(-u|--forget)| "\ + "[...]]", + "Scan or forget (deregister) devices for a btrfs filesystem", " -d|--all-devices (deprecated)", + " -u|--forget [ ..]", NULL }; @@ -267,37 +289,53 @@ static int cmd_device_scan(int argc, char **argv) int devstart; int all = 0; int ret = 0; + int forget = 0; optind = 0; while (1) { int c; static const struct option long_options[] = { { "all-devices", no_argument, NULL, 'd'}, + { "forget", no_argument, NULL, 'u'}, { NULL, 0, NULL, 0} }; - c = getopt_long(argc, argv, "d", long_options, NULL); + c = getopt_long(argc, argv, "du", long_options, NULL); if (c < 0) break; switch (c) { case 'd': all = 1; break; + case 'u': + forget = 1; + break; default: usage(cmd_device_scan_usage); } } devstart = optind; + if (all && forget) + usage(cmd_device_scan_usage); + if (all && check_argc_max(argc - optind, 1)) usage(cmd_device_scan_usage); if (all || argc - optind == 0) { - printf("Scanning for Btrfs filesystems\n"); - ret = btrfs_scan_devices(); - error_on(ret, "error %d while scanning", ret); - ret = btrfs_register_all_devices(); - error_on(ret, "there are %d errors while registering devices", ret); + if (forget) { + ret = btrfs_forget_devices(NULL); + error_on(ret, "'%s', forget failed", +strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems\n"); + ret = btrfs_scan_devices(); + error_on(ret, "error %d while scanning", ret); + ret = btrfs_register_all_devices(); + error_on(ret, + "there are %d errors while registering devices", + ret); + } goto out; } @@ -315,11 +353,19 @@ static int cmd_device_scan(int argc, char **argv) ret = 1; goto out; } - printf("Scanning for Btrfs filesystems in '%s'\n", path); - if (btrfs_register_one_device(path) != 0) { - ret = 1; - free(path); - goto out; + if (forget) { + ret = btrfs_forget_devices(path); + if (ret) + error("Can't forget '%s': %s", + path, strerror(-ret)); + } else { + printf("Scanning for Btrfs filesystems in '%s'\n", + path); + if (btrfs_register_on
[PATCH RESEND v11] Add cli and ioctl to forget scanned device(s)
v11: btrfs-progs: Bring the code into the else part of if(forget). Use strerror to print the erorr instead of ret. v10: Make btrfs-progs changes more readable. With an effort to keep the known bug [1] as it is.. [1] The cli 'btrfs device scan --all /dev/sdb' which should have scanned only one device, ends up scanning all the devices and I am not trying to fix this bug in this patch because.. . -d|--all is marked as deprecated, I hope -d option would go away . For now some script might be using this bug as a feature, and fixing this bug might lead to mount failure. v9: Make forget as a btrfs device scan option. Use forget in the fstests, now you can run fstests with btrfs as rootfs which helps to exercise the uuid_mutex lock. v8: Change log update in the kernel patch. v7: Use struct btrfs_ioctl_vol_args (instead of struct btrfs_ioctl_vol_args_v2) as its inline with other ioctl btrfs-control The CLI usage/features remains same. However internally the ioctl flag is not required to delete all the unmounted devices. Instead leave btrfs_ioctl_vol_args::name NULL. v6: Use the changed fn name btrfs_free_stale_devices(). Change in title: Old v5: Cover-letter: [PATCH v5] Add cli and ioctl to ignore a scanned device Kernel: [PATCH v5] btrfs: introduce feature to ignore a btrfs device Progs: [PATCH v5] btrfs-progs: add 'btrfs device ignore' cli v5: Adds feature to delete all stale devices Reuses btrfs_free_stale_devices() fn and so depends on the patch-set [1] in the ML. Uses struct btrfs_ioctl_vol_args_v2 instead of struct btrfs_ioctl_vol_args as arg Does the device path matching instead of btrfs_device matching (we won't delete the mounted device as btrfs_free_stale_devices() checks for it) v4: No change. But as the ML thread may be confusing, so resend. v3: No change. Send to correct ML. v2: Accepts review from Nikolay, details are in the specific patch. Patch 1/2 is renamed from [PATCH 1/2] btrfs: refactor btrfs_free_stale_device() to get device list delete to [PATCH 1/2] btrfs: add function to device list delete Adds cli and ioctl to forget a scanned device or forget all stale devices in the kernel. Anand Jain (1): btrfs: introduce feature to forget a btrfs device fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) Anand Jain (1): btrfs-progs: add cli to forget one or all scanned devices cmds-device.c | 63 ++- ioctl.h | 2 ++ 2 files changed, 56 insertions(+), 9 deletions(-) [1] Anand Jain (1): fstests: btrfs use forget if not reload common/btrfs| 20 tests/btrfs/124 | 6 +++--- tests/btrfs/125 | 6 +++--- tests/btrfs/154 | 6 +++--- tests/btrfs/164 | 4 ++-- 5 files changed, 31 insertions(+), 11 deletions(-) -- 1.8.3.1
[PATCH] btrfs: introduce feature to forget a btrfs device
Support for a new command 'btrfs dev forget [dev]' is proposed here to undo the effects of 'btrfs dev scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty. IOW(BTRFS_IOCTL_MAGIC, 5, ..) This patch adds new ioctl BTRFS_IOC_FORGET_DEV which can be sent from the /dev/btrfs-control to forget one or all devices, (devices which are not mounted) from the btrfs kernel. The argument it takes is struct btrfs_ioctl_vol_args, and ::name can be set to specify the device path. And all unmounted devices can be removed from the kernel if no device path is provided. Again, the devices are removed only if the relevant fsid aren't mounted. This new cli can provide.. . Release of unwanted btrfs_fs_devices and btrfs_devices memory if the device is not going to be mounted. . Ability to mount the device in degraded mode when one of the other device is corrupted like in split brain raid1. . Running test cases which requires btrfs.ko-reload if the rootfs is btrfs. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 9 + fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 15 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 345c64d810d4..f99db6899004 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2246,6 +2246,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..e1365a122657 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1208,6 +1208,15 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + mutex_lock(_mutex); + btrfs_free_stale_devices(strlen(path) ? path:NULL, NULL); + mutex_unlock(_mutex); + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..180297d04938 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -406,6 +406,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5ca1d21fc4a7..b1be7f828cb4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -836,6 +836,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- 1.8.3.1
[PATCH 4/9 v2.1] btrfs: fix UAF due to race between replace start and cancel
replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail to stop the scrub thread, so the scrub thread continue with the scrub for replace which then shall try to write to the target device and which is already freed by the cancel thread. Please ref to the logs below. So scrub_setup_ctx() warns as tgtdev is null. static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { :: if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters the scrub_write_page_to_dev_replace() without the target device it panics static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { :: bio_set_dev(bio, sbio->dev->bdev); <== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00a0 :: [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] :: [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain --- v2->v2.1: Fix compiler warning. (I couldn't reproduce on gcc 4.8.5) fs/btrfs/dev-replace.c: In function ‘btrfs_dev_replace_cancel’: fs/btrfs/dev-replace.c:865:9: warning: ‘result’ may be used uninitialized in this function [-Wmaybe-uninitialized] return result; ^~ v1->v2: pls ref to the cover letter fs/btrfs/dev-replace.c | 61 -- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 35ce10f18607..d32d696d931c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* +* scrub doing the replace isn't running so do the cleanup here +*/
Re: [PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel
On 11/14/2018 01:24 AM, David Sterba wrote: On Sun, Nov 11, 2018 at 10:22:19PM +0800, Anand Jain wrote: replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail to stop the scrub thread, so the scrub thread continue with the scrub for replace which then shall try to write to the target device and which is already freed by the cancel thread. Please ref to the logs below. So scrub_setup_ctx() warns as tgtdev is null. static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { :: if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters the scrub_write_page_to_dev_replace() without the target device it panics static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { :: bio_set_dev(bio, sbio->dev->bdev); <== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00a0 :: [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] :: [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 61 -- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 35ce10f18607..d32d696d931c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* +* scrub doing the replace isn't running so do the cleanup here +*/ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL;
Re: [PATCH V7] Btrfs: enhance raid1/10 balance heuristic
I am ok with the least used path approach here for the IO routing that's probably most reasonable in generic configurations. It can be default read mirror policy as well. But as I mentioned. Not all configurations would agree to the heuristic approach here. For example: To make use of the SAN storage cache to get high IO throughput read must access based on the LBA, And this heuristic would make matter worst. There are plans to add more options read_mirror_policy [1]. [1] https://patchwork.kernel.org/patch/10403299/ I would rather provide the configuration tune-ables to the use cases rather than fixing it using heuristic. Heuristic are good only with the known set of IO pattern for which heuristic is designed for. This is not the first time you are assuming heuristic would provide the best possible performance in all use cases. As I mentioned in the compression heuristic there was no problem statement that you wanted to address using heuristic, theoretically the integrated compression heuristic would have to do a lot more computation when all the file-extents are compressible, its not convenience to me how compression heuristic would help on a desktop machine where most of the files are compressible. IMO heuristic are good only for a set of types of workload. Giving an option to move away from it for the manual tuning is desired. Thanks, Anand On 11/12/2018 07:58 PM, Timofey Titovets wrote: From: Timofey Titovets Currently btrfs raid1/10 balancer bаlance requests to mirrors, based on pid % num of mirrors. Make logic understood: - if one of underline devices are non rotational - Queue length to underline devices By default try use pid % num_mirrors guessing, but: - If one of mirrors are non rotational, repick optimal to it - If underline mirror have less queue length then optimal, repick to that mirror For avoid round-robin request balancing, lets round down queue length: - By 8 for rotational devs - By 2 for all non rotational devs Some bench results from mail list (Dmitrii Tcvetkov ): Benchmark summary (arithmetic mean of 3 runs): Mainline Patch RAID1 | 18.9 MiB/s | 26.5 MiB/s RAID10 | 30.7 MiB/s | 30.7 MiB/s mainline, fio got lucky to read from first HDD (quite slow HDD): Jobs: 1 (f=1): [r(1)][100.0%][r=8456KiB/s,w=0KiB/s][r=264,w=0 IOPS] read: IOPS=265, BW=8508KiB/s (8712kB/s)(499MiB/60070msec) lat (msec): min=2, max=825, avg=60.17, stdev=65.06 mainline, fio got lucky to read from second HDD (much more modern): Jobs: 1 (f=1): [r(1)][8.7%][r=11.9MiB/s,w=0KiB/s][r=380,w=0 IOPS] read: IOPS=378, BW=11.8MiB/s (12.4MB/s)(710MiB/60051msec) lat (usec): min=416, max=644286, avg=42312.74, stdev=48518.56 mainline, fio got lucky to read from an SSD: Jobs: 1 (f=1): [r(1)][100.0%][r=436MiB/s,w=0KiB/s][r=13.9k,w=0 IOPS] read: IOPS=13.9k, BW=433MiB/s (454MB/s)(25.4GiB/60002msec) lat (usec): min=343, max=16319, avg=1152.52, stdev=245.36 With the patch, 2 HDDs: Jobs: 1 (f=1): [r(1)][100.0%][r=17.5MiB/s,w=0KiB/s][r=560,w=0 IOPS] read: IOPS=560, BW=17.5MiB/s (18.4MB/s)(1053MiB/60052msec) lat (usec): min=435, max=341037, avg=28511.64, stdev=3.14 With the patch, HDD(old one)+SSD: Jobs: 1 (f=1): [r(1)][100.0%][r=371MiB/s,w=0KiB/s][r=11.9k,w=0 IOPS] read: IOPS=11.6k, BW=361MiB/s (379MB/s)(21.2GiB/60084msec) lat (usec): min=363, max=346752, avg=1381.73, stdev=6948.32 Changes: v1 -> v2: - Use helper part_in_flight() from genhd.c to get queue length - Move guess code to guess_optimal() - Change balancer logic, try use pid % mirror by default Make balancing on spinning rust if one of underline devices are overloaded v2 -> v3: - Fix arg for RAID10 - use sub_stripes, instead of num_stripes v3 -> v4: - Rebased on latest misc-next v4 -> v5: - Rebased on latest misc-next v5 -> v6: - Fix spelling - Include bench results v6 -> v7: - Fixes based on Nikolay Borisov review: * Assume num == 2 * Remove "for" loop based on that assumption, where possible * No functional changes Signed-off-by: Timofey Titovets Tested-by: Dmitrii Tcvetkov Reviewed-by: Dmitrii Tcvetkov --- block/genhd.c | 1 + fs/btrfs/volumes.c | 100 - 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index be5bab20b2ab..939f0c6a2d79 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
Re: [PATCH RFC] btrfs: harden agaist duplicate fsid
On 11/13/2018 11:31 PM, David Sterba wrote: On Mon, Oct 01, 2018 at 09:31:04PM +0800, Anand Jain wrote: + /* + * we are going to replace the device path, make sure its the + * same device if the device mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(_devices->device_list_mutex); + return ERR_PTR(-EEXIST); It would be _really_ nice to have an informative error message printed here. Aside from the possibility of an admin accidentally making a block-level copy of the volume, this code triggering could represent an attempted attack against the system, so it's arguably something that should be reported as happening. Personally, I think a WARN_ON_ONCE for this would make sense, ideally per-volume if possible. Ah. Will add an warn. Thanks, Anand The requested error message is not in the patch you posted or I have missed that (https://patchwork.kernel.org/patch/10641041/) . No you didn't miss. I missed it. When you integrated this into for-next, I should have sent out v2. Sorry. Thanks for taking care of it. Austin, is the following ok for you? "BTRFS: duplicate device fsid:devid for %pU:%llu old:%s new:%s\n" BTRFS: duplicate device fsid:devid 7c667b96-59eb-43ad-9ae9-c878f6ad51d8:2 old:/dev/sda6 new:/dev/sdb6 As the UUID and paths are long I tried to squeeeze the rest so it's still comprehensible but this would be better confirmed. Thanks. Thanks, Anand
Re: [PATCH RFC RESEND] btrfs: harden agaist duplicate fsid
On 11/13/2018 11:21 PM, David Sterba wrote: On Mon, Oct 15, 2018 at 10:45:17AM +0800, Anand Jain wrote: (Thanks for the comments on requiring to warn_on if we fail the device change.) (This fixes an ugly bug, I appreciate if you have any further comments). Its not that impossible to imagine that a device OR a btrfs image is been copied just by using the dd or the cp command. Which in case both the copies of the btrfs will have the same fsid. If on the system with automount enabled, the copied FS gets scanned. We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part / |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:01 14.9G 0 disk |-sda48:414G 0 part / |-sda28:21 500M 0 part |-sda38:31 256M 0 part `-sda18:11 256M 0 part mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain I'm adding the patch to misc-next now, with an update message that matches the format when a device is scanned. "BTRFS: device fsid %pU devid %llu moved old:%s new:%s\n", That way it should be possible to grep for all messages that are related to the scanning ioctl. Right. Looks fine to me. Thanks, Anand
Re: [RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk
David, Gentle ping. Thanks, Anand On 11/12/2018 03:50 PM, Nikolay Borisov wrote: On 12.11.18 г. 6:58 ч., Anand Jain wrote: The dev_replace_state defines are miss matched between the BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1]. [1] - btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_* (we set dev_replace->replace_state using the BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk). 359 btrfs_set_dev_replace_replace_state(eb, ptr, 360 dev_replace->replace_state); IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_* altogether? But how about the userland progs other than btrfs-progs? If not at least fix the miss match as in [2], any comments? Unfortunately you are right. This seems to stem from sloppy job back in the days of initial dev-replace support. BTRFS_DEV_REPLACE_ITEM_STATE_* were added in e922e087a35c ("Btrfs: enhance btrfs structures for device replace support"), yet they were never used. And the IOCTL_DEV_REPLACE_STATE* were added in e93c89c1 ("Btrfs: add new sources for device replace code"). It looks like the ITEM_STATE* definitions were stillborn so to speak and personally I'm in favor of removing them. They shouldn't have been merged in the first place and indeed the patch doesn't even have a Reviewed-by tag. So it originated from the, I'd say, spartan days of btrfs development... David, any code which is using BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED is inherently broken, so how about we remove those definitions, then when it's compilation is broken in the future the author will actually have a chance to fix it, though it's highly unlikely anyone is relying on those definitions. [2] -- diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index aff1356c2bb8..9ffa7534cadf 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -805,9 +805,9 @@ struct btrfs_dev_stats_item { #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4 struct btrfs_dev_replace_item { /* -- Thanks, Anand
Re: [PATCH] btrfs: fix computation of max fs size for multiple device fs tests
On 11/06/2018 11:34 PM, fdman...@kernel.org wrote: From: Filipe Manana We were sorting numerical values with the 'sort' tool without telling it that we are sorting numbers, giving us unexpected ordering. So just pass the '-n' option to the 'sort' tool. Example: $ echo -e "11\n9\n20" | sort 11 20 9 $ echo -e "11\n9\n20" | sort -n 9 11 20 Signed-off-by: Filipe Manana Thanks for the fix. Reviewed-by: Anand Jain --- tests/btrfs/124 | 2 +- tests/btrfs/125 | 2 +- tests/btrfs/154 | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/btrfs/124 b/tests/btrfs/124 index ce3ad6aa..a52c65f6 100755 --- a/tests/btrfs/124 +++ b/tests/btrfs/124 @@ -61,7 +61,7 @@ dev2=`echo $SCRATCH_DEV_POOL | awk '{print $2}'` dev1_sz=`blockdev --getsize64 $dev1` dev2_sz=`blockdev --getsize64 $dev2` # get min of both -max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort | head -1` +max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz" | sort -n | head -1` # Need disks with more than 2G. if [ $max_fs_sz -lt 20 ]; then _scratch_dev_pool_put diff --git a/tests/btrfs/125 b/tests/btrfs/125 index e38de264..5ac68b67 100755 --- a/tests/btrfs/125 +++ b/tests/btrfs/125 @@ -68,7 +68,7 @@ dev2_sz=`blockdev --getsize64 $dev2` dev3_sz=`blockdev --getsize64 $dev3` # get min of both. -max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort | head -1` +max_fs_sz=`echo -e "$dev1_sz\n$dev2_sz\n$dev3_sz" | sort -n | head -1` # Need disks with more than 2G if [ $max_fs_sz -lt 20 ]; then _scratch_dev_pool_put diff --git a/tests/btrfs/154 b/tests/btrfs/154 index 99ea232a..cd6c688f 100755 --- a/tests/btrfs/154 +++ b/tests/btrfs/154 @@ -51,7 +51,7 @@ DEV1_SZ=`blockdev --getsize64 $DEV1` DEV2_SZ=`blockdev --getsize64 $DEV2` # get min -MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort | head -1` +MAX_FS_SZ=`echo -e "$DEV1_SZ\n$DEV2_SZ" | sort -n | head -1` # Need disks with more than 2G if [ $MAX_FS_SZ -lt 20 ]; then _scratch_dev_pool_put
Re: Full filesystem btrfs rebalance kernel panic to read-only lock
On 11/12/2018 10:12 AM, Qu Wenruo wrote: On 2018/11/12 上午9:35, Anand Jain wrote: On 11/09/2018 09:21 AM, Qu Wenruo wrote: On 2018/11/9 上午6:40, Pieter Maes wrote: Hello, So, I've had the full disk issue, so when I tried re-balancing, I got a panic, that pushed filesystem read-only and I'm unable to balance or grow the filesystem now. fs info: btrfs fi show / Label: none uuid: 9b591b6b-6040-437e-9398-6883ca3bf1bb Total devices 1 FS bytes used 614.94GiB devid 1 size 750.00GiB used 750.00GiB path /dev/mapper/vg0-root btrfs fi df / Data, single: total=740.94GiB, used=610.75GiB System, DUP: total=32.00MiB, used=112.00KiB Metadata, DUP: total=4.50GiB, used=3.94GiB Metadata usage the the biggest problem. It's already used up. GlobalReserve, single: total=512.00MiB, used=255.06MiB And the reserved space is also been used, that's a pretty bad news. btrfs sub list -ta / ID gen top level path -- --- - btrfs --version btrfs-progs v4.4 Log when booting machine now from root: [ 54.746700] [ cut here ] [ 54.746701] BTRFS: Transaction aborted (error -28) Transaction can't even be done due to lack of space. [snip] When booting to a net/livecd rescue First I run a check with repair: enabling repair mode Checking filesystem on /dev/vg0/root UUID: 9b591b6b-6040-437e-9398-6883ca3bf1bb checking extents Fixed 0 roots. checking free space cache cache and super generation don't match, space cache will be invalidated checking fs roots reset nbytes for ino 6228034 root 5 It's a minor problem. So the fs itself is still pretty health. checking csums checking root refs found 664259596288 bytes used err is 0 total csum bytes: 619404608 total tree bytes: 4237737984 total fs tree bytes: 1692581888 total extent tree bytes: 1461665792 btree space waste bytes: 945044758 file data blocks allocated: 1568329531392 referenced 537131163648 But then when I try to mount the fs: [snip] rescue kernel: 4.9.120 I've grown the blockdevice, but there is no way I can grow the fs, it doesn't want to mount in my rescue system, and it only mounts read-only when booting from it, so I can't do it from there either Btrfs-progs could do it with some extra dirty work. (I purposed offline device resize idea, but didn't implement it yet) You could use this branch: https://github.com/adam900710/btrfs-progs/tree/dirty_fix Qu, The online resize should work here right? Nope, the user reported unable to mount RW, due to exhausted metadata space. And due to the failure of RW mount, reported can't do online resize, thus we need to do offline one. Its nice tool fixed the issue here, but in the long term we need a way to free some space IMO. Source of the problem is unable to mount RW when metadata space is full. A serious issue. Adding more disk space was viable workaround at this use case, which might not be true in all use cases. Like user may just want to mount and free some space. I think we need to fine tune the reserve space usage like distinguish the reserve space allocation between the new metadata item VS modification of the old metadata items. And reserve a space for the modification of the metadata, so that mount and freeing of some files will work. Thanks, Anand Thanks, Qu Thanks, Anand It's a quick and dirty fix to allow "btrfs-corrupt-block -X " to extent device size to max. Please try above command to see if it solves your problem. Thanks, Qu I hope someone can help me out with this. Thanks!
[PATCH] btrfs: remove redundant replace_state init
dev_replace::replace_state has been set to BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED (0) in the same function, So delete the line which sets replace_state = 0; Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index cc25a34f87b0..cdc4313446a9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -59,7 +59,6 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; - dev_replace->replace_state = 0; dev_replace->time_started = 0; dev_replace->time_stopped = 0; atomic64_set(_replace->num_write_errors, 0); -- 1.8.3.1
[RFC] BTRFS_DEV_REPLACE_ITEM_STATE_* doesn't match with on disk
The dev_replace_state defines are miss matched between the BTRFS_IOCTL_DEV_REPLACE_STATE_* and BTRFS_DEV_REPLACE_ITEM_STATE_* [1]. [1] - btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 btrfs.h:#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 btrfs_tree.h:#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - The BTRFS_DEV_REPLACE_ITEM_STATE_* series is unused in both btrfs.ko and btrfs-progs, the on-disk also follows BTRFS_IOCTL_DEV_REPLACE_STATE_* (we set dev_replace->replace_state using the BTRFS_IOCTL_DEV_REPLACE_STATE_* defines and write to the on-disk). 359 btrfs_set_dev_replace_replace_state(eb, ptr, 360 dev_replace->replace_state); IMO it should be ok to delete the BTRFS_DEV_REPLACE_ITEM_STATE_* altogether? But how about the userland progs other than btrfs-progs? If not at least fix the miss match as in [2], any comments? [2] -- diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index aff1356c2bb8..9ffa7534cadf 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -805,9 +805,9 @@ struct btrfs_dev_stats_item { #define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 #define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 #define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 4 struct btrfs_dev_replace_item { /* -- Thanks, Anand
Re: Full filesystem btrfs rebalance kernel panic to read-only lock
On 11/09/2018 09:21 AM, Qu Wenruo wrote: On 2018/11/9 上午6:40, Pieter Maes wrote: Hello, So, I've had the full disk issue, so when I tried re-balancing, I got a panic, that pushed filesystem read-only and I'm unable to balance or grow the filesystem now. fs info: btrfs fi show / Label: none uuid: 9b591b6b-6040-437e-9398-6883ca3bf1bb Total devices 1 FS bytes used 614.94GiB devid 1 size 750.00GiB used 750.00GiB path /dev/mapper/vg0-root btrfs fi df / Data, single: total=740.94GiB, used=610.75GiB System, DUP: total=32.00MiB, used=112.00KiB Metadata, DUP: total=4.50GiB, used=3.94GiB Metadata usage the the biggest problem. It's already used up. GlobalReserve, single: total=512.00MiB, used=255.06MiB And the reserved space is also been used, that's a pretty bad news. btrfs sub list -ta / ID gen top level path -- --- - btrfs --version btrfs-progs v4.4 Log when booting machine now from root: [ 54.746700] [ cut here ] [ 54.746701] BTRFS: Transaction aborted (error -28) Transaction can't even be done due to lack of space. [snip] When booting to a net/livecd rescue First I run a check with repair: enabling repair mode Checking filesystem on /dev/vg0/root UUID: 9b591b6b-6040-437e-9398-6883ca3bf1bb checking extents Fixed 0 roots. checking free space cache cache and super generation don't match, space cache will be invalidated checking fs roots reset nbytes for ino 6228034 root 5 It's a minor problem. So the fs itself is still pretty health. checking csums checking root refs found 664259596288 bytes used err is 0 total csum bytes: 619404608 total tree bytes: 4237737984 total fs tree bytes: 1692581888 total extent tree bytes: 1461665792 btree space waste bytes: 945044758 file data blocks allocated: 1568329531392 referenced 537131163648 But then when I try to mount the fs: [snip] rescue kernel: 4.9.120 I've grown the blockdevice, but there is no way I can grow the fs, it doesn't want to mount in my rescue system, and it only mounts read-only when booting from it, so I can't do it from there either Btrfs-progs could do it with some extra dirty work. (I purposed offline device resize idea, but didn't implement it yet) You could use this branch: https://github.com/adam900710/btrfs-progs/tree/dirty_fix Qu, The online resize should work here right? Thanks, Anand It's a quick and dirty fix to allow "btrfs-corrupt-block -X " to extent device size to max. Please try above command to see if it solves your problem. Thanks, Qu I hope someone can help me out with this. Thanks!
[PATCH 9/9] btrfs: add explicit check for replace result no error
We recast the replace return status BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no error. And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0, which is also declared as 0, so we just return. Instead add it to the if statement so that there is enough clarity while reading the code. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 40a0942b4659..cc25a34f87b0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; return ret; } -- 1.8.3.1
[PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful
In btrfs_dev_replace_cancel() we should check if the btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to cancel the replace again. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 22 +- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d32d696d931c..9fc3cb8d3918 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_replace_write_unlock(dev_replace); break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: - result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); - /* -* btrfs_dev_replace_finishing() will handle the cleanup part -*/ - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + ret = btrfs_scrub_cancel(fs_info); + if (ret) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* -- 1.8.3.1
[PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state
When the replace state is placed in the suspended state, btrfs_scrub_cancel() should fail with -ENOTCONN as there is no scrub running, as a safety catch check if btrfs_scrub_cancel() returns -ENOTCONN and assert if it doesn't. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9fc3cb8d3918..1dc8e86546db 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -831,7 +831,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + /* scrub for replace must not be running in suspended state */ + ret = btrfs_scrub_cancel(fs_info); + ASSERT(ret != -ENOTCONN); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { -- 1.8.3.1
[PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish
When we successfully cancel the replace its scrub returns -ECANCELED, which then passed to btrfs_dev_replace_finishing(), it cleans up based on the scrub returned status and propagates the same -ECANCELED back the parent function. As of now only user can cancel the replace-scrub, so its ok to quieten the warn here. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1dc8e86546db..9031a362921a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -954,7 +954,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), _replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); clear_bit(BTRFS_FS_EXCL_OP, _info->flags); return 0; -- 1.8.3.1
[PATCH 8/9] btrfs: user requsted replace cancel is not an error
As of now only user requested replace cancel can cancel the replace-scrub so no need to log error for it. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9031a362921a..40a0942b4659 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, -- 1.8.3.1
[PATCH 3/9] btrfs: replace back to suspend state if EXCL OP is running
In a secnario where balance and replace co-exists as below, start balance; pause balance; start replace; reboot and when system restarts balance restarts first and the replace restart will fail as EXCL OP lock is already held by the balance. If so place the replace state back to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 4 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 3c29b0976087..35ce10f18607 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -897,6 +897,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, _info->flags)) { + btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + btrfs_dev_replace_write_unlock(dev_replace); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; -- 1.8.3.1
[PATCH 2/9] btrfs: replace go back to suspended if target missing
At the time of forced unmount we place the running replace to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes back and suppose the target device is missing, then let the replace state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub running as part of replace. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 59991165e126..3c29b0976087 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -884,6 +884,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; btrfs_dev_replace_write_unlock(dev_replace); return 0; } -- 1.8.3.1
[PATCH 1/9] btrfs: mark btrfs_dev_replace_start() as static
There isn't any other consumer other than in its own file dev-replace.c. Signed-off-by: Anand Jain Reviewed-by: Nikolay Borisov --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/dev-replace.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aa48aecc52b..59991165e126 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -390,7 +390,7 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) { diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 795c551f5b5e..27e3bb0cca11 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -13,9 +13,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, - int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); -- 1.8.3.1
[PATCH 0/9 v2] fix replace-start and replace-cancel racing
v1->v2: 2/9: Drop writeback required 3/9: Drop writeback required 7/9: Use the condition within the WARN_ON() 6/9: Use the condition within the ASSERT() Replace-start and replace-cancel threads can race to create a messy situation leading to UAF. We use the scrub code to write the blocks on the replace target. So if we haven't have set the replace-scrub-running yet, without this patch we just ignore the error and free the target device. When this happens the system panics with UAF error. Its nice to see that btrfs_dev_replace_finishing() already handles the ECANCELED (replace canceled) situation, but for an unknown reason we aren't using it to cleanup the replace cancel situation, instead we just let the replace cancel ioctl thread to cleanup the target device and return and out of synchronous with the scrub code. This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel() to check if the scrub was really running. And if its not then shall return an error to the user (replace not started error) so that user can retry replace cancel. And uses btrfs_dev_replace_finishing() code to cleanup after successful cancel of the replace scrub. Further, a suspended replace, when tries to restart, and if it fails (for example target device missing, or excl ops running) it goes to the started state, and so the cli 'btrfs replace status /mnt' hangs with no progress. So patches 2/9 and 3/9 fixes that. As the originals code idea of ECANCELED was limited to the situation of the error only and not user requested, there are unnecessary error log and warn log which 7/9 and 8/9 patches fixes. Patches 1/9 and 9/9 are good to have fixes. Makes a function static and code readability good. Testing: (I did some attempt to convert these into xfstests but need a mechanism where kernel thread can wait for user land script. I thought I could do it using ebfp, but needs more digging on how). As of now hand tested with using procfs to hold kernel thread at (wait_for_user(..)) until user land issues go. 1. umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs && fillfs /btrfs 1 btrfs replace start /dev/sdb /dev/sdc /btrfs wait_for_user("scrub running is set..waiting"); AND OR wait_for_user("scrub running is NOT set..waiting"); btrfs replace cancel /btrfs wait_for_user_go(); btrfs replace status /btrfs 2. umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs && fillfs /btrfs 1 btrfs replace start /dev/sdb /dev/sdc /btrfs wait_for_user("scrub running is set..waiting"); AND OR wait_for_user("scrub running is NOT set..waiting"); reboot mount -o device=/dev/sdc /dev/sdb /btrfs wait_for_user_go(); btrfs replace status /btrfs btrfs replace cancel /btrfs btrfs replace status /btrfs 3. umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs && fillfs /btrfs 1 btrfs replace start /dev/sdb /dev/sdc /btrfs wait_for_user("scrub running is set..waiting"); AND OR wait_for_user("scrub running is NOT set..waiting"); reboot mount -o degraded /dev/sdb /btrfs btrfs replace status /btrfs btrfs replace cancel /btrfs btrfs replace status /btrfs umount /btrfs mount /dev/sdb /btrfs Anand Jain (9): btrfs: mark btrfs_dev_replace_start() as static btrfs: replace go back to suspended if target missing btrfs: replace back to suspend state if EXCL OP is running btrfs: fix UAF due to race between replace start and cancel btrfs: replace cancel is successful if scrub cancel is successful btrfs: replace's scrub must not be running in replace suspended state btrfs: quiten warn if the replace is canceled at finish btrfs: user requsted replace cancel is not an error btrfs: add explicit check for replace result no error fs/btrfs/dev-replace.c | 87 ++ fs/btrfs/dev-replace.h | 3 -- 2 files changed, 59 insertions(+), 31 deletions(-) -- 1.8.3.1 >From e5a84ccf4f73ec405bc7f5ad812b04762f287e2d Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 7 Nov 2018 18:51:19 +0800 Anand Jain (9): btrfs: mark btrfs_dev_replace_start() as static btrfs: replace go back to suspended if target missing btrfs: replace back to suspend state if EXCL OP is running btrfs: fix UAF due to race between replace start and cancel btrfs: replace cancel is successful if scrub cancel is successful btrfs: replace's scrub must not be running in replace suspended state btrfs: quiten warn if the replace is canceled at finish btrfs: user requsted replace cancel is not an error btrfs: add explicit check for replace result no error fs/btrfs/dev-replace.c | 90 ++ fs/btrfs/dev-replace.h | 3 -- 2 files changed, 62 insertions(+), 31 deletions(-) -- 1.8.3.1
[PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel
replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail to stop the scrub thread, so the scrub thread continue with the scrub for replace which then shall try to write to the target device and which is already freed by the cancel thread. Please ref to the logs below. So scrub_setup_ctx() warns as tgtdev is null. static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { :: if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters the scrub_write_page_to_dev_replace() without the target device it panics static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { :: bio_set_dev(bio, sbio->dev->bdev); <== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00a0 :: [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] :: [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 61 -- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 35ce10f18607..d32d696d931c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* +* scrub doing the replace isn't running so do the cleanup here +*/ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_
Re: [PATCH 2/9] btrfs: replace go back to suspended if target missing
On 11/07/2018 08:35 PM, Nikolay Borisov wrote: On 7.11.18 г. 13:43 ч., Anand Jain wrote: At the time of forced unmount we place the running replace to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes back and suppose the target device is missing, then let the replace state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub running as part of replace. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 59991165e126..47d6768a9cde 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -884,6 +884,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->item_needs_writeback = 1; Why do we need items_needs_writeback = 1 here, nothing is changed w.r.t on-disk data? You are right. We don't need writeback. Will fix. Thanks, Anand btrfs_dev_replace_write_unlock(dev_replace); return 0; }
Re: [PATCH] btrfs: Check for missing device before bio submission in btrfs_map_bio
On 11/08/2018 10:16 PM, Nikolay Borisov wrote: Before btrfs_map_bio submits all stripe bio it does a number of checks to ensure the device for every stripe is present. However, it doesn't do a DEV_STATE_MISSING check, instead this is relegated to the lower level btrfs_schedule_bio (in the async submission case, sync submission doesn't check DEV_STATE_MISSING at all). Additionally btrfs_schedule_bios does the duplicate device->bdev check which has already been performed in btrfs_map_bio. This patch moves the DEV_STATE_MISSING check in btrfs_map_bio and removes the duplicate device->bdev check. Doing so ensures that no bio cloning/submission happens for both async/sync requests in the face of missing device. This makes the async io submission path slightly shorter in terms of instruction count. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: Anand Jain Thanks, Anand --- fs/btrfs/volumes.c | 9 ++--- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 44c5e8ccb644..3312cad65209 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6106,12 +6106,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (test_bit(BTRFS_DEV_STATE_MISSING, >dev_state) || - !device->bdev) { - bio_io_error(bio); - return; - } - /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { btrfsic_submit_bio(bio); @@ -6240,7 +6234,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, + >dev_state) || (bio_op(first_bio) == REQ_OP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, >dev_state))) { bbio_error(bbio, first_bio, logical);
Re: [PATCH v15.1 00/13] Btrfs In-band De-duplication
De-duplication must also let use cases to enable de-duplication on per subvolume level, using the subvolume properties. Similar to compression and future-encryption. Thanks, Anand
Re: [PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state
On 11/08/2018 04:52 PM, Nikolay Borisov wrote: On 8.11.18 г. 10:33 ч., Anand Jain wrote: On 11/07/2018 08:19 PM, Nikolay Borisov wrote: On 7.11.18 г. 13:43 ч., Anand Jain wrote: + /* scrub for replace must not be running in suspended state */ + if (btrfs_scrub_cancel(fs_info) != -ENOTCONN) + ASSERT(0); ASSERT(btrfs_scrub_cancel(fs_info) == -ENOTCONN) There will be substantial difference in code when compiled with and without CONFIG_BTRFS_ASSERT [1]. That is, btrfs_scrub_cancel(fs_info) won't be run at all, I would like to keep it as it is. Fair point, in that case do: ret = btrfs_scrub_cancel(fs_info); ASSERT(ret != -ENOTCONN); Fixed. Thanks, Anand result [1] -- ./fs/btrfs/ctree.h #ifdef CONFIG_BTRFS_ASSERT __cold static inline void assfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, file: %s, line: %d\n", expr, file, line); BUG(); } #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #else #define ASSERT(expr) ((void)0) #endif --- Thanks, Anand
Re: [PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state
On 11/07/2018 08:19 PM, Nikolay Borisov wrote: On 7.11.18 г. 13:43 ч., Anand Jain wrote: + /* scrub for replace must not be running in suspended state */ + if (btrfs_scrub_cancel(fs_info) != -ENOTCONN) + ASSERT(0); ASSERT(btrfs_scrub_cancel(fs_info) == -ENOTCONN) There will be substantial difference in code when compiled with and without CONFIG_BTRFS_ASSERT [1]. That is, btrfs_scrub_cancel(fs_info) won't be run at all, I would like to keep it as it is. [1] -- ./fs/btrfs/ctree.h #ifdef CONFIG_BTRFS_ASSERT __cold static inline void assfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, file: %s, line: %d\n", expr, file, line); BUG(); } #define ASSERT(expr)\ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #else #define ASSERT(expr)((void)0) #endif --- Thanks, Anand
Re: [PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish
On 11/07/2018 08:17 PM, Nikolay Borisov wrote: On 7.11.18 г. 13:43 ч., Anand Jain wrote: - WARN_ON(ret); + if (ret != -ECANCELED) + WARN_ON(ret); WARN_ON(ret && ret != -ECANCELED) Will fix. Thanks, Anand
Re: [PATCH 9/9] btrfs: add explicit check for replace result no error
On 11/07/2018 08:15 PM, Nikolay Borisov wrote: On 7.11.18 г. 13:43 ч., Anand Jain wrote: We recast the replace return status BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no error. And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0, which is also declared as 0, so we just return. Instead add it to the if statement so that there is enough clarity while reading the code. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index cf3554554616..ca44998189c7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR can only be returned from btrfs_dev_replace_cancel, It can return 0 and which is also BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR which this patch makes it explicit. Looking at this again tells me that, as btrfs_dev_replace_start() is internal helper function, its better if it free from all usage of BTRFS_IOCTL_DEV_REPLACE_RESULT*. Thanks, Anand so what you are doing with checking ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is redundant. So using BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR is at the very least misleading here. I suggest you drop this patch. + return 0; return ret; }
[PATCH 7/9] btrfs: quiten warn if the replace is canceled at finish
When we successfully cancel the replace its scrub returns -ECANCELED, which then passed to btrfs_dev_replace_finishing(), it cleans up based on the scrub returned status and propagates the same -ECANCELED back the parent function. So skip the -ECANCELED error to log the WARN. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dae2b920f1a9..c14c41b70287 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -497,7 +497,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -956,7 +956,8 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), _replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + if (ret != -ECANCELED) + WARN_ON(ret); clear_bit(BTRFS_FS_EXCL_OP, _info->flags); return 0; -- 1.8.3.1
[PATCH 8/9] btrfs: user requsted replace cancel is not an error
As of now only user requested replace cancel can cancel the replace-scrub so no need to log error for it. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c14c41b70287..cf3554554616 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -622,7 +622,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, -- 1.8.3.1
[PATCH 6/9] btrfs: replace's scrub must not be running in replace suspended state
When the replace state is placed in the suspended state, btrfs_scrub_cancel() should fail with -ENOTCONN as there is no scrub running, as a safety catch check if btrfs_scrub_cancel() returns -ENOTCONN and assert if it doesn't. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c092ed559714..dae2b920f1a9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -831,7 +831,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + /* scrub for replace must not be running in suspended state */ + if (btrfs_scrub_cancel(fs_info) != -ENOTCONN) + ASSERT(0); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { -- 1.8.3.1
[PATCH 9/9] btrfs: add explicit check for replace result no error
We recast the replace return status BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to 0, to indicate no error. And since BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR should also return 0, which is also declared as 0, so we just return. Instead add it to the if statement so that there is enough clarity while reading the code. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index cf3554554616..ca44998189c7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -533,8 +533,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; return ret; } -- 1.8.3.1
[PATCH 0/9] fix replace-start and replace-cancel racing
Replace-start and replace-cancel threads can race to create a messy situation leading to UAF. We use the scrub code to write the blocks on the replace target. So if we haven't have set the replace-scrub-running yet, without this patch we just ignore the error and free the target device. When this happens the system panics with UAF error. Its nice to see that btrfs_dev_replace_finishing() already handles the ECANCELED (replace canceled) situation, but for an unknown reason we aren't using it to cleanup the replace cancel situation, instead we just let the replace cancel ioctl thread to cleanup the target device and return and out of synchronous with the scrub code. This patch 4/9, 5/9 and 6/9 uses the return code of btrfs_scrub_cancel() to check if the scrub was really running. And if its not then shall return an error to the user (replace not started error) so that user can retry replace cancel. And uses btrfs_dev_replace_finishing() code to cleanup after successful cancel of the replace scrub. Further, a suspended replace, when tries to restart, and if it fails (for example target device missing, or excl ops running) it goes to the started state, and so the cli 'btrfs replace status /mnt' hangs with no progress. So patches 2/9 and 3/9 fixes that. As the originals code idea of ECANCELED was limited to the situation of the error only and not user requested, there are unnecessary error log and warn log which 7/9 and 8/9 patches fixes. Patches 1/9 and 9/9 are good to have fixes. Makes a function static and code readability good. Testing: (I did some attempt to convert these into xfstests but need a mechanism where kernel thread can wait for user land script. I thought I could do it using ebfp, but needs more digging on how). As of now hand tested with using procfs to hold kernel thread at (wait_for_user(..)) until user land issues go. 1. umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs && fillfs /btrfs 1 btrfs replace start /dev/sdb /dev/sdc /btrfs wait_for_user("scrub running is set..waiting"); AND OR wait_for_user("scrub running is NOT set..waiting"); btrfs replace cancel /btrfs wait_for_user_go(); btrfs replace status /btrfs 2. umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs && fillfs /btrfs 1 btrfs replace start /dev/sdb /dev/sdc /btrfs wait_for_user("scrub running is set..waiting"); AND OR wait_for_user("scrub running is NOT set..waiting"); reboot mount -o device=/dev/sdc /dev/sdb /btrfs wait_for_user_go(); btrfs replace status /btrfs btrfs replace cancel /btrfs btrfs replace status /btrfs 3. umount /btrfs; wipefs -a /dev/sd[b-f] && mkfs.btrfs -fq /dev/sdb && mount /dev/sdb /btrfs && fillfs /btrfs 1 btrfs replace start /dev/sdb /dev/sdc /btrfs wait_for_user("scrub running is set..waiting"); AND OR wait_for_user("scrub running is NOT set..waiting"); reboot mount -o degraded /dev/sdb /btrfs btrfs replace status /btrfs btrfs replace cancel /btrfs btrfs replace status /btrfs umount /btrfs mount /dev/sdb /btrfs Anand Jain (9): btrfs: mark btrfs_dev_replace_start() as static btrfs: replace go back to suspended if target missing btrfs: replace back to suspend state if EXCL OP is running btrfs: fix UAF due to race between replace start and cancel btrfs: replace cancel is successful if scrub cancel is successful btrfs: replace's scrub must not be running in replace suspended state btrfs: quiten warn if the replace is canceled at finish btrfs: user requsted replace cancel is not an error btrfs: add explicit check for replace result no error fs/btrfs/dev-replace.c | 90 ++ fs/btrfs/dev-replace.h | 3 -- 2 files changed, 62 insertions(+), 31 deletions(-) -- 1.8.3.1
[PATCH 5/9] btrfs: replace cancel is successful if scrub cancel is successful
In btrfs_dev_replace_cancel() we should check if the btrfs_scrub_cancel() is successful. If the btrfs_scrub_cancel() fails, return BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED so that user can try to cancel the replace again. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 22 +- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 90c124edec76..c092ed559714 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -799,18 +799,22 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) btrfs_dev_replace_write_unlock(dev_replace); break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: - result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); - /* -* btrfs_dev_replace_finishing() will handle the cleanup part -*/ - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + ret = btrfs_scrub_cancel(fs_info); + if (ret) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* -- 1.8.3.1
[PATCH 2/9] btrfs: replace go back to suspended if target missing
At the time of forced unmount we place the running replace to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes back and suppose the target device is missing, then let the replace state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub running as part of replace. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 59991165e126..47d6768a9cde 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -884,6 +884,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->item_needs_writeback = 1; btrfs_dev_replace_write_unlock(dev_replace); return 0; } -- 1.8.3.1
[PATCH 1/9] btrfs: mark btrfs_dev_replace_start() as static
There isn't any other consumer other than in its own file dev-replace.c. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/dev-replace.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aa48aecc52b..59991165e126 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -390,7 +390,7 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) { diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 795c551f5b5e..27e3bb0cca11 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -13,9 +13,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, - int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); -- 1.8.3.1
[PATCH 4/9] btrfs: fix UAF due to race between replace start and cancel
replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set the btrfs_scrub_cancel() will fail to stop the scrub thread, so the scrub thread continue with the scrub for replace which then shall try to write to the target device and which is already freed by the cancel thread. Please ref to the logs below. So scrub_setup_ctx() warns as tgtdev is null. static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { :: if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] :: [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters the scrub_write_page_to_dev_replace() without the target device it panics static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { :: bio_set_dev(bio, sbio->dev->bdev); <== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00a0 :: [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] :: [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 61 -- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e001c2418940..90c124edec76 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -797,39 +797,56 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* +* btrfs_dev_replace_finishing() will handle the cleanup part +*/ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* +* scrub doing the replace isn't running so do the cleanup here +*/ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_
[PATCH 3/9] btrfs: replace back to suspend state if EXCL OP is running
In a secnario where balance and replace co-exists as below, start balance; pause balance; start replace; reboot and when system restarts balance restarts first and the replace restart will fail as EXCL OP lock is already held by the balance. If so place the replace state back to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state. Signed-off-by: Anand Jain --- fs/btrfs/dev-replace.c | 5 + 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 47d6768a9cde..e001c2418940 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -898,6 +898,11 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, _info->flags)) { + btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_write_unlock(dev_replace); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; -- 1.8.3.1
[PATCH v3] fstests: btrfs verify hardening agaist duplicate fsid
We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. So this test case reproduces this issue. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part / |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:01 14.9G 0 disk |-sda48:414G 0 part / |-sda28:21 500M 0 part |-sda38:31 256M 0 part `-sda18:11 256M 0 part mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take the sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain --- v2->v3: Check the return code and use _fail to verify and accordingly fix golden output. Rename dev_foo(bar) to device_1(2) Don't log dd retun to $seqres.full v1->v2: dont play around with dev patch use it as it is. do not use SCRATCH_MNT instead create it at the TEST_DIR and its related changes. golden out changes tests/btrfs/173 | 82 + tests/btrfs/173.out | 2 ++ tests/btrfs/group | 1 + 3 files changed, 85 insertions(+) create mode 100755 tests/btrfs/173 create mode 100644 tests/btrfs/173.out diff --git a/tests/btrfs/173 b/tests/btrfs/173 new file mode 100755 index ..342ae92b4781 --- /dev/null +++ b/tests/btrfs/173 @@ -0,0 +1,82 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Oracle. All Rights Reserved. +# +# FS QA Test 173 +# +# Fuzzy test for FS image duplication. +# Could be fixed by +#[patch] btrfs: harden agaist duplicate fsid +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +mnt=$TEST_DIR/$seq.mnt +_cleanup() +{ + rm -rf $mnt > /dev/null 2>&1 + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch_dev_pool 2 +_scratch_dev_pool_get 2 + +device_1=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') +device_2=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') + +echo device_1=$device_1 device_2=$device_2 >> $seqres.full + +rm -rf $mnt > /dev/null 2>&1 +mkdir $mnt +_mkfs_dev $device_1 +_mount $device_1 $mnt + +[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \ + _fail "mounted device changed" + +for sb_bytenr in 65536 67108864 +do + echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\ + "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full + dd status=none if=$device_1 of=$device_2 bs=1 seek=$sb_bytenr \ + skip=$sb_bytenr count=4096 > /dev/null 2>&1 + echo ..:$? >> $seqres.full +done + +#Original device is mounted, scan of its clone should fail +$BTRFS_UTIL_PROG device scan $device_2 >> $seqres.full 2>&1 +[[ $? != 1 ]] && _fail "cloned device scan should fail" + +[[ $(findmnt $mnt | grep -v TARGET | awk '{print $2}') != $device_1 ]] && \ + _fail "mounted device changed" + +#Original device scan should be successful +$BTRFS_UTIL_PROG device scan $device_1 >> $seqres.full 2>&1 +[[ $? != 0 ]] && \ + _fail "if it fails here, then it means subvolume mount at boot may fail "\ + &
Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid
On 10/26/2018 11:52 PM, Nikolay Borisov wrote: On 26.10.18 г. 18:34 ч., Anand Jain wrote: On 10/26/2018 11:02 PM, Nikolay Borisov wrote: On 8.10.18 г. 21:28 ч., Anand Jain wrote: We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. So this test case reproduces this issue. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part / |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:0 1 14.9G 0 disk |-sda4 8:4 1 4G 0 part / |-sda2 8:2 1 500M 0 part |-sda3 8:3 1 256M 0 part `-sda1 8:1 1 256M 0 part mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take the sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Is there a pending fix for this? Yes. https://patchwork.kernel.org/patch/10641041/ Test case header mentioned it. Signed-off-by: Anand Jain --- v1->v2: dont play around with dev patch use it as it is. do not use SCRATCH_MNT instead create it at the TEST_DIR and its related changes. golden out changes tests/btrfs/173 | 88 + tests/btrfs/173.out | 6 tests/btrfs/group | 1 + 3 files changed, 95 insertions(+) create mode 100755 tests/btrfs/173 create mode 100644 tests/btrfs/173.out diff --git a/tests/btrfs/173 b/tests/btrfs/173 new file mode 100755 index ..b466ae921e19 --- /dev/null +++ b/tests/btrfs/173 @@ -0,0 +1,88 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Oracle. All Rights Reserved. +# +# FS QA Test 173 +# +# Fuzzy test for FS image duplication. +# Could be fixed by +# [patch] btrfs: harden agaist duplicate fsid +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +mnt=$TEST_DIR/$seq.mnt +_cleanup() +{ + rm -rf $mnt > /dev/null 2>&1 + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here + +# Modify as appropriate. +_supported_fs btrfs +_supported_os Linux +_require_scratch_dev_pool 2 +_scratch_dev_pool_get 2 + +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') Naming devices _foo and _bar just shows you could not care less about the test. Hmm. Will make it device_1 and device_2. Either use sane names - primary_dev/secondary dev or device_1 and device_2. + +echo dev_foo=$dev_foo >> $seqres.full +echo dev_bar=$dev_bar >> $seqres.full +echo | tee -a $seqres.full + +rm -rf $mnt > /dev/null 2>&1 So what is $mnt? I can see it's used by very few tests and it's not obvious? Generally you should either define it as a local variable or use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked with Eric re. the use of $mnt in tests and his conclusion is : " looks like a bug" No its not. As few lines above, I have assigned it as.. mnt=$TEST_DIR/$seq.mnt I missed that, I will recommend moving the assignment near where you set the devices cleanup() is using $mnt. cleanup() may get called for any trap before the actual test. Thanks, Anand +mkdir $mnt +_mkfs_dev $dev_foo +_mount $dev_foo $mnt + +check_btrfs_mount() +{ + local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}') + [[ $x == $dev_foo ]] && echo DEV_FOO + [[ $x == $dev_bar ]] && echo
Re: [PATCH v2 rev log added] fstests: btrfs verify hardening agaist duplicate fsid
On 10/26/2018 11:02 PM, Nikolay Borisov wrote: On 8.10.18 г. 21:28 ч., Anand Jain wrote: We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after its been copied. So this test case reproduces this issue. For example: Initially.. /dev/mmcblk0p4 is mounted as / lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part / |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. lsblk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:01 14.9G 0 disk |-sda48:414G 0 part / |-sda28:21 500M 0 part |-sda38:31 256M 0 part `-sda18:11 256M 0 part mmcblk0 179:00 29.2G 0 disk |-mmcblk0p4 179:404G 0 part |-mmcblk0p2 179:20 500M 0 part /boot |-mmcblk0p3 179:30 256M 0 part [SWAP] `-mmcblk0p1 179:10 256M 0 part /boot/efi btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take the sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Is there a pending fix for this? Yes. https://patchwork.kernel.org/patch/10641041/ Test case header mentioned it. Signed-off-by: Anand Jain --- v1->v2: dont play around with dev patch use it as it is. do not use SCRATCH_MNT instead create it at the TEST_DIR and its related changes. golden out changes tests/btrfs/173 | 88 + tests/btrfs/173.out | 6 tests/btrfs/group | 1 + 3 files changed, 95 insertions(+) create mode 100755 tests/btrfs/173 create mode 100644 tests/btrfs/173.out diff --git a/tests/btrfs/173 b/tests/btrfs/173 new file mode 100755 index ..b466ae921e19 --- /dev/null +++ b/tests/btrfs/173 @@ -0,0 +1,88 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Oracle. All Rights Reserved. +# +# FS QA Test 173 +# +# Fuzzy test for FS image duplication. +# Could be fixed by +#[patch] btrfs: harden agaist duplicate fsid +# +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo "QA output created by $seq" + +here=`pwd` +tmp=/tmp/$$ +status=1 # failure is the default! +trap "_cleanup; exit \$status" 0 1 2 3 15 + +mnt=$TEST_DIR/$seq.mnt +_cleanup() +{ + rm -rf $mnt > /dev/null 2>&1 + cd / + rm -f $tmp.* +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# remove previous $seqres.full before test +rm -f $seqres.full + +# real QA test starts here + +# Modify as appropriate. +_supported_fs btrfs +_supported_os Linux +_require_scratch_dev_pool 2 +_scratch_dev_pool_get 2 + +dev_foo=$(echo $SCRATCH_DEV_POOL | awk '{print $1}') +dev_bar=$(echo $SCRATCH_DEV_POOL | awk '{print $2}') Naming devices _foo and _bar just shows you could not care less about the test. Hmm. Will make it device_1 and device_2. Either use sane names - primary_dev/secondary dev or device_1 and device_2. + +echo dev_foo=$dev_foo >> $seqres.full +echo dev_bar=$dev_bar >> $seqres.full +echo | tee -a $seqres.full + +rm -rf $mnt > /dev/null 2>&1 So what is $mnt? I can see it's used by very few tests and it's not obvious? Generally you should either define it as a local variable or use one of the SCRATCH_MNT/TEST_MNT global variables. Also I checked with Eric re. the use of $mnt in tests and his conclusion is : " looks like a bug" No its not. As few lines above, I have assigned it as.. mnt=$TEST_DIR/$seq.mnt +mkdir $mnt +_mkfs_dev $dev_foo +_mount $dev_foo $mnt + +check_btrfs_mount() +{ + local x=$(findmnt $mnt | grep -v TARGET | awk '{print $2}') + [[ $x == $dev_foo ]] && echo DEV_FOO + [[ $x == $dev_bar ]] && echo DEV_BAR +} Same thing here re. DEV_(FOO|BAR). + +echo MNT $(check_btrfs_mount) + +for sb_bytenr in 65536 67108864 +do + echo -n "dd status=none if=$dev_foo of=$dev_bar bs=1 "\ + "seek=$sb_bytenr skip=$sb_bytenr count=4096" >> $seqres.full