Hi Jan, > Hi Wang, > > On 01.05.2013 09:29, Wang Shilong wrote: >> Hello Jan, >> >>> If qgroup tracking is out of sync, a rescan operation can be started. It >>> iterates the complete extent tree and recalculates all qgroup tracking data. >>> This is an expensive operation and should not be used unless required. >>> >>> A filesystem under rescan can still be umounted. The rescan continues on the >>> next mount. Status information is provided with a separate ioctl while a >>> rescan operation is in progress. >>> >>> Signed-off-by: Jan Schmidt <[email protected]> >>> --- >>> fs/btrfs/ctree.h | 17 ++- >>> fs/btrfs/disk-io.c | 5 + >>> fs/btrfs/ioctl.c | 83 ++++++++++-- >>> fs/btrfs/qgroup.c | 318 >>> ++++++++++++++++++++++++++++++++++++++++++-- >>> include/uapi/linux/btrfs.h | 12 ++- >>> 5 files changed, 400 insertions(+), 35 deletions(-) >>> >>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h >>> index 412c306..e4f28a6 100644 >>> --- a/fs/btrfs/ctree.h >>> +++ b/fs/btrfs/ctree.h >>> @@ -1021,9 +1021,9 @@ struct btrfs_block_group_item { >>> */ >>> #define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) >>> /* >>> - * SCANNING is set during the initialization phase >>> + * RESCAN is set during the initialization phase >>> */ >>> -#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) >>> +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) >>> /* >>> * Some qgroup entries are known to be out of date, >>> * either because the configuration has changed in a way that >>> @@ -1052,7 +1052,7 @@ struct btrfs_qgroup_status_item { >>> * only used during scanning to record the progress >>> * of the scan. It contains a logical address >>> */ >>> - __le64 scan; >>> + __le64 rescan; >>> } __attribute__ ((__packed__)); >>> >>> struct btrfs_qgroup_info_item { >>> @@ -1603,6 +1603,11 @@ struct btrfs_fs_info { >>> /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ >>> u64 qgroup_seq; >>> >>> + /* qgroup rescan items */ >>> + struct mutex qgroup_rescan_lock; /* protects the progress item */ >>> + struct btrfs_key qgroup_rescan_progress; >>> + struct btrfs_workers qgroup_rescan_workers; >>> + >>> /* filesystem state */ >>> unsigned long fs_state; >>> >>> @@ -2888,8 +2893,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_version, struct >>> btrfs_qgroup_status_item, >>> version, 64); >>> BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, >>> flags, 64); >>> -BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, >>> - scan, 64); >>> +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, >>> + rescan, 64); >>> >>> /* btrfs_qgroup_info_item */ >>> BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, >>> @@ -3834,7 +3839,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle >>> *trans, >>> struct btrfs_fs_info *fs_info); >>> int btrfs_quota_disable(struct btrfs_trans_handle *trans, >>> struct btrfs_fs_info *fs_info); >>> -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); >>> +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); >>> int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, >>> struct btrfs_fs_info *fs_info, u64 src, u64 dst); >>> int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, >>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c >>> index 7717363..63e9348 100644 >>> --- a/fs/btrfs/disk-io.c >>> +++ b/fs/btrfs/disk-io.c >>> @@ -2010,6 +2010,7 @@ static void btrfs_stop_all_workers(struct >>> btrfs_fs_info *fs_info) >>> btrfs_stop_workers(&fs_info->caching_workers); >>> btrfs_stop_workers(&fs_info->readahead_workers); >>> btrfs_stop_workers(&fs_info->flush_workers); >>> + btrfs_stop_workers(&fs_info->qgroup_rescan_workers); >>> } >>> >>> /* helper to cleanup tree roots */ >>> @@ -2301,6 +2302,7 @@ int open_ctree(struct super_block *sb, >>> fs_info->qgroup_seq = 1; >>> fs_info->quota_enabled = 0; >>> fs_info->pending_quota_state = 0; >>> + mutex_init(&fs_info->qgroup_rescan_lock); >>> >>> btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); >>> btrfs_init_free_cluster(&fs_info->data_alloc_cluster); >>> @@ -2529,6 +2531,8 @@ int open_ctree(struct super_block *sb, >>> btrfs_init_workers(&fs_info->readahead_workers, "readahead", >>> fs_info->thread_pool_size, >>> &fs_info->generic_worker); >>> + btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, >>> + &fs_info->generic_worker); >>> >>> /* >>> * endios are largely parallel and should have a very >>> @@ -2563,6 +2567,7 @@ int open_ctree(struct super_block *sb, >>> ret |= btrfs_start_workers(&fs_info->caching_workers); >>> ret |= btrfs_start_workers(&fs_info->readahead_workers); >>> ret |= btrfs_start_workers(&fs_info->flush_workers); >>> + ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); >>> if (ret) { >>> err = -ENOMEM; >>> goto fail_sb_buffer; >>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c >>> index d0af96a..5e93bb8 100644 >>> --- a/fs/btrfs/ioctl.c >>> +++ b/fs/btrfs/ioctl.c >>> @@ -3701,12 +3701,10 @@ static long btrfs_ioctl_quota_ctl(struct file >>> *file, void __user *arg) >>> } >>> >>> down_write(&root->fs_info->subvol_sem); >>> - if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { >>> - trans = btrfs_start_transaction(root->fs_info->tree_root, 2); >>> - if (IS_ERR(trans)) { >>> - ret = PTR_ERR(trans); >>> - goto out; >>> - } >>> + trans = btrfs_start_transaction(root->fs_info->tree_root, 2); >>> + if (IS_ERR(trans)) { >>> + ret = PTR_ERR(trans); >>> + goto out; >>> } >>> >>> switch (sa->cmd) { >>> @@ -3716,9 +3714,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, >>> void __user *arg) >>> case BTRFS_QUOTA_CTL_DISABLE: >>> ret = btrfs_quota_disable(trans, root->fs_info); >>> break; >>> - case BTRFS_QUOTA_CTL_RESCAN: >>> - ret = btrfs_quota_rescan(root->fs_info); >>> - break; >>> default: >>> ret = -EINVAL; >>> break; >>> @@ -3727,11 +3722,9 @@ static long btrfs_ioctl_quota_ctl(struct file *file, >>> void __user *arg) >>> if (copy_to_user(arg, sa, sizeof(*sa))) >>> ret = -EFAULT; >>> >>> - if (trans) { >>> - err = btrfs_commit_transaction(trans, root->fs_info->tree_root); >>> - if (err && !ret) >>> - ret = err; >>> - } >>> + err = btrfs_commit_transaction(trans, root->fs_info->tree_root); >>> + if (err && !ret) >>> + ret = err; >>> out: >>> kfree(sa); >>> up_write(&root->fs_info->subvol_sem); >>> @@ -3886,6 +3879,64 @@ drop_write: >>> return ret; >>> } >>> >>> +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) >>> +{ >>> + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; >>> + struct btrfs_ioctl_quota_rescan_args *qsa; >>> + int ret; >>> + >>> + if (!capable(CAP_SYS_ADMIN)) >>> + return -EPERM; >>> + >>> + ret = mnt_want_write_file(file); >>> + if (ret) >>> + return ret; >>> + >>> + qsa = memdup_user(arg, sizeof(*qsa)); >>> + if (IS_ERR(qsa)) { >>> + ret = PTR_ERR(qsa); >>> + goto drop_write; >>> + } >>> + >>> + if (qsa->flags) { >>> + ret = -EINVAL; >>> + goto out; >>> + } >>> + >>> + ret = btrfs_qgroup_rescan(root->fs_info); >>> + >>> +out: >>> + kfree(qsa); >>> +drop_write: >>> + mnt_drop_write_file(file); >>> + return ret; >>> +} >>> + >>> +static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user >>> *arg) >>> +{ >>> + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; >>> + struct btrfs_ioctl_quota_rescan_args *qsa; >>> + int ret = 0; >>> + >>> + if (!capable(CAP_SYS_ADMIN)) >>> + return -EPERM; >>> + >>> + qsa = kzalloc(sizeof(*qsa), GFP_NOFS); >>> + if (!qsa) >>> + return -ENOMEM; >>> + >>> + if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >>> + qsa->flags = 1; >>> + qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; >>> + } >>> + >>> + if (copy_to_user(arg, qsa, sizeof(*qsa))) >>> + ret = -EFAULT; >>> + >>> + kfree(qsa); >>> + return ret; >>> +} >>> + >>> static long btrfs_ioctl_set_received_subvol(struct file *file, >>> void __user *arg) >>> { >>> @@ -4124,6 +4175,10 @@ long btrfs_ioctl(struct file *file, unsigned int >>> return btrfs_ioctl_qgroup_create(file, argp); >>> case BTRFS_IOC_QGROUP_LIMIT: >>> return btrfs_ioctl_qgroup_limit(file, argp); >>> + case BTRFS_IOC_QUOTA_RESCAN: >>> + return btrfs_ioctl_quota_rescan(file, argp); >>> + case BTRFS_IOC_QUOTA_RESCAN_STATUS: >>> + return btrfs_ioctl_quota_rescan_status(file, argp); >>> case BTRFS_IOC_DEV_REPLACE: >>> return btrfs_ioctl_dev_replace(root, argp); >>> case BTRFS_IOC_GET_FSLABEL: >>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c >>> index c50e5a5..664d457 100644 >>> --- a/fs/btrfs/qgroup.c >>> +++ b/fs/btrfs/qgroup.c >>> @@ -31,13 +31,13 @@ >>> #include "locking.h" >>> #include "ulist.h" >>> #include "backref.h" >>> +#include "extent_io.h" >>> >>> /* TODO XXX FIXME >>> * - subvol delete -> delete when ref goes to 0? delete limits also? >>> * - reorganize keys >>> * - compressed >>> * - sync >>> - * - rescan >>> * - copy also limits on subvol creation >>> * - limit >>> * - caches fuer ulists >>> @@ -98,6 +98,14 @@ struct btrfs_qgroup_list { >>> struct btrfs_qgroup *member; >>> }; >>> >>> +struct qgroup_rescan { >>> + struct btrfs_work work; >>> + struct btrfs_fs_info *fs_info; >>> +}; >>> + >>> +static void qgroup_rescan_start(struct btrfs_fs_info *fs_info, >>> + struct qgroup_rescan *qscan); >>> + >>> /* must be called with qgroup_ioctl_lock held */ >>> static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, >>> u64 qgroupid) >>> @@ -298,7 +306,20 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info >>> *fs_info) >>> } >>> fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, >>> ptr); >>> - /* FIXME read scan element */ >>> + fs_info->qgroup_rescan_progress.objectid = >>> + btrfs_qgroup_status_rescan(l, ptr); >>> + if (fs_info->qgroup_flags & >>> + BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >>> + struct qgroup_rescan *qscan = >>> + kmalloc(sizeof(*qscan), GFP_NOFS); >>> + if (!qscan) { >>> + ret = -ENOMEM; >>> + goto out; >>> + } >>> + fs_info->qgroup_rescan_progress.type = 0; >>> + fs_info->qgroup_rescan_progress.offset = 0; >>> + qgroup_rescan_start(fs_info, qscan); >>> + } >>> goto next1; >>> } >>> >>> @@ -719,7 +740,8 @@ static int update_qgroup_status_item(struct >>> btrfs_trans_handle *trans, >>> ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); >>> btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); >>> btrfs_set_qgroup_status_generation(l, ptr, trans->transid); >>> - /* XXX scan */ >>> + btrfs_set_qgroup_status_rescan(l, ptr, >>> + fs_info->qgroup_rescan_progress.objectid); >>> >>> btrfs_mark_buffer_dirty(l); >>> >>> @@ -830,7 +852,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, >>> fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | >>> BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; >>> btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); >>> - btrfs_set_qgroup_status_scan(leaf, ptr, 0); >>> + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); >>> >>> btrfs_mark_buffer_dirty(leaf); >>> >>> @@ -944,10 +966,11 @@ out: >>> return ret; >>> } >>> >>> -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info) >>> +static void qgroup_dirty(struct btrfs_fs_info *fs_info, >>> + struct btrfs_qgroup *qgroup) >>> { >>> - /* FIXME */ >>> - return 0; >>> + if (list_empty(&qgroup->dirty)) >>> + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); >>> } >>> >>> int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, >>> @@ -1155,13 +1178,6 @@ out: >>> return ret; >>> } >>> >>> -static void qgroup_dirty(struct btrfs_fs_info *fs_info, >>> - struct btrfs_qgroup *qgroup) >>> -{ >>> - if (list_empty(&qgroup->dirty)) >>> - list_add(&qgroup->dirty, &fs_info->dirty_qgroups); >>> -} >>> - >>> /* >>> * btrfs_qgroup_record_ref is called when the ref is added or deleted. it >>> puts >>> * the modification into a list that's later used by btrfs_end_transaction to >>> @@ -1388,6 +1404,15 @@ int btrfs_qgroup_account_ref(struct >>> btrfs_trans_handle *trans, >>> BUG(); >>> } >>> >>> + mutex_lock(&fs_info->qgroup_rescan_lock); >>> + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >>> + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { >>> + mutex_unlock(&fs_info->qgroup_rescan_lock); >>> + return 0; >>> + } >>> + } >>> + mutex_unlock(&fs_info->qgroup_rescan_lock); >>> + >>> /* >>> * the delayed ref sequence number we pass depends on the direction of >>> * the operation. for add operations, we pass (node->seq - 1) to skip >>> @@ -1401,7 +1426,15 @@ int btrfs_qgroup_account_ref(struct >>> btrfs_trans_handle *trans, >>> if (ret < 0) >>> return ret; >>> >>> + mutex_lock(&fs_info->qgroup_rescan_lock); >>> spin_lock(&fs_info->qgroup_lock); >>> + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >>> + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { >>> + ret = 0; >>> + goto unlock; >>> + } >>> + } >>> + >>> quota_root = fs_info->quota_root; >>> if (!quota_root) >>> goto unlock; >>> @@ -1443,6 +1476,7 @@ int btrfs_qgroup_account_ref(struct >>> btrfs_trans_handle *trans, >>> >>> unlock: >>> spin_unlock(&fs_info->qgroup_lock); >>> + mutex_unlock(&fs_info->qgroup_rescan_lock); >> >> >> Why do you hold qgroup_rescan_lock when doing qgroup accounting here? >> I can understand that we hold qgroup_rescan_lock when we update >> qgroup_flag(at first in qgroup_account_ref()), >> However, is it necessary that we hold qgroup_rescan_lock when we are doing >> qgroup >> accounting step1,2,3?? >> >> Or am i missing something here? > > We need the lock for the check added above. This check needs the mutex > lock, while the three accounting steps need a spin lock (which was not > modified by my patch). We cannot call mutex_unlock while holding a spin > lock, because mutex_unlock might schedule.
Yeah, but do we need check that again? Considering we check: "fs_info->qgroup_rescan_progress.objectid <= node->bytenr" before find_all_roots() is called, if we can continue, that means "objected > node->bytenr". The point is that is it possible that "objected <= node-bytenr" after find_all_roots() when we are doing qgroup accounting. Here i think group_rescan_progress.objectid can only go larger, so it is not necessary to check it again when we are doing qgroup accounting steps thus we can save qgroup_rescan_lock usage here. What do you think of this, please correct me if i am wrong. Thanks, Wang > > Thanks, > -Jan > >> Thanks, >> Wang -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html
