On Fri, Mar 19, 2021 at 10:52 AM Johannes Thumshirn <johannes.thumsh...@wdc.com> wrote: > > When a file gets deleted on a zoned file system, the space freed is not > returned back into the block group's free space, but is migrated to > zone_unusable. > > As this zone_unusable space is behind the current write pointer it is not > possible to use it for new allocations. In the current implementation a > zone is reset once all of the block group's space is accounted as zone > unusable. > > This behaviour can lead to premature ENOSPC errors on a busy file system. > > Instead of only reclaiming the zone once it is completely unusable, > kick off a reclaim job once the amount of unusable bytes exceeds a user > configurable threshold between 51% and 100%. It can be set per mounted > filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% > per default. > > Similar to reclaiming unused block groups, these dirty block groups are > added to a to_reclaim list and then on a transaction commit, the reclaim > process is triggered but after we deleted unused block groups, which will > free space for the relocation process. > > Signed-off-by: Johannes Thumshirn <johannes.thumsh...@wdc.com> > --- > > AFAICT sysfs_create_files() does not have the ability to provide a is_visible > callback, so the bg_reclaim_threshold sysfs file is visible for non zoned > filesystems as well, even though only for zoned filesystems we're adding block > groups to the reclaim list. I'm all ears for a approach that is sensible in > this regard. > > > fs/btrfs/block-group.c | 84 ++++++++++++++++++++++++++++++++++++ > fs/btrfs/block-group.h | 2 + > fs/btrfs/ctree.h | 3 ++ > fs/btrfs/disk-io.c | 11 +++++ > fs/btrfs/free-space-cache.c | 9 +++- > fs/btrfs/sysfs.c | 35 +++++++++++++++ > fs/btrfs/volumes.c | 2 +- > fs/btrfs/volumes.h | 1 + > include/trace/events/btrfs.h | 12 ++++++ > 9 files changed, 157 insertions(+), 2 deletions(-) > > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c > index 9ae3ac96a521..af9026795ddd 100644 > --- a/fs/btrfs/block-group.c > +++ b/fs/btrfs/block-group.c > @@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) > spin_unlock(&fs_info->unused_bgs_lock); > } > > +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) > +{ > + struct btrfs_block_group *bg; > + struct btrfs_space_info *space_info; > + int ret = 0; > + > + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) > + return; > + > + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) > + return; > + > + mutex_lock(&fs_info->reclaim_bgs_lock); > + while (!list_empty(&fs_info->reclaim_bgs)) { > + bg = list_first_entry(&fs_info->reclaim_bgs, > + struct btrfs_block_group, > + bg_list); > + list_del_init(&bg->bg_list); > + > + space_info = bg->space_info; > + mutex_unlock(&fs_info->reclaim_bgs_lock); > + > + /* Don't want to race with allocators so take the groups_sem > */ > + down_write(&space_info->groups_sem); > + > + spin_lock(&bg->lock); > + if (bg->reserved || bg->pinned || bg->ro) { > + /* > + * We want to bail if we made new allocations or have > + * outstanding allocations in this block group. We do > + * the ro check in case balance is currently acting on > + * this block group. > + */ > + spin_unlock(&bg->lock); > + up_write(&space_info->groups_sem); > + goto next; > + } > + spin_unlock(&bg->lock); > + > + ret = inc_block_group_ro(bg, 0); > + up_write(&space_info->groups_sem); > + if (ret < 0) { > + ret = 0; > + goto next; > + } > + > + btrfs_info(fs_info, "reclaiming chunk %llu", bg->start); > + trace_btrfs_reclaim_block_group(bg); > + ret = btrfs_relocate_chunk(fs_info, bg->start);
I think you forgot to test this with lockdep enabled, it should have complained loudly otherwise. btrfs_relocate_chunk() calls lockdep_assert_head() to verify we are holding fs_info->reclaim_bgs_lock, but we just unlocked it. Thanks. > + if (ret) > + btrfs_err(fs_info, "error relocating chunk %llu", > + bg->start); > + > +next: > + btrfs_put_block_group(bg); > + mutex_lock(&fs_info->reclaim_bgs_lock); > + } > + mutex_unlock(&fs_info->reclaim_bgs_lock); > + btrfs_exclop_finish(fs_info); > +} > + > +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) > +{ > + struct btrfs_fs_info *fs_info = bg->fs_info; > + > + mutex_lock(&fs_info->reclaim_bgs_lock); > + if (list_empty(&bg->bg_list)) { > + btrfs_get_block_group(bg); > + trace_btrfs_add_reclaim_block_group(bg); > + list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); > + } > + mutex_unlock(&fs_info->reclaim_bgs_lock); > +} > + > static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key > *key, > struct btrfs_path *path) > { > @@ -3390,6 +3464,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) > } > spin_unlock(&info->unused_bgs_lock); > > + mutex_lock(&info->reclaim_bgs_lock); > + while (!list_empty(&info->reclaim_bgs)) { > + block_group = list_first_entry(&info->reclaim_bgs, > + struct btrfs_block_group, > + bg_list); > + list_del_init(&block_group->bg_list); > + btrfs_put_block_group(block_group); > + } > + mutex_unlock(&info->reclaim_bgs_lock); > + > spin_lock(&info->block_group_cache_lock); > while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { > block_group = rb_entry(n, struct btrfs_block_group, > diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h > index 3ecc3372a5ce..e75c79676241 100644 > --- a/fs/btrfs/block-group.h > +++ b/fs/btrfs/block-group.h > @@ -264,6 +264,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle > *trans, > u64 group_start, struct extent_map *em); > void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); > void btrfs_mark_bg_unused(struct btrfs_block_group *bg); > +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); > +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); > int btrfs_read_block_groups(struct btrfs_fs_info *info); > int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, > u64 type, u64 chunk_offset, u64 size); > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 34ec82d6df3e..0b438b97fed4 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -938,6 +938,7 @@ struct btrfs_fs_info { > struct list_head unused_bgs; > struct mutex unused_bg_unpin_mutex; > struct mutex reclaim_bgs_lock; > + struct list_head reclaim_bgs; > > /* Cached block sizes */ > u32 nodesize; > @@ -978,6 +979,8 @@ struct btrfs_fs_info { > spinlock_t treelog_bg_lock; > u64 treelog_bg; > > + int bg_reclaim_threshold; > + > #ifdef CONFIG_BTRFS_FS_REF_VERIFY > spinlock_t ref_verify_lock; > struct rb_root block_tree; > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index f9250f14fc1e..d4fccf113df1 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -1815,6 +1815,13 @@ static int cleaner_kthread(void *arg) > * unused block groups. > */ > btrfs_delete_unused_bgs(fs_info); > + > + /* > + * Reclaim block groups in the reclaim_bgs list after we > deleted > + * all unused block_groups. This possibly gives us some more > free > + * space. > + */ > + btrfs_reclaim_bgs(fs_info); > sleep: > clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, > &fs_info->flags); > if (kthread_should_park()) > @@ -2797,12 +2804,14 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > mutex_init(&fs_info->reloc_mutex); > mutex_init(&fs_info->delalloc_root_mutex); > mutex_init(&fs_info->zoned_meta_io_lock); > + mutex_init(&fs_info->reclaim_bgs_lock); > seqlock_init(&fs_info->profiles_lock); > > INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); > INIT_LIST_HEAD(&fs_info->space_info); > INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); > INIT_LIST_HEAD(&fs_info->unused_bgs); > + INIT_LIST_HEAD(&fs_info->reclaim_bgs); > #ifdef CONFIG_BTRFS_DEBUG > INIT_LIST_HEAD(&fs_info->allocated_roots); > INIT_LIST_HEAD(&fs_info->allocated_ebs); > @@ -2891,6 +2900,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > fs_info->swapfile_pins = RB_ROOT; > > fs_info->send_in_progress = 0; > + > + fs_info->bg_reclaim_threshold = 75; > } > > static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct > super_block *sb) > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c > index 9988decd5717..e54466fc101f 100644 > --- a/fs/btrfs/free-space-cache.c > +++ b/fs/btrfs/free-space-cache.c > @@ -11,6 +11,7 @@ > #include <linux/ratelimit.h> > #include <linux/error-injection.h> > #include <linux/sched/mm.h> > +#include "misc.h" > #include "ctree.h" > #include "free-space-cache.h" > #include "transaction.h" > @@ -2539,6 +2540,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info > *fs_info, > static int __btrfs_add_free_space_zoned(struct btrfs_block_group > *block_group, > u64 bytenr, u64 size, bool used) > { > + struct btrfs_fs_info *fs_info = block_group->fs_info; > struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; > u64 offset = bytenr - block_group->start; > u64 to_free, to_unusable; > @@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct > btrfs_block_group *block_group, > } > > /* All the region is now unusable. Mark it as unused and reclaim */ > - if (block_group->zone_unusable == block_group->length) > + if (block_group->zone_unusable == block_group->length) { > btrfs_mark_bg_unused(block_group); > + } else if (block_group->zone_unusable >= > + div_factor_fine(block_group->length, > + fs_info->bg_reclaim_threshold)) { > + btrfs_mark_bg_to_reclaim(block_group); > + } > > return 0; > } > diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c > index 6eb1c50fa98c..bf38c7c6b804 100644 > --- a/fs/btrfs/sysfs.c > +++ b/fs/btrfs/sysfs.c > @@ -965,6 +965,40 @@ static ssize_t btrfs_read_policy_store(struct kobject > *kobj, > } > BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, > btrfs_read_policy_store); > > +static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, > + struct kobj_attribute *a, > + char *buf) > +{ > + struct btrfs_fs_info *fs_info = to_fs_info(kobj); > + ssize_t ret; > + > + ret = scnprintf(buf, PAGE_SIZE, "%d\n", > fs_info->bg_reclaim_threshold); > + > + return ret; > +} > + > +static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, > + struct kobj_attribute *a, > + const char *buf, size_t len) > +{ > + struct btrfs_fs_info *fs_info = to_fs_info(kobj); > + int thresh; > + int ret; > + > + ret = kstrtoint(buf, 10, &thresh); > + if (ret) > + return ret; > + > + if (thresh <= 50 || thresh > 100) > + return -EINVAL; > + > + fs_info->bg_reclaim_threshold = thresh; > + > + return len; > +} > +BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, > + btrfs_bg_reclaim_threshold_store); > + > static const struct attribute *btrfs_attrs[] = { > BTRFS_ATTR_PTR(, label), > BTRFS_ATTR_PTR(, nodesize), > @@ -976,6 +1010,7 @@ static const struct attribute *btrfs_attrs[] = { > BTRFS_ATTR_PTR(, exclusive_operation), > BTRFS_ATTR_PTR(, generation), > BTRFS_ATTR_PTR(, read_policy), > + BTRFS_ATTR_PTR(, bg_reclaim_threshold), > NULL, > }; > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index fb785ff53a27..c78b5ce49d47 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -3098,7 +3098,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle > *trans, u64 chunk_offset) > return ret; > } > > -static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 > chunk_offset) > +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) > { > struct btrfs_root *root = fs_info->chunk_root; > struct btrfs_trans_handle *trans; > diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h > index d4c3e0dd32b8..9c0d84e5ec06 100644 > --- a/fs/btrfs/volumes.h > +++ b/fs/btrfs/volumes.h > @@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, > u32 size_buf); > int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); > int btrfs_recover_balance(struct btrfs_fs_info *fs_info); > int btrfs_pause_balance(struct btrfs_fs_info *fs_info); > +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); > int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); > int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); > int btrfs_uuid_scan_kthread(void *data); > diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h > index 0551ea65374f..a41dd8a0c730 100644 > --- a/include/trace/events/btrfs.h > +++ b/include/trace/events/btrfs.h > @@ -1903,6 +1903,18 @@ DEFINE_EVENT(btrfs__block_group, > btrfs_add_unused_block_group, > TP_ARGS(bg_cache) > ); > > +DEFINE_EVENT(btrfs__block_group, btrfs_add_reclaim_block_group, > + TP_PROTO(const struct btrfs_block_group *bg_cache), > + > + TP_ARGS(bg_cache) > +); > + > +DEFINE_EVENT(btrfs__block_group, btrfs_reclaim_block_group, > + TP_PROTO(const struct btrfs_block_group *bg_cache), > + > + TP_ARGS(bg_cache) > +); > + > DEFINE_EVENT(btrfs__block_group, btrfs_skip_unused_block_group, > TP_PROTO(const struct btrfs_block_group *bg_cache), > > -- > 2.30.0 > -- Filipe David Manana, “Whether you think you can, or you think you can't — you're right.”