Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones
On Thu, Apr 15, 2021 at 10:58:35PM +0900, Johannes Thumshirn wrote: > @@ -2974,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) > fs_info->swapfile_pins = RB_ROOT; > > fs_info->send_in_progress = 0; > + > + fs_info->bg_reclaim_threshold = 75; The value should be a named constant visible defined in zoned.h with a comment what it means.
Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones
On Thu, Apr 15, 2021 at 10:58:35PM +0900, Johannes Thumshirn wrote: > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -954,10 +954,14 @@ struct btrfs_fs_info { > struct work_struct async_data_reclaim_work; > struct work_struct preempt_reclaim_work; > > + /* Used to reclaim data space in the background */ > + struct work_struct reclaim_bgs_work; > + > spinlock_t unused_bgs_lock; > struct list_head unused_bgs; > struct mutex unused_bg_unpin_mutex; > struct mutex reclaim_bgs_lock; > + struct list_head reclaim_bgs; > > /* Cached block sizes */ > u32 nodesize; > @@ -998,6 +1002,8 @@ struct btrfs_fs_info { > spinlock_t treelog_bg_lock; > u64 treelog_bg; > > + int bg_reclaim_threshold; That's spreading related members over too many lines, please put them together with comments what they mean if that's not obvious (like for the work struct).
Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones
On Thu, Apr 15, 2021 at 3:00 PM Johannes Thumshirn wrote: > > When a file gets deleted on a zoned file system, the space freed is not > returned back into the block group's free space, but is migrated to > zone_unusable. > > As this zone_unusable space is behind the current write pointer it is not > possible to use it for new allocations. In the current implementation a > zone is reset once all of the block group's space is accounted as zone > unusable. > > This behaviour can lead to premature ENOSPC errors on a busy file system. > > Instead of only reclaiming the zone once it is completely unusable, > kick off a reclaim job once the amount of unusable bytes exceeds a user > configurable threshold between 51% and 100%. It can be set per mounted > filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% > per default. > > Similar to reclaiming unused block groups, these dirty block groups are > added to a to_reclaim list and then on a transaction commit, the reclaim > process is triggered but after we deleted unused block groups, which will > free space for the relocation process. > > Signed-off-by: Johannes Thumshirn > --- > fs/btrfs/block-group.c | 96 > fs/btrfs/block-group.h | 3 ++ > fs/btrfs/ctree.h | 6 +++ > fs/btrfs/disk-io.c | 13 + > fs/btrfs/free-space-cache.c | 9 +++- > fs/btrfs/sysfs.c | 35 + > fs/btrfs/volumes.c | 2 +- > fs/btrfs/volumes.h | 1 + > include/trace/events/btrfs.h | 12 + > 9 files changed, 175 insertions(+), 2 deletions(-) > > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c > index bbb5a6e170c7..3f06ea42c013 100644 > --- a/fs/btrfs/block-group.c > +++ b/fs/btrfs/block-group.c > @@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) > spin_unlock(&fs_info->unused_bgs_lock); > } > > +void btrfs_reclaim_bgs_work(struct work_struct *work) > +{ > + struct btrfs_fs_info *fs_info = > + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); > + struct btrfs_block_group *bg; > + struct btrfs_space_info *space_info; > + int ret = 0; > + > + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) > + return; > + > + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) > + return; > + > + mutex_lock(&fs_info->reclaim_bgs_lock); > + spin_lock(&fs_info->unused_bgs_lock); > + while (!list_empty(&fs_info->reclaim_bgs)) { > + bg = list_first_entry(&fs_info->reclaim_bgs, > + struct btrfs_block_group, > + bg_list); > + list_del_init(&bg->bg_list); > + > + space_info = bg->space_info; > + spin_unlock(&fs_info->unused_bgs_lock); > + > + /* Don't want to race with allocators so take the groups_sem > */ > + down_write(&space_info->groups_sem); > + > + spin_lock(&bg->lock); > + if (bg->reserved || bg->pinned || bg->ro) { > + /* > +* We want to bail if we made new allocations or have > +* outstanding allocations in this block group. We do > +* the ro check in case balance is currently acting on > +* this block group. > +*/ > + spin_unlock(&bg->lock); > + up_write(&space_info->groups_sem); > + goto next; > + } > + spin_unlock(&bg->lock); > + > + ret = inc_block_group_ro(bg, 0); > + up_write(&space_info->groups_sem); > + if (ret < 0) { > + ret = 0; It's pointless to set ret to 0. > + goto next; > + } > + > + btrfs_info(fs_info, "reclaiming chunk %llu", bg->start); > + trace_btrfs_reclaim_block_group(bg); > + ret = btrfs_relocate_chunk(fs_info, bg->start); > + if (ret) > + btrfs_err(fs_info, "error relocating chunk %llu", > + bg->start); > + > +next: > + btrfs_put_block_group(bg); > + spin_lock(&fs_info->unused_bgs_lock); > + } > + spin_unlock(&fs_info->unused_bgs_lock); > + mutex_unlock(&fs_info->reclaim_bgs_lock); > + btrfs_exclop_finish(fs_info); > +} > + > +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) > +{ > + spin_lock(&fs_info->unused_bgs_lock); > + if (!list_empty(&fs_info->reclaim_bgs)) > + queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); > + spin_unlock(&fs_info->unused_bgs_lock); > +} > + > +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) > +{ > + struct btrfs_fs_info
Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones
On 15/04/2021 20:37, Josef Bacik wrote: > On 4/15/21 9:58 AM, Johannes Thumshirn wrote: >> When a file gets deleted on a zoned file system, the space freed is not >> returned back into the block group's free space, but is migrated to >> zone_unusable. >> >> As this zone_unusable space is behind the current write pointer it is not >> possible to use it for new allocations. In the current implementation a >> zone is reset once all of the block group's space is accounted as zone >> unusable. >> >> This behaviour can lead to premature ENOSPC errors on a busy file system. >> >> Instead of only reclaiming the zone once it is completely unusable, >> kick off a reclaim job once the amount of unusable bytes exceeds a user >> configurable threshold between 51% and 100%. It can be set per mounted >> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% >> per default. >> >> Similar to reclaiming unused block groups, these dirty block groups are >> added to a to_reclaim list and then on a transaction commit, the reclaim >> process is triggered but after we deleted unused block groups, which will >> free space for the relocation process. >> >> Signed-off-by: Johannes Thumshirn >> --- >> fs/btrfs/block-group.c | 96 >> fs/btrfs/block-group.h | 3 ++ >> fs/btrfs/ctree.h | 6 +++ >> fs/btrfs/disk-io.c | 13 + >> fs/btrfs/free-space-cache.c | 9 +++- >> fs/btrfs/sysfs.c | 35 + >> fs/btrfs/volumes.c | 2 +- >> fs/btrfs/volumes.h | 1 + >> include/trace/events/btrfs.h | 12 + >> 9 files changed, 175 insertions(+), 2 deletions(-) >> >> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c >> index bbb5a6e170c7..3f06ea42c013 100644 >> --- a/fs/btrfs/block-group.c >> +++ b/fs/btrfs/block-group.c >> @@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group >> *bg) >> spin_unlock(&fs_info->unused_bgs_lock); >> } >> >> +void btrfs_reclaim_bgs_work(struct work_struct *work) >> +{ >> +struct btrfs_fs_info *fs_info = >> +container_of(work, struct btrfs_fs_info, reclaim_bgs_work); >> +struct btrfs_block_group *bg; >> +struct btrfs_space_info *space_info; >> +int ret = 0; >> + >> +if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) >> +return; >> + >> +if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) >> +return; >> + >> +mutex_lock(&fs_info->reclaim_bgs_lock); >> +spin_lock(&fs_info->unused_bgs_lock); >> +while (!list_empty(&fs_info->reclaim_bgs)) { >> +bg = list_first_entry(&fs_info->reclaim_bgs, >> + struct btrfs_block_group, >> + bg_list); >> +list_del_init(&bg->bg_list); >> + >> +space_info = bg->space_info; >> +spin_unlock(&fs_info->unused_bgs_lock); >> + >> +/* Don't want to race with allocators so take the groups_sem */ >> +down_write(&space_info->groups_sem); >> + >> +spin_lock(&bg->lock); >> +if (bg->reserved || bg->pinned || bg->ro) { >> +/* >> + * We want to bail if we made new allocations or have >> + * outstanding allocations in this block group. We do >> + * the ro check in case balance is currently acting on >> + * this block group. >> + */ >> +spin_unlock(&bg->lock); >> +up_write(&space_info->groups_sem); >> +goto next; >> +} >> +spin_unlock(&bg->lock); >> + > > Here I think we want a > > if (btrfs_fs_closing()) > goto next; > > so we don't block out a umount for all eternity. Thanks, Right, will add.
Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones
On 4/15/21 9:58 AM, Johannes Thumshirn wrote: When a file gets deleted on a zoned file system, the space freed is not returned back into the block group's free space, but is migrated to zone_unusable. As this zone_unusable space is behind the current write pointer it is not possible to use it for new allocations. In the current implementation a zone is reset once all of the block group's space is accounted as zone unusable. This behaviour can lead to premature ENOSPC errors on a busy file system. Instead of only reclaiming the zone once it is completely unusable, kick off a reclaim job once the amount of unusable bytes exceeds a user configurable threshold between 51% and 100%. It can be set per mounted filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75% per default. Similar to reclaiming unused block groups, these dirty block groups are added to a to_reclaim list and then on a transaction commit, the reclaim process is triggered but after we deleted unused block groups, which will free space for the relocation process. Signed-off-by: Johannes Thumshirn --- fs/btrfs/block-group.c | 96 fs/btrfs/block-group.h | 3 ++ fs/btrfs/ctree.h | 6 +++ fs/btrfs/disk-io.c | 13 + fs/btrfs/free-space-cache.c | 9 +++- fs/btrfs/sysfs.c | 35 + fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 1 + include/trace/events/btrfs.h | 12 + 9 files changed, 175 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bbb5a6e170c7..3f06ea42c013 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + int ret = 0; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* +* We want to bail if we made new allocations or have +* outstanding allocations in this block group. We do +* the ro check in case balance is currently acting on +* this block group. +*/ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + Here I think we want a if (btrfs_fs_closing()) goto next; so we don't block out a umount for all eternity. Thanks, Josef