Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones

2021-04-16 Thread David Sterba
On Thu, Apr 15, 2021 at 10:58:35PM +0900, Johannes Thumshirn wrote:
> @@ -2974,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>   fs_info->swapfile_pins = RB_ROOT;
>  
>   fs_info->send_in_progress = 0;
> +
> + fs_info->bg_reclaim_threshold = 75;

The value should be a named constant visible defined in zoned.h with a
comment what it means.


Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones

2021-04-16 Thread David Sterba
On Thu, Apr 15, 2021 at 10:58:35PM +0900, Johannes Thumshirn wrote:
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -954,10 +954,14 @@ struct btrfs_fs_info {
>   struct work_struct async_data_reclaim_work;
>   struct work_struct preempt_reclaim_work;
>  
> + /* Used to reclaim data space in the background */
> + struct work_struct reclaim_bgs_work;
> +
>   spinlock_t unused_bgs_lock;
>   struct list_head unused_bgs;
>   struct mutex unused_bg_unpin_mutex;
>   struct mutex reclaim_bgs_lock;
> + struct list_head reclaim_bgs;
>  
>   /* Cached block sizes */
>   u32 nodesize;
> @@ -998,6 +1002,8 @@ struct btrfs_fs_info {
>   spinlock_t treelog_bg_lock;
>   u64 treelog_bg;
>  
> + int bg_reclaim_threshold;

That's spreading related members over too many lines, please put them
together with comments what they mean if that's not obvious (like for
the work struct).


Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones

2021-04-16 Thread Filipe Manana
On Thu, Apr 15, 2021 at 3:00 PM Johannes Thumshirn
 wrote:
>
> When a file gets deleted on a zoned file system, the space freed is not
> returned back into the block group's free space, but is migrated to
> zone_unusable.
>
> As this zone_unusable space is behind the current write pointer it is not
> possible to use it for new allocations. In the current implementation a
> zone is reset once all of the block group's space is accounted as zone
> unusable.
>
> This behaviour can lead to premature ENOSPC errors on a busy file system.
>
> Instead of only reclaiming the zone once it is completely unusable,
> kick off a reclaim job once the amount of unusable bytes exceeds a user
> configurable threshold between 51% and 100%. It can be set per mounted
> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
> per default.
>
> Similar to reclaiming unused block groups, these dirty block groups are
> added to a to_reclaim list and then on a transaction commit, the reclaim
> process is triggered but after we deleted unused block groups, which will
> free space for the relocation process.
>
> Signed-off-by: Johannes Thumshirn 
> ---
>  fs/btrfs/block-group.c   | 96 
>  fs/btrfs/block-group.h   |  3 ++
>  fs/btrfs/ctree.h |  6 +++
>  fs/btrfs/disk-io.c   | 13 +
>  fs/btrfs/free-space-cache.c  |  9 +++-
>  fs/btrfs/sysfs.c | 35 +
>  fs/btrfs/volumes.c   |  2 +-
>  fs/btrfs/volumes.h   |  1 +
>  include/trace/events/btrfs.h | 12 +
>  9 files changed, 175 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index bbb5a6e170c7..3f06ea42c013 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
> spin_unlock(&fs_info->unused_bgs_lock);
>  }
>
> +void btrfs_reclaim_bgs_work(struct work_struct *work)
> +{
> +   struct btrfs_fs_info *fs_info =
> +   container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
> +   struct btrfs_block_group *bg;
> +   struct btrfs_space_info *space_info;
> +   int ret = 0;
> +
> +   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
> +   return;
> +
> +   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
> +   return;
> +
> +   mutex_lock(&fs_info->reclaim_bgs_lock);
> +   spin_lock(&fs_info->unused_bgs_lock);
> +   while (!list_empty(&fs_info->reclaim_bgs)) {
> +   bg = list_first_entry(&fs_info->reclaim_bgs,
> + struct btrfs_block_group,
> + bg_list);
> +   list_del_init(&bg->bg_list);
> +
> +   space_info = bg->space_info;
> +   spin_unlock(&fs_info->unused_bgs_lock);
> +
> +   /* Don't want to race with allocators so take the groups_sem 
> */
> +   down_write(&space_info->groups_sem);
> +
> +   spin_lock(&bg->lock);
> +   if (bg->reserved || bg->pinned || bg->ro) {
> +   /*
> +* We want to bail if we made new allocations or have
> +* outstanding allocations in this block group.  We do
> +* the ro check in case balance is currently acting on
> +* this block group.
> +*/
> +   spin_unlock(&bg->lock);
> +   up_write(&space_info->groups_sem);
> +   goto next;
> +   }
> +   spin_unlock(&bg->lock);
> +
> +   ret = inc_block_group_ro(bg, 0);
> +   up_write(&space_info->groups_sem);
> +   if (ret < 0) {
> +   ret = 0;

It's pointless to set ret to 0.

> +   goto next;
> +   }
> +
> +   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
> +   trace_btrfs_reclaim_block_group(bg);
> +   ret = btrfs_relocate_chunk(fs_info, bg->start);
> +   if (ret)
> +   btrfs_err(fs_info, "error relocating chunk %llu",
> + bg->start);
> +
> +next:
> +   btrfs_put_block_group(bg);
> +   spin_lock(&fs_info->unused_bgs_lock);
> +   }
> +   spin_unlock(&fs_info->unused_bgs_lock);
> +   mutex_unlock(&fs_info->reclaim_bgs_lock);
> +   btrfs_exclop_finish(fs_info);
> +}
> +
> +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
> +{
> +   spin_lock(&fs_info->unused_bgs_lock);
> +   if (!list_empty(&fs_info->reclaim_bgs))
> +   queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
> +   spin_unlock(&fs_info->unused_bgs_lock);
> +}
> +
> +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
> +{
> +   struct btrfs_fs_info 

Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones

2021-04-15 Thread Johannes Thumshirn
On 15/04/2021 20:37, Josef Bacik wrote:
> On 4/15/21 9:58 AM, Johannes Thumshirn wrote:
>> When a file gets deleted on a zoned file system, the space freed is not
>> returned back into the block group's free space, but is migrated to
>> zone_unusable.
>>
>> As this zone_unusable space is behind the current write pointer it is not
>> possible to use it for new allocations. In the current implementation a
>> zone is reset once all of the block group's space is accounted as zone
>> unusable.
>>
>> This behaviour can lead to premature ENOSPC errors on a busy file system.
>>
>> Instead of only reclaiming the zone once it is completely unusable,
>> kick off a reclaim job once the amount of unusable bytes exceeds a user
>> configurable threshold between 51% and 100%. It can be set per mounted
>> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
>> per default.
>>
>> Similar to reclaiming unused block groups, these dirty block groups are
>> added to a to_reclaim list and then on a transaction commit, the reclaim
>> process is triggered but after we deleted unused block groups, which will
>> free space for the relocation process.
>>
>> Signed-off-by: Johannes Thumshirn 
>> ---
>>   fs/btrfs/block-group.c   | 96 
>>   fs/btrfs/block-group.h   |  3 ++
>>   fs/btrfs/ctree.h |  6 +++
>>   fs/btrfs/disk-io.c   | 13 +
>>   fs/btrfs/free-space-cache.c  |  9 +++-
>>   fs/btrfs/sysfs.c | 35 +
>>   fs/btrfs/volumes.c   |  2 +-
>>   fs/btrfs/volumes.h   |  1 +
>>   include/trace/events/btrfs.h | 12 +
>>   9 files changed, 175 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index bbb5a6e170c7..3f06ea42c013 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group 
>> *bg)
>>  spin_unlock(&fs_info->unused_bgs_lock);
>>   }
>>   
>> +void btrfs_reclaim_bgs_work(struct work_struct *work)
>> +{
>> +struct btrfs_fs_info *fs_info =
>> +container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
>> +struct btrfs_block_group *bg;
>> +struct btrfs_space_info *space_info;
>> +int ret = 0;
>> +
>> +if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
>> +return;
>> +
>> +if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
>> +return;
>> +
>> +mutex_lock(&fs_info->reclaim_bgs_lock);
>> +spin_lock(&fs_info->unused_bgs_lock);
>> +while (!list_empty(&fs_info->reclaim_bgs)) {
>> +bg = list_first_entry(&fs_info->reclaim_bgs,
>> +  struct btrfs_block_group,
>> +  bg_list);
>> +list_del_init(&bg->bg_list);
>> +
>> +space_info = bg->space_info;
>> +spin_unlock(&fs_info->unused_bgs_lock);
>> +
>> +/* Don't want to race with allocators so take the groups_sem */
>> +down_write(&space_info->groups_sem);
>> +
>> +spin_lock(&bg->lock);
>> +if (bg->reserved || bg->pinned || bg->ro) {
>> +/*
>> + * We want to bail if we made new allocations or have
>> + * outstanding allocations in this block group.  We do
>> + * the ro check in case balance is currently acting on
>> + * this block group.
>> + */
>> +spin_unlock(&bg->lock);
>> +up_write(&space_info->groups_sem);
>> +goto next;
>> +}
>> +spin_unlock(&bg->lock);
>> +
> 
> Here I think we want a
> 
> if (btrfs_fs_closing())
>   goto next;
> 
> so we don't block out a umount for all eternity.  Thanks,

Right, will add.


Re: [PATCH v4 3/3] btrfs: zoned: automatically reclaim zones

2021-04-15 Thread Josef Bacik

On 4/15/21 9:58 AM, Johannes Thumshirn wrote:

When a file gets deleted on a zoned file system, the space freed is not
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%. It can be set per mounted
filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
per default.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.

Signed-off-by: Johannes Thumshirn 
---
  fs/btrfs/block-group.c   | 96 
  fs/btrfs/block-group.h   |  3 ++
  fs/btrfs/ctree.h |  6 +++
  fs/btrfs/disk-io.c   | 13 +
  fs/btrfs/free-space-cache.c  |  9 +++-
  fs/btrfs/sysfs.c | 35 +
  fs/btrfs/volumes.c   |  2 +-
  fs/btrfs/volumes.h   |  1 +
  include/trace/events/btrfs.h | 12 +
  9 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bbb5a6e170c7..3f06ea42c013 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1485,6 +1485,92 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
  }
  
+void btrfs_reclaim_bgs_work(struct work_struct *work)

+{
+   struct btrfs_fs_info *fs_info =
+   container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+   struct btrfs_block_group *bg;
+   struct btrfs_space_info *space_info;
+   int ret = 0;
+
+   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+   return;
+
+   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+   return;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   spin_lock(&fs_info->unused_bgs_lock);
+   while (!list_empty(&fs_info->reclaim_bgs)) {
+   bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+   list_del_init(&bg->bg_list);
+
+   space_info = bg->space_info;
+   spin_unlock(&fs_info->unused_bgs_lock);
+
+   /* Don't want to race with allocators so take the groups_sem */
+   down_write(&space_info->groups_sem);
+
+   spin_lock(&bg->lock);
+   if (bg->reserved || bg->pinned || bg->ro) {
+   /*
+* We want to bail if we made new allocations or have
+* outstanding allocations in this block group.  We do
+* the ro check in case balance is currently acting on
+* this block group.
+*/
+   spin_unlock(&bg->lock);
+   up_write(&space_info->groups_sem);
+   goto next;
+   }
+   spin_unlock(&bg->lock);
+


Here I think we want a

if (btrfs_fs_closing())
goto next;

so we don't block out a umount for all eternity.  Thanks,

Josef