Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-04-07 Thread Johannes Thumshirn
On 19/03/2021 18:59, Josef Bacik wrote:
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index f9250f14fc1e..d4fccf113df1 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -1815,6 +1815,13 @@ static int cleaner_kthread(void *arg)
>>   * unused block groups.
>>   */
>>  btrfs_delete_unused_bgs(fs_info);
>> +
>> +/*
>> + * Reclaim block groups in the reclaim_bgs list after we deleted
>> + * all unused block_groups. This possibly gives us some more 
>> free
>> + * space.
>> + */
>> +btrfs_reclaim_bgs(fs_info);
> Reclaim can be a super long process, and we use the cleaner to keep up with 
> other things, like delayed iputs and deleted snapshots.  I'd rather offload 
> this 
> to it's own worker thread so that the cleaner doesn't get bogged down in the 
> relocation work.
> 

As I finally have time to get back to this, kthread or workqueue? What does make
more sense here?


Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-04-07 Thread Johannes Thumshirn
On 23/03/2021 07:45, Anand Jain wrote:
> 
> On 19/03/2021 18:48, Johannes Thumshirn wrote:
>> When a file gets deleted on a zoned file system, the space freed is not
>> returned back into the block group's free space, but is migrated to
>> zone_unusable.
>>
>> As this zone_unusable space is behind the current write pointer it is not
>> possible to use it for new allocations. In the current implementation a
>> zone is reset once all of the block group's space is accounted as zone
>> unusable.
>>
>> This behaviour can lead to premature ENOSPC errors on a busy file system.
>>
>> Instead of only reclaiming the zone once it is completely unusable,
>> kick off a reclaim job once the amount of unusable bytes exceeds a user
>> configurable threshold between 51% and 100%. It can be set per mounted
>> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
>> per default.
>>
>> Similar to reclaiming unused block groups, these dirty block groups are
>> added to a to_reclaim list and then on a transaction commit, the reclaim
>> process is triggered but after we deleted unused block groups, which will
>> free space for the relocation process.
>>
> 
> Still, in the code below, I don't see the zone write pointer reset of
> the zone_unusable done here. Where does that happen? Or what did I miss?

This is indeed correct, relocation doesn't call trim/zone-reset. I've added
a patch to the series taking this into account. Thanks for spotting.


Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-25 Thread Johannes Thumshirn
On 23/03/2021 11:22, Johannes Thumshirn wrote:
> On 23/03/2021 10:57, Filipe Manana wrote:
>> On Fri, Mar 19, 2021 at 10:52 AM Johannes Thumshirn
>>  wrote:
>>>
>>> When a file gets deleted on a zoned file system, the space freed is not
>>> returned back into the block group's free space, but is migrated to
>>> zone_unusable.
>>>
>>> As this zone_unusable space is behind the current write pointer it is not
>>> possible to use it for new allocations. In the current implementation a
>>> zone is reset once all of the block group's space is accounted as zone
>>> unusable.
>>>
>>> This behaviour can lead to premature ENOSPC errors on a busy file system.
>>>
>>> Instead of only reclaiming the zone once it is completely unusable,
>>> kick off a reclaim job once the amount of unusable bytes exceeds a user
>>> configurable threshold between 51% and 100%. It can be set per mounted
>>> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
>>> per default.
>>>
>>> Similar to reclaiming unused block groups, these dirty block groups are
>>> added to a to_reclaim list and then on a transaction commit, the reclaim
>>> process is triggered but after we deleted unused block groups, which will
>>> free space for the relocation process.
>>>
>>> Signed-off-by: Johannes Thumshirn 
>>> ---
>>>
>>> AFAICT sysfs_create_files() does not have the ability to provide a 
>>> is_visible
>>> callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
>>> filesystems as well, even though only for zoned filesystems we're adding 
>>> block
>>> groups to the reclaim list. I'm all ears for a approach that is sensible in
>>> this regard.
>>>
>>>
>>>  fs/btrfs/block-group.c   | 84 
>>>  fs/btrfs/block-group.h   |  2 +
>>>  fs/btrfs/ctree.h |  3 ++
>>>  fs/btrfs/disk-io.c   | 11 +
>>>  fs/btrfs/free-space-cache.c  |  9 +++-
>>>  fs/btrfs/sysfs.c | 35 +++
>>>  fs/btrfs/volumes.c   |  2 +-
>>>  fs/btrfs/volumes.h   |  1 +
>>>  include/trace/events/btrfs.h | 12 ++
>>>  9 files changed, 157 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>>> index 9ae3ac96a521..af9026795ddd 100644
>>> --- a/fs/btrfs/block-group.c
>>> +++ b/fs/btrfs/block-group.c
>>> @@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group 
>>> *bg)
>>> spin_unlock(&fs_info->unused_bgs_lock);
>>>  }
>>>
>>> +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
>>> +{
>>> +   struct btrfs_block_group *bg;
>>> +   struct btrfs_space_info *space_info;
>>> +   int ret = 0;
>>> +
>>> +   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
>>> +   return;
>>> +
>>> +   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
>>> +   return;
>>> +
>>> +   mutex_lock(&fs_info->reclaim_bgs_lock);
>>> +   while (!list_empty(&fs_info->reclaim_bgs)) {
>>> +   bg = list_first_entry(&fs_info->reclaim_bgs,
>>> + struct btrfs_block_group,
>>> + bg_list);
>>> +   list_del_init(&bg->bg_list);
>>> +
>>> +   space_info = bg->space_info;
>>> +   mutex_unlock(&fs_info->reclaim_bgs_lock);
>>> +
>>> +   /* Don't want to race with allocators so take the 
>>> groups_sem */
>>> +   down_write(&space_info->groups_sem);
>>> +
>>> +   spin_lock(&bg->lock);
>>> +   if (bg->reserved || bg->pinned || bg->ro) {
>>> +   /*
>>> +* We want to bail if we made new allocations or 
>>> have
>>> +* outstanding allocations in this block group.  We 
>>> do
>>> +* the ro check in case balance is currently acting 
>>> on
>>> +* this block group.
>>> +*/
>>> +   spin_unlock(&bg->lock);
>>> +   up_write(&space_info->groups_sem);
>>> +   goto next;
>>> +   }
>>> +   spin_unlock(&bg->lock);
>>> +
>>> +   ret = inc_block_group_ro(bg, 0);
>>> +   up_write(&space_info->groups_sem);
>>> +   if (ret < 0) {
>>> +   ret = 0;
>>> +   goto next;
>>> +   }
>>> +
>>> +   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
>>> +   trace_btrfs_reclaim_block_group(bg);
>>> +   ret = btrfs_relocate_chunk(fs_info, bg->start);
>>
>> I think you forgot to test this with lockdep enabled, it should have
>> complained loudly otherwise.
>>
>> btrfs_relocate_chunk() calls lockdep_assert_head() to verify we are
>> holding fs_info->reclaim_bgs_lock,
>> but we just unlocked it.
> 
> I thought I did, but you're right. I'll need to send a v3 anyways addressing

OK as this was bugging m

Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-23 Thread Johannes Thumshirn
On 23/03/2021 10:57, Filipe Manana wrote:
> On Fri, Mar 19, 2021 at 10:52 AM Johannes Thumshirn
>  wrote:
>>
>> When a file gets deleted on a zoned file system, the space freed is not
>> returned back into the block group's free space, but is migrated to
>> zone_unusable.
>>
>> As this zone_unusable space is behind the current write pointer it is not
>> possible to use it for new allocations. In the current implementation a
>> zone is reset once all of the block group's space is accounted as zone
>> unusable.
>>
>> This behaviour can lead to premature ENOSPC errors on a busy file system.
>>
>> Instead of only reclaiming the zone once it is completely unusable,
>> kick off a reclaim job once the amount of unusable bytes exceeds a user
>> configurable threshold between 51% and 100%. It can be set per mounted
>> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
>> per default.
>>
>> Similar to reclaiming unused block groups, these dirty block groups are
>> added to a to_reclaim list and then on a transaction commit, the reclaim
>> process is triggered but after we deleted unused block groups, which will
>> free space for the relocation process.
>>
>> Signed-off-by: Johannes Thumshirn 
>> ---
>>
>> AFAICT sysfs_create_files() does not have the ability to provide a is_visible
>> callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
>> filesystems as well, even though only for zoned filesystems we're adding 
>> block
>> groups to the reclaim list. I'm all ears for a approach that is sensible in
>> this regard.
>>
>>
>>  fs/btrfs/block-group.c   | 84 
>>  fs/btrfs/block-group.h   |  2 +
>>  fs/btrfs/ctree.h |  3 ++
>>  fs/btrfs/disk-io.c   | 11 +
>>  fs/btrfs/free-space-cache.c  |  9 +++-
>>  fs/btrfs/sysfs.c | 35 +++
>>  fs/btrfs/volumes.c   |  2 +-
>>  fs/btrfs/volumes.h   |  1 +
>>  include/trace/events/btrfs.h | 12 ++
>>  9 files changed, 157 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 9ae3ac96a521..af9026795ddd 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group 
>> *bg)
>> spin_unlock(&fs_info->unused_bgs_lock);
>>  }
>>
>> +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
>> +{
>> +   struct btrfs_block_group *bg;
>> +   struct btrfs_space_info *space_info;
>> +   int ret = 0;
>> +
>> +   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
>> +   return;
>> +
>> +   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
>> +   return;
>> +
>> +   mutex_lock(&fs_info->reclaim_bgs_lock);
>> +   while (!list_empty(&fs_info->reclaim_bgs)) {
>> +   bg = list_first_entry(&fs_info->reclaim_bgs,
>> + struct btrfs_block_group,
>> + bg_list);
>> +   list_del_init(&bg->bg_list);
>> +
>> +   space_info = bg->space_info;
>> +   mutex_unlock(&fs_info->reclaim_bgs_lock);
>> +
>> +   /* Don't want to race with allocators so take the groups_sem 
>> */
>> +   down_write(&space_info->groups_sem);
>> +
>> +   spin_lock(&bg->lock);
>> +   if (bg->reserved || bg->pinned || bg->ro) {
>> +   /*
>> +* We want to bail if we made new allocations or have
>> +* outstanding allocations in this block group.  We 
>> do
>> +* the ro check in case balance is currently acting 
>> on
>> +* this block group.
>> +*/
>> +   spin_unlock(&bg->lock);
>> +   up_write(&space_info->groups_sem);
>> +   goto next;
>> +   }
>> +   spin_unlock(&bg->lock);
>> +
>> +   ret = inc_block_group_ro(bg, 0);
>> +   up_write(&space_info->groups_sem);
>> +   if (ret < 0) {
>> +   ret = 0;
>> +   goto next;
>> +   }
>> +
>> +   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
>> +   trace_btrfs_reclaim_block_group(bg);
>> +   ret = btrfs_relocate_chunk(fs_info, bg->start);
> 
> I think you forgot to test this with lockdep enabled, it should have
> complained loudly otherwise.
> 
> btrfs_relocate_chunk() calls lockdep_assert_head() to verify we are
> holding fs_info->reclaim_bgs_lock,
> but we just unlocked it.

I thought I did, but you're right. I'll need to send a v3 anyways addressing
Josef and Anand's comments.

Thanks,
Johannes


Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-23 Thread Filipe Manana
On Fri, Mar 19, 2021 at 10:52 AM Johannes Thumshirn
 wrote:
>
> When a file gets deleted on a zoned file system, the space freed is not
> returned back into the block group's free space, but is migrated to
> zone_unusable.
>
> As this zone_unusable space is behind the current write pointer it is not
> possible to use it for new allocations. In the current implementation a
> zone is reset once all of the block group's space is accounted as zone
> unusable.
>
> This behaviour can lead to premature ENOSPC errors on a busy file system.
>
> Instead of only reclaiming the zone once it is completely unusable,
> kick off a reclaim job once the amount of unusable bytes exceeds a user
> configurable threshold between 51% and 100%. It can be set per mounted
> filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
> per default.
>
> Similar to reclaiming unused block groups, these dirty block groups are
> added to a to_reclaim list and then on a transaction commit, the reclaim
> process is triggered but after we deleted unused block groups, which will
> free space for the relocation process.
>
> Signed-off-by: Johannes Thumshirn 
> ---
>
> AFAICT sysfs_create_files() does not have the ability to provide a is_visible
> callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
> filesystems as well, even though only for zoned filesystems we're adding block
> groups to the reclaim list. I'm all ears for a approach that is sensible in
> this regard.
>
>
>  fs/btrfs/block-group.c   | 84 
>  fs/btrfs/block-group.h   |  2 +
>  fs/btrfs/ctree.h |  3 ++
>  fs/btrfs/disk-io.c   | 11 +
>  fs/btrfs/free-space-cache.c  |  9 +++-
>  fs/btrfs/sysfs.c | 35 +++
>  fs/btrfs/volumes.c   |  2 +-
>  fs/btrfs/volumes.h   |  1 +
>  include/trace/events/btrfs.h | 12 ++
>  9 files changed, 157 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 9ae3ac96a521..af9026795ddd 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
> spin_unlock(&fs_info->unused_bgs_lock);
>  }
>
> +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
> +{
> +   struct btrfs_block_group *bg;
> +   struct btrfs_space_info *space_info;
> +   int ret = 0;
> +
> +   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
> +   return;
> +
> +   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
> +   return;
> +
> +   mutex_lock(&fs_info->reclaim_bgs_lock);
> +   while (!list_empty(&fs_info->reclaim_bgs)) {
> +   bg = list_first_entry(&fs_info->reclaim_bgs,
> + struct btrfs_block_group,
> + bg_list);
> +   list_del_init(&bg->bg_list);
> +
> +   space_info = bg->space_info;
> +   mutex_unlock(&fs_info->reclaim_bgs_lock);
> +
> +   /* Don't want to race with allocators so take the groups_sem 
> */
> +   down_write(&space_info->groups_sem);
> +
> +   spin_lock(&bg->lock);
> +   if (bg->reserved || bg->pinned || bg->ro) {
> +   /*
> +* We want to bail if we made new allocations or have
> +* outstanding allocations in this block group.  We do
> +* the ro check in case balance is currently acting on
> +* this block group.
> +*/
> +   spin_unlock(&bg->lock);
> +   up_write(&space_info->groups_sem);
> +   goto next;
> +   }
> +   spin_unlock(&bg->lock);
> +
> +   ret = inc_block_group_ro(bg, 0);
> +   up_write(&space_info->groups_sem);
> +   if (ret < 0) {
> +   ret = 0;
> +   goto next;
> +   }
> +
> +   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
> +   trace_btrfs_reclaim_block_group(bg);
> +   ret = btrfs_relocate_chunk(fs_info, bg->start);

I think you forgot to test this with lockdep enabled, it should have
complained loudly otherwise.

btrfs_relocate_chunk() calls lockdep_assert_head() to verify we are
holding fs_info->reclaim_bgs_lock,
but we just unlocked it.

Thanks.



> +   if (ret)
> +   btrfs_err(fs_info, "error relocating chunk %llu",
> + bg->start);
> +
> +next:
> +   btrfs_put_block_group(bg);
> +   mutex_lock(&fs_info->reclaim_bgs_lock);
> +   }
> +   mutex_unlock(&fs_info->reclaim_bgs_lock);
> +   btrfs_exclop_finish(fs_info);
> +}
> +
> +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
> 

Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-23 Thread Naohiro Aota
On Fri, Mar 19, 2021 at 01:59:02PM -0400, Josef Bacik wrote:
> On 3/19/21 6:48 AM, Johannes Thumshirn wrote:
> > When a file gets deleted on a zoned file system, the space freed is not
> > returned back into the block group's free space, but is migrated to
> > zone_unusable.
> > 
> > As this zone_unusable space is behind the current write pointer it is not
> > possible to use it for new allocations. In the current implementation a
> > zone is reset once all of the block group's space is accounted as zone
> > unusable.
> > 
> > This behaviour can lead to premature ENOSPC errors on a busy file system.
> > 
> > Instead of only reclaiming the zone once it is completely unusable,
> > kick off a reclaim job once the amount of unusable bytes exceeds a user
> > configurable threshold between 51% and 100%. It can be set per mounted
> > filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
> > per default.
> > 
> > Similar to reclaiming unused block groups, these dirty block groups are
> > added to a to_reclaim list and then on a transaction commit, the reclaim
> > process is triggered but after we deleted unused block groups, which will
> > free space for the relocation process.
> > 
> > Signed-off-by: Johannes Thumshirn 
> > ---
> > 
> > AFAICT sysfs_create_files() does not have the ability to provide a 
> > is_visible
> > callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
> > filesystems as well, even though only for zoned filesystems we're adding 
> > block
> > groups to the reclaim list. I'm all ears for a approach that is sensible in
> > this regard.
> > 
> > 
> >   fs/btrfs/block-group.c   | 84 
> >   fs/btrfs/block-group.h   |  2 +
> >   fs/btrfs/ctree.h |  3 ++
> >   fs/btrfs/disk-io.c   | 11 +
> >   fs/btrfs/free-space-cache.c  |  9 +++-
> >   fs/btrfs/sysfs.c | 35 +++
> >   fs/btrfs/volumes.c   |  2 +-
> >   fs/btrfs/volumes.h   |  1 +
> >   include/trace/events/btrfs.h | 12 ++
> >   9 files changed, 157 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> > index 9ae3ac96a521..af9026795ddd 100644
> > --- a/fs/btrfs/block-group.c
> > +++ b/fs/btrfs/block-group.c
> > @@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group 
> > *bg)
> > spin_unlock(&fs_info->unused_bgs_lock);
> >   }
> > +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
> > +{
> > +   struct btrfs_block_group *bg;
> > +   struct btrfs_space_info *space_info;
> > +   int ret = 0;
> > +
> > +   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
> > +   return;
> > +
> > +   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
> > +   return;
> > +
> > +   mutex_lock(&fs_info->reclaim_bgs_lock);
> > +   while (!list_empty(&fs_info->reclaim_bgs)) {
> > +   bg = list_first_entry(&fs_info->reclaim_bgs,
> > + struct btrfs_block_group,
> > + bg_list);
> > +   list_del_init(&bg->bg_list);
> > +
> > +   space_info = bg->space_info;
> > +   mutex_unlock(&fs_info->reclaim_bgs_lock);
> > +
> > +   /* Don't want to race with allocators so take the groups_sem */
> > +   down_write(&space_info->groups_sem);
> > +
> > +   spin_lock(&bg->lock);
> > +   if (bg->reserved || bg->pinned || bg->ro) {
> 
> How do we deal with backup supers in zoned again?  Will they show up as
> readonly?  If so we may not want the bg->ro check, but I may be insane.

No superblock/backups are placed into a zone composing a block group,
because, if placed, it becomes a hole blocking sequential writes. The
zones containing superblock/backups are reserved and no device extents
are allocated there. So, bg->ro == 0, if the block group is read-write.


Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-22 Thread Anand Jain



On 19/03/2021 18:48, Johannes Thumshirn wrote:

When a file gets deleted on a zoned file system, the space freed is not
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%. It can be set per mounted
filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
per default.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.



Still, in the code below, I don't see the zone write pointer reset of
the zone_unusable done here. Where does that happen? Or what did I miss?


Signed-off-by: Johannes Thumshirn 
---




AFAICT sysfs_create_files() does not have the ability to provide a is_visible
callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
filesystems as well, even though only for zoned filesystems we're adding block
groups to the reclaim list. I'm all ears for a approach that is sensible in
this regard.


All per mounted filesystem kobjects remain visible, even if it doesn't
make sense in some configs. So it is consistent with the rest.

Thanks, Anand




  fs/btrfs/block-group.c   | 84 
  fs/btrfs/block-group.h   |  2 +
  fs/btrfs/ctree.h |  3 ++
  fs/btrfs/disk-io.c   | 11 +
  fs/btrfs/free-space-cache.c  |  9 +++-
  fs/btrfs/sysfs.c | 35 +++
  fs/btrfs/volumes.c   |  2 +-
  fs/btrfs/volumes.h   |  1 +
  include/trace/events/btrfs.h | 12 ++
  9 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9ae3ac96a521..af9026795ddd 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
  }
  
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)

+{
+   struct btrfs_block_group *bg;
+   struct btrfs_space_info *space_info;
+   int ret = 0;
+
+   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+   return;
+
+   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+   return;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   while (!list_empty(&fs_info->reclaim_bgs)) {
+   bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+   list_del_init(&bg->bg_list);
+
+   space_info = bg->space_info;
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+   /* Don't want to race with allocators so take the groups_sem */
+   down_write(&space_info->groups_sem);
+
+   spin_lock(&bg->lock);
+   if (bg->reserved || bg->pinned || bg->ro) {
+   /*
+* We want to bail if we made new allocations or have
+* outstanding allocations in this block group.  We do
+* the ro check in case balance is currently acting on
+* this block group.
+*/
+   spin_unlock(&bg->lock);
+   up_write(&space_info->groups_sem);
+   goto next;
+   }
+   spin_unlock(&bg->lock);
+
+   ret = inc_block_group_ro(bg, 0);
+   up_write(&space_info->groups_sem);
+   if (ret < 0) {
+   ret = 0;
+   goto next;
+   }
+
+   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
+   trace_btrfs_reclaim_block_group(bg);
+   ret = btrfs_relocate_chunk(fs_info, bg->start);
+   if (ret)
+   btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+   btrfs_put_block_group(bg);
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   }
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+   btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+   struct btrfs_fs_info *fs_info = bg->fs_info;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   if (list_empty(&bg->bg_list)) {
+

Re: [PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-19 Thread Josef Bacik

On 3/19/21 6:48 AM, Johannes Thumshirn wrote:

When a file gets deleted on a zoned file system, the space freed is not
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%. It can be set per mounted
filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
per default.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.

Signed-off-by: Johannes Thumshirn 
---

AFAICT sysfs_create_files() does not have the ability to provide a is_visible
callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
filesystems as well, even though only for zoned filesystems we're adding block
groups to the reclaim list. I'm all ears for a approach that is sensible in
this regard.


  fs/btrfs/block-group.c   | 84 
  fs/btrfs/block-group.h   |  2 +
  fs/btrfs/ctree.h |  3 ++
  fs/btrfs/disk-io.c   | 11 +
  fs/btrfs/free-space-cache.c  |  9 +++-
  fs/btrfs/sysfs.c | 35 +++
  fs/btrfs/volumes.c   |  2 +-
  fs/btrfs/volumes.h   |  1 +
  include/trace/events/btrfs.h | 12 ++
  9 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9ae3ac96a521..af9026795ddd 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
  }
  
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)

+{
+   struct btrfs_block_group *bg;
+   struct btrfs_space_info *space_info;
+   int ret = 0;
+
+   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+   return;
+
+   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+   return;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   while (!list_empty(&fs_info->reclaim_bgs)) {
+   bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+   list_del_init(&bg->bg_list);
+
+   space_info = bg->space_info;
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+   /* Don't want to race with allocators so take the groups_sem */
+   down_write(&space_info->groups_sem);
+
+   spin_lock(&bg->lock);
+   if (bg->reserved || bg->pinned || bg->ro) {


How do we deal with backup supers in zoned again?  Will they show up as 
readonly?  If so we may not want the bg->ro check, but I may be insane.



+   /*
+* We want to bail if we made new allocations or have
+* outstanding allocations in this block group.  We do
+* the ro check in case balance is currently acting on
+* this block group.
+*/
+   spin_unlock(&bg->lock);
+   up_write(&space_info->groups_sem);
+   goto next;
+   }
+   spin_unlock(&bg->lock);
+
+   ret = inc_block_group_ro(bg, 0);
+   up_write(&space_info->groups_sem);
+   if (ret < 0) {
+   ret = 0;
+   goto next;
+   }
+
+   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
+   trace_btrfs_reclaim_block_group(bg);
+   ret = btrfs_relocate_chunk(fs_info, bg->start);
+   if (ret)
+   btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+   btrfs_put_block_group(bg);
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   }
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+   btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+   struct btrfs_fs_info *fs_info = bg->fs_info;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   if (list_empty(&bg->bg_list)) {
+   btrfs_get_block_group(bg);
+   trace_btrfs_add_reclaim_block_group(bg);
+   list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs

[PATCH v2 2/2] btrfs: zoned: automatically reclaim zones

2021-03-19 Thread Johannes Thumshirn
When a file gets deleted on a zoned file system, the space freed is not
returned back into the block group's free space, but is migrated to
zone_unusable.

As this zone_unusable space is behind the current write pointer it is not
possible to use it for new allocations. In the current implementation a
zone is reset once all of the block group's space is accounted as zone
unusable.

This behaviour can lead to premature ENOSPC errors on a busy file system.

Instead of only reclaiming the zone once it is completely unusable,
kick off a reclaim job once the amount of unusable bytes exceeds a user
configurable threshold between 51% and 100%. It can be set per mounted
filesystem via the sysfs tunable bg_reclaim_threshold which is set to 75%
per default.

Similar to reclaiming unused block groups, these dirty block groups are
added to a to_reclaim list and then on a transaction commit, the reclaim
process is triggered but after we deleted unused block groups, which will
free space for the relocation process.

Signed-off-by: Johannes Thumshirn 
---

AFAICT sysfs_create_files() does not have the ability to provide a is_visible
callback, so the bg_reclaim_threshold sysfs file is visible for non zoned
filesystems as well, even though only for zoned filesystems we're adding block
groups to the reclaim list. I'm all ears for a approach that is sensible in
this regard.


 fs/btrfs/block-group.c   | 84 
 fs/btrfs/block-group.h   |  2 +
 fs/btrfs/ctree.h |  3 ++
 fs/btrfs/disk-io.c   | 11 +
 fs/btrfs/free-space-cache.c  |  9 +++-
 fs/btrfs/sysfs.c | 35 +++
 fs/btrfs/volumes.c   |  2 +-
 fs/btrfs/volumes.h   |  1 +
 include/trace/events/btrfs.h | 12 ++
 9 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 9ae3ac96a521..af9026795ddd 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1485,6 +1485,80 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
 }
 
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_block_group *bg;
+   struct btrfs_space_info *space_info;
+   int ret = 0;
+
+   if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+   return;
+
+   if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+   return;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   while (!list_empty(&fs_info->reclaim_bgs)) {
+   bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+   list_del_init(&bg->bg_list);
+
+   space_info = bg->space_info;
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+   /* Don't want to race with allocators so take the groups_sem */
+   down_write(&space_info->groups_sem);
+
+   spin_lock(&bg->lock);
+   if (bg->reserved || bg->pinned || bg->ro) {
+   /*
+* We want to bail if we made new allocations or have
+* outstanding allocations in this block group.  We do
+* the ro check in case balance is currently acting on
+* this block group.
+*/
+   spin_unlock(&bg->lock);
+   up_write(&space_info->groups_sem);
+   goto next;
+   }
+   spin_unlock(&bg->lock);
+
+   ret = inc_block_group_ro(bg, 0);
+   up_write(&space_info->groups_sem);
+   if (ret < 0) {
+   ret = 0;
+   goto next;
+   }
+
+   btrfs_info(fs_info, "reclaiming chunk %llu", bg->start);
+   trace_btrfs_reclaim_block_group(bg);
+   ret = btrfs_relocate_chunk(fs_info, bg->start);
+   if (ret)
+   btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+   btrfs_put_block_group(bg);
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   }
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+   btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+   struct btrfs_fs_info *fs_info = bg->fs_info;
+
+   mutex_lock(&fs_info->reclaim_bgs_lock);
+   if (list_empty(&bg->bg_list)) {
+   btrfs_get_block_group(bg);
+   trace_btrfs_add_reclaim_block_group(bg);
+   list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
+   }
+   mutex_unlock(&fs_info->reclaim_bgs_lock);
+}
+
 static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key 
*key,
   struct btrfs_path *path)
 {
@@ -3