Re: [PATCH 07/19] btrfs: do sequential extent allocation in HMZONED mode

Naohiro Aota Tue, 18 Jun 2019 01:28:29 -0700

On 2019/06/13 23:07, Josef Bacik wrote:
> On Fri, Jun 07, 2019 at 10:10:13PM +0900, Naohiro Aota wrote:
>> @@ -9616,7 +9701,8 @@ static int inc_block_group_ro(struct 
>> btrfs_block_group_cache *cache, int force)
>>      }
>>   
>>      num_bytes = cache->key.offset - cache->reserved - cache->pinned -
>> -                cache->bytes_super - btrfs_block_group_used(&cache->item);
>> +                cache->bytes_super - cache->unusable -
>> +                btrfs_block_group_used(&cache->item);
>>      sinfo_used = btrfs_space_info_used(sinfo, true);
>>   
>>      if (sinfo_used + num_bytes + min_allocable_bytes <=
>> @@ -9766,6 +9852,7 @@ void btrfs_dec_block_group_ro(struct 
>> btrfs_block_group_cache *cache)
>>      if (!--cache->ro) {
>>              num_bytes = cache->key.offset - cache->reserved -
>>                          cache->pinned - cache->bytes_super -
>> +                        cache->unusable -
>>                          btrfs_block_group_used(&cache->item);
> 
> You've done this in a few places, but not all the places, most notably
> btrfs_space_info_used() which is used in the space reservation code a lot.


I added "unsable" to struct btrfs_block_group_cache, but added
nothing to struct btrfs_space_info. Once extent is allocated and
freed in an ALLOC_SEQ Block Group, such extent is never resued
until we remove the BG. I'm accounting the size of such region
in "cache->unusable" and in "space_info->bytes_readonly". So,
btrfs_space_info_used() does not need the modify.

I admit it's confusing here. I can add "bytes_zone_unusable" to
struct btrfs_space_info, if it's better.

>>              sinfo->bytes_readonly -= num_bytes;
>>              list_del_init(&cache->ro_list);
>> @@ -10200,11 +10287,240 @@ static void link_block_group(struct 
>> btrfs_block_group_cache *cache)
>>      }
>>   }
>>   
>> +static int
>> +btrfs_get_block_group_alloc_offset(struct btrfs_block_group_cache *cache)
>> +{
>> +    struct btrfs_fs_info *fs_info = cache->fs_info;
>> +    struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
>> +    struct extent_map *em;
>> +    struct map_lookup *map;
>> +    struct btrfs_device *device;
>> +    u64 logical = cache->key.objectid;
>> +    u64 length = cache->key.offset;
>> +    u64 physical = 0;
>> +    int ret, alloc_type;
>> +    int i, j;
>> +    u64 *alloc_offsets = NULL;
>> +
>> +#define WP_MISSING_DEV ((u64)-1)
>> +
>> +    /* Sanity check */
>> +    if (!IS_ALIGNED(length, fs_info->zone_size)) {
>> +            btrfs_err(fs_info, "unaligned block group at %llu + %llu",
>> +                      logical, length);
>> +            return -EIO;
>> +    }
>> +
>> +    /* Get the chunk mapping */
>> +    em_tree = &fs_info->mapping_tree.map_tree;
>> +    read_lock(&em_tree->lock);
>> +    em = lookup_extent_mapping(em_tree, logical, length);
>> +    read_unlock(&em_tree->lock);
>> +
>> +    if (!em)
>> +            return -EINVAL;
>> +
>> +    map = em->map_lookup;
>> +
>> +    /*
>> +     * Get the zone type: if the group is mapped to a non-sequential zone,
>> +     * there is no need for the allocation offset (fit allocation is OK).
>> +     */
>> +    alloc_type = -1;
>> +    alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets),
>> +                            GFP_NOFS);
>> +    if (!alloc_offsets) {
>> +            free_extent_map(em);
>> +            return -ENOMEM;
>> +    }
>> +
>> +    for (i = 0; i < map->num_stripes; i++) {
>> +            int is_sequential;
>> +            struct blk_zone zone;
>> +
>> +            device = map->stripes[i].dev;
>> +            physical = map->stripes[i].physical;
>> +
>> +            if (device->bdev == NULL) {
>> +                    alloc_offsets[i] = WP_MISSING_DEV;
>> +                    continue;
>> +            }
>> +
>> +            is_sequential = btrfs_dev_is_sequential(device, physical);
>> +            if (alloc_type == -1)
>> +                    alloc_type = is_sequential ?
>> +                                    BTRFS_ALLOC_SEQ : BTRFS_ALLOC_FIT;
>> +
>> +            if ((is_sequential && alloc_type != BTRFS_ALLOC_SEQ) ||
>> +                (!is_sequential && alloc_type == BTRFS_ALLOC_SEQ)) {
>> +                    btrfs_err(fs_info, "found block group of mixed zone 
>> types");
>> +                    ret = -EIO;
>> +                    goto out;
>> +            }
>> +
>> +            if (!is_sequential)
>> +                    continue;
>> +
>> +            /* this zone will be used for allocation, so mark this
>> +             * zone non-empty
>> +             */
>> +            clear_bit(physical >> device->zone_size_shift,
>> +                      device->empty_zones);
>> +
>> +            /*
>> +             * The group is mapped to a sequential zone. Get the zone write
>> +             * pointer to determine the allocation offset within the zone.
>> +             */
>> +            WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
>> +            ret = btrfs_get_dev_zone(device, physical, &zone, GFP_NOFS);
>> +            if (ret == -EIO || ret == -EOPNOTSUPP) {
>> +                    ret = 0;
>> +                    alloc_offsets[i] = WP_MISSING_DEV;
>> +                    continue;
>> +            } else if (ret) {
>> +                    goto out;
>> +            }
>> +
>> +
>> +            switch (zone.cond) {
>> +            case BLK_ZONE_COND_OFFLINE:
>> +            case BLK_ZONE_COND_READONLY:
>> +                    btrfs_err(fs_info, "Offline/readonly zone %llu",
>> +                              physical >> device->zone_size_shift);
>> +                    alloc_offsets[i] = WP_MISSING_DEV;
>> +                    break;
>> +            case BLK_ZONE_COND_EMPTY:
>> +                    alloc_offsets[i] = 0;
>> +                    break;
>> +            case BLK_ZONE_COND_FULL:
>> +                    alloc_offsets[i] = fs_info->zone_size;
>> +                    break;
>> +            default:
>> +                    /* Partially used zone */
>> +                    alloc_offsets[i] =
>> +                            ((zone.wp - zone.start) << SECTOR_SHIFT);
>> +                    break;
>> +            }
>> +    }
>> +
>> +    if (alloc_type == BTRFS_ALLOC_FIT)
>> +            goto out;
>> +
>> +    switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>> +    case 0: /* single */
>> +    case BTRFS_BLOCK_GROUP_DUP:
>> +    case BTRFS_BLOCK_GROUP_RAID1:
>> +            cache->alloc_offset = WP_MISSING_DEV;
>> +            for (i = 0; i < map->num_stripes; i++) {
>> +                    if (alloc_offsets[i] == WP_MISSING_DEV)
>> +                            continue;
>> +                    if (cache->alloc_offset == WP_MISSING_DEV)
>> +                            cache->alloc_offset = alloc_offsets[i];
>> +                    if (alloc_offsets[i] == cache->alloc_offset)
>> +                            continue;
>> +
>> +                    btrfs_err(fs_info,
>> +                              "write pointer mismatch: block group %llu",
>> +                              logical);
>> +                    cache->wp_broken = 1;
>> +            }
>> +            break;
>> +    case BTRFS_BLOCK_GROUP_RAID0:
>> +            cache->alloc_offset = 0;
>> +            for (i = 0; i < map->num_stripes; i++) {
>> +                    if (alloc_offsets[i] == WP_MISSING_DEV) {
>> +                            btrfs_err(fs_info,
>> +                                      "cannot recover write pointer: block 
>> group %llu",
>> +                                      logical);
>> +                            cache->wp_broken = 1;
>> +                            continue;
>> +                    }
>> +
>> +                    if (alloc_offsets[0] < alloc_offsets[i]) {
>> +                            btrfs_err(fs_info,
>> +                                      "write pointer mismatch: block group 
>> %llu",
>> +                                      logical);
>> +                            cache->wp_broken = 1;
>> +                            continue;
>> +                    }
>> +
>> +                    cache->alloc_offset += alloc_offsets[i];
>> +            }
>> +            break;
>> +    case BTRFS_BLOCK_GROUP_RAID10:
>> +            /*
>> +             * Pass1: check write pointer of RAID1 level: each pointer
>> +             * should be equal.
>> +             */
>> +            for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
>> +                    int base = i*map->sub_stripes;
>> +                    u64 offset = WP_MISSING_DEV;
>> +
>> +                    for (j = 0; j < map->sub_stripes; j++) {
>> +                            if (alloc_offsets[base+j] == WP_MISSING_DEV)
>> +                                    continue;
>> +                            if (offset == WP_MISSING_DEV)
>> +                                    offset = alloc_offsets[base+j];
>> +                            if (alloc_offsets[base+j] == offset)
>> +                                    continue;
>> +
>> +                            btrfs_err(fs_info,
>> +                                      "write pointer mismatch: block group 
>> %llu",
>> +                                      logical);
>> +                            cache->wp_broken = 1;
>> +                    }
>> +                    for (j = 0; j < map->sub_stripes; j++)
>> +                            alloc_offsets[base+j] = offset;
>> +            }
>> +
>> +            /* Pass2: check write pointer of RAID1 level */
>> +            cache->alloc_offset = 0;
>> +            for (i = 0; i < map->num_stripes / map->sub_stripes; i++) {
>> +                    int base = i*map->sub_stripes;
>> +
>> +                    if (alloc_offsets[base] == WP_MISSING_DEV) {
>> +                            btrfs_err(fs_info,
>> +                                      "cannot recover write pointer: block 
>> group %llu",
>> +                                      logical);
>> +                            cache->wp_broken = 1;
>> +                            continue;
>> +                    }
>> +
>> +                    if (alloc_offsets[0] < alloc_offsets[base]) {
>> +                            btrfs_err(fs_info,
>> +                                      "write pointer mismatch: block group 
>> %llu",
>> +                                      logical);
>> +                            cache->wp_broken = 1;
>> +                            continue;
>> +                    }
>> +
>> +                    cache->alloc_offset += alloc_offsets[base];
>> +            }
>> +            break;
>> +    case BTRFS_BLOCK_GROUP_RAID5:
>> +    case BTRFS_BLOCK_GROUP_RAID6:
>> +            /* RAID5/6 is not supported yet */
>> +    default:
>> +            btrfs_err(fs_info, "Unsupported profile on HMZONED %llu",
>> +                    map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
>> +            ret = -EINVAL;
>> +            goto out;
>> +    }
>> +
>> +out:
>> +    cache->alloc_type = alloc_type;
>> +    kfree(alloc_offsets);
>> +    free_extent_map(em);
>> +
>> +    return ret;
>> +}
>> +
> 
> Move this to the zoned device file that you create.

Sure.

>>   static struct btrfs_block_group_cache *
>>   btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
>>                             u64 start, u64 size)
>>   {
>>      struct btrfs_block_group_cache *cache;
>> +    int ret;
>>   
>>      cache = kzalloc(sizeof(*cache), GFP_NOFS);
>>      if (!cache)
>> @@ -10238,6 +10554,16 @@ btrfs_create_block_group_cache(struct btrfs_fs_info 
>> *fs_info,
>>      atomic_set(&cache->trimming, 0);
>>      mutex_init(&cache->free_space_lock);
>>      btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
>> +    cache->alloc_type = BTRFS_ALLOC_FIT;
>> +    cache->alloc_offset = 0;
>> +
>> +    if (btrfs_fs_incompat(fs_info, HMZONED)) {
>> +            ret = btrfs_get_block_group_alloc_offset(cache);
>> +            if (ret) {
>> +                    kfree(cache);
>> +                    return NULL;
>> +            }
>> +    }
>>   
>>      return cache;
>>   }
>> @@ -10310,6 +10636,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info 
>> *info)
>>      int need_clear = 0;
>>      u64 cache_gen;
>>      u64 feature;
>> +    u64 unusable;
>>      int mixed;
>>   
>>      feature = btrfs_super_incompat_flags(info->super_copy);
>> @@ -10415,6 +10742,26 @@ int btrfs_read_block_groups(struct btrfs_fs_info 
>> *info)
>>                      free_excluded_extents(cache);
>>              }
>>   
>> +            switch (cache->alloc_type) {
>> +            case BTRFS_ALLOC_FIT:
>> +                    unusable = cache->bytes_super;
>> +                    break;
>> +            case BTRFS_ALLOC_SEQ:
>> +                    WARN_ON(cache->bytes_super != 0);
>> +                    unusable = cache->alloc_offset -
>> +                            btrfs_block_group_used(&cache->item);
>> +                    /* we only need ->free_space in ALLOC_SEQ BGs */
>> +                    cache->last_byte_to_unpin = (u64)-1;
>> +                    cache->cached = BTRFS_CACHE_FINISHED;
>> +                    cache->free_space_ctl->free_space =
>> +                            cache->key.offset - cache->alloc_offset;
>> +                    cache->unusable = unusable;
>> +                    free_excluded_extents(cache);
>> +                    break;
>> +            default:
>> +                    BUG();
>> +            }
>> +
>>              ret = btrfs_add_block_group_cache(info, cache);
>>              if (ret) {
>>                      btrfs_remove_free_space_cache(cache);
>> @@ -10425,7 +10772,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info 
>> *info)
>>              trace_btrfs_add_block_group(info, cache, 0);
>>              update_space_info(info, cache->flags, found_key.offset,
>>                                btrfs_block_group_used(&cache->item),
>> -                              cache->bytes_super, &space_info);
>> +                              unusable, &space_info);
>>   
>>              cache->space_info = space_info;
>>   
>> @@ -10438,6 +10785,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info 
>> *info)
>>                      ASSERT(list_empty(&cache->bg_list));
>>                      btrfs_mark_bg_unused(cache);
>>              }
>> +
>> +            if (cache->wp_broken)
>> +                    inc_block_group_ro(cache, 1);
>>      }
>>   
>>      list_for_each_entry_rcu(space_info, &info->space_info, list) {
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index f74dc259307b..cc69dc71f4c1 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -2326,8 +2326,11 @@ int __btrfs_add_free_space(struct btrfs_fs_info 
>> *fs_info,
>>                         u64 offset, u64 bytes)
>>   {
>>      struct btrfs_free_space *info;
>> +    struct btrfs_block_group_cache *block_group = ctl->private;
>>      int ret = 0;
>>   
>> +    WARN_ON(block_group && block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
>>      info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
>>      if (!info)
>>              return -ENOMEM;
>> @@ -2376,6 +2379,28 @@ int __btrfs_add_free_space(struct btrfs_fs_info 
>> *fs_info,
>>      return ret;
>>   }
>>   
>> +int __btrfs_add_free_space_seq(struct btrfs_block_group_cache *block_group,
>> +                           u64 bytenr, u64 size)
>> +{
>> +    struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>> +    u64 offset = bytenr - block_group->key.objectid;
>> +    u64 to_free, to_unusable;
>> +
>> +    spin_lock(&ctl->tree_lock);
>> +    if (offset >= block_group->alloc_offset)
>> +            to_free = size;
>> +    else if (offset + size <= block_group->alloc_offset)
>> +            to_free = 0;
>> +    else
>> +            to_free = offset + size - block_group->alloc_offset;
>> +    to_unusable = size - to_free;
>> +    ctl->free_space += to_free;
>> +    block_group->unusable += to_unusable;
>> +    spin_unlock(&ctl->tree_lock);
>> +    return 0;
>> +
>> +}
>> +
>>   int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
>>                          u64 offset, u64 bytes)
>>   {
>> @@ -2384,6 +2409,8 @@ int btrfs_remove_free_space(struct 
>> btrfs_block_group_cache *block_group,
>>      int ret;
>>      bool re_search = false;
>>   
>> +    WARN_ON(block_group->alloc_type == BTRFS_ALLOC_SEQ);
>> +
> 
> These should probably be ASSERT() right?  Want to make sure the developers
> really notice a problem when testing.  Thanks,
> 
> Josef
> 

Agree. I will use ASSERT.

Re: [PATCH 07/19] btrfs: do sequential extent allocation in HMZONED mode

Reply via email to