On 2021/03/15 14:55, Naohiro Aota wrote:
> This commit moves the location of superblock logging zones. The location of
> the logging zones are determined based on fixed block addresses instead of
> on fixed zone numbers.

David,

Any comment on this ? It would be nice to get this settled in this cycle so that
we have a stable on-disk format going forward. btrfs-tools and libblkid zoned
support patches also depend on this.

> 
> By locating the superblock zones using fixed addresses, we can scan a
> dumped file system image without the zone information. And, no drawbacks
> exist.
> 
> We use the following three pairs of zones containing fixed offset
> locations, regardless of the device zone size.
> 
>   - Primary superblock: zone starting at offset 0 and the following zone
>   - First copy: zone containing offset 64GB and the following zone
>   - Second copy: zone containing offset 256GB and the following zone
> 
> If the location of the zones are outside of disk, we don't record the
> superblock copy.
> 
> These addresses are arbitrary, but using addresses that are too large
> reduces superblock reliability for smaller devices, so we do not want to
> exceed 1T to cover all case nicely.
> 
> Also, LBAs are generally distributed initially across one head (platter
> side) up to one or more zones, then go on the next head backward (the other
> side of the same platter), and on to the following head/platter. Thus using
> non sequential fixed addresses for superblock logging, such as 0/64G/256G,
> likely result in each superblock copy being on a different head/platter
> which improves chances of recovery in case of superblock read error.
> 
> These zones are reserved for superblock logging and never used for data or
> metadata blocks. Zones containing the offsets used to store superblocks in
> a regular btrfs volume (no zoned case) are also reserved to avoid
> confusion.
> 
> Note that we only reserve the 2 zones per primary/copy actually used for
> superblock logging. We don't reserve the ranges possibly containing
> superblock with the largest supported zone size (0-16GB, 64G-80GB,
> 256G-272GB).
> 
> The first copy position is much larger than for a regular btrfs volume
> (64M).  This increase is to avoid overlapping with the log zones for the
> primary superblock. This higher location is arbitrary but allows supporting
> devices with very large zone size, up to 32GB. But we only allow zone sizes
> up to 8GB for now.
> 
> Signed-off-by: Naohiro Aota <naohiro.a...@wdc.com>
> ---
>  fs/btrfs/zoned.c | 39 +++++++++++++++++++++++++++++++--------
>  1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index 43948bd40e02..6a72ca1f7988 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -21,9 +21,24 @@
>  /* Pseudo write pointer value for conventional zone */
>  #define WP_CONVENTIONAL ((u64)-2)
>  
> +/*
> + * Location of the first zone of superblock logging zone pairs.
> + * - Primary superblock: the zone containing offset 0 (zone 0)
> + * - First superblock copy: the zone containing offset 64G
> + * - Second superblock copy: the zone containing offset 256G
> + */
> +#define BTRFS_PRIMARY_SB_LOG_ZONE 0ULL
> +#define BTRFS_FIRST_SB_LOG_ZONE (64ULL * SZ_1G)
> +#define BTRFS_SECOND_SB_LOG_ZONE (256ULL * SZ_1G)
> +#define BTRFS_FIRST_SB_LOG_ZONE_SHIFT const_ilog2(BTRFS_FIRST_SB_LOG_ZONE)
> +#define BTRFS_SECOND_SB_LOG_ZONE_SHIFT const_ilog2(BTRFS_SECOND_SB_LOG_ZONE)
> +
>  /* Number of superblock log zones */
>  #define BTRFS_NR_SB_LOG_ZONES 2
>  
> +/* Max size of supported zone size */
> +#define BTRFS_MAX_ZONE_SIZE SZ_8G
> +
>  static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void 
> *data)
>  {
>       struct blk_zone *zones = data;
> @@ -111,11 +126,8 @@ static int sb_write_pointer(struct block_device *bdev, 
> struct blk_zone *zones,
>  }
>  
>  /*
> - * The following zones are reserved as the circular buffer on ZONED btrfs.
> - *  - The primary superblock: zones 0 and 1
> - *  - The first copy: zones 16 and 17
> - *  - The second copy: zones 1024 or zone at 256GB which is minimum, and
> - *                     the following one
> + * Get the zone number of the first zone of a pair of contiguous zones used
> + * for superblock logging.
>   */
>  static inline u32 sb_zone_number(int shift, int mirror)
>  {
> @@ -123,8 +135,8 @@ static inline u32 sb_zone_number(int shift, int mirror)
>  
>       switch (mirror) {
>       case 0: return 0;
> -     case 1: return 16;
> -     case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
> +     case 1: return 1 << (BTRFS_FIRST_SB_LOG_ZONE_SHIFT - shift);
> +     case 2: return 1 << (BTRFS_SECOND_SB_LOG_ZONE_SHIFT - shift);
>       }
>  
>       return 0;
> @@ -300,10 +312,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
>               zone_sectors = bdev_zone_sectors(bdev);
>       }
>  
> -     nr_sectors = bdev_nr_sectors(bdev);
>       /* Check if it's power of 2 (see is_power_of_2) */
>       ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
>       zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
> +
> +     /* We reject devices with a zone size larger than 8GB. */
> +     if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
> +             btrfs_err_in_rcu(fs_info,
> +                              "zoned: %s: zone size %llu is too large",
> +                              rcu_str_deref(device->name),
> +                              zone_info->zone_size);
> +             ret = -EINVAL;
> +             goto out;
> +     }
> +
> +     nr_sectors = bdev_nr_sectors(bdev);
>       zone_info->zone_size_shift = ilog2(zone_info->zone_size);
>       zone_info->max_zone_append_size =
>               (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
> 


-- 
Damien Le Moal
Western Digital Research

Reply via email to