On 4/9/25 8:51 AM, Benjamin Marzinski wrote:
> dm_revalidate_zones() only allowed new or previously unzoned devices to
> call blk_revalidate_disk_zones(). If the device was already zoned,
> disk->nr_zones would always equal md->nr_zones, so dm_revalidate_zones()
> returned without doing any work. This would make the zoned settings for
> the device not match the new table. If the device had zone write plug
> resources, it could run into errors like bdev_zone_is_seq() reading
> invalid memory because disk->conv_zones_bitmap was the wrong size.
>
> If the device doesn't have any zone write plug resources, calling
> blk_revalidate_disk_zones() will always correctly update device. If
> blk_revalidate_disk_zones() fails, it can still overwrite or clear the
> current disk->nr_zones value. In this case, DM must restore the previous
> value of disk->nr_zones, so that the zoned settings will continue to
> match the previous that it failed back to.
s/previous/previous value ?
s/failed/fell ?
>
> If the device already has zone write plug resources,
> blk_revalidate_disk_zones() will not correctly update them, if it is
> called for arbitrary zoned device changes. Since there is not much need
> for this ability, the easiest solution is to disallow any table reloads
> that change the zoned settings, for devices that already have zone plug
> resources. Specifically, if a device already has zone plug resources
> allocated, it can only switch to another zoned table that also emulates
> zone append. Also, it cannot change the device size or the zone size. A
> device can switch to an error target.
>
> Fixes: bb37d77239af2 ("dm: introduce zone append emulation")
> Signed-off-by: Benjamin Marzinski <[email protected]>
[...]
> @@ -1873,11 +1898,17 @@ int dm_table_set_restrictions(struct dm_table *t,
> struct request_queue *q,
> limits->features &= ~BLK_FEAT_DAX;
>
> /* For a zoned table, setup the zone related queue attributes. */
> - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> - (limits->features & BLK_FEAT_ZONED)) {
> - r = dm_set_zones_restrictions(t, q, limits);
> - if (r)
> - return r;
> + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
> + if (limits->features & BLK_FEAT_ZONED) {
> + r = dm_set_zones_restrictions(t, q, limits);
> + if (r)
> + return r;
> + } else if (dm_has_zone_plugs(t->md)) {
> + DMWARN("%s: device has zone write plug resources. "
> + "Cannot switch to non-zoned table.",
> + dm_device_name(t->md));
> + return -EINVAL;
> + }
I am confused with this one. We can only have zone write plugs if the device is
zoned. So it seems that the check for "if (dm_has_zone_plugs(t->md)" should
really be inside dm_set_zones_restrictions(). Or is this trying to detect a
table change that would switch the device from zoned to non-zoned ? If that is
the case, regardless of the zone write plug initialization state, we should
never allow such change.
> }
>
> if (dm_table_supports_atomic_writes(t))
> diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
> index 11e19281bb64..1d419734fefc 100644
> --- a/drivers/md/dm-zone.c
> +++ b/drivers/md/dm-zone.c
> @@ -158,22 +158,22 @@ int dm_revalidate_zones(struct dm_table *t, struct
> request_queue *q)
> {
> struct mapped_device *md = t->md;
> struct gendisk *disk = md->disk;
> + unsigned int nr_zones = disk->nr_zones;
> int ret;
>
> if (!get_capacity(disk))
> return 0;
>
> - /* Revalidate only if something changed. */
> - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) {
> - DMINFO("%s using %s zone append",
> - disk->disk_name,
> - queue_emulates_zone_append(q) ? "emulated" : "native");
> - md->nr_zones = 0;
> - }
> -
> - if (md->nr_zones)
> + /*
> + * Do not revalidate if zone append emulation resources have already
> + * been allocated.
Maybe better to rephrased this (to be precise) to something like:
/*
* Do not revalidate if zone write plug resources have already
* been allocated.
*/
> + */
> + if (dm_has_zone_plugs(md))
> return 0;
>
> + DMINFO("%s using %s zone append", disk->disk_name,
> + queue_emulates_zone_append(q) ? "emulated" : "native");
> +
> /*
> * Our table is not live yet. So the call to dm_get_live_table()
> * in dm_blk_report_zones() will fail. Set a temporary pointer to
> @@ -187,6 +187,7 @@ int dm_revalidate_zones(struct dm_table *t, struct
> request_queue *q)
>
> if (ret) {
> DMERR("Revalidate zones failed %d", ret);
> + disk->nr_zones = nr_zones;
> return ret;
> }
>
> @@ -383,12 +384,28 @@ int dm_set_zones_restrictions(struct dm_table *t,
> struct request_queue *q,
> lim->max_open_zones = 0;
> lim->max_active_zones = 0;
> lim->max_hw_zone_append_sectors = 0;
> + lim->max_zone_append_sectors = 0;
> lim->zone_write_granularity = 0;
> lim->chunk_sectors = 0;
> lim->features &= ~BLK_FEAT_ZONED;
> return 0;
> }
>
> + if (get_capacity(disk) && dm_has_zone_plugs(t->md)) {
> + if (q->limits.chunk_sectors != lim->chunk_sectors) {
> + DMWARN("%s: device has zone write plug resources. "
> + "Cannot change zone size",
> + disk->disk_name);
> + return -EINVAL;
> + }
> + if (lim->max_hw_zone_append_sectors != 0 &&
> + !dm_table_is_wildcard(t)) {
> + DMWARN("%s: device has zone write plug resources. "
> + "New table must emulate zone append",
> + disk->disk_name);
> + return -EINVAL;
> + }
> + }
I have some concerns about this: what if the new table has the same capacity
and the same zone size but the types of zones changed ? We are not checking
this here and we cannot actually check that without doing a report zones. So I
really think we should just use the big hammer here and simply only allow the
wildcard target and no other change.
> /*
> * Warn once (when the capacity is not yet set) if the mapped device is
> * partially using zone resources of the target devices as that leads to
> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
> index 292414da871d..240f6dab8dda 100644
> --- a/drivers/md/dm.c
> +++ b/drivers/md/dm.c
> @@ -2429,6 +2429,12 @@ static struct dm_table *__bind(struct mapped_device
> *md, struct dm_table *t,
> size = dm_table_get_size(t);
>
> old_size = dm_get_size(md);
> +
> + if (!dm_table_supports_size_change(t, old_size, size)) {
> + old_map = ERR_PTR(-EINVAL);
> + goto out;
> + }
And I guess we could probably move the "wildcard-only is allowed" change check
here as that would further simplify the revalidation code. No ?
> +
> set_capacity(md->disk, size);
>
> ret = dm_table_set_restrictions(t, md->queue, limits);
> diff --git a/drivers/md/dm.h b/drivers/md/dm.h
> index e5d3a9f46a91..245f52b59215 100644
> --- a/drivers/md/dm.h
> +++ b/drivers/md/dm.h
> @@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t,
> void (*fn)(void *), void *context);
> struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
> bool dm_table_has_no_data_devices(struct dm_table *table);
> +bool dm_table_is_wildcard(struct dm_table *t);
> int dm_calculate_queue_limits(struct dm_table *table,
> struct queue_limits *limits);
> int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
> @@ -72,6 +73,8 @@ struct target_type
> *dm_table_get_immutable_target_type(struct dm_table *t);
> struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
> struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
> bool dm_table_request_based(struct dm_table *t);
> +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size,
> + sector_t new_size);
>
> void dm_lock_md_type(struct mapped_device *md);
> void dm_unlock_md_type(struct mapped_device *md);
> @@ -111,12 +114,14 @@ bool dm_is_zone_write(struct mapped_device *md, struct
> bio *bio);
> int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
> sector_t sector, unsigned int nr_zones,
> unsigned long *need_reset);
> +#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL)
> #else
> #define dm_blk_report_zones NULL
> static inline bool dm_is_zone_write(struct mapped_device *md, struct bio
> *bio)
> {
> return false;
> }
> +#define dm_has_zone_plugs(md) false
> #endif
>
> /*
--
Damien Le Moal
Western Digital Research