On 2020/05/31 22:06, Hannes Reinecke wrote:
> On 5/31/20 11:10 AM, Damien Le Moal wrote:
>> On Fri, 2020-05-29 at 19:39 +0200, Hannes Reinecke wrote:
>>> Remove the hard-coded limit of two devices and support an unlimited
>>> number of additional zoned devices.
>>> With that we need to increase the device-mapper version number to
>>> 3.0.0 as we've modified the interface.
>>>
>>> Signed-off-by: Hannes Reinecke <[email protected]>
>>> ---
>>>   drivers/md/dm-zoned-metadata.c |  15 +++++-
>>>   drivers/md/dm-zoned-target.c   | 106 
>>> ++++++++++++++++++++++++-----------------
>>>   2 files changed, 75 insertions(+), 46 deletions(-)
>>>
>>> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
>>> index 044c152eb756..221163ae5f68 100644
>>> --- a/drivers/md/dm-zoned-metadata.c
>>> +++ b/drivers/md/dm-zoned-metadata.c
>>> @@ -1523,7 +1523,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>>>              */
>>>             zmd->sb[0].zone = dmz_get(zmd, 0);
>>>   
>>> -           zoned_dev = &zmd->dev[1];
>>> +           for (i = 1; i < zmd->nr_devs; i++) {
>>> +                   zoned_dev = &zmd->dev[i];
>>> +
>>> +                   ret = blkdev_report_zones(zoned_dev->bdev, 0,
>>> +                                             BLK_ALL_ZONES,
>>> +                                             dmz_init_zone, zoned_dev);
>>> +                   if (ret < 0) {
>>> +                           DMDEBUG("(%s): Failed to report zones, error 
>>> %d",
>>> +                                   zmd->devname, ret);
>>> +                           dmz_drop_zones(zmd);
>>> +                           return ret;
>>> +                   }
>>> +           }
>>> +           return 0;
>>>     }
>>>   
>>>     /*
>>> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
>>> index aa3d26d16441..4a51738d4b0d 100644
>>> --- a/drivers/md/dm-zoned-target.c
>>> +++ b/drivers/md/dm-zoned-target.c
>>> @@ -13,8 +13,6 @@
>>>   
>>>   #define DMZ_MIN_BIOS              8192
>>>   
>>> -#define DMZ_MAX_DEVS               2
>>> -
>>>   /*
>>>    * Zone BIO context.
>>>    */
>>> @@ -40,10 +38,10 @@ struct dm_chunk_work {
>>>    * Target descriptor.
>>>    */
>>>   struct dmz_target {
>>> -   struct dm_dev           *ddev[DMZ_MAX_DEVS];
>>> +   struct dm_dev           **ddev;
>>>     unsigned int            nr_ddevs;
>>>   
>>> -   unsigned long           flags;
>>> +   unsigned int            flags;
>>>   
>>>     /* Zoned block device information */
>>>     struct dmz_dev          *dev;
>>> @@ -764,7 +762,7 @@ static void dmz_put_zoned_device(struct dm_target *ti)
>>>     struct dmz_target *dmz = ti->private;
>>>     int i;
>>>   
>>> -   for (i = 0; i < DMZ_MAX_DEVS; i++) {
>>> +   for (i = 0; i < dmz->nr_ddevs; i++) {
>>>             if (dmz->ddev[i]) {
>>>                     dm_put_device(ti, dmz->ddev[i]);
>>>                     dmz->ddev[i] = NULL;
>>> @@ -777,21 +775,35 @@ static int dmz_fixup_devices(struct dm_target *ti)
>>>     struct dmz_target *dmz = ti->private;
>>>     struct dmz_dev *reg_dev, *zoned_dev;
>>>     struct request_queue *q;
>>> +   sector_t zone_nr_sectors = 0;
>>> +   int i;
>>>   
>>>     /*
>>> -    * When we have two devices, the first one must be a regular block
>>> -    * device and the second a zoned block device.
>>> +    * When we have more than on devices, the first one must be a
>>> +    * regular block device and the others zoned block devices.
>>>      */
>>> -   if (dmz->ddev[0] && dmz->ddev[1]) {
>>> +   if (dmz->nr_ddevs > 1) {
>>>             reg_dev = &dmz->dev[0];
>>>             if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
>>>                     ti->error = "Primary disk is not a regular device";
>>>                     return -EINVAL;
>>>             }
>>> -           zoned_dev = &dmz->dev[1];
>>> -           if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
>>> -                   ti->error = "Secondary disk is not a zoned device";
>>> -                   return -EINVAL;
>>> +           for (i = 1; i < dmz->nr_ddevs; i++) {
>>> +                   zoned_dev = &dmz->dev[i];
>>> +                   if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
>>> +                           ti->error = "Secondary disk is not a zoned 
>>> device";
>>> +                           return -EINVAL;
>>> +                   }
>>> +                   q = bdev_get_queue(zoned_dev->bdev);
>>> +                   if (zone_nr_sectors &&
>>> +                       zone_nr_sectors != blk_queue_zone_sectors(q)) {
>>> +                           ti->error = "Zone nr sectors mismatch";
>>> +                           return -EINVAL;
>>> +                   }
>>> +                   zone_nr_sectors = blk_queue_zone_sectors(q);
>>> +                   zoned_dev->zone_nr_sectors = zone_nr_sectors;
>>> +                   zoned_dev->nr_zones =
>>> +                           blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>             }
>>>     } else {
>>>             reg_dev = NULL;
>>> @@ -800,17 +812,24 @@ static int dmz_fixup_devices(struct dm_target *ti)
>>>                     ti->error = "Disk is not a zoned device";
>>>                     return -EINVAL;
>>>             }
>>> +           q = bdev_get_queue(zoned_dev->bdev);
>>> +           zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
>>> +           zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>     }
>>> -   q = bdev_get_queue(zoned_dev->bdev);
>>> -   zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
>>> -   zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>   
>>>     if (reg_dev) {
>>> -           reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors;
>>> +           sector_t zone_offset;
>>> +
>>> +           reg_dev->zone_nr_sectors = zone_nr_sectors;
>>>             reg_dev->nr_zones =
>>>                     DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
>>>                                           reg_dev->zone_nr_sectors);
>>> -           zoned_dev->zone_offset = reg_dev->nr_zones;
>>> +           reg_dev->zone_offset = 0;
>>> +           zone_offset = reg_dev->nr_zones;
>>> +           for (i = 1; i < dmz->nr_ddevs; i++) {
>>> +                   dmz->dev[i].zone_offset = zone_offset;
>>> +                   zone_offset += dmz->dev[i].nr_zones;
>>> +           }
>>>     }
>>>     return 0;
>>>   }
>>> @@ -824,7 +843,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int 
>>> argc, char **argv)
>>>     int ret, i;
>>>   
>>>     /* Check arguments */
>>> -   if (argc < 1 || argc > 2) {
>>> +   if (argc < 1) {
>>>             ti->error = "Invalid argument count";
>>>             return -EINVAL;
>>>     }
>>> @@ -835,32 +854,31 @@ static int dmz_ctr(struct dm_target *ti, unsigned int 
>>> argc, char **argv)
>>>             ti->error = "Unable to allocate the zoned target descriptor";
>>>             return -ENOMEM;
>>>     }
>>> -   dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL);
>>> +   dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
>>>     if (!dmz->dev) {
>>>             ti->error = "Unable to allocate the zoned device descriptors";
>>>             kfree(dmz);
>>>             return -ENOMEM;
>>>     }
>>> +   dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
>>> +   if (!dmz->ddev) {
>>> +           ti->error = "Unable to allocate the dm device descriptors";
>>> +           ret = -ENOMEM;
>>> +           goto err;
>>> +   }
>>>     dmz->nr_ddevs = argc;
>>> +
>>>     ti->private = dmz;
>>>   
>>>     /* Get the target zoned block device */
>>> -   ret = dmz_get_zoned_device(ti, argv[0], 0, argc);
>>> -   if (ret)
>>> -           goto err;
>>> -
>>> -   if (argc == 2) {
>>> -           ret = dmz_get_zoned_device(ti, argv[1], 1, argc);
>>> -           if (ret) {
>>> -                   dmz_put_zoned_device(ti);
>>> -                   goto err;
>>> -           }
>>> +   for (i = 0; i < argc; i++) {
>>> +           ret = dmz_get_zoned_device(ti, argv[i], i, argc);
>>> +           if (ret)
>>> +                   goto err_dev;
>>>     }
>>>     ret = dmz_fixup_devices(ti);
>>> -   if (ret) {
>>> -           dmz_put_zoned_device(ti);
>>> -           goto err;
>>> -   }
>>> +   if (ret)
>>> +           goto err_dev;
>>>   
>>>     /* Initialize metadata */
>>>     ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
>>> @@ -1056,13 +1074,13 @@ static int dmz_iterate_devices(struct dm_target *ti,
>>>     struct dmz_target *dmz = ti->private;
>>>     unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
>>>     sector_t capacity;
>>> -   int r;
>>> +   int i, r;
>>>   
>>> -   capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1);
>>> -   r = fn(ti, dmz->ddev[0], 0, capacity, data);
>>> -   if (!r && dmz->ddev[1]) {
>>> -           capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1);
>>> -           r = fn(ti, dmz->ddev[1], 0, capacity, data);
>>> +   for (i = 0; i < dmz->nr_ddevs; i++) {
>>> +           capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
>>> +           r = fn(ti, dmz->ddev[i], 0, capacity, data);
>>> +           if (r)
>>> +                   break;
>>>     }
>>>     return r;
>>>   }
>>> @@ -1083,9 +1101,7 @@ static void dmz_status(struct dm_target *ti, 
>>> status_type_t type,
>>>                    dmz_nr_zones(dmz->metadata),
>>>                    dmz_nr_unmap_cache_zones(dmz->metadata),
>>>                    dmz_nr_cache_zones(dmz->metadata));
>>> -           for (i = 0; i < DMZ_MAX_DEVS; i++) {
>>> -                   if (!dmz->ddev[i])
>>> -                           continue;
>>> +           for (i = 0; i < dmz->nr_ddevs; i++) {
>>>                     /*
>>>                      * For a multi-device setup the first device
>>>                      * contains only cache zones.
>>> @@ -1104,8 +1120,8 @@ static void dmz_status(struct dm_target *ti, 
>>> status_type_t type,
>>>             dev = &dmz->dev[0];
>>>             format_dev_t(buf, dev->bdev->bd_dev);
>>>             DMEMIT("%s", buf);
>>> -           if (dmz->dev[1].bdev) {
>>> -                   dev = &dmz->dev[1];
>>> +           for (i = 1; i < dmz->nr_ddevs; i++) {
>>> +                   dev = &dmz->dev[i];
>>>                     format_dev_t(buf, dev->bdev->bd_dev);
>>>                     DMEMIT(" %s", buf);
>>>             }
>>> @@ -1133,7 +1149,7 @@ static int dmz_message(struct dm_target *ti, unsigned 
>>> int argc, char **argv,
>>>   
>>>   static struct target_type dmz_type = {
>>>     .name            = "zoned",
>>> -   .version         = {2, 0, 0},
>>> +   .version         = {3, 0, 0},
>>>     .features        = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
>>>     .module          = THIS_MODULE,
>>>     .ctr             = dmz_ctr,
>>
>> Looks all good to me, but thinking more about it, don't we need to add
>> a device index in the super blocks ? The reason is that if the drive
>> configuration changes between stopt/start (drives removed, added or
>> changed slots), the drive names will change and while the userspace
>> will still be able to find the group of drives constituting the target
>> (using UUID9, there is no obvious way to find out what the original
>> drive order was. Since the kernel side relies on the drive being passed
>> to the ctr function in the order of the mapping, we need to preserve
>> that. Or change also the kernel side to use the index in the super
>> block to put each drive in its correct dmz->dev[] slot.
>>
> Already taken care of; here's where the tertiary superblocks come in.
> Each superblock carries its own position (in the 'sb_block' field).
> This is the _absolute_ position within the entire setup, not the
> relative per-device block number.
> And it also has the absolute number of blocks in the 'nr_chunks' field.
> 
> Hence we know exactly where this superblock (and, by implication, the 
> zones following this superblock) should end up. And we know how large
> the entire setup will be. So can insert the superblock at the right
> position and then can check if we have enough zones for the entire
> device.

I do not get it though. Where is that checked ? At least in this patch, drives
are initialized in the order of the ctr arguments, and this loop:

+               for (i = 1; i < dmz->nr_ddevs; i++) {
+                       dmz->dev[i].zone_offset = zone_offset;
+                       zone_offset += dmz->dev[i].nr_zones;
+               }

in dmz_fixup_devices() sets the zone offset for each device in the same order.
So for a given chunk mapped to a zone identified by its ID, if the device order
changes, zone ID will change and the chunk will not be mapped to the correct
zone. What am I missing here ?


> 
> Not sure if the dmzadm does it, though; but should be easy enough to 
> implement.
> 
> Cheers,
> 
> Hannes
> 


-- 
Damien Le Moal
Western Digital Research



--
dm-devel mailing list
[email protected]
https://www.redhat.com/mailman/listinfo/dm-devel

Reply via email to