On Wed, Nov 15, 2017 at 11:25:12AM +0100, Mariusz Dabrowski wrote:
> Hi all,
> 
> In order to be compliant with a pass-throug drive behavior, RAID queue
> limits should be calculated in a way that minimal io, optimal io and
> discard granularity size will be met from a single drive perspective.
> Currently MD driver is ignoring queue limits reported by members and all
> calculations are based on chunk (stripe) size.

We did use blk_stack_limits, which will care about these.

> I am working on patch which
> changes this. Since kernel 4.13 drives which support streams (write hints)
> might report discard_alignment or discard_granularity bigger than array
> stripe (for more details view NVMe 1.3 specification, chapter 9.3 Streams)
> so result of current calculation is not optimal for those drives. For 
> example for 4 disk RAID 0 with chunk size smaller than discard_granularity
> of member drives, discard_granularity of RAID array should be equal to
> 4*member_discard_granularity.
> 
> The problem is that for some drives there is a risk that result of this
> calculation exceeds unsigned int range. Current implementation of function
> nvme_config_discard in NVMe driver can also suffer the same problem:
> 
>       if (ctrl->nr_streams && ns->sws && ns->sgs) {
>               unsigned int sz = logical_block_size * ns->sws * ns->sgs;
> 
>               ns->queue->limits.discard_alignment = sz;
>               ns->queue->limits.discard_granularity = sz;
>       }
> 
> One potential solution for that is to change type of some queue limits
> (at least discard_granularity and discard_alignment, maybe more, like
> max_discard_sectors?) from 32 bits to 64 bits.
> 
> What are your thoughts about this? Do you expect any troubles with 
> changing this in block layer? Would you be willing to do such change?
> 
> Signed-off-by: Mariusz Dabrowski <mariusz.dabrow...@intel.com>
> ---
> 
>  drivers/md/md.c     | 24 ++++++++++++++++++++++++
>  drivers/md/md.h     |  1 +
>  drivers/md/raid0.c  | 23 ++++++++++++++++++++---
>  drivers/md/raid10.c | 30 +++++++++++++++++++++++++-----
>  drivers/md/raid5.c  | 25 +++++++++++++++++++------
>  5 files changed, 89 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0ff1bbf6c90e..e0dc114cab19 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -66,6 +66,7 @@
>  #include <linux/raid/md_u.h>
>  #include <linux/slab.h>
>  #include <linux/percpu-refcount.h>
> +#include <linux/lcm.h>
>  
>  #include <trace/events/block.h>
>  #include "md.h"
> @@ -679,6 +680,29 @@ struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, 
> int nr)
>  }
>  EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
>  
> +void md_rdev_get_queue_limits(struct mddev *mddev, struct queue_limits 
> *limits)
> +{
> +     struct md_rdev *rdev;
> +
> +     memset(limits, 0, sizeof(struct queue_limits));
> +
> +     rdev_for_each(rdev, mddev) {
> +             struct queue_limits *rdev_limits;
> +
> +             rdev_limits = &rdev->bdev->bd_queue->limits;
> +             limits->io_min = max(limits->io_min, rdev_limits->io_min);
> +             limits->io_opt = lcm_not_zero(limits->io_opt,
> +                                           rdev_limits->io_opt);
> +             limits->discard_granularity =
> +                     max(limits->discard_granularity,
> +                         rdev_limits->discard_granularity);
> +             limits->discard_alignment =
> +                     max(limits->discard_alignment,
> +                         rdev_limits->discard_alignment);
> +     }
> +}

isn't this exactly what blk_stack_limits does?

> +EXPORT_SYMBOL_GPL(md_rdev_get_queue_limits);
> +
>  static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
>  {
>       struct md_rdev *rdev;
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index d8287d3cd1bf..5b514b9bb535 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -702,6 +702,7 @@ extern void md_reload_sb(struct mddev *mddev, int 
> raid_disk);
>  extern void md_update_sb(struct mddev *mddev, int force);
>  extern void md_kick_rdev_from_array(struct md_rdev * rdev);
>  struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
> +void md_rdev_get_queue_limits(struct mddev *mddev, struct queue_limits 
> *limits);
>  
>  static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev 
> *mddev)
>  {
> diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
> index 5a00fc118470..f1dcf7fd3eb1 100644
> --- a/drivers/md/raid0.c
> +++ b/drivers/md/raid0.c
> @@ -382,15 +382,31 @@ static int raid0_run(struct mddev *mddev)
>       if (mddev->queue) {
>               struct md_rdev *rdev;
>               bool discard_supported = false;
> +             struct queue_limits limits;
> +             unsigned int io_min, io_opt;
> +             unsigned int granularity, alignment;
> +             unsigned int chunk_size = mddev->chunk_sectors << 9;
>  
> +             md_rdev_get_queue_limits(mddev, &limits);
>               blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
>               blk_queue_max_write_same_sectors(mddev->queue, 
> mddev->chunk_sectors);
>               blk_queue_max_write_zeroes_sectors(mddev->queue, 
> mddev->chunk_sectors);
>               blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
>  
> -             blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
> -             blk_queue_io_opt(mddev->queue,
> -                              (mddev->chunk_sectors << 9) * 
> mddev->raid_disks);
> +             io_min = chunk_size > limits.io_min ?
> +                      chunk_size : limits.io_min * mddev->raid_disks;
> +             io_opt = max(chunk_size, limits.io_min) * mddev->raid_disks;
> +             granularity = chunk_size > limits.discard_granularity ?
> +                           limits.discard_granularity :
> +                           limits.discard_granularity * mddev->raid_disks;
> +             alignment = chunk_size > limits.discard_granularity ?
> +                         limits.discard_alignment :
> +                         limits.discard_alignment * mddev->raid_disks;
> +
> +             blk_queue_io_min(mddev->queue, io_min);
> +             blk_queue_io_opt(mddev->queue, io_opt);
> +             mddev->queue->limits.discard_granularity = granularity;
> +             mddev->queue->limits.discard_alignment = alignment;

We use blk_stack_limits below, why we need this change?
  
>               rdev_for_each(rdev, mddev) {
>                       disk_stack_limits(mddev->gendisk, rdev->bdev,
> @@ -398,6 +414,7 @@ static int raid0_run(struct mddev *mddev)
>                       if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
>                               discard_supported = true;
>               }
> +
>               if (!discard_supported)
>                       queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 
> mddev->queue);
>               else
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 374df5796649..5ff1635a2551 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -3623,7 +3623,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
>  static int raid10_run(struct mddev *mddev)
>  {
>       struct r10conf *conf;
> -     int i, disk_idx, chunk_size;
> +     int i, disk_idx;
> +     unsigned int chunk_size, io_min, io_opt;
> +     unsigned int granularity, alignment;
> +     struct queue_limits limits;
>       struct raid10_info *disk;
>       struct md_rdev *rdev;
>       sector_t size;
> @@ -3649,16 +3652,33 @@ static int raid10_run(struct mddev *mddev)
>  
>       chunk_size = mddev->chunk_sectors << 9;
>       if (mddev->queue) {
> +             int factor;
> +
> +             md_rdev_get_queue_limits(mddev, &limits);
>               blk_queue_max_discard_sectors(mddev->queue,
>                                             mddev->chunk_sectors);
>               blk_queue_max_write_same_sectors(mddev->queue, 0);
>               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
> -             blk_queue_io_min(mddev->queue, chunk_size);
> +
>               if (conf->geo.raid_disks % conf->geo.near_copies)
> -                     blk_queue_io_opt(mddev->queue, chunk_size * 
> conf->geo.raid_disks);
> +                     factor = conf->geo.raid_disks;
>               else
> -                     blk_queue_io_opt(mddev->queue, chunk_size *
> -                                      (conf->geo.raid_disks / 
> conf->geo.near_copies));
> +                     factor = conf->geo.raid_disks / conf->geo.near_copies;
> +
> +             io_min = chunk_size > limits.io_min ?
> +                              chunk_size : limits.io_min * factor;
> +             io_opt = max(chunk_size, limits.io_min) * factor;
> +             granularity = chunk_size > limits.discard_granularity ?
> +                                   limits.discard_granularity :
> +                                   limits.discard_granularity * factor;
> +             alignment = chunk_size > limits.discard_alignment ?
> +                                 limits.discard_alignment :
> +                                 limits.discard_alignment * factor;
> +
> +             blk_queue_io_min(mddev->queue, io_min);
> +             blk_queue_io_opt(mddev->queue, io_opt);
> +             mddev->queue->limits.discard_granularity = granularity;
> +             mddev->queue->limits.discard_alignment = alignment;
>       }
>  
>       rdev_for_each(rdev, mddev) {
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 4fd34ff2e830..adbbe979c363 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7508,7 +7508,10 @@ static int raid5_run(struct mddev *mddev)
>       md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
>  
>       if (mddev->queue) {
> -             int chunk_size;
> +             unsigned int chunk_size, io_min, io_opt;
> +             unsigned int granularity, alignment;
> +             struct queue_limits limits;
> +
>               /* read-ahead size must cover two whole stripes, which
>                * is 2 * (datadisks) * chunksize where 'n' is the
>                * number of raid devices
> @@ -7519,10 +7522,14 @@ static int raid5_run(struct mddev *mddev)
>               if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
>                       mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
>  
> +             md_rdev_get_queue_limits(mddev, &limits);
>               chunk_size = mddev->chunk_sectors << 9;
> -             blk_queue_io_min(mddev->queue, chunk_size);
> -             blk_queue_io_opt(mddev->queue, chunk_size *
> -                              (conf->raid_disks - conf->max_degraded));
> +             io_min = chunk_size > limits.io_min ?
> +                             chunk_size : limits.io_min * data_disks;
> +             io_opt = max(chunk_size, limits.io_min) * data_disks;
> +             blk_queue_io_min(mddev->queue, io_min);
> +             blk_queue_io_opt(mddev->queue, io_opt);
> +
>               mddev->queue->limits.raid_partial_stripes_expensive = 1;
>               /*
>                * We can only discard a whole stripe. It doesn't make sense to
> @@ -7533,8 +7540,14 @@ static int raid5_run(struct mddev *mddev)
>                * currently assumes that */
>               while ((stripe-1) & stripe)
>                       stripe = (stripe | (stripe-1)) + 1;
> -             mddev->queue->limits.discard_alignment = stripe;
> -             mddev->queue->limits.discard_granularity = stripe;
> +             granularity = chunk_size > limits.discard_granularity ?
> +                           stripe :
> +                           limits.discard_granularity * data_disks;
> +             alignment = chunk_size > limits.discard_granularity ?
> +                         stripe :
> +                         limits.discard_alignment * data_disks;
> +             mddev->queue->limits.discard_alignment = granularity;
> +             mddev->queue->limits.discard_granularity = alignment;
>  
>               blk_queue_max_write_same_sectors(mddev->queue, 0);
>               blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
> -- 
> 2.15.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to