Please add [email protected] to this list
and Cc maintainers of NVMe subsystems.

Also do you have performance numbers for WRR ?

On 06/16/2019 03:16 AM, Weiping Zhang wrote:
> Now nvme support five types hardware queue:
> poll:         if io was marked for poll
> wrr_low:      weighted round robin low
> wrr_medium:   weighted round robin medium
> wrr_high:     weighted round robin high
> read:         for read, if blkcg's wrr is none and is not poll
> defaut:               for write/flush, if blkcg's wrr is none and is not poll
>
> for read, default and poll those submission queue's priority is medium by 
> default;
>
> Signed-off-by: Weiping Zhang <[email protected]>
> ---
>   drivers/nvme/host/pci.c   | 195 
> +++++++++++++++++++++++++++++++++-------------
>   include/linux/interrupt.h |   2 +-
>   2 files changed, 144 insertions(+), 53 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index f562154551ce..ee9f3239f3e7 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -73,16 +73,28 @@ static const struct kernel_param_ops queue_count_ops = {
>       .get = param_get_int,
>   };
>
> -static int write_queues;
> -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
> -MODULE_PARM_DESC(write_queues,
> -     "Number of queues to use for writes. If not set, reads and writes "
> +static int read_queues;
> +module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644);
> +MODULE_PARM_DESC(read_queues,
> +     "Number of queues to use for reads. If not set, reads and writes "
>       "will share a queue set.");
>
>   static int poll_queues = 0;
>   module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
>   MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
>
> +static int wrr_high_queues = 0;
> +module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644);
> +MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high.");
> +
> +static int wrr_medium_queues = 0;
> +module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 
> 0644);
> +MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR 
> medium.");
> +
> +static int wrr_low_queues = 0;
> +module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644);
> +MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low.");
> +
>   struct nvme_dev;
>   struct nvme_queue;
>
> @@ -226,9 +238,17 @@ struct nvme_iod {
>       struct scatterlist *sg;
>   };
>
> +static inline bool nvme_is_enable_wrr(struct nvme_dev *dev)
> +{
> +     return dev->io_queues[HCTX_TYPE_WRR_LOW] +
> +             dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
> +             dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0;
> +}
> +
>   static unsigned int max_io_queues(void)
>   {
> -     return num_possible_cpus() + write_queues + poll_queues;
> +     return num_possible_cpus() + read_queues + poll_queues +
> +             wrr_high_queues + wrr_medium_queues + wrr_low_queues;
>   }
>
>   static unsigned int max_queue_count(void)
> @@ -1156,19 +1176,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 
> qid,
>   }
>
>   static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
> -                                             struct nvme_queue *nvmeq)
> +                                     struct nvme_queue *nvmeq, int wrr)
>   {
>       struct nvme_ctrl *ctrl = &dev->ctrl;
>       struct nvme_command c;
>       int flags = NVME_QUEUE_PHYS_CONTIG;
>
> -     /*
> -      * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
> -      * set. Since URGENT priority is zeroes, it makes all queues
> -      * URGENT.
> -      */
> -     if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
> -             flags |= NVME_SQ_PRIO_MEDIUM;
> +     if (!nvme_is_enable_wrr(dev)) {
> +             /*
> +              * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
> +              * set. Since URGENT priority is zeroes, it makes all queues
> +              * URGENT.
> +              */
> +             if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
> +                     flags |= NVME_SQ_PRIO_MEDIUM;
> +     } else {
> +             flags |= wrr;
> +     }
>
>       /*
>        * Note: we (ab)use the fact that the prp fields survive if no data
> @@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, 
> u16 qid)
>       wmb(); /* ensure the first interrupt sees the initialization */
>   }
>
> -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
> +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
>   {
>       struct nvme_dev *dev = nvmeq->dev;
> -     int result;
> +     int start, end, result, wrr;
> +     bool polled = false;
>       u16 vector = 0;
> +     enum hctx_type type;
> +
> +     /* 0 for admain queue, io queue index >= 1 */
> +     start = 1;
> +     /* get hardware context type base on qid */
> +     for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
> +             end = start + dev->io_queues[type] - 1;
> +             if (qid >= start && qid <= end)
> +                     break;
> +             start = end + 1;
> +     }
> +
> +     if (nvme_is_enable_wrr(dev)) {
> +             /* set read,poll,default to medium by default */
> +             switch (type) {
> +             case HCTX_TYPE_POLL:
> +                     polled = true;
> +             case HCTX_TYPE_DEFAULT:
> +             case HCTX_TYPE_READ:
> +             case HCTX_TYPE_WRR_MEDIUM:
> +                     wrr = NVME_SQ_PRIO_MEDIUM;
> +                     break;
> +             case HCTX_TYPE_WRR_LOW:
> +                     wrr = NVME_SQ_PRIO_LOW;
> +                     break;
> +             case HCTX_TYPE_WRR_HIGH:
> +                     wrr = NVME_SQ_PRIO_HIGH;
> +                     break;
> +             default:
> +                     return -EINVAL;
> +             }
> +     } else {
> +             wrr = 0;
> +     }
>
>       clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
>
> @@ -1555,7 +1614,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
> int qid, bool polled)
>       if (result)
>               return result;
>
> -     result = adapter_alloc_sq(dev, qid, nvmeq);
> +     result = adapter_alloc_sq(dev, qid, nvmeq, wrr);
>       if (result < 0)
>               return result;
>       else if (result)
> @@ -1726,7 +1785,7 @@ static int nvme_pci_configure_admin_queue(struct 
> nvme_dev *dev)
>
>   static int nvme_create_io_queues(struct nvme_dev *dev)
>   {
> -     unsigned i, max, rw_queues;
> +     unsigned i, max;
>       int ret = 0;
>
>       for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
> @@ -1737,17 +1796,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
>       }
>
>       max = min(dev->max_qid, dev->ctrl.queue_count - 1);
> -     if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
> -             rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
> -                             dev->io_queues[HCTX_TYPE_READ];
> -     } else {
> -             rw_queues = max;
> -     }
>
>       for (i = dev->online_queues; i <= max; i++) {
> -             bool polled = i > rw_queues;
> -
> -             ret = nvme_create_queue(&dev->queues[i], i, polled);
> +             ret = nvme_create_queue(&dev->queues[i], i);
>               if (ret)
>                       break;
>       }
> @@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
>   static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int 
> nrirqs)
>   {
>       struct nvme_dev *dev = affd->priv;
> -     unsigned int nr_read_queues;
> +     unsigned int nr_total, nr, nr_read, nr_default;
> +     unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low;
> +     unsigned int nr_sets;
>
>       /*
>        * If there is no interupt available for queues, ensure that
>        * the default queue is set to 1. The affinity set size is
>        * also set to one, but the irq core ignores it for this case.
> -      *
> -      * If only one interrupt is available or 'write_queue' == 0, combine
> -      * write and read queues.
> -      *
> -      * If 'write_queues' > 0, ensure it leaves room for at least one read
> -      * queue.
>        */
> -     if (!nrirqs) {
> +     if (!nrirqs)
>               nrirqs = 1;
> -             nr_read_queues = 0;
> -     } else if (nrirqs == 1 || !write_queues) {
> -             nr_read_queues = 0;
> -     } else if (write_queues >= nrirqs) {
> -             nr_read_queues = 1;
> -     } else {
> -             nr_read_queues = nrirqs - write_queues;
> -     }
>
> -     dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -     affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
> -     dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
> -     affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
> -     affd->nr_sets = nr_read_queues ? 2 : 1;
> +     nr_total = nrirqs;
> +
> +     nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
> +
> +     /* set default to 1, add all the rest queue to default at last */
> +     nr = nr_default = 1;
> +     nr_sets = 1;
> +
> +     nr_total -= nr;
> +     if (!nr_total)
> +             goto done;
> +
> +     /* read queues */
> +     nr_sets++;
> +     nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
> +     nr_total -= nr;
> +     if (!nr_total)
> +             goto done;
> +
> +     /* wrr low queues */
> +     nr_sets++;
> +     nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
> +     nr_total -= nr;
> +     if (!nr_total)
> +             goto done;
> +
> +     /* wrr medium queues */
> +     nr_sets++;
> +     nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : 
> wrr_medium_queues;
> +     nr_total -= nr;
> +     if (!nr_total)
> +             goto done;
> +
> +     /* wrr high queues */
> +     nr_sets++;
> +     nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : 
> wrr_high_queues;
> +     nr_total -= nr;
> +     if (!nr_total)
> +             goto done;
> +
> +     /* set all the rest queue to default */
> +     nr_default += nr_total;
> +
> +done:
> +     dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default;
> +     affd->set_size[HCTX_TYPE_DEFAULT] = nr_default;
> +     dev->io_queues[HCTX_TYPE_READ] = nr_read;
> +     affd->set_size[HCTX_TYPE_READ] = nr_read;
> +     dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
> +     affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
> +     dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
> +     affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
> +     dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
> +     affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
> +     affd->nr_sets = nr_sets;
>   }
>
>   static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
> @@ -2171,10 +2260,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>               nvme_suspend_io_queues(dev);
>               goto retry;
>       }
> -     dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
> +     dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d "
> +                     "default/read/poll/wrr_low/wrr_medium/wrr_high 
> queues\n",
>                                       dev->io_queues[HCTX_TYPE_DEFAULT],
>                                       dev->io_queues[HCTX_TYPE_READ],
> -                                     dev->io_queues[HCTX_TYPE_POLL]);
> +                                     dev->io_queues[HCTX_TYPE_POLL],
> +                                     dev->io_queues[HCTX_TYPE_WRR_LOW],
> +                                     dev->io_queues[HCTX_TYPE_WRR_MEDIUM],
> +                                     dev->io_queues[HCTX_TYPE_WRR_HIGH]);
>       return 0;
>   }
>
> @@ -2263,9 +2356,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
>       if (!dev->ctrl.tagset) {
>               dev->tagset.ops = &nvme_mq_ops;
>               dev->tagset.nr_hw_queues = dev->online_queues - 1;
> -             dev->tagset.nr_maps = 2; /* default + read */
> -             if (dev->io_queues[HCTX_TYPE_POLL])
> -                     dev->tagset.nr_maps++;
> +             dev->tagset.nr_maps = HCTX_MAX_TYPES;
>               dev->tagset.timeout = NVME_IO_TIMEOUT;
>               dev->tagset.numa_node = dev_to_node(dev->dev);
>               dev->tagset.queue_depth =
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index c7eef32e7739..ea726c2f95cc 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -259,7 +259,7 @@ struct irq_affinity_notify {
>       void (*release)(struct kref *ref);
>   };
>
> -#define      IRQ_AFFINITY_MAX_SETS  4
> +#define      IRQ_AFFINITY_MAX_SETS  7
>
>   /**
>    * struct irq_affinity - Description for automatic irq affinity assignements
>

Reply via email to