Now nvme support five types hardware queue:
poll:           if io was marked for poll
wrr_low:        weighted round robin low
wrr_medium:     weighted round robin medium
wrr_high:       weighted round robin high
read:           for read, if blkcg's wrr is none and is not poll
defaut:         for write/flush, if blkcg's wrr is none and is not poll

for read, default and poll those submission queue's priority is medium by 
default;

Signed-off-by: Weiping Zhang <[email protected]>
---
 drivers/nvme/host/pci.c   | 195 +++++++++++++++++++++++++++++++++-------------
 include/linux/interrupt.h |   2 +-
 2 files changed, 144 insertions(+), 53 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f562154551ce..ee9f3239f3e7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -73,16 +73,28 @@ static const struct kernel_param_ops queue_count_ops = {
        .get = param_get_int,
 };
 
-static int write_queues;
-module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
-MODULE_PARM_DESC(write_queues,
-       "Number of queues to use for writes. If not set, reads and writes "
+static int read_queues;
+module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644);
+MODULE_PARM_DESC(read_queues,
+       "Number of queues to use for reads. If not set, reads and writes "
        "will share a queue set.");
 
 static int poll_queues = 0;
 module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
 
+static int wrr_high_queues = 0;
+module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644);
+MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high.");
+
+static int wrr_medium_queues = 0;
+module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 0644);
+MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR medium.");
+
+static int wrr_low_queues = 0;
+module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644);
+MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low.");
+
 struct nvme_dev;
 struct nvme_queue;
 
@@ -226,9 +238,17 @@ struct nvme_iod {
        struct scatterlist *sg;
 };
 
+static inline bool nvme_is_enable_wrr(struct nvme_dev *dev)
+{
+       return dev->io_queues[HCTX_TYPE_WRR_LOW] +
+               dev->io_queues[HCTX_TYPE_WRR_MEDIUM] +
+               dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0;
+}
+
 static unsigned int max_io_queues(void)
 {
-       return num_possible_cpus() + write_queues + poll_queues;
+       return num_possible_cpus() + read_queues + poll_queues +
+               wrr_high_queues + wrr_medium_queues + wrr_low_queues;
 }
 
 static unsigned int max_queue_count(void)
@@ -1156,19 +1176,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 
qid,
 }
 
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
-                                               struct nvme_queue *nvmeq)
+                                       struct nvme_queue *nvmeq, int wrr)
 {
        struct nvme_ctrl *ctrl = &dev->ctrl;
        struct nvme_command c;
        int flags = NVME_QUEUE_PHYS_CONTIG;
 
-       /*
-        * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
-        * set. Since URGENT priority is zeroes, it makes all queues
-        * URGENT.
-        */
-       if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
-               flags |= NVME_SQ_PRIO_MEDIUM;
+       if (!nvme_is_enable_wrr(dev)) {
+               /*
+                * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+                * set. Since URGENT priority is zeroes, it makes all queues
+                * URGENT.
+                */
+               if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+                       flags |= NVME_SQ_PRIO_MEDIUM;
+       } else {
+               flags |= wrr;
+       }
 
        /*
         * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, 
u16 qid)
        wmb(); /* ensure the first interrupt sees the initialization */
 }
 
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 {
        struct nvme_dev *dev = nvmeq->dev;
-       int result;
+       int start, end, result, wrr;
+       bool polled = false;
        u16 vector = 0;
+       enum hctx_type type;
+
+       /* 0 for admain queue, io queue index >= 1 */
+       start = 1;
+       /* get hardware context type base on qid */
+       for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) {
+               end = start + dev->io_queues[type] - 1;
+               if (qid >= start && qid <= end)
+                       break;
+               start = end + 1;
+       }
+
+       if (nvme_is_enable_wrr(dev)) {
+               /* set read,poll,default to medium by default */
+               switch (type) {
+               case HCTX_TYPE_POLL:
+                       polled = true;
+               case HCTX_TYPE_DEFAULT:
+               case HCTX_TYPE_READ:
+               case HCTX_TYPE_WRR_MEDIUM:
+                       wrr = NVME_SQ_PRIO_MEDIUM;
+                       break;
+               case HCTX_TYPE_WRR_LOW:
+                       wrr = NVME_SQ_PRIO_LOW;
+                       break;
+               case HCTX_TYPE_WRR_HIGH:
+                       wrr = NVME_SQ_PRIO_HIGH;
+                       break;
+               default:
+                       return -EINVAL;
+               }
+       } else {
+               wrr = 0;
+       }
 
        clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
@@ -1555,7 +1614,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
int qid, bool polled)
        if (result)
                return result;
 
-       result = adapter_alloc_sq(dev, qid, nvmeq);
+       result = adapter_alloc_sq(dev, qid, nvmeq, wrr);
        if (result < 0)
                return result;
        else if (result)
@@ -1726,7 +1785,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev 
*dev)
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-       unsigned i, max, rw_queues;
+       unsigned i, max;
        int ret = 0;
 
        for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1737,17 +1796,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
        }
 
        max = min(dev->max_qid, dev->ctrl.queue_count - 1);
-       if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
-               rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
-                               dev->io_queues[HCTX_TYPE_READ];
-       } else {
-               rw_queues = max;
-       }
 
        for (i = dev->online_queues; i <= max; i++) {
-               bool polled = i > rw_queues;
-
-               ret = nvme_create_queue(&dev->queues[i], i, polled);
+               ret = nvme_create_queue(&dev->queues[i], i);
                if (ret)
                        break;
        }
@@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 {
        struct nvme_dev *dev = affd->priv;
-       unsigned int nr_read_queues;
+       unsigned int nr_total, nr, nr_read, nr_default;
+       unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low;
+       unsigned int nr_sets;
 
        /*
         * If there is no interupt available for queues, ensure that
         * the default queue is set to 1. The affinity set size is
         * also set to one, but the irq core ignores it for this case.
-        *
-        * If only one interrupt is available or 'write_queue' == 0, combine
-        * write and read queues.
-        *
-        * If 'write_queues' > 0, ensure it leaves room for at least one read
-        * queue.
         */
-       if (!nrirqs) {
+       if (!nrirqs)
                nrirqs = 1;
-               nr_read_queues = 0;
-       } else if (nrirqs == 1 || !write_queues) {
-               nr_read_queues = 0;
-       } else if (write_queues >= nrirqs) {
-               nr_read_queues = 1;
-       } else {
-               nr_read_queues = nrirqs - write_queues;
-       }
 
-       dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-       affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
-       dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
-       affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
-       affd->nr_sets = nr_read_queues ? 2 : 1;
+       nr_total = nrirqs;
+
+       nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0;
+
+       /* set default to 1, add all the rest queue to default at last */
+       nr = nr_default = 1;
+       nr_sets = 1;
+
+       nr_total -= nr;
+       if (!nr_total)
+               goto done;
+
+       /* read queues */
+       nr_sets++;
+       nr_read = nr = read_queues > nr_total ? nr_total : read_queues;
+       nr_total -= nr;
+       if (!nr_total)
+               goto done;
+
+       /* wrr low queues */
+       nr_sets++;
+       nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues;
+       nr_total -= nr;
+       if (!nr_total)
+               goto done;
+
+       /* wrr medium queues */
+       nr_sets++;
+       nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : 
wrr_medium_queues;
+       nr_total -= nr;
+       if (!nr_total)
+               goto done;
+
+       /* wrr high queues */
+       nr_sets++;
+       nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : 
wrr_high_queues;
+       nr_total -= nr;
+       if (!nr_total)
+               goto done;
+
+       /* set all the rest queue to default */
+       nr_default += nr_total;
+
+done:
+       dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default;
+       affd->set_size[HCTX_TYPE_DEFAULT] = nr_default;
+       dev->io_queues[HCTX_TYPE_READ] = nr_read;
+       affd->set_size[HCTX_TYPE_READ] = nr_read;
+       dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+       affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low;
+       dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+       affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium;
+       dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+       affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high;
+       affd->nr_sets = nr_sets;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2171,10 +2260,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
                nvme_suspend_io_queues(dev);
                goto retry;
        }
-       dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+       dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d "
+                       "default/read/poll/wrr_low/wrr_medium/wrr_high 
queues\n",
                                        dev->io_queues[HCTX_TYPE_DEFAULT],
                                        dev->io_queues[HCTX_TYPE_READ],
-                                       dev->io_queues[HCTX_TYPE_POLL]);
+                                       dev->io_queues[HCTX_TYPE_POLL],
+                                       dev->io_queues[HCTX_TYPE_WRR_LOW],
+                                       dev->io_queues[HCTX_TYPE_WRR_MEDIUM],
+                                       dev->io_queues[HCTX_TYPE_WRR_HIGH]);
        return 0;
 }
 
@@ -2263,9 +2356,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
        if (!dev->ctrl.tagset) {
                dev->tagset.ops = &nvme_mq_ops;
                dev->tagset.nr_hw_queues = dev->online_queues - 1;
-               dev->tagset.nr_maps = 2; /* default + read */
-               if (dev->io_queues[HCTX_TYPE_POLL])
-                       dev->tagset.nr_maps++;
+               dev->tagset.nr_maps = HCTX_MAX_TYPES;
                dev->tagset.timeout = NVME_IO_TIMEOUT;
                dev->tagset.numa_node = dev_to_node(dev->dev);
                dev->tagset.queue_depth =
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..ea726c2f95cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,7 +259,7 @@ struct irq_affinity_notify {
        void (*release)(struct kref *ref);
 };
 
-#define        IRQ_AFFINITY_MAX_SETS  4
+#define        IRQ_AFFINITY_MAX_SETS  7
 
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
-- 
2.14.1

Reply via email to