Re: [PATCH 01/11] nvme: provide optimized poll function for separate poll queues

2018-11-16 Thread Jens Axboe
On 11/16/18 1:35 AM, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018 at 12:51:25PM -0700, Jens Axboe wrote:
>> If we have separate poll queues, we know that they aren't using
>> interrupts. Hence we don't need to disable interrupts around
>> finding completions.
>>
>> Provide a separate set of blk_mq_ops for such devices.
> 
> This looks ok, but I'd prefer if we could offer to just support
> polling with the separate queue.  That way we get ourselves out of
> all kinds of potential races of the interrupt path vs poll path.

As Keith mentioned, we do use polling to find missing completions
in case of timeouts. And that has actually been really useful.

I'd rather keep such a change separate. If we do go down that
route, then there are more optimizations we can make.

Finally, let's not forget that polling is/was still a win even
if we did trigger interrupts. That's how NVMe has been since
polling was introduced. While the newer stuff is a lot more
efficient, I don't think we should totally abandon an easy opt-in
for polling for hardware unless we have strong reasons to do so.

-- 
Jens Axboe



Re: [PATCH 01/11] nvme: provide optimized poll function for separate poll queues

2018-11-16 Thread Christoph Hellwig
On Thu, Nov 15, 2018 at 12:51:25PM -0700, Jens Axboe wrote:
> If we have separate poll queues, we know that they aren't using
> interrupts. Hence we don't need to disable interrupts around
> finding completions.
> 
> Provide a separate set of blk_mq_ops for such devices.

This looks ok, but I'd prefer if we could offer to just support
polling with the separate queue.  That way we get ourselves out of
all kinds of potential races of the interrupt path vs poll path.


[PATCH 01/11] nvme: provide optimized poll function for separate poll queues

2018-11-15 Thread Jens Axboe
If we have separate poll queues, we know that they aren't using
interrupts. Hence we don't need to disable interrupts around
finding completions.

Provide a separate set of blk_mq_ops for such devices.

Signed-off-by: Jens Axboe 
---
 drivers/nvme/host/pci.c | 45 +
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ffbab5b01df4..fc7dd49f22fc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1082,6 +1082,23 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, 
unsigned int tag)
return __nvme_poll(nvmeq, tag);
 }
 
+static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+   struct nvme_queue *nvmeq = hctx->driver_data;
+   u16 start, end;
+   bool found;
+
+   if (!nvme_cqe_pending(nvmeq))
+   return 0;
+
+   spin_lock(>cq_lock);
+   found = nvme_process_cq(nvmeq, , , tag);
+   spin_unlock(>cq_lock);
+
+   nvme_complete_cqes(nvmeq, start, end);
+   return found;
+}
+
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -1584,17 +1601,25 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
.timeout= nvme_timeout,
 };
 
+#define NVME_SHARED_MQ_OPS \
+   .queue_rq   = nvme_queue_rq,\
+   .rq_flags_to_type   = nvme_rq_flags_to_type,\
+   .complete   = nvme_pci_complete_rq, \
+   .init_hctx  = nvme_init_hctx,   \
+   .init_request   = nvme_init_request,\
+   .map_queues = nvme_pci_map_queues,  \
+   .timeout= nvme_timeout  \
+
 static const struct blk_mq_ops nvme_mq_ops = {
-   .queue_rq   = nvme_queue_rq,
-   .rq_flags_to_type   = nvme_rq_flags_to_type,
-   .complete   = nvme_pci_complete_rq,
-   .init_hctx  = nvme_init_hctx,
-   .init_request   = nvme_init_request,
-   .map_queues = nvme_pci_map_queues,
-   .timeout= nvme_timeout,
+   NVME_SHARED_MQ_OPS,
.poll   = nvme_poll,
 };
 
+static const struct blk_mq_ops nvme_mq_poll_noirq_ops = {
+   NVME_SHARED_MQ_OPS,
+   .poll   = nvme_poll_noirq,
+};
+
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
@@ -2274,7 +2299,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
int ret;
 
if (!dev->ctrl.tagset) {
-   dev->tagset.ops = _mq_ops;
+   if (!dev->io_queues[NVMEQ_TYPE_POLL])
+   dev->tagset.ops = _mq_ops;
+   else
+   dev->tagset.ops = _mq_poll_noirq_ops;
+
dev->tagset.nr_hw_queues = dev->online_queues - 1;
dev->tagset.nr_maps = NVMEQ_TYPE_NR;
dev->tagset.timeout = NVME_IO_TIMEOUT;
-- 
2.17.1