Re: [PATCH v10] NVMe: Convert to blk-mq

Matias Bjorling Wed, 23 Jul 2014 11:59:04 -0700

On 07/14/2014 02:41 PM, Christoph Hellwig wrote:

+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                         unsigned int hctx_idx)
+       struct nvme_queue *nvmeq = dev->queues[
+                                       (hctx_idx % dev->queue_count) + 1];
+
+       /* nvmeq queues are shared between namespaces. We assume here that
+        * blk-mq map the tags so they match up with the nvme queue tags */
+       if (!nvmeq->hctx)
+               nvmeq->hctx = hctx;
+       else
+               WARN_ON(nvmeq->hctx->tags != hctx->tags);



This wrong to me, as you're overwriting the value of nvmeq->hctx for each
new requeust_queue.  But nothing but ->tagsis ever used from nvmeq->hctx,
so you shold rather set up nvmeq->tags in nvme_dev_add.

Ack

+static int nvme_init_request(void *data, struct request *req,
+                               unsigned int hctx_idx, unsigned int rq_idx,
+                               unsigned int numa_node)
+{
+       struct nvme_dev *dev = data;
+       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+
+       WARN_ON(!nvmeq);
+       cmd->nvmeq = nvmeq;


Shouldn't this fail instead of the warn_on?


Yes, ack

+static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
  {
+       struct nvme_ns *ns = hctx->queue->queuedata;
+       struct nvme_queue *nvmeq = hctx->driver_data;
+       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
        struct nvme_iod *iod;
+       enum dma_data_direction dma_dir;
+       int psegs = req->nr_phys_segments;
+       int result = BLK_MQ_RQ_QUEUE_BUSY;
+       /*
+        * Requeued IO has already been prepped
+        */
+       iod = req->special;
+       if (iod)
+               goto submit_iod;

+       iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
        if (!iod)
+               return result;


So there's still a memory allocation for each request here.  Any reason
this can't be preallocated at least for reasonable sized I/O?

Not at all. I've kept from adding optimizations in the first pass. Thepatches following can implement the optimizations. Jens already has apatch for this in his tree. It also removes GFP_ATOMIC.


No need for GFP_ATOMIC here either, and you probably need a mempool to
guarantee forward progress.

+       if (req->cmd_flags & REQ_DISCARD) {
                void *range;
                /*
                 * We reuse the small pool to allocate the 16-byte range here
@@ -752,33 +602,53 @@ static int nvme_submit_bio_queue(struct nvme_queue 
*nvmeq, struct nvme_ns *ns,
                range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
                                                GFP_ATOMIC,
                                                &iod->first_dma);
+               if (!range)
+                       goto finish_cmd;
                iod_list(iod)[0] = (__le64 *)range;
                iod->npages = 0;
        } else if (psegs) {
+               dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+               sg_init_table(iod->sg, psegs);
+               iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+               if (!iod->nents) {
+                       result = BLK_MQ_RQ_QUEUE_ERROR;
+                       goto finish_cmd;
                }
+
+               if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
+                       goto finish_cmd;
+
+               if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
+                                               blk_rq_bytes(req), GFP_ATOMIC))
+                       goto finish_cmd;
+       }


Would be nice to factor these two into helpers, that could also avoid
the submid_iod goto..

Agree. The q_suspended properly isn't necessary any more, I'll like towait with this until its gone upstream, to keep the patch flow simple.

+
+       if (req->cmd_flags & REQ_DISCARD) {
+               nvme_submit_discard(nvmeq, ns, req, iod);
+               goto queued;
+       }
+
+       if (req->cmd_flags & REQ_FLUSH) {
+               nvme_submit_flush(nvmeq, ns, req->tag);
+               goto queued;
        }
-       return 0;

+       nvme_submit_iod(nvmeq, iod, ns);
+ queued:


A simple

        if (req->cmd_flags & REQ_DISCARD)
                nvme_submit_discard(nvmeq, ns, req, iod);
        else if if (req->cmd_flags & REQ_FLUSH)
                nvme_submit_flush(nvmeq, ns, req->tag);
        else
                nvme_submit_iod(nvmeq, iod, ns);

seems preferable here.

Ack

+static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
  {
+       struct nvme_queue *nvmeq = data;
+       struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+       unsigned int tag = 0;

+       tag = 0;
+       do {
+               struct request *req;
                void *ctx;
                nvme_completion_fn fn;
+               struct nvme_cmd_info *cmd;
                static struct nvme_completion cqe = {
                        .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
                };
+               int qdepth = nvmeq == nvmeq->dev->queues[0] ?
+                                       nvmeq->dev->admin_tagset.queue_depth :
+                                       nvmeq->dev->tagset.queue_depth;

+               /* zero'd bits are free tags */
+               tag = find_next_zero_bit(tag_map, qdepth, tag);
+               if (tag >= qdepth)
+                       break;
+
+               req = blk_mq_tag_to_rq(hctx->tags, tag++);
+               cmd = blk_mq_rq_to_pdu(req);


Seems like blk-mq would make your life easier by exporting an iterator
that goes over each in-use request instead of the current
blk_mq_tag_busy_iter prototype.  blk_mq_timeout_check would also be able
to make use of that, so maybe that would be a good preparatory patch?


Yes. I'll prepare a patch and send it off to Jens.

+static enum blk_eh_timer_return nvme_timeout(struct request *req)
  {
+       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = cmd->nvmeq;

+       dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
+                                                       nvmeq->qid);
+       if (nvmeq->dev->initialized)
+               nvme_abort_req(req);

+       return BLK_EH_RESET_TIMER;
+}


Aborting a request but then resetting the timer looks wrong to me.

If that's indeed the intended behavior please add a comment explaining
it.

Ack

+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+       if (!dev->admin_q) {


When would it be non-NULL?  Seems like the resume case might be the
case, but I suspect the code could be restructured to avoid even calling
nvme_alloc_admin_tags for that case.

Ack. I want to wait changing resume/suspend flow until its goneupstream. It should go into a separate patch.

+static void nvme_free_admin_tags(struct nvme_dev *dev)
+{
+       if (dev->admin_q)
+               blk_mq_free_tag_set(&dev->admin_tagset);
+}


When would this be called with the admin queue not initialized?

Is it possible for a pci_driver->remove fn to be called during the probephase? If not, then this could definitely be removed.

+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+       if (dev->admin_q && !blk_queue_dying(dev->admin_q))
+               blk_cleanup_queue(dev->admin_q);
+}


Same here.


Thanks for taking the time to look through it.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v10] NVMe: Convert to blk-mq

Reply via email to