On 07/14/2014 02:41 PM, Christoph Hellwig wrote:
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                         unsigned int hctx_idx)
+       struct nvme_queue *nvmeq = dev->queues[
+                                       (hctx_idx % dev->queue_count) + 1];
+
+       /* nvmeq queues are shared between namespaces. We assume here that
+        * blk-mq map the tags so they match up with the nvme queue tags */
+       if (!nvmeq->hctx)
+               nvmeq->hctx = hctx;
+       else
+               WARN_ON(nvmeq->hctx->tags != hctx->tags);


This wrong to me, as you're overwriting the value of nvmeq->hctx for each
new requeust_queue.  But nothing but ->tagsis ever used from nvmeq->hctx,
so you shold rather set up nvmeq->tags in nvme_dev_add.

Ack


+static int nvme_init_request(void *data, struct request *req,
+                               unsigned int hctx_idx, unsigned int rq_idx,
+                               unsigned int numa_node)
+{
+       struct nvme_dev *dev = data;
+       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+
+       WARN_ON(!nvmeq);
+       cmd->nvmeq = nvmeq;

Shouldn't this fail instead of the warn_on?

Yes, ack


+static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
  {
+       struct nvme_ns *ns = hctx->queue->queuedata;
+       struct nvme_queue *nvmeq = hctx->driver_data;
+       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
        struct nvme_iod *iod;
+       enum dma_data_direction dma_dir;
+       int psegs = req->nr_phys_segments;
+       int result = BLK_MQ_RQ_QUEUE_BUSY;
+       /*
+        * Requeued IO has already been prepped
+        */
+       iod = req->special;
+       if (iod)
+               goto submit_iod;

+       iod = nvme_alloc_iod(psegs, blk_rq_bytes(req), GFP_ATOMIC);
        if (!iod)
+               return result;

So there's still a memory allocation for each request here.  Any reason
this can't be preallocated at least for reasonable sized I/O?

Not at all. I've kept from adding optimizations in the first pass. The patches following can implement the optimizations. Jens already has a patch for this in his tree. It also removes GFP_ATOMIC.


No need for GFP_ATOMIC here either, and you probably need a mempool to
guarantee forward progress.

+       if (req->cmd_flags & REQ_DISCARD) {
                void *range;
                /*
                 * We reuse the small pool to allocate the 16-byte range here
@@ -752,33 +602,53 @@ static int nvme_submit_bio_queue(struct nvme_queue 
*nvmeq, struct nvme_ns *ns,
                range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
                                                GFP_ATOMIC,
                                                &iod->first_dma);
+               if (!range)
+                       goto finish_cmd;
                iod_list(iod)[0] = (__le64 *)range;
                iod->npages = 0;
        } else if (psegs) {
+               dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+               sg_init_table(iod->sg, psegs);
+               iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+               if (!iod->nents) {
+                       result = BLK_MQ_RQ_QUEUE_ERROR;
+                       goto finish_cmd;
                }
+
+               if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
+                       goto finish_cmd;
+
+               if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod,
+                                               blk_rq_bytes(req), GFP_ATOMIC))
+                       goto finish_cmd;
+       }

Would be nice to factor these two into helpers, that could also avoid
the submid_iod goto..

Agree. The q_suspended properly isn't necessary any more, I'll like to wait with this until its gone upstream, to keep the patch flow simple.


+
+       if (req->cmd_flags & REQ_DISCARD) {
+               nvme_submit_discard(nvmeq, ns, req, iod);
+               goto queued;
+       }
+
+       if (req->cmd_flags & REQ_FLUSH) {
+               nvme_submit_flush(nvmeq, ns, req->tag);
+               goto queued;
        }
-       return 0;

+       nvme_submit_iod(nvmeq, iod, ns);
+ queued:

A simple

        if (req->cmd_flags & REQ_DISCARD)
                nvme_submit_discard(nvmeq, ns, req, iod);
        else if if (req->cmd_flags & REQ_FLUSH)
                nvme_submit_flush(nvmeq, ns, req->tag);
        else
                nvme_submit_iod(nvmeq, iod, ns);

seems preferable here.

Ack


+static void nvme_cancel_queue_ios(void *data, unsigned long *tag_map)
  {
+       struct nvme_queue *nvmeq = data;
+       struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+       unsigned int tag = 0;

+       tag = 0;
+       do {
+               struct request *req;
                void *ctx;
                nvme_completion_fn fn;
+               struct nvme_cmd_info *cmd;
                static struct nvme_completion cqe = {
                        .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
                };
+               int qdepth = nvmeq == nvmeq->dev->queues[0] ?
+                                       nvmeq->dev->admin_tagset.queue_depth :
+                                       nvmeq->dev->tagset.queue_depth;

+               /* zero'd bits are free tags */
+               tag = find_next_zero_bit(tag_map, qdepth, tag);
+               if (tag >= qdepth)
+                       break;
+
+               req = blk_mq_tag_to_rq(hctx->tags, tag++);
+               cmd = blk_mq_rq_to_pdu(req);

Seems like blk-mq would make your life easier by exporting an iterator
that goes over each in-use request instead of the current
blk_mq_tag_busy_iter prototype.  blk_mq_timeout_check would also be able
to make use of that, so maybe that would be a good preparatory patch?

Yes. I'll prepare a patch and send it off to Jens.


+static enum blk_eh_timer_return nvme_timeout(struct request *req)
  {
+       struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+       struct nvme_queue *nvmeq = cmd->nvmeq;

+       dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
+                                                       nvmeq->qid);
+       if (nvmeq->dev->initialized)
+               nvme_abort_req(req);

+       return BLK_EH_RESET_TIMER;
+}

Aborting a request but then resetting the timer looks wrong to me.

If that's indeed the intended behavior please add a comment explaining
it.

Ack


+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+       if (!dev->admin_q) {

When would it be non-NULL?  Seems like the resume case might be the
case, but I suspect the code could be restructured to avoid even calling
nvme_alloc_admin_tags for that case.

Ack. I want to wait changing resume/suspend flow until its gone upstream. It should go into a separate patch.


+static void nvme_free_admin_tags(struct nvme_dev *dev)
+{
+       if (dev->admin_q)
+               blk_mq_free_tag_set(&dev->admin_tagset);
+}

When would this be called with the admin queue not initialized?

Is it possible for a pci_driver->remove fn to be called during the probe phase? If not, then this could definitely be removed.


+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+       if (dev->admin_q && !blk_queue_dying(dev->admin_q))
+               blk_cleanup_queue(dev->admin_q);
+}

Same here.


Thanks for taking the time to look through it.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to