Hi Keith

Would you please take a look at this patch.
I really need your suggestion on this.

Sincerely
Jianchao

On 03/09/2018 10:01 AM, jianchao.wang wrote:
> Hi Keith
> 
> Can I have the honor of getting your comment on this patch?
> 
> Thanks in advance
> Jianchao
> 
> On 03/08/2018 02:19 PM, Jianchao Wang wrote:
>> nvme_dev_disable will issue command on adminq to clear HMB and
>> delete io cq/sqs, maybe more in the future. When adminq no response,
>> it has to depends on timeout path. However, nvme_timeout has to
>> invoke nvme_dev_disable before return, so that the DMA mappings
>> could be released safely. This will introduce dangerous circular
>> dependence. Moreover, the whole nvme_dev_disable is under
>> shutdown_lock even waiting for the command, this makes things
>> worse.
>>
>> To avoid this, this patch refactors the nvme_timeout. The basic
>> principle is:
>>  - When need to schedule reset_work, hand over expired requests
>>    to nvme_dev_disable. They will be completed after the controller
>>    is disabled/shtudown.
>>  - When requests from nvme_dev_disable and nvme_reset_work expires,
>>    disable the controller directly then the request could be completed
>>    to wakeup the waiter. nvme_pci_disable_ctrl_directly is introduced
>>    for this, it doesn't send commands on adminq and the shutdown_lock
>>    is not needed here, because the nvme_abort_requests_sync in
>>    nvme_dev_disable could synchronize with nvme_timeout.
>>
>> Signed-off-by: Jianchao Wang <jianchao.w.w...@oracle.com>
>> ---
>>  drivers/nvme/host/pci.c | 199 
>> ++++++++++++++++++++++++++++++++----------------
>>  1 file changed, 134 insertions(+), 65 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index e186158..ce09057 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -70,6 +70,7 @@ struct nvme_queue;
>>  
>>  static void nvme_process_cq(struct nvme_queue *nvmeq);
>>  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
>> +static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev);
>>  
>>  /*
>>   * Represents an NVM Express device.  Each nvme_dev is a PCI function.
>> @@ -98,6 +99,7 @@ struct nvme_dev {
>>      u32 cmbloc;
>>      struct nvme_ctrl ctrl;
>>      struct completion ioq_wait;
>> +    bool inflight_flushed;
>>  
>>      /* shadow doorbell buffer support: */
>>      u32 *dbbuf_dbs;
>> @@ -1180,73 +1182,13 @@ static void nvme_warn_reset(struct nvme_dev *dev, 
>> u32 csts)
>>                       csts, result);
>>  }
>>  
>> -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
>> reserved)
>> +static enum blk_eh_timer_return nvme_pci_abort_io_req(struct request *req)
>>  {
>>      struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>>      struct nvme_queue *nvmeq = iod->nvmeq;
>>      struct nvme_dev *dev = nvmeq->dev;
>> -    struct request *abort_req;
>>      struct nvme_command cmd;
>> -    u32 csts = readl(dev->bar + NVME_REG_CSTS);
>> -
>> -    /*
>> -     * Reset immediately if the controller is failed
>> -     */
>> -    if (nvme_should_reset(dev, csts)) {
>> -            nvme_warn_reset(dev, csts);
>> -            nvme_dev_disable(dev, false);
>> -            nvme_reset_ctrl(&dev->ctrl);
>> -            return BLK_EH_HANDLED;
>> -    }
>> -
>> -    /*
>> -     * Did we miss an interrupt?
>> -     */
>> -    if (__nvme_poll(nvmeq, req->tag)) {
>> -            dev_warn(dev->ctrl.device,
>> -                     "I/O %d QID %d timeout, completion polled\n",
>> -                     req->tag, nvmeq->qid);
>> -            return BLK_EH_HANDLED;
>> -    }
>> -
>> -    /*
>> -     * Shutdown immediately if controller times out while starting. The
>> -     * reset work will see the pci device disabled when it gets the forced
>> -     * cancellation error. All outstanding requests are completed on
>> -     * shutdown, so we return BLK_EH_HANDLED.
>> -     */
>> -    switch (dev->ctrl.state) {
>> -    case NVME_CTRL_CONNECTING:
>> -    case NVME_CTRL_RESETTING:
>> -            dev_warn(dev->ctrl.device,
>> -                     "I/O %d QID %d timeout, disable controller\n",
>> -                     req->tag, nvmeq->qid);
>> -            nvme_dev_disable(dev, false);
>> -            set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> -            return BLK_EH_HANDLED;
>> -    default:
>> -            break;
>> -    }
>> -
>> -    /*
>> -     * Shutdown the controller immediately and schedule a reset if the
>> -     * command was already aborted once before and still hasn't been
>> -     * returned to the driver, or if this is the admin queue.
>> -     */
>> -    if (!nvmeq->qid || iod->aborted) {
>> -            dev_warn(dev->ctrl.device,
>> -                     "I/O %d QID %d timeout, reset controller\n",
>> -                     req->tag, nvmeq->qid);
>> -            nvme_dev_disable(dev, false);
>> -            nvme_reset_ctrl(&dev->ctrl);
>> -
>> -            /*
>> -             * Mark the request as handled, since the inline shutdown
>> -             * forces all outstanding requests to complete.
>> -             */
>> -            set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> -            return BLK_EH_HANDLED;
>> -    }
>> +    struct request *abort_req;
>>  
>>      if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
>>              atomic_inc(&dev->ctrl.abort_limit);
>> @@ -1282,6 +1224,105 @@ static enum blk_eh_timer_return nvme_timeout(struct 
>> request *req, bool reserved)
>>      return BLK_EH_RESET_TIMER;
>>  }
>>  
>> +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool 
>> reserved)
>> +{
>> +    struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>> +    struct nvme_queue *nvmeq = iod->nvmeq;
>> +    struct nvme_dev *dev = nvmeq->dev;
>> +    u32 csts = readl(dev->bar + NVME_REG_CSTS);
>> +    enum {ABORT, RESET, DISABLE} action;
>> +    enum blk_eh_timer_return ret;
>> +    /*
>> +     * This is for nvme_abort_req. This request will be completed
>> +     * by nvme_flush_aborted_requests after the controller is
>> +     * disabled/shutdown
>> +     */
>> +    if (test_bit(NVME_REQ_ABORTED, &nvme_req(req)->flags))
>> +            return BLK_EH_NOT_HANDLED;
>> +
>> +    /*
>> +     * Reset immediately if the controller is failed.
>> +     * Defer the completion to nvme_flush_aborted_requests.
>> +     */
>> +    if (nvme_should_reset(dev, csts)) {
>> +            nvme_warn_reset(dev, csts);
>> +            nvme_reset_ctrl(&dev->ctrl);
>> +            return BLK_EH_RESET_TIMER;
>> +    }
>> +
>> +    /*
>> +     * Did we miss an interrupt?
>> +     */
>> +    if (__nvme_poll(nvmeq, req->tag)) {
>> +            dev_warn(dev->ctrl.device,
>> +                     "I/O %d QID %d timeout, completion polled\n",
>> +                     req->tag, nvmeq->qid);
>> +            return BLK_EH_HANDLED;
>> +    }
>> +
>> +    if (nvmeq->qid) {
>> +            if (dev->ctrl.state == NVME_CTRL_RESETTING ||
>> +                            iod->aborted)
>> +                    action = RESET;
>> +            else
>> +                    action = ABORT;
>> +    } else {
>> +            /*
>> +             * Disable immediately if controller times out while disabling/
>> +             * shuting down/starting. The nvme_dev_disable/nvme_reset_work
>> +             * will see the error.
>> +             * Note: inflight_flushed is set in nvme_dev_disable when it
>> +             * abort all the inflight requests. Introducing this flag is due
>> +             * to there is no state to represent the shutdown procedure.
>> +             */
>> +            if (dev->ctrl.state == NVME_CTRL_CONNECTING ||
>> +                    dev->inflight_flushed)
>> +                    action = DISABLE;
>> +            else
>> +                    action = RESET;
>> +    }
>> +
>> +    switch (action) {
>> +    case ABORT:
>> +            ret = nvme_pci_abort_io_req(req);
>> +            break;
>> +    case RESET:
>> +            dev_warn(dev->ctrl.device,
>> +                     "I/O %d QID %d timeout, reset controller\n",
>> +                     req->tag, nvmeq->qid);
>> +            set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> +            nvme_reset_ctrl(&dev->ctrl);
>> +            /*
>> +             * The reset work will take over this request. nvme_abort_req
>> +             * employs blk_abort_request to force the request to be timed
>> +             * out. So we need to return BLK_EH_RESET_TIMER then the
>> +             * RQF_MQ_TIMEOUT_EXPIRED could be cleared.
>> +             */
>> +            ret = BLK_EH_RESET_TIMER;
>> +            break;
>> +    case DISABLE:
>> +            /*
>> +             * We disable the controller directly here so that we could
>> +             * complete the request safely to wake up the nvme_dev_disable/
>> +             * nvme_reset_work who is waiting on adminq. We cannot return
>> +             * BLK_EH_RESET_TIMER to depend on error recovery procedure
>> +             * because it is waiting for timeout path.
>> +             */
>> +            dev_warn(dev->ctrl.device,
>> +                     "I/O %d QID %d timeout, disable controller\n",
>> +                     req->tag, nvmeq->qid);
>> +            nvme_pci_disable_ctrl_directly(dev);
>> +            set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> +            ret = BLK_EH_HANDLED;
>> +            break;
>> +    default:
>> +            WARN_ON(1);
>> +            break;
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>>  static void nvme_free_queue(struct nvme_queue *nvmeq)
>>  {
>>      dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
>> @@ -2169,6 +2210,33 @@ static void nvme_pci_disable(struct nvme_dev *dev)
>>      }
>>  }
>>  
>> +/*
>> + * This is only invoked by nvme_timeout. shutdown_lock is not needed
>> + * here because nvme_abort_requests_sync in nvme_dev_disable will
>> + * synchronize with the timeout path.
>> + */
>> +static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev)
>> +{
>> +    int i;
>> +    struct pci_dev *pdev = to_pci_dev(dev->dev);
>> +    bool dead = true;
>> +
>> +    if (pci_is_enabled(pdev)) {
>> +            u32 csts = readl(dev->bar + NVME_REG_CSTS);
>> +
>> +            dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
>> +                    pdev->error_state  != pci_channel_io_normal);
>> +
>> +            if (!dead)
>> +                    nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
>> +    }
>> +
>> +    for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
>> +            nvme_suspend_queue(&dev->queues[i]);
>> +
>> +    nvme_pci_disable(dev);
>> +}
>> +
>>  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>>  {
>>      int i;
>> @@ -2205,6 +2273,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, 
>> bool shutdown)
>>  
>>      }
>>      nvme_stop_queues(&dev->ctrl);
>> +    nvme_abort_requests_sync(&dev->ctrl);
>> +    dev->inflight_flushed = true;
>>  
>>      if (!dead) {
>>              nvme_disable_io_queues(dev);
>> @@ -2215,9 +2285,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, 
>> bool shutdown)
>>  
>>      nvme_pci_disable(dev);
>>  
>> -    blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
>> -    blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, 
>> &dev->ctrl);
>> -
>> +    nvme_flush_aborted_requests(&dev->ctrl);
>> +    dev->inflight_flushed = false;
>>      /*
>>       * The driver will not be starting up queues again if shutting down so
>>       * must flush all entered requests to their failed completion to avoid
>>
> 
> _______________________________________________
> Linux-nvme mailing list
> linux-n...@lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=5Mw5gjePWHT_YZ77OroeGMhDmXLF0LW6EtV-hBMRYas&s=VM31J1nvWkmoWzuZIHV0f5nlrSW0ZTKUQ1TjLwbB18g&e=
> 

Reply via email to