[PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device
From: Wenbo Wang <wenbo.w...@memblaze.com> Signed-off-by: Wenbo Wang <wenbo.w...@memblaze.com> --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 663c40c..2fec23c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2141,6 +2141,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } -- 1.8.3.1
[PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device
From: Wenbo Wang Signed-off-by: Wenbo Wang --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 663c40c..2fec23c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2141,6 +2141,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } -- 1.8.3.1
RE: [PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device
Keith, Is this patch accepted? Thanks. -Original Message- From: Wenbo Wang Sent: Tuesday, August 9, 2016 11:18 AM To: 'keith.bu...@intel.com'; 'ax...@fb.com' Cc: linux-n...@lists.infradead.org; linux-kernel@vger.kernel.org Subject: [PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device Add a delay before checking device ready for memblaze device Signed-off-by: Wenbo Wang <wenbo.w...@memblaze.com> --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c82282f..ab90e5f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2154,6 +2154,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } -- 1.8.3.1
RE: [PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device
Keith, Is this patch accepted? Thanks. -Original Message- From: Wenbo Wang Sent: Tuesday, August 9, 2016 11:18 AM To: 'keith.bu...@intel.com'; 'ax...@fb.com' Cc: linux-n...@lists.infradead.org; linux-kernel@vger.kernel.org Subject: [PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device Add a delay before checking device ready for memblaze device Signed-off-by: Wenbo Wang --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c82282f..ab90e5f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2154,6 +2154,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } -- 1.8.3.1
[PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device
Add a delay before checking device ready for memblaze device Signed-off-by: Wenbo Wang <wenbo.w...@memblaze.com> --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c82282f..ab90e5f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2154,6 +2154,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } -- 1.8.3.1
[PATCH] nvme/quirk: Add a delay before checking device ready for memblaze device
Add a delay before checking device ready for memblaze device Signed-off-by: Wenbo Wang --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c82282f..ab90e5f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2154,6 +2154,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } -- 1.8.3.1
[PATCH v2] NVMe: do not touch sq door bell if nvmeq has been suspended
If __nvme_submit_cmd races with nvme_dev_disable, nvmeq could have been suspended and dev->bar could have been unmapped. Do not touch sq door bell in this case. Signed-off-by: Wenbo Wang --- drivers/nvme/host/pci.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8b1a725..30795f6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -678,6 +678,11 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); spin_lock_irq(>q_lock); + if (unlikely(nvmeq->cq_vector == -1)) { + ret = BLK_MQ_RQ_QUEUE_BUSY; + spin_unlock_irq(>q_lock); + goto out; + } __nvme_submit_cmd(nvmeq, ); nvme_process_cq(nvmeq); spin_unlock_irq(>q_lock); -- 1.8.3.1
[PATCH v2] NVMe: do not touch sq door bell if nvmeq has been suspended
If __nvme_submit_cmd races with nvme_dev_disable, nvmeq could have been suspended and dev->bar could have been unmapped. Do not touch sq door bell in this case. Signed-off-by: Wenbo Wang <mail_weber_w...@163.com> --- drivers/nvme/host/pci.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8b1a725..30795f6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -678,6 +678,11 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); spin_lock_irq(>q_lock); + if (unlikely(nvmeq->cq_vector == -1)) { + ret = BLK_MQ_RQ_QUEUE_BUSY; + spin_unlock_irq(>q_lock); + goto out; + } __nvme_submit_cmd(nvmeq, ); nvme_process_cq(nvmeq); spin_unlock_irq(>q_lock); -- 1.8.3.1
RE: [PATCH] NVMe: do not touch sq door bell if nvmeq has been suspended
No, this issue has not been seen yet. Since blk_mq_run_hw_queue tests queue stopped in the very beginning. It should be possible to race with nvme_dev_disable. I agree that the request shall be re-queued. -Original Message- From: Busch, Keith [mailto:keith.bu...@intel.com] Sent: Tuesday, February 2, 2016 12:00 AM To: Wenbo Wang; ax...@fb.com Cc: linux-kernel@vger.kernel.org; Wenbo Wang; linux-n...@lists.infradead.org Subject: RE: [PATCH] NVMe: do not touch sq door bell if nvmeq has been suspended Does this ever happen? The queue should be stopped before the bar is unmapped. If that's insufficient to guard against this, we've another problem this patch does not cover. That command will just timeout since it was accepted by the driver, but not actually submitted to anything. The request needs to be requeued or return BLK_MQ_RQ_QUEUE_BUSY. > -Original Message- > From: Wenbo Wang [mailto:mail_weber_w...@163.com] > Sent: Monday, February 01, 2016 8:42 AM > To: ax...@fb.com; Busch, Keith > Cc: linux-kernel@vger.kernel.org; Wenbo Wang; Wenbo Wang; > linux-n...@lists.infradead.org > Subject: [PATCH] NVMe: do not touch sq door bell if nvmeq has been > suspended > > If __nvme_submit_cmd races with nvme_dev_disable, nvmeq could have > been suspended and dev->bar could have been unmapped. Do not touch sq > door bell in this case. > > Signed-off-by: Wenbo Wang > Reviewed-by: Wenwei Tao > CC: linux-n...@lists.infradead.org > --- > drivers/nvme/host/pci.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index > 8b1a725..2288712 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -325,7 +325,8 @@ static void __nvme_submit_cmd(struct nvme_queue > *nvmeq, > > if (++tail == nvmeq->q_depth) > tail = 0; > - writel(tail, nvmeq->q_db); > + if (likely(nvmeq->cq_vector >= 0)) > + writel(tail, nvmeq->q_db); > nvmeq->sq_tail = tail; > } > > -- > 1.8.3.1
[PATCH] NVMe: do not touch sq door bell if nvmeq has been suspended
If __nvme_submit_cmd races with nvme_dev_disable, nvmeq could have been suspended and dev->bar could have been unmapped. Do not touch sq door bell in this case. Signed-off-by: Wenbo Wang Reviewed-by: Wenwei Tao CC: linux-n...@lists.infradead.org --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8b1a725..2288712 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -325,7 +325,8 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, if (++tail == nvmeq->q_depth) tail = 0; - writel(tail, nvmeq->q_db); + if (likely(nvmeq->cq_vector >= 0)) + writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; } -- 1.8.3.1
RE: [PATCH] NVMe: do not touch sq door bell if nvmeq has been suspended
No, this issue has not been seen yet. Since blk_mq_run_hw_queue tests queue stopped in the very beginning. It should be possible to race with nvme_dev_disable. I agree that the request shall be re-queued. -Original Message- From: Busch, Keith [mailto:keith.bu...@intel.com] Sent: Tuesday, February 2, 2016 12:00 AM To: Wenbo Wang; ax...@fb.com Cc: linux-kernel@vger.kernel.org; Wenbo Wang; linux-n...@lists.infradead.org Subject: RE: [PATCH] NVMe: do not touch sq door bell if nvmeq has been suspended Does this ever happen? The queue should be stopped before the bar is unmapped. If that's insufficient to guard against this, we've another problem this patch does not cover. That command will just timeout since it was accepted by the driver, but not actually submitted to anything. The request needs to be requeued or return BLK_MQ_RQ_QUEUE_BUSY. > -Original Message- > From: Wenbo Wang [mailto:mail_weber_w...@163.com] > Sent: Monday, February 01, 2016 8:42 AM > To: ax...@fb.com; Busch, Keith > Cc: linux-kernel@vger.kernel.org; Wenbo Wang; Wenbo Wang; > linux-n...@lists.infradead.org > Subject: [PATCH] NVMe: do not touch sq door bell if nvmeq has been > suspended > > If __nvme_submit_cmd races with nvme_dev_disable, nvmeq could have > been suspended and dev->bar could have been unmapped. Do not touch sq > door bell in this case. > > Signed-off-by: Wenbo Wang <wenbo.w...@memblaze.com> > Reviewed-by: Wenwei Tao <wenwei@memblaze.com> > CC: linux-n...@lists.infradead.org > --- > drivers/nvme/host/pci.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index > 8b1a725..2288712 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -325,7 +325,8 @@ static void __nvme_submit_cmd(struct nvme_queue > *nvmeq, > > if (++tail == nvmeq->q_depth) > tail = 0; > - writel(tail, nvmeq->q_db); > + if (likely(nvmeq->cq_vector >= 0)) > + writel(tail, nvmeq->q_db); > nvmeq->sq_tail = tail; > } > > -- > 1.8.3.1
[PATCH] NVMe: do not touch sq door bell if nvmeq has been suspended
If __nvme_submit_cmd races with nvme_dev_disable, nvmeq could have been suspended and dev->bar could have been unmapped. Do not touch sq door bell in this case. Signed-off-by: Wenbo Wang <wenbo.w...@memblaze.com> Reviewed-by: Wenwei Tao <wenwei@memblaze.com> CC: linux-n...@lists.infradead.org --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8b1a725..2288712 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -325,7 +325,8 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, if (++tail == nvmeq->q_depth) tail = 0; - writel(tail, nvmeq->q_db); + if (likely(nvmeq->cq_vector >= 0)) + writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; } -- 1.8.3.1
[PATCH RFC] blk-mq: no need to modify bt->wake_index if someone has updated it
If someone else has found active waitqueue and updated bt->wake_index, do not modify it again. This saves an atomic read. Signed-off-by: Wenbo Wang CC: linux-bl...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- block/blk-mq-tag.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index abdbb47..5ed9111 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -356,16 +356,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) { - int i, wake_index; + int i, wake_index, orig_wake_index; - wake_index = atomic_read(>wake_index); + orig_wake_index = wake_index = atomic_read(>wake_index); for (i = 0; i < BT_WAIT_QUEUES; i++) { struct bt_wait_state *bs = >bs[wake_index]; if (waitqueue_active(>wait)) { - int o = atomic_read(>wake_index); - if (wake_index != o) - atomic_cmpxchg(>wake_index, o, wake_index); + if (wake_index != orig_wake_index) + atomic_cmpxchg(>wake_index, orig_wake_index, wake_index); return bs; } -- 1.8.3.1 --- Not seeing too much benifit from this patch, but it makes the logic here more clear
[PATCH RFC] blk-mq: no need to modify bt->wake_index if someone has updated it
If someone else has found active waitqueue and updated bt->wake_index, do not modify it again. This saves an atomic read. Signed-off-by: Wenbo Wang <mail_weber_w...@163.com> CC: linux-bl...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- block/blk-mq-tag.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index abdbb47..5ed9111 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -356,16 +356,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) { - int i, wake_index; + int i, wake_index, orig_wake_index; - wake_index = atomic_read(>wake_index); + orig_wake_index = wake_index = atomic_read(>wake_index); for (i = 0; i < BT_WAIT_QUEUES; i++) { struct bt_wait_state *bs = >bs[wake_index]; if (waitqueue_active(>wait)) { - int o = atomic_read(>wake_index); - if (wake_index != o) - atomic_cmpxchg(>wake_index, o, wake_index); + if (wake_index != orig_wake_index) + atomic_cmpxchg(>wake_index, orig_wake_index, wake_index); return bs; } -- 1.8.3.1 --- Not seeing too much benifit from this patch, but it makes the logic here more clear
[PATCH v3] NVMe: init nvme queue before enabling irq
From: Wenbo Wang [v3] Do request irq in nvme_init_queue() to handle request irq failures There is one problem with the original patch. Since init queue happens before request irq, online_queue might be left increased if request irq fails. This version merges request irq into nvme_init_queue() because: 1. It solves the online_queue counting issue if request irq fails. 2. nvme_init_queue() is always followed by request irq, this change simplifies code. This version also solves another possible race condition by moving adminq free irq before iounmap. [v2] Remove duplicate init code in nvme_alloc_queue() During reset process, the nvme_dev->bar (ioremapped) may change, so nvmeq->q_db shall be also updated by nvme_init_queue(). [v1] Currently nvmeq irq is enabled before queue init, so a spurious interrupt triggered nvme_process_cq may access nvmeq->q_db just before it is updated, this could cause kernel panic. Signed-off-by: Wenbo Wang Reviewed-by: Wenwei Tao CC: sta...@vger.kernel.org --- drivers/nvme/host/pci.c | 38 -- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f5c0e26..11e7cb6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1529,9 +1529,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", dev->instance, qid); spin_lock_init(>q_lock); - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - nvmeq->q_db = >dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->qid = qid; nvmeq->cq_vector = -1; @@ -1562,8 +1559,9 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, IRQF_SHARED, name, nvmeq); } -static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +static int nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { + int result; struct nvme_dev *dev = nvmeq->dev; spin_lock_irq(>q_lock); @@ -1574,6 +1572,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); dev->online_queues++; spin_unlock_irq(>q_lock); + + result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + + if (result) { + spin_lock_irq(>q_lock); + nvmeq->cq_vector = -1; + dev->online_queues--; + spin_unlock_irq(>q_lock); + } + return result; } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) @@ -1590,11 +1598,15 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_cq; - result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + /* +* Init queue door bell ioremap address before enabling irq, if not, +* a spurious interrupt triggered nvme_process_cq may access invalid +* address +*/ + result = nvme_init_queue(nvmeq, qid); if (result < 0) goto release_sq; - nvme_init_queue(nvmeq, qid); return result; release_sq: @@ -1790,11 +1802,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) goto free_nvmeq; nvmeq->cq_vector = 0; - result = queue_request_irq(dev, nvmeq, nvmeq->irqname); - if (result) { - nvmeq->cq_vector = -1; + result = nvme_init_queue(nvmeq, 0); + if (result) goto free_nvmeq; - } return result; @@ -2446,6 +2456,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_release_cmb(dev); } + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, adminq); + size = db_bar_size(dev, nr_io_queues); if (size > 8192) { iounmap(dev->bar); @@ -2461,9 +2474,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) adminq->q_db = dev->dbs; } - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, adminq); - /* * If we enable msix early due to not intx, disable it again before * setting up the full range we need. @@ -2496,6 +2506,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) result = queue_request_irq(dev, adminq, adminq->irqname); if (result) { adminq->cq_vector = -1; + dev->online_queues--; goto free_queues; } @@ -3164,7 +3175,6 @@ static void nvme_probe_work(struct work_struct *work) goto disable; } - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) goto disable; -- 1.8.3.1
[PATCH v3] NVMe: init nvme queue before enabling irq
From: Wenbo Wang <wenbo.w...@memblaze.com> [v3] Do request irq in nvme_init_queue() to handle request irq failures There is one problem with the original patch. Since init queue happens before request irq, online_queue might be left increased if request irq fails. This version merges request irq into nvme_init_queue() because: 1. It solves the online_queue counting issue if request irq fails. 2. nvme_init_queue() is always followed by request irq, this change simplifies code. This version also solves another possible race condition by moving adminq free irq before iounmap. [v2] Remove duplicate init code in nvme_alloc_queue() During reset process, the nvme_dev->bar (ioremapped) may change, so nvmeq->q_db shall be also updated by nvme_init_queue(). [v1] Currently nvmeq irq is enabled before queue init, so a spurious interrupt triggered nvme_process_cq may access nvmeq->q_db just before it is updated, this could cause kernel panic. Signed-off-by: Wenbo Wang <wenbo.w...@memblaze.com> Reviewed-by: Wenwei Tao <wenwei@memblaze.com> CC: sta...@vger.kernel.org --- drivers/nvme/host/pci.c | 38 -- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f5c0e26..11e7cb6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1529,9 +1529,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", dev->instance, qid); spin_lock_init(>q_lock); - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - nvmeq->q_db = >dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->qid = qid; nvmeq->cq_vector = -1; @@ -1562,8 +1559,9 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, IRQF_SHARED, name, nvmeq); } -static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +static int nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { + int result; struct nvme_dev *dev = nvmeq->dev; spin_lock_irq(>q_lock); @@ -1574,6 +1572,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); dev->online_queues++; spin_unlock_irq(>q_lock); + + result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + + if (result) { + spin_lock_irq(>q_lock); + nvmeq->cq_vector = -1; + dev->online_queues--; + spin_unlock_irq(>q_lock); + } + return result; } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) @@ -1590,11 +1598,15 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_cq; - result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + /* +* Init queue door bell ioremap address before enabling irq, if not, +* a spurious interrupt triggered nvme_process_cq may access invalid +* address +*/ + result = nvme_init_queue(nvmeq, qid); if (result < 0) goto release_sq; - nvme_init_queue(nvmeq, qid); return result; release_sq: @@ -1790,11 +1802,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) goto free_nvmeq; nvmeq->cq_vector = 0; - result = queue_request_irq(dev, nvmeq, nvmeq->irqname); - if (result) { - nvmeq->cq_vector = -1; + result = nvme_init_queue(nvmeq, 0); + if (result) goto free_nvmeq; - } return result; @@ -2446,6 +2456,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_release_cmb(dev); } + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, adminq); + size = db_bar_size(dev, nr_io_queues); if (size > 8192) { iounmap(dev->bar); @@ -2461,9 +2474,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) adminq->q_db = dev->dbs; } - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, adminq); - /* * If we enable msix early due to not intx, disable it again before * setting up the full range we need. @@ -2496,6 +2506,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) result = queue_request_irq(dev, adminq, adminq->irqname); if (result) { adminq->cq_vector = -1; + dev->online_queues--; goto free_queues; } @@ -3164,7 +3175,6 @@ static void nvme_probe_work(struct work_struct *work
[PATCH] Speed up __run_timers() when jiffie gap is large
From: Wenbo Wang Signed-off-by: Wenbo Wang Reviewed-by: Chong Yuan Currently __run_timers() increases jiffie by one in every loop. This is very slow when the jiffie gap is large. The following test is done on idle cpu 15 (isolated by isolcpus= kernel option). There is a clocksource_watchdog timer expires in every 20s. run_timer_softirq() (invokes __run_timers) takes 120+us to catch up jiffies. [root@localhost tracing]# cat trace_pipe 450.328663 |15) ! 124.138 us | run_timer_softirq(); 470.318675 |15) ! 128.002 us | run_timer_softirq(); 490.308875 |15) | run_timer_softirq() { 490.309048 |15) ! 164.647 us | } 510.299059 |15) | run_timer_softirq() { 510.299230 |15) ! 162.307 us | } After applying this patch, the same softirq takes less than 20us to complete the same work. [root@localhost tracing]# cat trace_pipe 430.058585 |15) + 11.228 us | run_timer_softirq(); 450.048683 |15) 7.936 us| run_timer_softirq(); 470.038760 |15) + 19.443 us | run_timer_softirq(); 490.028856 |15) + 11.361 us | run_timer_softirq(); 510.018953 |15) + 10.085 us | run_timer_softirq(); 530.009038 |15) + 10.075 us | run_timer_softirq(); 550.000242 |15) + 10.958 us | run_timer_softirq(); --- kernel/time/timer.c | 32 +++- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c5..15da6a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -88,6 +88,7 @@ struct tvec_base { struct tvec tv3; struct tvec tv4; struct tvec tv5; + int tv1_cnt;/* number of newly inserted timers in tv1 */ } cacheline_aligned; struct tvec_base boot_tvec_bases; @@ -363,6 +364,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) if (idx < TVR_SIZE) { int i = expires & TVR_MASK; vec = base->tv1.vec + i; + base->tv1_cnt++; } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { int i = (expires >> TVR_BITS) & TVN_MASK; vec = base->tv2.vec + i; @@ -378,6 +380,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) * or you set a timer to go off in the past */ vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + base->tv1_cnt++; } else { int i; /* If the timeout is larger than MAX_TVAL (on 64-bit @@ -1174,6 +1177,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(>lock); return; } + while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = _list; @@ -1182,11 +1186,29 @@ static inline void __run_timers(struct tvec_base *base) /* * Cascade timers: */ - if (!index && - (!cascade(base, >tv2, INDEX(0))) && - (!cascade(base, >tv3, INDEX(1))) && - !cascade(base, >tv4, INDEX(2))) - cascade(base, >tv5, INDEX(3)); + if (unlikely(!index)) { + if (!cascade(base, >tv2, INDEX(0))) { + if ((!cascade(base, >tv3, INDEX(1))) && + !cascade(base, >tv4, INDEX(2))) + cascade(base, >tv5, INDEX(3)); + } + + /* +* We are just crossing the boundary of tv1 (usually 256 jiffies). +* Since there was no new timers inserted to tv1 and all the old +* timers have been processed, it is guaranteed that tv1 has no +* pending timers, so jiffie can jump to the next boundary at +* most. +*/ + if (base->tv1_cnt == 0) { + base->timer_jiffies += min_t(unsigned long, + TVR_SIZE - index, jiffies - base->timer_jiffies + 1); + continue; + } + + base->tv1_cnt = 0; + } + ++base->timer_jiffies; list_replace_init(base->tv1.vec + index, head); while (!list_empty(head)) { -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Speed up __run_timers() when jiffie gap is large
From: Wenbo Wang wenbo.w...@memblaze.com Signed-off-by: Wenbo Wang wenbo.w...@memblaze.com Reviewed-by: Chong Yuan chong.y...@memblaze.com Currently __run_timers() increases jiffie by one in every loop. This is very slow when the jiffie gap is large. The following test is done on idle cpu 15 (isolated by isolcpus= kernel option). There is a clocksource_watchdog timer expires in every 20s. run_timer_softirq() (invokes __run_timers) takes 120+us to catch up jiffies. [root@localhost tracing]# cat trace_pipe 450.328663 |15) ! 124.138 us | run_timer_softirq(); 470.318675 |15) ! 128.002 us | run_timer_softirq(); 490.308875 |15) | run_timer_softirq() { 490.309048 |15) ! 164.647 us | } 510.299059 |15) | run_timer_softirq() { 510.299230 |15) ! 162.307 us | } After applying this patch, the same softirq takes less than 20us to complete the same work. [root@localhost tracing]# cat trace_pipe 430.058585 |15) + 11.228 us | run_timer_softirq(); 450.048683 |15) 7.936 us| run_timer_softirq(); 470.038760 |15) + 19.443 us | run_timer_softirq(); 490.028856 |15) + 11.361 us | run_timer_softirq(); 510.018953 |15) + 10.085 us | run_timer_softirq(); 530.009038 |15) + 10.075 us | run_timer_softirq(); 550.000242 |15) + 10.958 us | run_timer_softirq(); --- kernel/time/timer.c | 32 +++- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c5..15da6a4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -88,6 +88,7 @@ struct tvec_base { struct tvec tv3; struct tvec tv4; struct tvec tv5; + int tv1_cnt;/* number of newly inserted timers in tv1 */ } cacheline_aligned; struct tvec_base boot_tvec_bases; @@ -363,6 +364,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) if (idx TVR_SIZE) { int i = expires TVR_MASK; vec = base-tv1.vec + i; + base-tv1_cnt++; } else if (idx 1 (TVR_BITS + TVN_BITS)) { int i = (expires TVR_BITS) TVN_MASK; vec = base-tv2.vec + i; @@ -378,6 +380,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) * or you set a timer to go off in the past */ vec = base-tv1.vec + (base-timer_jiffies TVR_MASK); + base-tv1_cnt++; } else { int i; /* If the timeout is larger than MAX_TVAL (on 64-bit @@ -1174,6 +1177,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(base-lock); return; } + while (time_after_eq(jiffies, base-timer_jiffies)) { struct list_head work_list; struct list_head *head = work_list; @@ -1182,11 +1186,29 @@ static inline void __run_timers(struct tvec_base *base) /* * Cascade timers: */ - if (!index - (!cascade(base, base-tv2, INDEX(0))) - (!cascade(base, base-tv3, INDEX(1))) - !cascade(base, base-tv4, INDEX(2))) - cascade(base, base-tv5, INDEX(3)); + if (unlikely(!index)) { + if (!cascade(base, base-tv2, INDEX(0))) { + if ((!cascade(base, base-tv3, INDEX(1))) + !cascade(base, base-tv4, INDEX(2))) + cascade(base, base-tv5, INDEX(3)); + } + + /* +* We are just crossing the boundary of tv1 (usually 256 jiffies). +* Since there was no new timers inserted to tv1 and all the old +* timers have been processed, it is guaranteed that tv1 has no +* pending timers, so jiffie can jump to the next boundary at +* most. +*/ + if (base-tv1_cnt == 0) { + base-timer_jiffies += min_t(unsigned long, + TVR_SIZE - index, jiffies - base-timer_jiffies + 1); + continue; + } + + base-tv1_cnt = 0; + } + ++base-timer_jiffies; list_replace_init(base-tv1.vec + index, head); while (!list_empty(head)) { -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Unexpected long wakeup latency on an idle core [3.10 centos 7]
Hi, I am testing wakeup latency on a 40 core server with Centos 7 installed. I used cpuset to isolate cores 10-19 and they are almost idle except some kernel threads. Since currently the latest kernel I have is 3.10, I am not sure if this issue still exists. Problem === Below is the result after testing wakeup latency on idle cores 10 - 13. It shows that the wakeup latency is not consistent, sometimes it is huge (1000+ us) while in most cases it is about 3us. [root@localhost rt-tests]# ./cyclictest -a 10-13 -t 4 -p 99 # /dev/cpu_dma_latency set to 0us policy: fifo: loadavg: 1.00 1.02 0.91 2/633 36355 T: 0 (36098) P:99 I:1000 C:1588612 Min: 2 Act:2 Avg:2 Max: 11 T: 1 (36099) P:99 I:1500 C:1059075 Min: 3 Act:4 Avg:3 Max: 1089<- huge latency T: 2 (36100) P:99 I:2000 C: 794306 Min: 3 Act:4 Avg:3 Max: 1485 T: 3 (36101) P:99 I:2500 C: 635445 Min: 3 Act:3 Avg:3 Max: 11 Kernel Configuration = Dynamic tick is enable. CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ_COMMON=y CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_HZ_1000=y CONFIG_HZ=1000 Analysis === The long latency is caused by the execution of @run_timer_softirq. ftrace shows that sometimes it takes 500+ us to complete. 13) 0.902 us| run_timer_softirq(); 10) ! 461.167 us | run_timer_softirq(); 10) + 48.572 us | run_timer_softirq(); 12) ! 566.923 us | run_timer_softirq(); 12) 7.564 us| run_timer_softirq(); 10) ! 130.498 us | run_timer_softirq(); Because dynamic tick feature is enable, the idle thread stops tick on the idle core, it makes the gap between local cpu @tvec_bases->jiffies and global @jiffies very large. As a result, it takes lots of time for @__run_timers (invoked by run_timer_softirq) to catch up. Following result shows that sometimes the jiffie gap is over 20. [root@localhost tracing]# stap -e 'probe kernel.function("__run_timers") {printf("cpu=%d, %d\n", cpu(), @var("jiffies_64@kernel/timer.c") - $base->timer_jiffies)}' | grep "cpu=1[0123]" | grep -v "cpu=10, 0" cpu=13, 259616 cpu=12, 29 cpu=11, 254670 cpu=12, 170722 cpu=12, 23780 cpu=11, 215427 cpu=12, 21767 cpu=12, 141 cpu=13, 28 cpu=12, 83584 cpu=11, 84569 cpu=11, 59424 I tried to run an infinite loop on core 10 to keep timer tick alive, the wakeup latency is much better on this core. Solution === The following two method may help: 1. Speed up the handling of __run_timers. Currently it increases the local jiffies one by one, which is very slow. 2. Increate local jiffies in the idle thread. Thanks, -Wenbo -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Unexpected long wakeup latency on an idle core [3.10 centos 7]
Hi, I am testing wakeup latency on a 40 core server with Centos 7 installed. I used cpuset to isolate cores 10-19 and they are almost idle except some kernel threads. Since currently the latest kernel I have is 3.10, I am not sure if this issue still exists. Problem === Below is the result after testing wakeup latency on idle cores 10 - 13. It shows that the wakeup latency is not consistent, sometimes it is huge (1000+ us) while in most cases it is about 3us. [root@localhost rt-tests]# ./cyclictest -a 10-13 -t 4 -p 99 # /dev/cpu_dma_latency set to 0us policy: fifo: loadavg: 1.00 1.02 0.91 2/633 36355 T: 0 (36098) P:99 I:1000 C:1588612 Min: 2 Act:2 Avg:2 Max: 11 T: 1 (36099) P:99 I:1500 C:1059075 Min: 3 Act:4 Avg:3 Max: 1089- huge latency T: 2 (36100) P:99 I:2000 C: 794306 Min: 3 Act:4 Avg:3 Max: 1485 T: 3 (36101) P:99 I:2500 C: 635445 Min: 3 Act:3 Avg:3 Max: 11 Kernel Configuration = Dynamic tick is enable. CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ_COMMON=y CONFIG_NO_HZ_FULL=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_HZ_1000=y CONFIG_HZ=1000 Analysis === The long latency is caused by the execution of @run_timer_softirq. ftrace shows that sometimes it takes 500+ us to complete. 13) 0.902 us| run_timer_softirq(); 10) ! 461.167 us | run_timer_softirq(); 10) + 48.572 us | run_timer_softirq(); 12) ! 566.923 us | run_timer_softirq(); 12) 7.564 us| run_timer_softirq(); 10) ! 130.498 us | run_timer_softirq(); Because dynamic tick feature is enable, the idle thread stops tick on the idle core, it makes the gap between local cpu @tvec_bases-jiffies and global @jiffies very large. As a result, it takes lots of time for @__run_timers (invoked by run_timer_softirq) to catch up. Following result shows that sometimes the jiffie gap is over 20. [root@localhost tracing]# stap -e 'probe kernel.function(__run_timers) {printf(cpu=%d, %d\n, cpu(), @var(jiffies_64@kernel/timer.c) - $base-timer_jiffies)}' | grep cpu=1[0123] | grep -v cpu=10, 0 cpu=13, 259616 cpu=12, 29 cpu=11, 254670 cpu=12, 170722 cpu=12, 23780 cpu=11, 215427 cpu=12, 21767 cpu=12, 141 cpu=13, 28 cpu=12, 83584 cpu=11, 84569 cpu=11, 59424 I tried to run an infinite loop on core 10 to keep timer tick alive, the wakeup latency is much better on this core. Solution === The following two method may help: 1. Speed up the handling of __run_timers. Currently it increases the local jiffies one by one, which is very slow. 2. Increate local jiffies in the idle thread. Thanks, -Wenbo -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/