Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-24 Thread Sagi Grimberg



Please prepare a formal one(at least tested in normal case), either I
or Zhang Yi may test/verify it.


OK.


@@ -1387,10 +1385,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev,
struct nvme_queue *nvmeq,
  static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 int depth, int node)
  {
-   struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
-   node);
-   if (!nvmeq)
-   return NULL;
+   struct nvme_queue *nvmeq = >queues[qid];


Maybe you need to zero *nvmeq again since it is done in current code.


Relying on this is not a good idea, so I think we better off without
it because I want to know about it and fix it.


@@ -2470,8 +2465,9 @@ static int nvme_probe(struct pci_dev *pdev, const
struct pci_device_id *id)
 dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
 if (!dev)
 return -ENOMEM;
-   dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void
*),
-   GFP_KERNEL, node);
+
+   alloc_size = (num_possible_cpus() + 1) * sizeof(struct nvme_queue
*);


The element size should be 'sizeof(struct nvme_queue)'.


Right.


Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-21 Thread Ming Lei
Hi Sagi,

On Thu, Dec 21, 2017 at 10:20:45AM +0200, Sagi Grimberg wrote:
> Ming,
> 
> I'd prefer that we make the pci driver match
> the rest of the drivers in nvme.

OK, this way looks better.

> 
> IMO it would be better to allocate a queues array at probe time
> and simply reuse it at reset sequence.
> 
> Can this (untested) patch also fix the issue your seeing:

Please prepare a formal one(at least tested in normal case), either I
or Zhang Yi may test/verify it.

> --
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index f26aaa393016..a8edb29dac16 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool
> shutdown);
>   * Represents an NVM Express device.  Each nvme_dev is a PCI function.
>   */
>  struct nvme_dev {
> -   struct nvme_queue **queues;
> +   struct nvme_queue *queues;
> struct blk_mq_tag_set tagset;
> struct blk_mq_tag_set admin_tagset;
> u32 __iomem *dbs;
> @@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx
> *hctx, void *data,
> unsigned int hctx_idx)
>  {
> struct nvme_dev *dev = data;
> -   struct nvme_queue *nvmeq = dev->queues[0];
> +   struct nvme_queue *nvmeq = >queues[0];
> 
> WARN_ON(hctx_idx != 0);
> WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
> @@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx,
> void *data,
>   unsigned int hctx_idx)
>  {
> struct nvme_dev *dev = data;
> -   struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
> +   struct nvme_queue *nvmeq = >queues[hctx_idx + 1];
> 
> if (!nvmeq->tags)
> nvmeq->tags = >tagset.tags[hctx_idx];
> @@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set,
> struct request *req,
> struct nvme_dev *dev = set->driver_data;
> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> int queue_idx = (set == >tagset) ? hctx_idx + 1 : 0;
> -   struct nvme_queue *nvmeq = dev->queues[queue_idx];
> +   struct nvme_queue *nvmeq = >queues[queue_idx];
> 
> BUG_ON(!nvmeq);
> iod->nvmeq = nvmeq;
> @@ -1046,7 +1046,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx,
> unsigned int tag)
>  static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
>  {
> struct nvme_dev *dev = to_nvme_dev(ctrl);
> -   struct nvme_queue *nvmeq = dev->queues[0];
> +   struct nvme_queue *nvmeq = >queues[0];
> struct nvme_command c;
> 
> memset(, 0, sizeof(c));
> @@ -1290,10 +1290,8 @@ static void nvme_free_queues(struct nvme_dev *dev,
> int lowest)
> int i;
> 
> for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
> -   struct nvme_queue *nvmeq = dev->queues[i];
> dev->ctrl.queue_count--;
> -   dev->queues[i] = NULL;
> -   nvme_free_queue(nvmeq);
> +   nvme_free_queue(>queues[i]);
> }
>  }
> 
> @@ -1325,7 +1323,7 @@ static int nvme_suspend_queue(struct nvme_queue
> *nvmeq)
> 
>  static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
>  {
> -   struct nvme_queue *nvmeq = dev->queues[0];
> +   struct nvme_queue *nvmeq = >queues[0];
> 
> if (!nvmeq)
> return;
> @@ -1387,10 +1385,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev,
> struct nvme_queue *nvmeq,
>  static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
> int depth, int node)
>  {
> -   struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
> -   node);
> -   if (!nvmeq)
> -   return NULL;
> +   struct nvme_queue *nvmeq = >queues[qid];

Maybe you need to zero *nvmeq again since it is done in current code.

> 
> nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
>   >cq_dma_addr, GFP_KERNEL);
> @@ -1409,7 +1404,6 @@ static struct nvme_queue *nvme_alloc_queue(struct
> nvme_dev *dev, int qid,
> nvmeq->q_depth = depth;
> nvmeq->qid = qid;
> nvmeq->cq_vector = -1;
> -   dev->queues[qid] = nvmeq;
> dev->ctrl.queue_count++;
> 
> return nvmeq;
> @@ -1592,7 +1586,7 @@ static int nvme_pci_configure_admin_queue(struct
> nvme_dev *dev)
> if (result < 0)
> return result;
> 
> -   nvmeq = dev->queues[0];
> +   nvmeq = >queues[0];
> if (!nvmeq) {
> nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
> dev_to_node(dev->dev));
> @@ -1638,7 +1632,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
> 
> max = min(dev->max_qid, dev->ctrl.queue_count - 1);
> for (i = dev->online_queues; i <= max; 

Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-21 Thread Johannes Thumshirn
On Thu, Dec 21, 2017 at 10:20:45AM +0200, Sagi Grimberg wrote:
> @@ -2470,8 +2465,9 @@ static int nvme_probe(struct pci_dev *pdev, const
> struct pci_device_id *id)
> dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
> if (!dev)
> return -ENOMEM;
> -   dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void
> *),
> -   GFP_KERNEL, node);
> +
> +   alloc_size = (num_possible_cpus() + 1) * sizeof(struct nvme_queue
> *);
> +   dev->queues = kzalloc_node(alloc_size, GFP_KERNEL, node);

Nit:
  dev->queues = kcalloc_node(num_possible_cpus() + 1,
 sizeof(struct nvme_queue *),
 node);
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-21 Thread Sagi Grimberg

Ming,

I'd prefer that we make the pci driver match
the rest of the drivers in nvme.

IMO it would be better to allocate a queues array at probe time
and simply reuse it at reset sequence.

Can this (untested) patch also fix the issue your seeing:
--
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f26aaa393016..a8edb29dac16 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, 
bool shutdown);

  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
-   struct nvme_queue **queues;
+   struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
@@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx 
*hctx, void *data,

unsigned int hctx_idx)
 {
struct nvme_dev *dev = data;
-   struct nvme_queue *nvmeq = dev->queues[0];
+   struct nvme_queue *nvmeq = >queues[0];

WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx 
*hctx, void *data,

  unsigned int hctx_idx)
 {
struct nvme_dev *dev = data;
-   struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+   struct nvme_queue *nvmeq = >queues[hctx_idx + 1];

if (!nvmeq->tags)
nvmeq->tags = >tagset.tags[hctx_idx];
@@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set 
*set, struct request *req,

struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == >tagset) ? hctx_idx + 1 : 0;
-   struct nvme_queue *nvmeq = dev->queues[queue_idx];
+   struct nvme_queue *nvmeq = >queues[queue_idx];

BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
@@ -1046,7 +1046,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, 
unsigned int tag)

 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
struct nvme_dev *dev = to_nvme_dev(ctrl);
-   struct nvme_queue *nvmeq = dev->queues[0];
+   struct nvme_queue *nvmeq = >queues[0];
struct nvme_command c;

memset(, 0, sizeof(c));
@@ -1290,10 +1290,8 @@ static void nvme_free_queues(struct nvme_dev 
*dev, int lowest)

int i;

for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
-   struct nvme_queue *nvmeq = dev->queues[i];
dev->ctrl.queue_count--;
-   dev->queues[i] = NULL;
-   nvme_free_queue(nvmeq);
+   nvme_free_queue(>queues[i]);
}
 }

@@ -1325,7 +1323,7 @@ static int nvme_suspend_queue(struct nvme_queue 
*nvmeq)


 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
-   struct nvme_queue *nvmeq = dev->queues[0];
+   struct nvme_queue *nvmeq = >queues[0];

if (!nvmeq)
return;
@@ -1387,10 +1385,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev 
*dev, struct nvme_queue *nvmeq,

 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth, int 
node)

 {
-   struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
-   node);
-   if (!nvmeq)
-   return NULL;
+   struct nvme_queue *nvmeq = >queues[qid];

nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
  >cq_dma_addr, GFP_KERNEL);
@@ -1409,7 +1404,6 @@ static struct nvme_queue *nvme_alloc_queue(struct 
nvme_dev *dev, int qid,

nvmeq->q_depth = depth;
nvmeq->qid = qid;
nvmeq->cq_vector = -1;
-   dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++;

return nvmeq;
@@ -1592,7 +1586,7 @@ static int nvme_pci_configure_admin_queue(struct 
nvme_dev *dev)

if (result < 0)
return result;

-   nvmeq = dev->queues[0];
+   nvmeq = >queues[0];
if (!nvmeq) {
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
dev_to_node(dev->dev));
@@ -1638,7 +1632,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)

max = min(dev->max_qid, dev->ctrl.queue_count - 1);
for (i = dev->online_queues; i <= max; i++) {
-   ret = nvme_create_queue(dev->queues[i], i);
+   ret = nvme_create_queue(>queues[i], i);
if (ret)
break;
}
@@ -1894,7 +1888,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)

 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-   struct nvme_queue *adminq = dev->queues[0];
+   struct nvme_queue *adminq = >queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, 

[PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-13 Thread Ming Lei
It may cause race by setting 'nvmeq' in nvme_init_request()
because .init_request is called inside switching io scheduler, which
may happen when the NVMe device is being resetted and its nvme queues
are being freed and created. We don't have any sync between the two
pathes.

This patch removes the .init_request callback and sets the nvmeq runtime
via hctx->driver_data, which is reliable because .init_hctx is always
called after hw queue is resized in reset handler, then the following bug
is fixed:

[   93.268391] kernel BUG at drivers/nvme/host/pci.c:408!
[   93.274146] invalid opcode:  [#1] SMP
[   93.278618] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss
nfsv4 dns_resolver nfs lockd grace fscache sunrpc ipmi_ssif vfat fat
intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel
kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt
intel_cstate ipmi_si iTCO_vendor_support intel_uncore mxm_wmi mei_me
ipmi_devintf intel_rapl_perf pcspkr sg ipmi_msghandler lpc_ich dcdbas mei
shpchp acpi_power_meter wmi dm_multipath ip_tables xfs libcrc32c sd_mod
mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt
fb_sys_fops ttm drm ahci libahci nvme libata crc32c_intel nvme_core tg3
megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod
[   93.349071] CPU: 5 PID: 1842 Comm: sh Not tainted 4.15.0-rc2.ming+ #4
[   93.356256] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 
08/16/2017
[   93.364801] task: fb8abf2a task.stack: 28bd82d1
[   93.371408] RIP: 0010:nvme_init_request+0x36/0x40 [nvme]
[   93.377333] RSP: 0018:c90002537ca8 EFLAGS: 00010246
[   93.383161] RAX:  RBX:  RCX: 0008
[   93.391122] RDX:  RSI: 880276ae RDI: 88047bae9008
[   93.399084] RBP: 88047bae9008 R08: 88047bae9008 R09: 09dabc00
[   93.407045] R10: 0004 R11: 299c R12: 880186bc1f00
[   93.415007] R13: 880276ae R14:  R15: 0071
[   93.422969] FS:  7f33cf288740() GS:88047ba8() 
knlGS:
[   93.431996] CS:  0010 DS:  ES:  CR0: 80050033
[   93.438407] CR2: 7f33cf28e000 CR3: 00047e5bb006 CR4: 001606e0
[   93.446368] Call Trace:
[   93.449103]  blk_mq_alloc_rqs+0x231/0x2a0
[   93.453579]  blk_mq_sched_alloc_tags.isra.8+0x42/0x80
[   93.459214]  blk_mq_init_sched+0x7e/0x140
[   93.463687]  elevator_switch+0x5a/0x1f0
[   93.467966]  ? elevator_get.isra.17+0x52/0xc0
[   93.472826]  elv_iosched_store+0xde/0x150
[   93.477299]  queue_attr_store+0x4e/0x90
[   93.481580]  kernfs_fop_write+0xfa/0x180
[   93.485958]  __vfs_write+0x33/0x170
[   93.489851]  ? __inode_security_revalidate+0x4c/0x60
[   93.495390]  ? selinux_file_permission+0xda/0x130
[   93.500641]  ? _cond_resched+0x15/0x30
[   93.504815]  vfs_write+0xad/0x1a0
[   93.508512]  SyS_write+0x52/0xc0
[   93.512113]  do_syscall_64+0x61/0x1a0
[   93.516199]  entry_SYSCALL64_slow_path+0x25/0x25
[   93.521351] RIP: 0033:0x7f33ce96aab0
[   93.525337] RSP: 002b:7ffe57570238 EFLAGS: 0246 ORIG_RAX: 
0001
[   93.533785] RAX: ffda RBX: 0006 RCX: 7f33ce96aab0
[   93.541746] RDX: 0006 RSI: 7f33cf28e000 RDI: 0001
[   93.549707] RBP: 7f33cf28e000 R08: 000a R09: 7f33cf288740
[   93.557669] R10: 7f33cf288740 R11: 0246 R12: 7f33cec42400
[   93.565630] R13: 0006 R14: 0001 R15: 
[   93.573592] Code: 4c 8d 40 08 4c 39 c7 74 16 48 8b 00 48 8b 04 08 48 85 c0
74 16 48 89 86 78 01 00 00 31 c0 c3 8d 4a 01 48 63 c9 48 c1 e1 03 eb de <0f>
0b 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 f6 53 48 89
[   93.594676] RIP: nvme_init_request+0x36/0x40 [nvme] RSP: c90002537ca8
[   93.602273] ---[ end trace 810dde3993e5f14e ]---

Reported-by: Yi Zhang 
Signed-off-by: Ming Lei 
---
 drivers/nvme/host/pci.c | 21 -
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5800c3c9082..9eff33b21ca5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -397,19 +397,6 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void 
*data,
return 0;
 }
 
-static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
-   unsigned int hctx_idx, unsigned int numa_node)
-{
-   struct nvme_dev *dev = set->driver_data;
-   struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-   int queue_idx = (set == >tagset) ? hctx_idx + 1 : 0;
-   struct nvme_queue *nvmeq = dev->queues[queue_idx];
-
-   BUG_ON(!nvmeq);
-   iod->nvmeq = nvmeq;
-   return 0;
-}
-
 static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
struct nvme_dev *dev = set->driver_data;
@@ -448,7 +435,8