Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III

On Thu, Dec 21, 2017 at 17:16:03 -0600,
 Bruno Wolff III  wrote:


Enforcing mode alone isn't enough as I tested that one one machine at 
home and it didn't trigger the problem. I'll try another machine late 
tonight.


I got the problem to occur on my i686 machine when booting in enforcing 
mode. This machine uses raid 1 vua mdraid which may or may not be a 
factor in this problem. The boot log has a trace at the end and might be 
helpful, so I'm attaching it here.
-- Logs begin at Sun 2017-09-24 07:43:45 CDT, end at Thu 2017-12-21 22:46:47 
CST. --
Dec 21 21:36:32 wolff.to kernel: Linux version 4.15.0-0.rc4.git1.2.fc28.i686 
(mockbu...@buildvm-15.phx2.fedoraproject.org) (gcc version 7.2.1 20170915 (Red 
Hat 7.2.1-4) (GCC)) #1 SMP Tue Dec 19 17:26:41 UTC 2017
Dec 21 21:36:32 wolff.to kernel: x86/fpu: x87 FPU will use FXSAVE
Dec 21 21:36:32 wolff.to kernel: e820: BIOS-provided physical RAM map:
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0x-0x0009cbff] usable
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0x0009cc00-0x0009] reserved
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0x000e-0x000f] reserved
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0x0010-0xbfee] usable
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xbfef-0xbfefbfff] ACPI data
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xbfefc000-0xbfef] ACPI NVS
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xbff0-0xbff7] usable
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xbff8-0xbfff] reserved
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xfec0-0xfec0] reserved
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xfee0-0xfee00fff] reserved
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xff80-0xffbf] reserved
Dec 21 21:36:32 wolff.to kernel: BIOS-e820: [mem 
0xfff0-0x] reserved
Dec 21 21:36:32 wolff.to kernel: Notice: NX (Execute Disable) protection 
missing in CPU!
Dec 21 21:36:32 wolff.to kernel: random: fast init done
Dec 21 21:36:32 wolff.to kernel: SMBIOS 2.32 present.
Dec 21 21:36:32 wolff.to kernel: DMI: Hewlett-Packard hp workstation 
xw8000/0844, BIOS JQ.W1.19US  04/13/05
Dec 21 21:36:32 wolff.to kernel: e820: update [mem 0x-0x0fff] 
usable ==> reserved
Dec 21 21:36:32 wolff.to kernel: e820: remove [mem 0x000a-0x000f] usable
Dec 21 21:36:32 wolff.to kernel: e820: last_pfn = 0xbff80 max_arch_pfn = 
0x10
Dec 21 21:36:32 wolff.to kernel: MTRR default type: uncachable
Dec 21 21:36:32 wolff.to kernel: MTRR fixed ranges enabled:
Dec 21 21:36:32 wolff.to kernel:   0-9 write-back
Dec 21 21:36:32 wolff.to kernel:   A-B uncachable
Dec 21 21:36:32 wolff.to kernel:   C-F write-protect
Dec 21 21:36:32 wolff.to kernel: MTRR variable ranges enabled:
Dec 21 21:36:32 wolff.to kernel:   0 base 0 mask F8000 write-back
Dec 21 21:36:32 wolff.to kernel:   1 base 08000 mask FC000 write-back
Dec 21 21:36:32 wolff.to kernel:   2 disabled
Dec 21 21:36:32 wolff.to kernel:   3 disabled
Dec 21 21:36:32 wolff.to kernel:   4 disabled
Dec 21 21:36:32 wolff.to kernel:   5 disabled
Dec 21 21:36:32 wolff.to kernel:   6 disabled
Dec 21 21:36:32 wolff.to kernel:   7 disabled
Dec 21 21:36:32 wolff.to kernel: x86/PAT: Configuration [0-7]: WB  WC  UC- UC  
WB  WC  UC- UC  
Dec 21 21:36:32 wolff.to kernel: found SMP MP-table at [mem 
0x000f63a0-0x000f63af] mapped at [(ptrval)]
Dec 21 21:36:32 wolff.to kernel: initial memory mapped: [mem 
0x-0x0a7f]
Dec 21 21:36:32 wolff.to kernel: Base memory trampoline at [(ptrval)] 98000 
size 16384
Dec 21 21:36:32 wolff.to kernel: BRK [0x0a53f000, 0x0a53] PGTABLE
Dec 21 21:36:32 wolff.to kernel: BRK [0x0a54, 0x0a541fff] PGTABLE
Dec 21 21:36:32 wolff.to kernel: BRK [0x0a542000, 0x0a542fff] PGTABLE
Dec 21 21:36:32 wolff.to kernel: RAMDISK: [mem 0x36732000-0x37fe]
Dec 21 21:36:32 wolff.to kernel: Allocated new RAMDISK: [mem 
0x34e74000-0x367318b1]
Dec 21 21:36:32 wolff.to kernel: Move RAMDISK from [mem 0x36732000-0x37fef8b1] 
to [mem 0x34e74000-0x367318b1]
Dec 21 21:36:32 wolff.to kernel: ACPI: Early table checksum verification 
disabled
Dec 21 21:36:32 wolff.to kernel: ACPI: RSDP 0x000F6370 14 (v00 
PTLTD )
Dec 21 21:36:32 wolff.to kernel: ACPI: RSDT 0xBFEF8D1D 34 (v01 
PTLTDRSDT   0604  LTP )
Dec 21 21:36:32 wolff.to kernel: ACPI: FACP 0xBFEFBDB5 74 (v01 
INTEL  PLACER   0604 PTL  0008)
Dec 21 21:36:32 wolff.to kernel: ACPI: DSDT 0xBFEF8D51 003064 (v01 hp   
  silvertn 0604 MSFT 010E)
Dec 21 21:36:32 wolff.to kernel: ACPI: FACS 0xBFEFCFC0 40
Dec 21 21:36:32 wolff.to kernel: ACPI: _HP_ 0xBFEFBE29 000113 (v01 
HPINVT HPINVENT 

Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-21 Thread Ming Lei
Hi Sagi,

On Thu, Dec 21, 2017 at 10:20:45AM +0200, Sagi Grimberg wrote:
> Ming,
> 
> I'd prefer that we make the pci driver match
> the rest of the drivers in nvme.

OK, this way looks better.

> 
> IMO it would be better to allocate a queues array at probe time
> and simply reuse it at reset sequence.
> 
> Can this (untested) patch also fix the issue your seeing:

Please prepare a formal one(at least tested in normal case), either I
or Zhang Yi may test/verify it.

> --
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index f26aaa393016..a8edb29dac16 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool
> shutdown);
>   * Represents an NVM Express device.  Each nvme_dev is a PCI function.
>   */
>  struct nvme_dev {
> -   struct nvme_queue **queues;
> +   struct nvme_queue *queues;
> struct blk_mq_tag_set tagset;
> struct blk_mq_tag_set admin_tagset;
> u32 __iomem *dbs;
> @@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx
> *hctx, void *data,
> unsigned int hctx_idx)
>  {
> struct nvme_dev *dev = data;
> -   struct nvme_queue *nvmeq = dev->queues[0];
> +   struct nvme_queue *nvmeq = >queues[0];
> 
> WARN_ON(hctx_idx != 0);
> WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
> @@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx,
> void *data,
>   unsigned int hctx_idx)
>  {
> struct nvme_dev *dev = data;
> -   struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
> +   struct nvme_queue *nvmeq = >queues[hctx_idx + 1];
> 
> if (!nvmeq->tags)
> nvmeq->tags = >tagset.tags[hctx_idx];
> @@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set,
> struct request *req,
> struct nvme_dev *dev = set->driver_data;
> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> int queue_idx = (set == >tagset) ? hctx_idx + 1 : 0;
> -   struct nvme_queue *nvmeq = dev->queues[queue_idx];
> +   struct nvme_queue *nvmeq = >queues[queue_idx];
> 
> BUG_ON(!nvmeq);
> iod->nvmeq = nvmeq;
> @@ -1046,7 +1046,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx,
> unsigned int tag)
>  static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
>  {
> struct nvme_dev *dev = to_nvme_dev(ctrl);
> -   struct nvme_queue *nvmeq = dev->queues[0];
> +   struct nvme_queue *nvmeq = >queues[0];
> struct nvme_command c;
> 
> memset(, 0, sizeof(c));
> @@ -1290,10 +1290,8 @@ static void nvme_free_queues(struct nvme_dev *dev,
> int lowest)
> int i;
> 
> for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
> -   struct nvme_queue *nvmeq = dev->queues[i];
> dev->ctrl.queue_count--;
> -   dev->queues[i] = NULL;
> -   nvme_free_queue(nvmeq);
> +   nvme_free_queue(>queues[i]);
> }
>  }
> 
> @@ -1325,7 +1323,7 @@ static int nvme_suspend_queue(struct nvme_queue
> *nvmeq)
> 
>  static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
>  {
> -   struct nvme_queue *nvmeq = dev->queues[0];
> +   struct nvme_queue *nvmeq = >queues[0];
> 
> if (!nvmeq)
> return;
> @@ -1387,10 +1385,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev,
> struct nvme_queue *nvmeq,
>  static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
> int depth, int node)
>  {
> -   struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
> -   node);
> -   if (!nvmeq)
> -   return NULL;
> +   struct nvme_queue *nvmeq = >queues[qid];

Maybe you need to zero *nvmeq again since it is done in current code.

> 
> nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
>   >cq_dma_addr, GFP_KERNEL);
> @@ -1409,7 +1404,6 @@ static struct nvme_queue *nvme_alloc_queue(struct
> nvme_dev *dev, int qid,
> nvmeq->q_depth = depth;
> nvmeq->qid = qid;
> nvmeq->cq_vector = -1;
> -   dev->queues[qid] = nvmeq;
> dev->ctrl.queue_count++;
> 
> return nvmeq;
> @@ -1592,7 +1586,7 @@ static int nvme_pci_configure_admin_queue(struct
> nvme_dev *dev)
> if (result < 0)
> return result;
> 
> -   nvmeq = dev->queues[0];
> +   nvmeq = >queues[0];
> if (!nvmeq) {
> nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
> dev_to_node(dev->dev));
> @@ -1638,7 +1632,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
> 
> max = min(dev->max_qid, dev->ctrl.queue_count - 1);
> for (i = dev->online_queues; i <= max; 

Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III

On Thu, Dec 21, 2017 at 12:15:31 -0600,
 Bruno Wolff III  wrote:


One important thing I have just found is that it looks like the 
problem only happens when booting in enforcing mode. If I boot in 
permissive mode it does not happen. My home machines are currently set 
to boot in permissive mode and I'll test this evening to see if I can 
reproduce the problem if I change them to enforcing mode. If so I'll 
be able to do lots of testing during my vacation.


Enforcing mode alone isn't enough as I tested that one one machine at 
home and it didn't trigger the problem. I'll try another machine late 
tonight.


Re: [PATCH 3/3] block: Polling completion performance optimization

2017-12-21 Thread Keith Busch
On Thu, Dec 21, 2017 at 03:17:41PM -0700, Jens Axboe wrote:
> On 12/21/17 2:34 PM, Keith Busch wrote:
> > It would be nice, but the driver doesn't know a request's completion
> > is going to be a polled. 
> 
> That's trivially solvable though, since the information is available
> at submission time.
> 
> > Even if it did, we don't have a spec defined
> > way to tell the controller not to send an interrupt with this command's
> > compeletion, which would be negated anyway if any interrupt driven IO
> > is mixed in the same queue. We could possibly create a special queue
> > with interrupts disabled for this purpose if we can pass the HIPRI hint
> > through the request.
> 
> There's on way to do it per IO, right. But you can create a sq/cq pair
> without interrupts enabled. This would also allow you to scale better
> with multiple users of polling, a case where we currently don't
> perform as well spdk, for instance.

Would you be open to have blk-mq provide special hi-pri hardware contexts
for all these requests to come through? Maybe one per NUMA node? If not,
I don't think have enough unused bits in the NVMe command id to stash
the hctx id to extract the original request.


Re: [PATCH 3/3] block: Polling completion performance optimization

2017-12-21 Thread Jens Axboe
On 12/21/17 2:34 PM, Keith Busch wrote:
> On Thu, Dec 21, 2017 at 02:00:04PM -0700, Jens Axboe wrote:
>> On 12/21/17 1:56 PM, Scott Bauer wrote:
>>> On 12/21/2017 01:46 PM, Keith Busch wrote:
 @@ -181,7 +181,10 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
struct task_struct *waiter = bio->bi_private;
  
WRITE_ONCE(bio->bi_private, NULL);
 -  wake_up_process(waiter);
 +  if (current != waiter)
 +  wake_up_process(waiter);
 +  else
 +  __set_current_state(TASK_RUNNING);
>>>
>>> Do we actually need to set this to TASK_RUNNING? If we get here we're 
>>> already running, right?
>>>
>>> Everywhere I see uses of __set_current_state(TASK_RUNNING) it's after we've 
>>> done a set_current_state(TASK_INTERRUPTIBLE).
>>
>> We'd only be TASK_RUNNING if the IRQ got to it first. And that's something 
>> that
>> should be removed as well - I suspect that'd be a much bigger win, getting 
>> rid
>> of the IRQ trigger for polled IO, than most of the micro optimizations. For
>> Keith's testing, looks like he reduced the cost by turning on coalescing, but
>> it'd be cheaper (and better) to not have to rely on that.
> 
> It would be nice, but the driver doesn't know a request's completion
> is going to be a polled. 

That's trivially solvable though, since the information is available
at submission time.

> Even if it did, we don't have a spec defined
> way to tell the controller not to send an interrupt with this command's
> compeletion, which would be negated anyway if any interrupt driven IO
> is mixed in the same queue. We could possibly create a special queue
> with interrupts disabled for this purpose if we can pass the HIPRI hint
> through the request.

There's on way to do it per IO, right. But you can create a sq/cq pair
without interrupts enabled. This would also allow you to scale better
with multiple users of polling, a case where we currently don't
perform as well spdk, for instance.

-- 
Jens Axboe



Re: [PATCH 3/3] block: Polling completion performance optimization

2017-12-21 Thread Keith Busch
On Thu, Dec 21, 2017 at 02:00:04PM -0700, Jens Axboe wrote:
> On 12/21/17 1:56 PM, Scott Bauer wrote:
> > On 12/21/2017 01:46 PM, Keith Busch wrote:
> >> @@ -181,7 +181,10 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
> >>struct task_struct *waiter = bio->bi_private;
> >>  
> >>WRITE_ONCE(bio->bi_private, NULL);
> >> -  wake_up_process(waiter);
> >> +  if (current != waiter)
> >> +  wake_up_process(waiter);
> >> +  else
> >> +  __set_current_state(TASK_RUNNING);
> > 
> > Do we actually need to set this to TASK_RUNNING? If we get here we're 
> > already running, right?
> > 
> > Everywhere I see uses of __set_current_state(TASK_RUNNING) it's after we've 
> > done a set_current_state(TASK_INTERRUPTIBLE).
> 
> We'd only be TASK_RUNNING if the IRQ got to it first. And that's something 
> that
> should be removed as well - I suspect that'd be a much bigger win, getting rid
> of the IRQ trigger for polled IO, than most of the micro optimizations. For
> Keith's testing, looks like he reduced the cost by turning on coalescing, but
> it'd be cheaper (and better) to not have to rely on that.

It would be nice, but the driver doesn't know a request's completion
is going to be a polled. Even if it did, we don't have a spec defined
way to tell the controller not to send an interrupt with this command's
compeletion, which would be negated anyway if any interrupt driven IO
is mixed in the same queue. We could possibly create a special queue
with interrupts disabled for this purpose if we can pass the HIPRI hint
through the request.


Re: [PATCH 3/3] block: Polling completion performance optimization

2017-12-21 Thread Scott Bauer


On 12/21/2017 01:46 PM, Keith Busch wrote:
> When a request completion is polled, the completion task wakes itself
> up. This is unnecessary, as the task can just set itself back to
> running.
> 
> Signed-off-by: Keith Busch 
> ---
>  fs/block_dev.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 4a181fcb5175..ce67ffe73430 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -181,7 +181,10 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>   struct task_struct *waiter = bio->bi_private;
>  
>   WRITE_ONCE(bio->bi_private, NULL);
> - wake_up_process(waiter);
> + if (current != waiter)
> + wake_up_process(waiter);
> + else
> + __set_current_state(TASK_RUNNING);

Do we actually need to set this to TASK_RUNNING? If we get here we're already 
running, right?

Everywhere I see uses of __set_current_state(TASK_RUNNING) it's after we've 
done a set_current_state(TASK_INTERRUPTIBLE).



Re: [PATCH 1/3] nvme/pci: Start request after doorbell ring

2017-12-21 Thread Jens Axboe
On 12/21/17 2:02 PM, Keith Busch wrote:
> On Thu, Dec 21, 2017 at 01:53:44PM -0700, Jens Axboe wrote:
>> Turns out that wasn't what patch 2 was. And the code is right there
>> above as well, and under the q_lock, so I guess that race doesn't
>> exist.
>>
>> But that does bring up the fact if we should always be doing the
>> nvme_process_cq(nvmeq) after IO submission. For direct/hipri IO,
>> maybe it's better to make the submission path faster and skip it?
> 
> Yes, I am okay to remove the opprotunistic nvme_process_cq in the
> submission path. Even under deeply queued IO, I've not seen this provide
> any measurable benefit.

I haven't either, but curious if others had. It's mostly just extra
overhead, I haven't seen it provide a latency reduction of any kind.

-- 
Jens Axboe



Re: [PATCH 3/3] block: Polling completion performance optimization

2017-12-21 Thread Jens Axboe
On 12/21/17 1:56 PM, Scott Bauer wrote:
> 
> 
> On 12/21/2017 01:46 PM, Keith Busch wrote:
>> When a request completion is polled, the completion task wakes itself
>> up. This is unnecessary, as the task can just set itself back to
>> running.
>>
>> Signed-off-by: Keith Busch 
>> ---
>>  fs/block_dev.c | 5 -
>>  1 file changed, 4 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 4a181fcb5175..ce67ffe73430 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -181,7 +181,10 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
>>  struct task_struct *waiter = bio->bi_private;
>>  
>>  WRITE_ONCE(bio->bi_private, NULL);
>> -wake_up_process(waiter);
>> +if (current != waiter)
>> +wake_up_process(waiter);
>> +else
>> +__set_current_state(TASK_RUNNING);
> 
> Do we actually need to set this to TASK_RUNNING? If we get here we're already 
> running, right?
> 
> Everywhere I see uses of __set_current_state(TASK_RUNNING) it's after we've 
> done a set_current_state(TASK_INTERRUPTIBLE).

We'd only be TASK_RUNNING if the IRQ got to it first. And that's something that
should be removed as well - I suspect that'd be a much bigger win, getting rid
of the IRQ trigger for polled IO, than most of the micro optimizations. For
Keith's testing, looks like he reduced the cost by turning on coalescing, but
it'd be cheaper (and better) to not have to rely on that.

In any case, the __set_current_state() is very cheap. Given that and the
above, I'd rather keep that.

-- 
Jens Axboe



Re: [PATCH 1/3] nvme/pci: Start request after doorbell ring

2017-12-21 Thread Keith Busch
On Thu, Dec 21, 2017 at 01:53:44PM -0700, Jens Axboe wrote:
> Turns out that wasn't what patch 2 was. And the code is right there
> above as well, and under the q_lock, so I guess that race doesn't
> exist.
> 
> But that does bring up the fact if we should always be doing the
> nvme_process_cq(nvmeq) after IO submission. For direct/hipri IO,
> maybe it's better to make the submission path faster and skip it?

Yes, I am okay to remove the opprotunistic nvme_process_cq in the
submission path. Even under deeply queued IO, I've not seen this provide
any measurable benefit.


Re: [PATCH 2/3] nvme/pci: Remove cq_vector check in IO path

2017-12-21 Thread Jens Axboe
On 12/21/17 1:46 PM, Keith Busch wrote:
> This is a micro-optimization removing unnecessary check for a disabled
> queue. We no longer need this check because blk-mq provides the ability
> to quiesce queues that nvme uses, and the doorbell registers are never
> unmapped as long as requests are active.

Reviewed-by: Jens Axboe 

-- 
Jens Axboe



Re: [PATCH 1/3] nvme/pci: Start request after doorbell ring

2017-12-21 Thread Jens Axboe
On 12/21/17 1:49 PM, Jens Axboe wrote:
> On 12/21/17 1:46 PM, Keith Busch wrote:
>> This is a performance optimization that allows the hardware to work on
>> a command in parallel with the kernel's stats and timeout tracking.
>>
>> Signed-off-by: Keith Busch 
>> ---
>>  drivers/nvme/host/pci.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index f5800c3c9082..df5550ce0531 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -886,8 +886,6 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
>> *hctx,
>>  goto out_cleanup_iod;
>>  }
>>  
>> -blk_mq_start_request(req);
>> -
>>  spin_lock_irq(>q_lock);
>>  if (unlikely(nvmeq->cq_vector < 0)) {
>>  ret = BLK_STS_IOERR;
>> @@ -895,6 +893,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
>> *hctx,
>>  goto out_cleanup_iod;
>>  }
>>  __nvme_submit_cmd(nvmeq, );
>> +blk_mq_start_request(req);
>>  nvme_process_cq(nvmeq);
>>  spin_unlock_irq(>q_lock);
>>  return BLK_STS_OK;
> 
> I guess this will work since we hold the q_lock, but you should probably
> include a comment to that effect since it's important that we can't find
> the completion before we've called started.
> 
> Actually, you'd need to reorder this after #2 (I'm assuming, it hasn't
> shown up yet, but I'm guessing it's kill the cq check after submission)
> since if we have multiple CPUs on the same hardware queue, one of
> the other CPUs could find a completion event between your
> __nvme_submit_cmd() and blk_mq_start_request() call. Very short window,
> but it does exist.

Turns out that wasn't what patch 2 was. And the code is right there
above as well, and under the q_lock, so I guess that race doesn't
exist.

But that does bring up the fact if we should always be doing the
nvme_process_cq(nvmeq) after IO submission. For direct/hipri IO,
maybe it's better to make the submission path faster and skip it?

-- 
Jens Axboe



Re: [PATCH 1/3] nvme/pci: Start request after doorbell ring

2017-12-21 Thread Jens Axboe
On 12/21/17 1:46 PM, Keith Busch wrote:
> This is a performance optimization that allows the hardware to work on
> a command in parallel with the kernel's stats and timeout tracking.
> 
> Signed-off-by: Keith Busch 
> ---
>  drivers/nvme/host/pci.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index f5800c3c9082..df5550ce0531 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -886,8 +886,6 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
> *hctx,
>   goto out_cleanup_iod;
>   }
>  
> - blk_mq_start_request(req);
> -
>   spin_lock_irq(>q_lock);
>   if (unlikely(nvmeq->cq_vector < 0)) {
>   ret = BLK_STS_IOERR;
> @@ -895,6 +893,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
> *hctx,
>   goto out_cleanup_iod;
>   }
>   __nvme_submit_cmd(nvmeq, );
> + blk_mq_start_request(req);
>   nvme_process_cq(nvmeq);
>   spin_unlock_irq(>q_lock);
>   return BLK_STS_OK;

I guess this will work since we hold the q_lock, but you should probably
include a comment to that effect since it's important that we can't find
the completion before we've called started.

Actually, you'd need to reorder this after #2 (I'm assuming, it hasn't
shown up yet, but I'm guessing it's kill the cq check after submission)
since if we have multiple CPUs on the same hardware queue, one of
the other CPUs could find a completion event between your
__nvme_submit_cmd() and blk_mq_start_request() call. Very short window,
but it does exist.

-- 
Jens Axboe



[PATCH 0/3] Performance enhancements

2017-12-21 Thread Keith Busch
A few IO micro-optimizations for IO polling and NVMe. I'm really working
to close the performance gap with userspace drivers, and this gets me
halfway there on latency. The fastest hardware I could get measured
roundtrip read latency at 5usec with this series that was previously
measuring 5.7usec.

Note with NVMe, you really need to crank up the interrupt coalescing to
see the completion polling benefit.

Test pre-setup:

  echo performance | tee 
/sys/devices/system/cpu/cpufreq/policy*/scaling_governor
  echo 0 > /sys/block/nvme0n1/queue/iostats
  echo -1 > /sys/block/nvme0n1/queue/io_poll_delay
  nvme set-feature /dev/nvme0 -f 8 -v 0x4ff

fio profile:

  [global]
  ioengine=pvsync2
  rw=randread
  norandommap
  direct=1
  bs=4k
  hipri

  [hi-pri]
  filename=/dev/nvme0n1
  cpus_allowed=2

Keith Busch (3):
  nvme/pci: Start request after doorbell ring
  nvme/pci: Remove cq_vector check in IO path
  block: Polling completion performance optimization

 drivers/nvme/host/pci.c | 14 +++---
 fs/block_dev.c  |  5 -
 2 files changed, 7 insertions(+), 12 deletions(-)

-- 
2.13.6



[PATCH 2/3] nvme/pci: Remove cq_vector check in IO path

2017-12-21 Thread Keith Busch
This is a micro-optimization removing unnecessary check for a disabled
queue. We no longer need this check because blk-mq provides the ability
to quiesce queues that nvme uses, and the doorbell registers are never
unmapped as long as requests are active.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index df5550ce0531..0be5124a3a49 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -887,11 +887,6 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
*hctx,
}
 
spin_lock_irq(>q_lock);
-   if (unlikely(nvmeq->cq_vector < 0)) {
-   ret = BLK_STS_IOERR;
-   spin_unlock_irq(>q_lock);
-   goto out_cleanup_iod;
-   }
__nvme_submit_cmd(nvmeq, );
blk_mq_start_request(req);
nvme_process_cq(nvmeq);
@@ -923,11 +918,9 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue 
*nvmeq)
 {
u16 head = nvmeq->cq_head;
 
-   if (likely(nvmeq->cq_vector >= 0)) {
-   if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+   if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
  nvmeq->dbbuf_cq_ei))
-   writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
-   }
+   writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 }
 
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
-- 
2.13.6



[PATCH 1/3] nvme/pci: Start request after doorbell ring

2017-12-21 Thread Keith Busch
This is a performance optimization that allows the hardware to work on
a command in parallel with the kernel's stats and timeout tracking.

Signed-off-by: Keith Busch 
---
 drivers/nvme/host/pci.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f5800c3c9082..df5550ce0531 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -886,8 +886,6 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
*hctx,
goto out_cleanup_iod;
}
 
-   blk_mq_start_request(req);
-
spin_lock_irq(>q_lock);
if (unlikely(nvmeq->cq_vector < 0)) {
ret = BLK_STS_IOERR;
@@ -895,6 +893,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx 
*hctx,
goto out_cleanup_iod;
}
__nvme_submit_cmd(nvmeq, );
+   blk_mq_start_request(req);
nvme_process_cq(nvmeq);
spin_unlock_irq(>q_lock);
return BLK_STS_OK;
-- 
2.13.6



[PATCH 3/3] block: Polling completion performance optimization

2017-12-21 Thread Keith Busch
When a request completion is polled, the completion task wakes itself
up. This is unnecessary, as the task can just set itself back to
running.

Signed-off-by: Keith Busch 
---
 fs/block_dev.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4a181fcb5175..ce67ffe73430 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -181,7 +181,10 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
struct task_struct *waiter = bio->bi_private;
 
WRITE_ONCE(bio->bi_private, NULL);
-   wake_up_process(waiter);
+   if (current != waiter)
+   wake_up_process(waiter);
+   else
+   __set_current_state(TASK_RUNNING);
 }
 
 static ssize_t
-- 
2.13.6



Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III

On Thu, Dec 21, 2017 at 10:02:15 -0700,
 Jens Axboe  wrote:

On 12/21/17 9:42 AM, Bruno Wolff III wrote:

On Thu, Dec 21, 2017 at 23:48:19 +0800,
  weiping zhang  wrote:

output you want. I never saw it for any kernels I compiled myself. Only when
I test kernels built by Fedora do I see it.

see it every boot ?


I don't look every boot. The warning gets scrolled of the screen. Once I see
the CPU hang warnings I know the boot is failing. I don't always look
at journalctl later to see what's there.


I'm going to revert a0747a859ef6 for now, since we're now 8 days into this
and no progress has been made on fixing it.


One important thing I have just found is that it looks like the problem 
only happens when booting in enforcing mode. If I boot in permissive 
mode it does not happen. My home machines are currently set to boot in 
permissive mode and I'll test this evening to see if I can reproduce the 
problem if I change them to enforcing mode. If so I'll be able to do lots 
of testing during my vacation.


[GIT PULL] Block fixes for 4.15-rc

2017-12-21 Thread Jens Axboe
Hi Linus,

It's been a few weeks, so here's a small collection of fixes that
should go into the current series.

This pull request contains:

- NVMe pull request from Christoph, with a few important fixes.

- kyber hang fix from Omar.

- A blk-throttl fix from Shaohua, fixing a case where we double charge
  a bio.

- Two call_single_data alignment fixes from me, fixing up some unfortunate
  changes that went into 4.14 without being properly reviewed on the
  block side (since nobody was CC'ed on the patch...).

- A bounce buffer fix in two parts, one from me and one from Ming.

- Revert bdi debug error handling patch. It's causing boot issues for some
  folks, and a week down the line, we're still no closer to a fix. Revert
  this patch for now until it's figured out, then we can retry for 4.16.

Please pull!


  git://git.kernel.dk/linux-block.git for-linus



David Disseldorp (1):
  nvme: set discard_alignment to zero

James Smart (1):
  nvme-fc: remove double put reference if admin connect fails

Jens Axboe (5):
  Merge branch 'nvme-4.15' of git://git.infradead.org/nvme into for-linus
  block: fix blk_rq_append_bio
  block: unalign call_single_data in struct request
  null_blk: unalign call_single_data
  Revert "bdi: add error handle for bdi_debug_register"

Keith Busch (2):
  nvme: check hw sectors before setting chunk sectors
  nvme: setup streams after initializing namespace head

Ming Lei (2):
  nvme: call blk_integrity_unregister after queue is cleaned up
  block: don't let passthrough IO go into .make_request_fn()

Omar Sandoval (1):
  kyber: fix another domain token wait queue hang

Shaohua Li (1):
  block-throttle: avoid double charge

 block/bio.c|  2 ++
 block/blk-map.c| 38 ++
 block/blk-throttle.c   |  8 +---
 block/bounce.c |  6 --
 block/kyber-iosched.c  | 37 -
 drivers/block/null_blk.c   |  4 ++--
 drivers/nvme/host/core.c   | 11 ++-
 drivers/nvme/host/fc.c |  1 -
 drivers/scsi/osd/osd_initiator.c   |  4 +++-
 drivers/target/target_core_pscsi.c |  4 ++--
 include/linux/bio.h|  2 ++
 include/linux/blk_types.h  |  9 -
 include/linux/blkdev.h | 25 +
 mm/backing-dev.c   |  5 +
 14 files changed, 94 insertions(+), 62 deletions(-)

-- 
Jens Axboe



Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Jens Axboe
On 12/21/17 9:42 AM, Bruno Wolff III wrote:
> On Thu, Dec 21, 2017 at 23:48:19 +0800,
>   weiping zhang  wrote:
>>> output you want. I never saw it for any kernels I compiled myself. Only when
>>> I test kernels built by Fedora do I see it.
>> see it every boot ?
> 
> I don't look every boot. The warning gets scrolled of the screen. Once I see 
> the CPU hang warnings I know the boot is failing. I don't always look 
> at journalctl later to see what's there.

I'm going to revert a0747a859ef6 for now, since we're now 8 days into this
and no progress has been made on fixing it.

-- 
Jens Axboe



Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III

On Thu, Dec 21, 2017 at 23:48:19 +0800,
 weiping zhang  wrote:

output you want. I never saw it for any kernels I compiled myself. Only when
I test kernels built by Fedora do I see it.

see it every boot ?


I don't look every boot. The warning gets scrolled of the screen. Once I see 
the CPU hang warnings I know the boot is failing. I don't always look 
at journalctl later to see what's there.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread weiping zhang
2017-12-21 23:36 GMT+08:00 Bruno Wolff III :
> On Thu, Dec 21, 2017 at 23:31:40 +0800,
>  weiping zhang  wrote:
>>
>> does every time boot fail can trigger WANRING in device_add_disk ?
>
>
> Not that I see. But the message could scroll off the screen. The boot gets
> far enough that systemd copies over dmesg output to permanent storage that I
> can see on my next successful boot. That's where I looked for the warning
> output you want. I never saw it for any kernels I compiled myself. Only when
> I test kernels built by Fedora do I see it.
see it every boot ?

> I just tried booting to single user and the boot still hangs.
>
> When I build the kernels, the compiler options are probably a bit different
> than when Fedora does. That might affect what happens during boot.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III

On Thu, Dec 21, 2017 at 23:31:40 +0800,
 weiping zhang  wrote:

does every time boot fail can trigger WANRING in device_add_disk ?


Not that I see. But the message could scroll off the screen. The boot gets 
far enough that systemd copies over dmesg output to permanent storage that 
I can see on my next successful boot. That's where I looked for the warning 
output you want. I never saw it for any kernels I compiled myself. Only 
when I test kernels built by Fedora do I see it.


I just tried booting to single user and the boot still hangs.

When I build the kernels, the compiler options are probably a bit different 
than when Fedora does. That might affect what happens during boot.


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III

On Thu, Dec 21, 2017 at 22:01:33 +0800,
 weiping zhang  wrote:

Hi,
how do you do bisect ?build all kernel commit one by one ?
as you did before:
https://bugzilla.redhat.com/show_bug.cgi?id=1520982


I just did the one bisect using Linus' tree. After each build, I would do 
a test boot and see if the boot was normal or if I got errors and an 
eventual hang before boot.


Since then I have used git revert to revert just the problem commit from 
later kernels (such as v4.15-rc4) and when I do the system boots normally. 
And when I don't do the revert or just use stock Fedora kernels the problem 
occurs every time.


I also did a couple of tests with Josh Boyer's Fedora kernel tree that 
has Fedora patches on top of the development kernel.



what kernel source code do you use that occur warning at device_add_disk?
from fedora or any official release ? if so ,could you provide web link?


That was from an offical Fedora kernel. I believe I got it from the 
nodebug repo, but that kernel should be the same as the one that was 
normally used for rawhide. It is at 
https://koji.fedoraproject.org/koji/buildinfo?buildID=1007500 
but I don't know how much longer the binaries will stay available in koji. 


if you use same kernel source code and same .config, why your own build
Cann't trigger that warning ?


I don't know. The install script may build the initramfs differently. As 
far as I can tell, if the WARN_ON was triggered, I should have gotten 
output. 


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread weiping zhang
2017-12-21 21:00 GMT+08:00 Bruno Wolff III :
> After today, I won't have physical access to the problem machine until
> January 2nd. So if you guys have any testing suggestions I need them soon if
> they are to get done before my vacation.
> I do plan to try booting to level 1 to see if I can get a login prompt that
> might facilitate testing. The lockups do happen fairly late in the boot
> process. I never get to X, but maybe it will get far enough for a console
> login.
>
Hi,
how do you do bisect ?build all kernel commit one by one ?
as you did before:
https://bugzilla.redhat.com/show_bug.cgi?id=1520982

what kernel source code do you use that occur warning at device_add_disk?
from fedora or any official release ? if so ,could you provide web link?

if you use same kernel source code and same .config, why your own build
Cann't trigger that warning ?


Re: Regression with a0747a859ef6 ("bdi: add error handle for bdi_debug_register")

2017-12-21 Thread Bruno Wolff III
After today, I won't have physical access to the problem machine until 
January 2nd. So if you guys have any testing suggestions I need them soon 
if they are to get done before my vacation.
I do plan to try booting to level 1 to see if I can get a login prompt 
that might facilitate testing. The lockups do happen fairly late in the 
boot process. I never get to X, but maybe it will get far enough for 
a console login.


Re: severe regression with random I/O from (at least) 4.14?

2017-12-21 Thread Paolo Valente


> Il giorno 21 dic 2017, alle ore 11:57, Paolo Valente 
>  ha scritto:
> 
> Hi,
> a few minutes ago I bumped into this apparent severe regression, with 
> 4.15-rc4 and an SSD PLEXTOR PX-256M5S. If, with none as I/O scheduler, I do
> fio --name=global --rw=randread --size=512m --name=job1
> 
> I get
> read : io=524288KB, bw=34402KB/s, iops=8600, runt= 15240msec
> 
> This device had to reach at least 40 KIOPS. I retried with mq-deadline too, 
> and then with a 4.14, same outcome in all cases. I asked some colleague to 
> repeat the test with his/her device. Same outcome, so far, also with an SSD 
> 850 EVO 250GB.
> 
> I'm sorry if I'm making some colossal mistake.
> 

Forget it, sorry. I was confused by a failed test where iodepth was higher than 
1, and I am sleeping too little ...

Thanks,
Paolo

> Thanks,
> Paolo
> 
> 
> 



Re: [PATCH 03/11] fs: add frozen sb state helpers

2017-12-21 Thread Jan Kara
Hello,

I think I owe you a reply here... Sorry that it took so long.

On Fri 01-12-17 22:13:27, Luis R. Rodriguez wrote:
> On Fri, Dec 01, 2017 at 12:47:24PM +0100, Jan Kara wrote:
> > On Thu 30-11-17 20:05:48, Luis R. Rodriguez wrote:
> > > > In fact, what might be a cleaner solution is to introduce a 
> > > > 'freeze_count'
> > > > for superblock freezing (we already do have this for block devices). 
> > > > Then
> > > > you don't need to differentiate these two cases - but you'd still need 
> > > > to
> > > > properly handle cleanup if freezing of all superblocks fails in the 
> > > > middle.
> > > > So I'm not 100% this works out nicely in the end. But it's certainly 
> > > > worth
> > > > a consideration.
> > > 
> > > Ah, there are three important reasons for doing it the way I did it which 
> > > are
> > > easy to miss, unless you read the commit log message very carefully.
> > > 
> > > 0) The ioctl interface causes a failure to be sent back to userspace if
> > > you issue two consecutive freezes, or two thaws. Ie, once a filesystem is
> > > frozen, a secondary call will result in an error. Likewise for thaw.
> > 
> > Yep. But also note that there's *another* interface to filesystem freezing
> > which behaves differently - freeze_bdev() (used internally by dm). That
> > interface uses the counter and freezing of already frozen device succeeds.
> 
> Ah... so freeze_bdev() semantics matches the same semantics I was looking
> for.
> 
> > IOW it is a mess.
> 
> To say the least.
> 
> > We cannot change the behavior of the ioctl but we could
> > still provide an in-kernel interface to freeze_super() with the same
> > semantics as freeze_bdev() which might be easier to use by suspend - maybe
> > we could call it 'exclusive' (for the current freeze_super() semantics) and
> > 'non-exclusive' (for the freeze_bdev() semantics) since this is very much
> > like O_EXCL open of block devices...
> 
> Sure, now typically I see we make exclusive calls with the postfix _excl() so
> I take it you'd be OK in renaming freeze_super() freeze_super_excl() 
> eventually
> then?

In principle yes but let's leave the naming disputes to a later time when
it is clear what API do we actually want to provide.

> I totally missed freeze_bdev() otherwise I think I would have picked up on the
> shared semantics stuff and I would have just made a helper out of what
> freeze_bdev() uses, and then have both in-kernel and freeze_bdev() use it.
> 
> I'll note that its still not perfectly clear if really the semantics behind
> freeze_bdev() match what I described above fully. That still needs to be
> vetted for. For instance, does thaw_bdev() keep a superblock frozen if we
> an ioctl initiated freeze had occurred before? If so then great. Otherwise
> I think we'll need to distinguish the ioctl interface. Worst possible case
> is that bdev semantics and in-kernel semantics differ somehow, then that
> will really create a holy fucking mess.

I believe nobody really thought about mixing those two interfaces to fs
freezing and so the behavior is basically defined by the implementation.
That is:

freeze_bdev() on sb frozen by ioctl_fsfreeze() -> EBUSY
freeze_bdev() on sb frozen by freeze_bdev() -> success
ioctl_fsfreeze() on sb frozen by freeze_bdev() -> EBUSY
ioctl_fsfreeze() on sb frozen by ioctl_fsfreeze() -> EBUSY

thaw_bdev() on sb frozen by ioctl_fsfreeze() -> EINVAL
ioctl_fsthaw() on sb frozen by freeze_bdev() -> success

What I propose is the following API:

freeze_super_excl()
  - freezes superblock, returns EBUSY if the superblock is already frozen
(either by another freeze_super_excl() or by freeze_super())
freeze_super()
  - this function will make sure superblock is frozen when the function
returns with success. It can be nested with other freeze_super() or
freeze_super_excl() calls (this second part is different from how
freeze_bdev() behaves currently but AFAICT this behavior is actually
what all current users of freeze_bdev() really want - just make sure
fs cannot be written to)
thaw_super()
  - counterpart to freeze_super(), would fail with EINVAL if we were to
drop the last "freeze reference" but sb was actually frozen with
freeze_super_excl()
thaw_super_excl()
  - counterpart to freeze_super_excl(). Fails with EINVAL if sb was not
frozen with freeze_super_excl() (this is different to current behavior
but I don't believe anyone relies on this and doing otherwise is asking
for data corruption).

I'd implement it by a freeze counter in the superblock (similar to what we
currently have in bdev) where every call to freeze_super() or
freeze_super_excl() would add one. Additionally we'd have a flag in the
superblock whether the first freeze (it could not be any other since those
would fail with EBUSY) came from freeze_super_excl().

Then we could make ioctl interface use the _excl() variant of the freezing
API, freeze_bdev() would use the non-exclusive variant (we could drop the
freeze 

severe regression with random I/O from (at least) 4.14?

2017-12-21 Thread Paolo Valente
Hi,
a few minutes ago I bumped into this apparent severe regression, with 4.15-rc4 
and an SSD PLEXTOR PX-256M5S. If, with none as I/O scheduler, I do
fio --name=global --rw=randread --size=512m --name=job1

I get
read : io=524288KB, bw=34402KB/s, iops=8600, runt= 15240msec

This device had to reach at least 40 KIOPS. I retried with mq-deadline too, and 
then with a 4.14, same outcome in all cases. I asked some colleague to repeat 
the test with his/her device. Same outcome, so far, also with an SSD 850 EVO 
250GB.

I'm sorry if I'm making some colossal mistake.

Thanks,
Paolo





Re: bfq: BUG bfq_queue: Objects remaining in bfq_queue on __kmem_cache_shutdown() after rmmod

2017-12-21 Thread Guoqing Jiang



On 12/21/2017 03:53 PM, Paolo Valente wrote:



Il giorno 21 dic 2017, alle ore 08:08, Guoqing Jiang  ha 
scritto:

Hi,


On 12/08/2017 08:34 AM, Holger Hoffstätte wrote:

So plugging in a device on USB with BFQ as scheduler now works without
hiccup (probably thanks to Ming Lei's last patch), but of course I found
another problem. Unmounting the device after use, changing the scheduler
back to deadline or kyber and rmmod'ing the BFQ module reproducibly gives me:

kernel: 
=
kernel: BUG bfq_queue (Tainted: GB  ): Objects remaining in 
bfq_queue on __kmem_cache_shutdown()
kernel: 
-
kernel:
kernel: INFO: Slab 0xea001601fc00 objects=37 used=3 fp=0x8805807f0360 
flags=0x80008100
kernel: CPU: 0 PID: 9967 Comm: rmmod Tainted: GB   4.14.5 #1
kernel: Hardware name: Gigabyte Technology Co., Ltd. P67-DS3-B3/P67-DS3-B3, 
BIOS F1 05/06/2011
kernel: Call Trace:
kernel:  dump_stack+0x46/0x5e
kernel:  slab_err+0x9e/0xb0
kernel:  ? on_each_cpu_mask+0x35/0x40
kernel:  ? ksm_migrate_page+0x60/0x60
kernel:  ? __kmalloc+0x1c9/0x1d0
kernel:  __kmem_cache_shutdown+0x177/0x350
kernel:  shutdown_cache+0xf/0x130
kernel:  kmem_cache_destroy+0x19e/0x1b0
kernel:  SyS_delete_module+0x168/0x230
kernel:  ? exit_to_usermode_loop+0x39/0x80
kernel:  entry_SYSCALL_64_fastpath+0x13/0x94
kernel: RIP: 0033:0x7f53e4136b97
kernel: RSP: 002b:7ffd660061d8 EFLAGS: 0206 ORIG_RAX: 00b0
kernel: RAX: ffda RBX: 0003 RCX: 7f53e4136b97
kernel: RDX: 000a RSI: 0800 RDI: 006247f8
kernel: RBP:  R08: 7ffd66005171 R09: 
kernel: R10: 7f53e41acbc0 R11: 0206 R12: 00624790
kernel: R13: 7ffd660051d0 R14: 00624790 R15: 00623260
kernel: INFO: Object 0x8805807f @offset=0
kernel: INFO: Object 0x8805807f01b0 @offset=432
kernel: INFO: Object 0x8805807f3cc0 @offset=15552
kernel: kmem_cache_destroy bfq_queue: Slab cache still has objects
kernel: CPU: 0 PID: 9967 Comm: rmmod Tainted: GB   4.14.5 #1
kernel: Hardware name: Gigabyte Technology Co., Ltd. P67-DS3-B3/P67-DS3-B3, 
BIOS F1 05/06/2011
kernel: Call Trace:
kernel:  dump_stack+0x46/0x5e
kernel:  kmem_cache_destroy+0x191/0x1b0
kernel:  SyS_delete_module+0x168/0x230
kernel:  ? exit_to_usermode_loop+0x39/0x80
kernel:  entry_SYSCALL_64_fastpath+0x13/0x94
kernel: RIP: 0033:0x7f53e4136b97
kernel: RSP: 002b:7ffd660061d8 EFLAGS: 0206 ORIG_RAX: 00b0
kernel: RAX: ffda RBX: 0003 RCX: 7f53e4136b97
kernel: RDX: 000a RSI: 0800 RDI: 006247f8
kernel: RBP:  R08: 7ffd66005171 R09: 
kernel: R10: 7f53e41acbc0 R11: 0206 R12: 00624790
kernel: R13: 7ffd660051d0 R14: 00624790 R15: 00623260


I also encountered the similar bug, seems there are some async objects which 
are not freed with
BFQ_GROUP_IOSCHED build as "Y".

If revert part of commit e21b7a0b988772e82e7147e1c659a5afe2ae003c ("block, bfq: 
add full hierarchical
scheduling and cgroups support")like below, then I can't see the bug again with 
simple test.

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 889a8549d97f..c640e64e042f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4710,6 +4710,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
 spin_lock_irq(>lock);
 list_for_each_entry_safe(bfqq, n, >idle_list, bfqq_list)
 bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+   bfq_put_async_queues(bfqd, bfqd->root_group);
 spin_unlock_irq(>lock);

 hrtimer_cancel(>idle_slice_timer);
@@ -4718,7 +4719,6 @@ static void bfq_exit_queue(struct elevator_queue *e)
 blkcg_deactivate_policy(bfqd->queue, _policy_bfq);
  #else
 spin_lock_irq(>lock);
-   bfq_put_async_queues(bfqd, bfqd->root_group);
 kfree(bfqd->root_group);
 spin_unlock_irq(>lock);

But perhaps we should do it inside blkcg_deactivate_policy, right?


Thank you very much for investigating this. And yes, I removed that 
bfq_put_async_queues invocation a long ago, because it had to be done in 
blkcg_deactivate_policy; and actually it was invoked as expected. We are trying 
to understand what is going wrong now.


Thanks for the in formations. I find another way, does it make sense?

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index da1525ec4c87..0a070daf96c7 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -474,7 +474,10 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
 static void bfq_pd_free(struct blkg_policy_data *pd)
 {
    struct bfq_group *bfqg = pd_to_bfqg(pd);
+   struct bfq_data *bfqd = bfqg->bfqd;

+   if (bfqd)
+

Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-21 Thread Johannes Thumshirn
On Thu, Dec 21, 2017 at 10:20:45AM +0200, Sagi Grimberg wrote:
> @@ -2470,8 +2465,9 @@ static int nvme_probe(struct pci_dev *pdev, const
> struct pci_device_id *id)
> dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
> if (!dev)
> return -ENOMEM;
> -   dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void
> *),
> -   GFP_KERNEL, node);
> +
> +   alloc_size = (num_possible_cpus() + 1) * sizeof(struct nvme_queue
> *);
> +   dev->queues = kzalloc_node(alloc_size, GFP_KERNEL, node);

Nit:
  dev->queues = kcalloc_node(num_possible_cpus() + 1,
 sizeof(struct nvme_queue *),
 node);
-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850


Re: [PATCH V2 6/6] nvme-pci: remove .init_request callback

2017-12-21 Thread Sagi Grimberg

Ming,

I'd prefer that we make the pci driver match
the rest of the drivers in nvme.

IMO it would be better to allocate a queues array at probe time
and simply reuse it at reset sequence.

Can this (untested) patch also fix the issue your seeing:
--
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f26aaa393016..a8edb29dac16 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, 
bool shutdown);

  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
-   struct nvme_queue **queues;
+   struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
@@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx 
*hctx, void *data,

unsigned int hctx_idx)
 {
struct nvme_dev *dev = data;
-   struct nvme_queue *nvmeq = dev->queues[0];
+   struct nvme_queue *nvmeq = >queues[0];

WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx 
*hctx, void *data,

  unsigned int hctx_idx)
 {
struct nvme_dev *dev = data;
-   struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+   struct nvme_queue *nvmeq = >queues[hctx_idx + 1];

if (!nvmeq->tags)
nvmeq->tags = >tagset.tags[hctx_idx];
@@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set 
*set, struct request *req,

struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == >tagset) ? hctx_idx + 1 : 0;
-   struct nvme_queue *nvmeq = dev->queues[queue_idx];
+   struct nvme_queue *nvmeq = >queues[queue_idx];

BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
@@ -1046,7 +1046,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, 
unsigned int tag)

 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
struct nvme_dev *dev = to_nvme_dev(ctrl);
-   struct nvme_queue *nvmeq = dev->queues[0];
+   struct nvme_queue *nvmeq = >queues[0];
struct nvme_command c;

memset(, 0, sizeof(c));
@@ -1290,10 +1290,8 @@ static void nvme_free_queues(struct nvme_dev 
*dev, int lowest)

int i;

for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
-   struct nvme_queue *nvmeq = dev->queues[i];
dev->ctrl.queue_count--;
-   dev->queues[i] = NULL;
-   nvme_free_queue(nvmeq);
+   nvme_free_queue(>queues[i]);
}
 }

@@ -1325,7 +1323,7 @@ static int nvme_suspend_queue(struct nvme_queue 
*nvmeq)


 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
-   struct nvme_queue *nvmeq = dev->queues[0];
+   struct nvme_queue *nvmeq = >queues[0];

if (!nvmeq)
return;
@@ -1387,10 +1385,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev 
*dev, struct nvme_queue *nvmeq,

 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth, int 
node)

 {
-   struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
-   node);
-   if (!nvmeq)
-   return NULL;
+   struct nvme_queue *nvmeq = >queues[qid];

nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
  >cq_dma_addr, GFP_KERNEL);
@@ -1409,7 +1404,6 @@ static struct nvme_queue *nvme_alloc_queue(struct 
nvme_dev *dev, int qid,

nvmeq->q_depth = depth;
nvmeq->qid = qid;
nvmeq->cq_vector = -1;
-   dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++;

return nvmeq;
@@ -1592,7 +1586,7 @@ static int nvme_pci_configure_admin_queue(struct 
nvme_dev *dev)

if (result < 0)
return result;

-   nvmeq = dev->queues[0];
+   nvmeq = >queues[0];
if (!nvmeq) {
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
dev_to_node(dev->dev));
@@ -1638,7 +1632,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)

max = min(dev->max_qid, dev->ctrl.queue_count - 1);
for (i = dev->online_queues; i <= max; i++) {
-   ret = nvme_create_queue(dev->queues[i], i);
+   ret = nvme_create_queue(>queues[i], i);
if (ret)
break;
}
@@ -1894,7 +1888,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)

 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
-   struct nvme_queue *adminq = dev->queues[0];
+   struct nvme_queue *adminq = >queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result,