Re: Re: [PATCH] bcache: fix error count in memory shrink

2018-03-05 Thread tang . junhui
Hi Mike---

>>Hi Tang Junhui---
>>
>>I'm not really sure about this one.  It changes the semantics of the
>>amount of work done-- nr_to_scan now means number of things to free
>>instead of the number to check.
>>
>The code seems to be designed as that, sc->nr_to_scan marks how much btree
>nodes to scan in c->btree_cache_used, and it doesn't include the btree
>nodes in c->btree_cache_freeable.
>
I am sorry, sc->nr_to_scan should include the btree nodes in
c->btree_cache_freeable. I have sended a new patch.

>>If the system is under severe memory pressure, and most of the cache is
>>essential/actively used, this could greatly increase the amount of work
>>done.
>>
>Yes, I didn't relize this.
>>As to the i < btree_cache_used, it's arguably a bug, but it only reduces
>>the amount of work done if the cache is very, very small.
>>
>I think it is a bug, I'll send a new patch to add variable to record 
>c->btree_cache_used befor for(;;) loop.
>
>>Mike
>>
>>On 01/30/2018 06:46 PM, tang.jun...@zte.com.cn wrote:
>>> From: Tang Junhui 
>>> 
>>> In bch_mca_scan(), nr btree nodes are expected to shrink, so the for(;;)
>>> loop need to satisfy the condition freed < nr.
>>> And since c->btree_cache_used always decrease after mca_data_free() calling
>>> in for(;;) loop,  so we need a auto variable to record c->btree_cache_used
>>> before the for(;;) loop.
>>> 
>>> Signed-off-by: Tang Junhui 
>>> ---
>>>  drivers/md/bcache/btree.c | 4 +++-
>>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>> 
>>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>>> index 81e8dc3..7e9ef3d 100644
>>> --- a/drivers/md/bcache/btree.c
>>> +++ b/drivers/md/bcache/btree.c
>>> @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker 
>>> *shrink,
>>>  struct btree *b, *t;
>>>  unsigned long i, nr = sc->nr_to_scan;
>>>  unsigned long freed = 0;
>>> +unsigned int btree_cache_used;
>>>  
>>>  if (c->shrinker_disabled)
>>>  return SHRINK_STOP;
>>> @@ -700,7 +701,8 @@ static unsigned long bch_mca_scan(struct shrinker 
>>> *shrink,
>>>  }
>>>  }
>>>  
>>> -for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
>>> +btree_cache_used = c->btree_cache_used;
>>> +for (i = 0; freed < nr && i < btree_cache_used; i++) {
>>>  if (list_empty(>btree_cache))
>>>  goto out;
>>>  
>>> 
Thanks,
Tang Junhui


[PATCH] [v2] bcache: fix using of loop variable in memory shrink

2018-03-05 Thread tang . junhui
From: Tang Junhui 

In bch_mca_scan(), There are some confusion and logical error in the use of
loop variables. In this patch, we clarify them as:
1) nr: the number of btree nodes needs to scan, which will decrease after
we scan a btree node, and should not be less than 0;
2) i: the number of btree nodes have scanned, includes both
btree_cache_freeable and btree_cache, which should not be bigger than
btree_cache_used;
3) freed: the number of btree nodes have freed.

Signed-off-by: Tang Junhui 
---
 drivers/md/bcache/btree.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3..7487f8e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
struct btree *b, *t;
unsigned long i, nr = sc->nr_to_scan;
unsigned long freed = 0;
+   unsigned int btree_cache_used;
 
if (c->shrinker_disabled)
return SHRINK_STOP;
@@ -688,9 +689,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
nr = min_t(unsigned long, nr, mca_can_free(c));
 
i = 0;
+   btree_cache_used = c->btree_cache_used;
list_for_each_entry_safe(b, t, >btree_cache_freeable, list) {
-   if (freed >= nr)
-   break;
+   if (nr <= 0)
+   goto out;
 
if (++i > 3 &&
!mca_reap(b, 0, false)) {
@@ -698,9 +700,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
rw_unlock(true, b);
freed++;
}
+   nr--;
}
 
-   for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
+   for (;  (nr--) && i < btree_cache_used; i++) {
if (list_empty(>btree_cache))
goto out;
 
-- 
1.8.3.1



Re: [PATCH] bcache: fix error count in memory shrink

2018-03-05 Thread tang . junhui
Hi Mike---

>Hi Tang Junhui---
>
>I'm not really sure about this one.  It changes the semantics of the
>amount of work done-- nr_to_scan now means number of things to free
>instead of the number to check.
>
The code seems to be designed as that, sc->nr_to_scan marks how much btree
nodes to scan in c->btree_cache_used, and it doesn't include the btree
nodes in c->btree_cache_freeable.

>If the system is under severe memory pressure, and most of the cache is
>essential/actively used, this could greatly increase the amount of work
>done.
>
Yes, I didn't relize this.
>As to the i < btree_cache_used, it's arguably a bug, but it only reduces
>the amount of work done if the cache is very, very small.
>
I think it is a bug, I'll send a new patch to add variable to record 
c->btree_cache_used befor for(;;) loop.

>Mike
>
>On 01/30/2018 06:46 PM, tang.jun...@zte.com.cn wrote:
>> From: Tang Junhui 
>> 
>> In bch_mca_scan(), nr btree nodes are expected to shrink, so the for(;;)
>> loop need to satisfy the condition freed < nr.
>> And since c->btree_cache_used always decrease after mca_data_free() calling
>> in for(;;) loop,  so we need a auto variable to record c->btree_cache_used
>> before the for(;;) loop.
>> 
>> Signed-off-by: Tang Junhui 
>> ---
>>  drivers/md/bcache/btree.c | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>> index 81e8dc3..7e9ef3d 100644
>> --- a/drivers/md/bcache/btree.c
>> +++ b/drivers/md/bcache/btree.c
>> @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker 
>> *shrink,
>>  struct btree *b, *t;
>>  unsigned long i, nr = sc->nr_to_scan;
>>  unsigned long freed = 0;
>> +unsigned int btree_cache_used;
>>  
>>  if (c->shrinker_disabled)
>>  return SHRINK_STOP;
>> @@ -700,7 +701,8 @@ static unsigned long bch_mca_scan(struct shrinker 
>> *shrink,
>>  }
>>  }
>>  
>> -for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
>> +btree_cache_used = c->btree_cache_used;
>> +for (i = 0; freed < nr && i < btree_cache_used; i++) {
>>  if (list_empty(>btree_cache))
>>  goto out;
>>  
>> 
Thanks,
Tang Junhui


[PATCH V2] block: null_blk: fix 'Invalid parameters' when loading module

2018-03-05 Thread Ming Lei
On ARM64, the default page size has been 64K on some distributions, and
we should allow ARM64 people to play null_blk.

This patch fixes the issue by extend page bitmap size for supporting
other non-4KB PAGE_SIZE.

Cc: Bart Van Assche 
Cc: Shaohua Li 
Cc: Kyungchan Koh ,
Cc: weiping zhang 
Cc: Yi Zhang 
Reported-by: Yi Zhang 
Signed-off-by: Ming Lei 
---
 drivers/block/null_blk.c | 46 +-
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 51b16249028a..3c5a684de170 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -72,6 +72,7 @@ enum nullb_device_flags {
NULLB_DEV_FL_CACHE  = 3,
 };
 
+#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2)
 /*
  * nullb_page is a page in memory for nullb devices.
  *
@@ -86,10 +87,10 @@ enum nullb_device_flags {
  */
 struct nullb_page {
struct page *page;
-   unsigned long bitmap;
+   DECLARE_BITMAP(bitmap, MAP_SZ);
 };
-#define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1)
-#define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2)
+#define NULLB_PAGE_LOCK (MAP_SZ - 1)
+#define NULLB_PAGE_FREE (MAP_SZ - 2)
 
 struct nullb_device {
struct nullb *nullb;
@@ -732,7 +733,7 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
if (!t_page->page)
goto out_freepage;
 
-   t_page->bitmap = 0;
+   memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
return t_page;
 out_freepage:
kfree(t_page);
@@ -742,13 +743,20 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
 
 static void null_free_page(struct nullb_page *t_page)
 {
-   __set_bit(NULLB_PAGE_FREE, _page->bitmap);
-   if (test_bit(NULLB_PAGE_LOCK, _page->bitmap))
+   __set_bit(NULLB_PAGE_FREE, t_page->bitmap);
+   if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
return;
__free_page(t_page->page);
kfree(t_page);
 }
 
+static bool null_page_empty(struct nullb_page *page)
+{
+   int size = MAP_SZ - 2;
+
+   return find_first_bit(page->bitmap, size) == size;
+}
+
 static void null_free_sector(struct nullb *nullb, sector_t sector,
bool is_cache)
 {
@@ -763,9 +771,9 @@ static void null_free_sector(struct nullb *nullb, sector_t 
sector,
 
t_page = radix_tree_lookup(root, idx);
if (t_page) {
-   __clear_bit(sector_bit, _page->bitmap);
+   __clear_bit(sector_bit, t_page->bitmap);
 
-   if (!t_page->bitmap) {
+   if (null_page_empty(t_page)) {
ret = radix_tree_delete_item(root, idx, t_page);
WARN_ON(ret != t_page);
null_free_page(ret);
@@ -836,7 +844,7 @@ static struct nullb_page *__null_lookup_page(struct nullb 
*nullb,
t_page = radix_tree_lookup(root, idx);
WARN_ON(t_page && t_page->page->index != idx);
 
-   if (t_page && (for_write || test_bit(sector_bit, _page->bitmap)))
+   if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
return t_page;
 
return NULL;
@@ -899,10 +907,10 @@ static int null_flush_cache_page(struct nullb *nullb, 
struct nullb_page *c_page)
 
t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
 
-   __clear_bit(NULLB_PAGE_LOCK, _page->bitmap);
-   if (test_bit(NULLB_PAGE_FREE, _page->bitmap)) {
+   __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
+   if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
null_free_page(c_page);
-   if (t_page && t_page->bitmap == 0) {
+   if (t_page && null_page_empty(t_page)) {
ret = radix_tree_delete_item(>dev->data,
idx, t_page);
null_free_page(t_page);
@@ -918,11 +926,11 @@ static int null_flush_cache_page(struct nullb *nullb, 
struct nullb_page *c_page)
 
for (i = 0; i < PAGE_SECTORS;
i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
-   if (test_bit(i, _page->bitmap)) {
+   if (test_bit(i, c_page->bitmap)) {
offset = (i << SECTOR_SHIFT);
memcpy(dst + offset, src + offset,
nullb->dev->blocksize);
-   __set_bit(i, _page->bitmap);
+   __set_bit(i, t_page->bitmap);
}
}
 
@@ -959,10 +967,10 @@ static int null_make_cache_space(struct nullb *nullb, 
unsigned long n)
 * We found the page which is being flushed to disk by other
 * threads
 */
-   if (test_bit(NULLB_PAGE_LOCK, _pages[i]->bitmap))
+   if (test_bit(NULLB_PAGE_LOCK, 

Re: [PATCH] block: null_blk: fix 'Invalid parameters' failure when loading module

2018-03-05 Thread Ming Lei
On Mon, Mar 05, 2018 at 03:57:07PM +, Bart Van Assche wrote:
> On Sat, 2018-03-03 at 10:24 +0800, Ming Lei wrote:
> >  struct nullb_page {
> > struct page *page;
> > -   unsigned long bitmap;
> > +   unsigned long bitmap[DIV_ROUND_UP(MAP_SZ, sizeof(unsigned long) * 8)];
> >  };
> 
> Could DECLARE_BITMAP() have been used here?

Indeed, will do it in V2.

Thanks,
Ming


Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Logan Gunthorpe



On 05/03/18 05:49 PM, Oliver wrote:

It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full barriers!

Awesome!

Our io.h indicates that our iomem accessors are designed to provide x86ish
strong ordering of accesses to MMIO space. The git log indicates
arch/powerpc/kernel/io.c has barely been touched in the last decade so
odds are most of that code was written in the elder days when people
were less aware of ordering issues. It might just be overly conservative
by today's standards, but maybe not (see below).


Yes, that seems overly conservative.


(I'm not going to suggest ditching the lwsync trick. mpe is not going
to take that patch
without a really good reason)


Well, that's pretty gross. Is this not exactly the situation mmiowb() is 
meant to solve? See [1].


Though, you're right in principle. Even if power was similar to other 
systems in this way, it's still a risk that if these pages get passed 
somewhere in the kernel that uses a spin lock like that without an 
mmiowb() call, then it's going to have a bug. For now, the risk is 
pretty low as we know exactly where all the p2pmem pages will be used 
but if it gets into other places, all bets are off. I did do some work 
trying to make a safe version of io-pages and also trying to change from 
pages to pfn_t in large areas but neither approach seemed likely to get 
any traction in the community, at least not in the near term.


Logan

[1] ACQUIRES VS I/O ACCESSES in 
https://www.kernel.org/doc/Documentation/memory-barriers.txt




Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Oliver
On Tue, Mar 6, 2018 at 4:10 AM, Logan Gunthorpe  wrote:
>
>
> On 05/03/18 09:00 AM, Keith Busch wrote:
>>
>> On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:
>>>
>>> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe 
>>> wrote:

 @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue
 *nvmeq,
   {
  u16 tail = nvmeq->sq_tail;
>>>
>>>
 -   if (nvmeq->sq_cmds_io)
 -   memcpy_toio(>sq_cmds_io[tail], cmd,
 sizeof(*cmd));
 -   else
 -   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
 +   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
>>>
>>>
>>> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
>>> the _toio() variant enforces alignment, does the copy with 4 byte
>>> stores, and has a full barrier after the copy. In comparison our
>>> regular memcpy() does none of those things and may use unaligned and
>>> vector load/stores. For normal (cacheable) memory that is perfectly
>>> fine, but they can cause alignment faults when targeted at MMIO
>>> (cache-inhibited) memory.
>>>
>>> I think in this particular case it might be ok since we know SEQs are
>>> aligned to 64 byte boundaries and the copy is too small to use our
>>> vectorised memcpy(). I'll assume we don't need explicit ordering
>>> between writes of SEQs since the existing code doesn't seem to care
>>> unless the doorbell is being rung, so you're probably fine there too.
>>> That said, I still think this is a little bit sketchy and at the very
>>> least you should add a comment explaining what's going on when the CMB
>>> is being used. If someone more familiar with the NVMe driver could
>>> chime in I would appreciate it.
>>
>>
>> I may not be understanding the concern, but I'll give it a shot.
>>
>> You're right, the start of any SQE is always 64-byte aligned, so that
>> should satisfy alignment requirements.
>>
>> The order when writing multiple/successive SQEs in a submission queue
>> does matter, and this is currently serialized through the q_lock.
>>
>> The order in which the bytes of a single SQE is written doesn't really
>> matter as long as the entire SQE is written into the CMB prior to writing
>> that SQ's doorbell register.
>>
>> The doorbell register is written immediately after copying a command
>> entry into the submission queue (ignore "shadow buffer" features),
>> so the doorbells written to commands submitted is 1:1.
>>
>> If a CMB SQE and DB order is not enforced with the memcpy, then we do
>> need a barrier after the SQE's memcpy and before the doorbell's writel.
>
>
>
> Thanks for the information Keith.
>
> Adding to this: regular memcpy generally also enforces alignment as
> unaligned access to regular memory is typically bad in some way on most
> arches. The generic memcpy_toio also does not have any barrier as it is just
> a call to memcpy. Arm64 also does not appear to have a barrier in its
> implementation and in the short survey I did I could not find any
> implementation with a barrier. I also did not find a ppc implementation in
> the tree but it would be weird for it to add a barrier when other arches do
> not appear to need it.

It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full barriers!

Awesome!

Our io.h indicates that our iomem accessors are designed to provide x86ish
strong ordering of accesses to MMIO space. The git log indicates
arch/powerpc/kernel/io.c has barely been touched in the last decade so
odds are most of that code was written in the elder days when people
were less aware of ordering issues. It might just be overly conservative
by today's standards, but maybe not (see below).

> We've been operating on the assumption that memory mapped by
> devm_memremap_pages() can be treated as regular memory. This is emphasized
> by the fact that it does not return an __iomem pointer.

I think the lack of an __iomem annotation is a bit of a red herring. The comment
header for devm_memremap_pages() outlines the conditions for when using
it is valid, the important one being:

> * 4/ res is expected to be a host memory range that could feasibly be
> *treated as a "System RAM" range, i.e. not a device mmio range, but
> *this is not enforced.

Granted, the CMB is a MMIO buffer rather than a register range, but from
my perspective it's still a device MMIO range rather than something we
could feasibly treat as system RAM. The two big reasons being:

a) System RAM is cacheable and coherent while the CMB is neither, and
b) On PPC MMIO accesses are in a separate ordering domain to cacheable
   memory accesses.

The latter isn't as terrifying as it sounds since a full barrier will
order everything
with everything, but it still causes problems. For performance reasons
we want to
minimise the number of cases where we need to use a sync instruction
(full barrier)
in favour of a lightweight sync (lwsync) which only orders cacheable
accesses. To

Re: EXT4 Oops (Re: [PATCH V15 06/22] mmc: block: Add blk-mq support)

2018-03-05 Thread Dmitry Osipenko
On 01.03.2018 19:04, Theodore Ts'o wrote:
> On Thu, Mar 01, 2018 at 10:55:37AM +0200, Adrian Hunter wrote:
>> On 27/02/18 11:28, Adrian Hunter wrote:
>>> On 26/02/18 23:48, Dmitry Osipenko wrote:
 But still something is wrong... I've been getting occasional EXT4 Ooops's, 
 like
 the one below, and __wait_on_bit() is always figuring in the stacktrace. It
 never happened with blk-mq disabled, though it could be a coincidence and
 actually unrelated to blk-mq patches.
>>>
 [ 6625.992337] Unable to handle kernel NULL pointer dereference at virtual
 address 001c
 [ 6625.993004] pgd = 00b30c03
 [ 6625.993257] [001c] *pgd=
 [ 6625.993594] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
 [ 6625.994022] Modules linked in:
 [ 6625.994326] CPU: 1 PID: 19355 Comm: dpkg Not tainted
 4.16.0-rc2-next-20180220-00095-ge9c9f5689a84-dirty #2090
 [ 6625.995078] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
 [ 6625.995595] PC is aht dx_probe+0x68/0x684
 [ 6625.995947] LR is at __wait_on_bit+0xac/0xc8
> 
> This doesn't seem to make sense; the PC is where we are currently
> executing, and LR is the "Link Register" where the flow of control
> will be returning after the current function returns, right?  Well,
> dx_probe should *not* be returning to __wait_on_bit().  So this just
> seems weird.
> 
> Ignoring the LR register, this stack trace looks sane...  I can't see
> which pointer could be NULL and getting dereferenced, though.  How
> easily can you reproduce the problem?  Can you either (a) translate
> the PC into a line number, or better yet, if you can reproduce, add a
> series of BUG_ON's so we can see what's going on?
> 
> + BUG_ON(frame);
>   memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
>   frame->bh = ext4_read_dirblock(dir, 0, INDEX);
>   if (IS_ERR(frame->bh))
>   return (struct dx_frame *) frame->bh;
> 
> + BUG_ON(frame->bh);
> + BUG_ON(frame->bh->b_data);
>   root = (struct dx_root *) frame->bh->b_data;
>   if (root->info.hash_version != DX_HASH_TEA &&
>   root->info.hash_version != DX_HASH_HALF_MD4 &&
>   root->info.hash_version != DX_HASH_LEGACY) {
> 
> These are "could never" happen scenarios from looking at the code, but
> that will help explain what is going on.
> 
> If this is reliably only happening with mq, the only way I could see
> that if is something is returning an error when it previously wasn't.
> This isn't a problem we're seeing with any of our testing, though.

It happened today again, "BUG_ON(!frame->bh->b_data);" has been trapped.

kernel BUG at fs/ext4/namei.c:751!
Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 PID: 296 Comm: cron Not tainted
4.16.0-rc2-next-20180220-00095-ge9c9f5689a84-dirty #2100
Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
PC is at dx_probe+0x308/0x694
LR is at __wait_on_bit+0xac/0xc8
pc : []lr : []psr: 60040013
sp : d545bc20  ip : c0170e88  fp : d545bc74
r10:   r9 : d545bca0  r8 : d4209300
r7 :   r6 :   r5 : d656e838  r4 : d545bcbc
r3 : 007b  r2 : d5830800  r1 : d5831000  r0 : d4209300
Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 1552004a  DAC: 0051
Process cron (pid: 296, stack limit = 0x4d1ebf14)
Stack: (0xd545bc20 to 0xd545c000)
bc20: 02ea c0c019d4 60040113 014000c0 c029e640 d6cf3540 d545bc7c d545bc48
bc40: c02797f4 c0152804 d545bca4 0007 d5830800  d656e838 0001
bc60: d545bca0  d545bd0c d545bc78 c033d578 c033b904 c029e714 c029b088
bc80: 0148 c0c01984 d65f6be0  d545be10 d545bd24 d545bd00 d5830800
bca0: d65f6bf8 d65f6c0c 0007 d6547720 8420edbe c029eec8  d4209300
bcc0:        
bce0: d545bd48 d65f6be0 d656e838 d65f6be0 d6547720 0001 d545be10 
bd00: d545bd44 d545bd10 c033d7c0 c033d1c8 d545bd34 d656e8b8 d656e838 d545be08
bd20: d656e838  d65f6be0 d656e838 d656e8b8 d6547720 d545bd8c d545bd48
bd40: c028ea50 c033d774  dead4ead   d545bd58 d545bd58
bd60: d6d7f015 d545be08   d545bee8 d545bee8 d545bf28 
bd80: d545bdd4 d545bd90 c028f310 c028e9b0 d545be08 80808080 d545be08 d6d7f010
bda0: d545bdd4 d545bdb0 c028df9c d545be08 d6d7f010  d545bee8 d545bee8
bdc0: d545bf28  d545be04 d545bdd8 c0290e24 c028f160 c0111a1c c0111674
bde0: d545be04 d545bdf0 0001 d6d7f000 d545be08 0001 d545beb4 d545be08
be00: c0293848 c0290da4 d6dd0310 d6547720 8420edbe 0007 d6d7f015 000c
be20: d6dd0310 d6547098 d656e838 0001 0002 0fe0  
be40:  d545be48 c02797f4 0ff0 d6d7f010 c102b4c8 d5522db8 d6d7f000
be60: c130bbdc 004f73f8  0001 d545bf28  d6d7f000 
be80: c0293570 0002 ff9c 0001 ff9c 0001 ff9c d545bee8
bea0: ff9c 004f73f8 

Re: [PATCH v2 04/10] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches

2018-03-05 Thread Logan Gunthorpe



On 05/03/18 03:28 PM, Bjorn Helgaas wrote:

If you put the #ifdef right here, then it's easier to read because we
can see that "oh, this is a special and uncommon case that I can
probably ignore".


Makes sense. I'll do that.

Thanks,

Logan


Re: [PATCH v2 04/10] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches

2018-03-05 Thread Bjorn Helgaas
On Thu, Mar 01, 2018 at 12:13:10PM -0700, Logan Gunthorpe wrote:
> 
> 
> On 01/03/18 11:02 AM, Bjorn Helgaas wrote:
> > >   void pci_enable_acs(struct pci_dev *dev)
> > >   {
> > > + if (pci_p2pdma_disable_acs(dev))
> > > + return;
> > 
> > This doesn't read naturally to me.  I do see that when
> > CONFIG_PCI_P2PDMA is not set, pci_p2pdma_disable_acs() does nothing
> > and returns 0, so we'll go ahead and try to enable ACS as before.
> > 
> > But I think it would be clearer to have an #ifdef CONFIG_PCI_P2PDMA
> > right here so it's more obvious that we only disable ACS when it's
> > selected.
> 
> I could do this... however, I wrote it this way because I've read Linus
> dislikes using #ifdef's inside function bodies and I personally agree with
> that sentiment.

I try to avoid #ifdefs too, but in this case the plain reading of the
code makes no sense (we're in a function called "enable_acs()", and
the first thing we do is call a function to "disable_acs()".

Disabling ACS is scary for all the security reasons mentioned
elsewhere, so a reader who knows what ACS does and cares about
virtualization and security *has* to go look up the definition of
pci_p2pdma_disable_acs().

If you put the #ifdef right here, then it's easier to read because we
can see that "oh, this is a special and uncommon case that I can
probably ignore".

Bjorn


Re: Some bcache patches need reveiw

2018-03-05 Thread Michael Lyle
Hey Tang Junhui--

You're right.  I'm sorry, these had slipped through the cracks / I had
lost track of them.  I also have the rest of Coly's patchset to work
through.  I was able to apply the first two, and will continue to work
on my for-next branch.

Mike


On 03/05/2018 02:39 AM, tang.jun...@zte.com.cn wrote:
> Hello Mike
> 
> I send some patches some times before, with no response,
> 
> Bellow patches are very simple, can you or anybody else have a review?
> [PATCH] bcache: fix incorrect sysfs output value of strip size
> [PATCH] bcache: fix error return value in memory shrink
> [PATCH] bcache: fix error count in memory shrink
> 
> Bellow patches are simple too, and important for me and also important for 
> bcache,
> can you or anybody else have a review?
> [PATCH] bcache: finish incremental GC
> [PATCH] bcache: calculate the number of incremental GC nodes according to the 
> total of btree nodes
> 
> Thanks,
> Tang Junhui
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



Re: [PATCH] bcache: fix error count in memory shrink

2018-03-05 Thread Michael Lyle
Hi Tang Junhui---

I'm not really sure about this one.  It changes the semantics of the
amount of work done-- nr_to_scan now means number of things to free
instead of the number to check.

If the system is under severe memory pressure, and most of the cache is
essential/actively used, this could greatly increase the amount of work
done.

As to the i < btree_cache_used, it's arguably a bug, but it only reduces
the amount of work done if the cache is very, very small.

Mike

On 01/30/2018 06:46 PM, tang.jun...@zte.com.cn wrote:
> From: Tang Junhui 
> 
> In bch_mca_scan(), nr btree nodes are expected to shrink, so the for(;;)
> loop need to satisfy the condition freed < nr.
> And since c->btree_cache_used always decrease after mca_data_free() calling
> in for(;;) loop,  so we need a auto variable to record c->btree_cache_used
> before the for(;;) loop.
> 
> Signed-off-by: Tang Junhui 
> ---
>  drivers/md/bcache/btree.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 81e8dc3..7e9ef3d 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
>   struct btree *b, *t;
>   unsigned long i, nr = sc->nr_to_scan;
>   unsigned long freed = 0;
> + unsigned int btree_cache_used;
>  
>   if (c->shrinker_disabled)
>   return SHRINK_STOP;
> @@ -700,7 +701,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
>   }
>   }
>  
> - for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
> + btree_cache_used = c->btree_cache_used;
> + for (i = 0; freed < nr && i < btree_cache_used; i++) {
>   if (list_empty(>btree_cache))
>   goto out;
>  
> 



Re: [PATCH] bcache: fix error return value in memory shrink

2018-03-05 Thread Michael Lyle
LGTM, applied

On 01/30/2018 07:30 PM, tang.jun...@zte.com.cn wrote:
> From: Tang Junhui 
> 
> In bch_mca_scan(), the return value should not be the number of freed btree
> nodes, but the number of pages of freed btree nodes.
> 
> Signed-off-by: Tang Junhui 
> ---
>  drivers/md/bcache/btree.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 81e8dc3..a5fbccc 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -718,7 +718,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
>   }
>  out:
>   mutex_unlock(>bucket_lock);
> - return freed;
> + return freed * c->btree_pages;
>  }
>  
>  static unsigned long bch_mca_count(struct shrinker *shrink,
> 



Re: [PATCH] bcache: fix incorrect sysfs output value of strip size

2018-03-05 Thread Michael Lyle
LGTM, applied.

On 02/10/2018 06:30 PM, tang.jun...@zte.com.cn wrote:
> From: Tang Junhui 
> 
> Stripe size is shown as zero when no strip in back end device:
> [root@ceph132 ~]# cat /sys/block/sdd/bcache/stripe_size
> 0.0k
> 
> Actually it should be 1T Bytes (1 << 31 sectors), but in sysfs
> interface, stripe_size was changed from sectors to bytes, and move
> 9 bits left, so the 32 bits variable overflows.
> 
> This patch change the variable to a 64 bits type before moving bits.
> 
> Signed-off-by: Tang Junhui 
> ---
>  drivers/md/bcache/sysfs.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index b418409..240cbec 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -170,7 +170,7 @@
>   sysfs_hprint(dirty_data,
>bcache_dev_sectors_dirty(>disk) << 9);
>  
> - sysfs_hprint(stripe_size,   dc->disk.stripe_size << 9);
> + sysfs_hprint(stripe_size,((uint64_t)dc->disk.stripe_size) << 9);
>   var_printf(partial_stripes_expensive,   "%u");
>  
>   var_hprint(sequential_cutoff);
> 



Re: [PATCH 0/2] Duplicate UUID fixes for 4.16

2018-03-05 Thread Jens Axboe
On 3/5/18 2:41 PM, Michael Lyle wrote:
> Jens--
> 
> Sorry for a couple of last-minute changes.  Marc Merlin noticed an issue
> with disk imaging where if multiple devices were present with the same
> bcache UUID that a kernel panic could result.  Tang Junhui fixed this.
> I found a related data corruption issue with duplicate backing devices.
> 
> Both are minimal, reviewed, and tested.  As always, thanks for your help.

Applied, thanks Mike.

-- 
Jens Axboe



Re: [PATCH] bcache: don't attach backing with duplicate UUID

2018-03-05 Thread Michael Lyle
Hi Tang Junhui---

Thanks for your review.  I just sent it upstream (with your change) to Jens.

Mike

On 03/04/2018 05:07 PM, tang.jun...@zte.com.cn wrote:
> Hello Mike
> 
> I send the email from my personal mailbox(110950...@qq.com), it may be fail,
> so I resend this email from my office mailbox again. bellow is the mail
> context I send previous.
> 
> 
> 
> I am Tang Junhui(tang.jun...@zte.com.cn), This email comes from my
> personal mailbox, since I am not in office today.
> 
>>> From: Tang Junhui 
>>>
>>> Hello, Mike
>>>
>>> This patch looks good, but has some conflicts with this patch:
>>> bcache: fix for data collapse after re-attaching an attached device
>>> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?id=73ac105be390c1de42a2f21643c9778a5e002930
>>> Could you modify your fix base on the previous patch?
> 
>> That doesn't make sense.  This patch was generated from a current tree
>> where it's applied on top of that: (It's based on next when it should
>> really be based on Linus's tree, but it doesn't matter for patch
>> application because there's no changes in next right now to bcache that
>> aren't in Linus's tree).
> 
> Originally, I did not mean merger conflicts, but the 
> code logical conflicts, since the previous patch add a new input parameter
> set_uuid in bch_cached_dev_attach(), and if set_uuid is not NULL,
> we use set_uuid as cache set uuid, otherwise, we use dc->sb.set_uuid as
> the cache set uuid.
> 
> But now, I read your patch again, and realize that you did not use 
> dc->sb.set_uuid, but use dc->sb.uuid to judge whether the device is a
> duplicate backend device, so it's OK for me.
> 
>> May I add your reviewed-by so I can send this (and your fix) upstream?
> Reviewed-by: Tang Junhui 
> 
> Thanks 
> Tang Junhui
> 
> Thanks
> Tang Junhui
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



[PATCH 1/2] bcache: fix crashes in duplicate cache device register

2018-03-05 Thread Michael Lyle
From: Tang Junhui 

Kernel crashed when register a duplicate cache device, the call trace is
bellow:
[  417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G
   W  OE4.15.5-amd64-preempt-sysrq-20171018 #2
[  417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS
N1DET41W (1.15 ) 12/31/2015
[  417.643870] RIP: 0010:bdevname+0x13/0x1e
[  417.643876] RSP: 0018:a3aa9138fd38 EFLAGS: 00010282
[  417.643884] RAX:  RBX: 8c8f2f2f8000 RCX: d6701f8
c7edf
[  417.643890] RDX: a3aa9138fd88 RSI: a3aa9138fd88 RDI: 000
0
[  417.643895] RBP: a3aa9138fde0 R08: a3aa9138fae8 R09: 000
1850e
[  417.643901] R10: 8c8eed34b271 R11: 8c8eed34b250 R12: 000
0
[  417.643906] R13: d6701f78f940 R14: 8c8f38f8 R15: 8c8ea7d
9
[  417.643913] FS:  7fde7e66f500() GS:8c8f6144() knlGS:

[  417.643919] CS:  0010 DS:  ES:  CR0: 80050033
[  417.643925] CR2: 0314 CR3: 0007e6fa0001 CR4: 003
606e0
[  417.643931] DR0:  DR1:  DR2: 000
0
[  417.643938] DR3:  DR6: fffe0ff0 DR7: 000
00400
[  417.643946] Call Trace:
[  417.643978]  register_bcache+0x1117/0x1270 [bcache]
[  417.643994]  ? slab_pre_alloc_hook+0x15/0x3c
[  417.644001]  ? slab_post_alloc_hook.isra.44+0xa/0x1a
[  417.644013]  ? kernfs_fop_write+0xf6/0x138
[  417.644020]  kernfs_fop_write+0xf6/0x138
[  417.644031]  __vfs_write+0x31/0xcc
[  417.644043]  ? current_kernel_time64+0x10/0x36
[  417.644115]  ? __audit_syscall_entry+0xbf/0xe3
[  417.644124]  vfs_write+0xa5/0xe2
[  417.644133]  SyS_write+0x5c/0x9f
[  417.644144]  do_syscall_64+0x72/0x81
[  417.644161]  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
[  417.644169] RIP: 0033:0x7fde7e1c1974
[  417.644175] RSP: 002b:7fff13009a38 EFLAGS: 0246 ORIG_RAX: 000
1
[  417.644183] RAX: ffda RBX: 01658280 RCX: 7fde7e1c
1974
[  417.644188] RDX: 000a RSI: 01658280 RDI: 
0001
[  417.644193] RBP: 000a R08: 0003 R09: 
0077
[  417.644198] R10: 089e R11: 0246 R12: 
0001
[  417.644203] R13: 000a R14: 7fff R15: 

[  417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0
0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8
b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39
[  417.644302] RIP: bdevname+0x13/0x1e RSP: a3aa9138fd38
[  417.644306] CR2: 0314

When registering duplicate cache device in register_cache(), after failure
on calling register_cache_set(), bch_cache_release() will be called, then
bdev will be freed, so bdevname(bdev, name) caused kernel crash.

Since bch_cache_release() will free bdev, so in this patch we make sure
bdev being freed if register_cache() fail, and do not free bdev again in
register_bcache() when register_cache() fail.

Signed-off-by: Tang Junhui 
Reported-by: Marc MERLIN 
Tested-by: Michael Lyle 
Reviewed-by: Michael Lyle 
Cc: 
---
 drivers/md/bcache/super.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 312895788036..9c141a8aaacc 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1204,7 +1204,7 @@ static void register_bdev(struct cache_sb *sb, struct 
page *sb_page,
 
return;
 err:
-   pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+   pr_notice("error %s: %s", bdevname(bdev, name), err);
bcache_device_stop(>disk);
 }
 
@@ -1883,6 +1883,8 @@ static int register_cache(struct cache_sb *sb, struct 
page *sb_page,
const char *err = NULL; /* must be set for any error case */
int ret = 0;
 
+   bdevname(bdev, name);
+
memcpy(>sb, sb, sizeof(struct cache_sb));
ca->bdev = bdev;
ca->bdev->bd_holder = ca;
@@ -1891,11 +1893,12 @@ static int register_cache(struct cache_sb *sb, struct 
page *sb_page,
bio_first_bvec_all(>sb_bio)->bv_page = sb_page;
get_page(sb_page);
 
-   if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+   if (blk_queue_discard(bdev_get_queue(bdev)))
ca->discard = CACHE_DISCARD(>sb);
 
ret = cache_alloc(ca);
if (ret != 0) {
+   blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
else
@@ -1918,14 +1921,14 @@ static int register_cache(struct cache_sb *sb, struct 
page *sb_page,
goto out;
}
 
-   pr_info("registered cache device %s", bdevname(bdev, name));
+   pr_info("registered 

[PATCH 2/2] bcache: don't attach backing with duplicate UUID

2018-03-05 Thread Michael Lyle
This can happen e.g. during disk cloning.

This is an incomplete fix: it does not catch duplicate UUIDs earlier
when things are still unattached.  It does not unregister the device.
Further changes to cope better with this are planned but conflict with
Coly's ongoing improvements to handling device errors.  In the meantime,
one can manually stop the device after this has happened.

Attempts to attach a duplicate device result in:

[  136.372404] loop: module loaded
[  136.424461] bcache: register_bdev() registered backing device loop0
[  136.424464] bcache: bch_cached_dev_attach() Tried to attach loop0 but 
duplicate UUID already attached

My test procedure is:

  dd if=/dev/sdb1 of=imgfile bs=1024 count=262144
  losetup -f imgfile

Signed-off-by: Michael Lyle 
Reviewed-by: Tang Junhui 
Cc: 
---
 drivers/md/bcache/super.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9c141a8aaacc..5cace6892958 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -963,6 +963,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct 
cache_set *c,
uint32_t rtime = cpu_to_le32(get_seconds());
struct uuid_entry *u;
char buf[BDEVNAME_SIZE];
+   struct cached_dev *exist_dc, *t;
 
bdevname(dc->bdev, buf);
 
@@ -987,6 +988,16 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct 
cache_set *c,
return -EINVAL;
}
 
+   /* Check whether already attached */
+   list_for_each_entry_safe(exist_dc, t, >cached_devs, list) {
+   if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
+   pr_err("Tried to attach %s but duplicate UUID already 
attached",
+   buf);
+
+   return -EINVAL;
+   }
+   }
+
u = uuid_find(c, dc->sb.uuid);
 
if (u &&
-- 
2.14.1



[PATCH 0/2] Duplicate UUID fixes for 4.16

2018-03-05 Thread Michael Lyle
Jens--

Sorry for a couple of last-minute changes.  Marc Merlin noticed an issue
with disk imaging where if multiple devices were present with the same
bcache UUID that a kernel panic could result.  Tang Junhui fixed this.
I found a related data corruption issue with duplicate backing devices.

Both are minimal, reviewed, and tested.  As always, thanks for your help.

Mike


Re: Hangs in balance_dirty_pages with arm-32 LPAE + highmem

2018-03-05 Thread Laura Abbott

On 02/26/2018 06:28 AM, Michal Hocko wrote:

On Fri 23-02-18 11:51:41, Laura Abbott wrote:

Hi,

The Fedora arm-32 build VMs have a somewhat long standing problem
of hanging when running mkfs.ext4 with a bunch of processes stuck
in D state. This has been seen as far back as 4.13 but is still
present on 4.14:


[...]

This looks like everything is blocked on the writeback completing but
the writeback has been throttled. According to the infra team, this problem
is _not_ seen without LPAE (i.e. only 4G of RAM). I did see
https://patchwork.kernel.org/patch/10201593/ but that doesn't seem to
quite match since this seems to be completely stuck. Any suggestions to
narrow the problem down?


How much dirtyable memory does the system have? We do allow only lowmem
to be dirtyable by default on 32b highmem systems. Maybe you have the
lowmem mostly consumed by the kernel memory. Have you tried to enable
highmem_is_dirtyable?



Setting highmem_is_dirtyable did fix the problem. The infrastructure
people seemed satisfied enough with this (and are happy to have the
machines back). I'll see if they are willing to run a few more tests
to get some more state information.

Thanks,
Laura


Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Jason Gunthorpe
On Mon, Mar 05, 2018 at 01:42:12PM -0700, Keith Busch wrote:
> On Mon, Mar 05, 2018 at 01:10:53PM -0700, Jason Gunthorpe wrote:
> > So when reading the above mlx code, we see the first wmb() being used
> > to ensure that CPU stores to cachable memory are visible to the DMA
> > triggered by the doorbell ring.
> 
> IIUC, we don't need a similar barrier for NVMe to ensure memory is
> visibile to DMA since the SQE memory is allocated DMA coherent when the
> SQ is not within a CMB.

You still need it.

DMA coherent just means you don't need to call the DMA API after
writing, it says nothing about CPU ordering.

eg on x86 DMA coherent is just normal system memory, and you do need
the SFENCE betweeen system memory stores and DMA triggering MMIO,
apparently.

Jason


Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Keith Busch
On Mon, Mar 05, 2018 at 01:10:53PM -0700, Jason Gunthorpe wrote:
> So when reading the above mlx code, we see the first wmb() being used
> to ensure that CPU stores to cachable memory are visible to the DMA
> triggered by the doorbell ring.

IIUC, we don't need a similar barrier for NVMe to ensure memory is
visibile to DMA since the SQE memory is allocated DMA coherent when the
SQ is not within a CMB.
 
> The mmiowb() is used to ensure that DB writes are not combined and not
> issued in any order other than implied by the lock that encloses the
> whole thing. This is needed because uar_map is WC memory.
> 
> We don't have ordering with respect to two writel's here, so if ARM
> performance was a concern the writel could be switched to
> writel_relaxed().
> 
> Presumably nvme has similar requirments, although I guess the DB
> register is mapped UC not WC?

Yep, the NVMe DB register is required by the spec to be mapped UC.


Re: [PATCH v2 00/10] Copy Offload in NVMe Fabrics with P2P PCI Memory

2018-03-05 Thread Stephen Bates
>Yes i need to document that some more in hmm.txt...

Hi Jermone, thanks for the explanation. Can I suggest you update hmm.txt with 
what you sent out?

>  I am about to send RFC for nouveau, i am still working out some bugs.

Great. I will keep an eye out for it. An example user of hmm will be very 
helpful.

> i will fix the MAINTAINERS as part of those.

Awesome, thanks.

Stephen




Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Logan Gunthorpe



On 05/03/18 01:10 PM, Jason Gunthorpe wrote:

So when reading the above mlx code, we see the first wmb() being used
to ensure that CPU stores to cachable memory are visible to the DMA
triggered by the doorbell ring.


Oh, yes, that makes sense. Disregard my previous email as I was wrong.

Logan


Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Logan Gunthorpe



On 05/03/18 12:57 PM, Sagi Grimberg wrote:
Keith, while we're on this, regardless of cmb, is SQE memcopy and DB 
update ordering always guaranteed?


If you look at mlx4 (rdma device driver) that works exactly the same as
nvme you will find:
--
     qp->sq.head += nreq;

     /*
  * Make sure that descriptors are written before
  * doorbell record.
  */
     wmb();

     writel(qp->doorbell_qpn,
    to_mdev(ibqp->device)->uar_map + 
MLX4_SEND_DOORBELL);


     /*
  * Make sure doorbells don't leak out of SQ spinlock
  * and reach the HCA out of order.
  */
     mmiowb();
--


To me, it looks like the wmb() is redundant as writel should guarantee 
the order. (Indeed, per Sinan's comment, writel on arm64 starts with a 
wmb() which means, on that platform, there are two wmb() calls in a row.)


The mmiowb() call, on the other hand, looks correct per my understanding 
of it's purpose with respect to the spinlock.


Logan




Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Jason Gunthorpe
On Mon, Mar 05, 2018 at 09:57:27PM +0200, Sagi Grimberg wrote:

> Keith, while we're on this, regardless of cmb, is SQE memcopy and DB update
> ordering always guaranteed?
> 
> If you look at mlx4 (rdma device driver) that works exactly the same as
> nvme you will find:
> --
> qp->sq.head += nreq;
> 
> /*
>  * Make sure that descriptors are written before
>  * doorbell record.
>  */
> wmb();
> 
> writel(qp->doorbell_qpn,
>to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
> 
> /*
>  * Make sure doorbells don't leak out of SQ spinlock
>  * and reach the HCA out of order.
>  */
> mmiowb();
> --
> 
> That seems to explicitly make sure to place a barrier before updating
> the doorbell. So as I see it, either ordering is guaranteed and the
> above code is redundant, or nvme needs to do the same.

A wmb() is always required before operations that can trigger DMA.

The reason ARM has a barrier in writel() is not to make it ordered
with respect to CPU stores to cachable memory, but to make it ordered
with respect to *other* writels.

Eg Linux defines this:

writel(A, mem);
writel(B, mem);

To always produce two TLPs on PCI-E when mem is UC BAR memory.

And ARM cannot guarentee that without the extra barrier.

So now we see stuff like this:

writel_relaxed(A, mem);
writel_relaxed(B, mem+4);

Which says the TLPs to A and B can be issued in any order..

So when reading the above mlx code, we see the first wmb() being used
to ensure that CPU stores to cachable memory are visible to the DMA
triggered by the doorbell ring.

The mmiowb() is used to ensure that DB writes are not combined and not
issued in any order other than implied by the lock that encloses the
whole thing. This is needed because uar_map is WC memory.

We don't have ordering with respect to two writel's here, so if ARM
performance was a concern the writel could be switched to
writel_relaxed().

Presumably nvme has similar requirments, although I guess the DB
register is mapped UC not WC?

Jason


Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Sagi Grimberg



-   if (nvmeq->sq_cmds_io)
-   memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd));
-   else
-   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
+   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));


Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
the _toio() variant enforces alignment, does the copy with 4 byte
stores, and has a full barrier after the copy. In comparison our
regular memcpy() does none of those things and may use unaligned and
vector load/stores. For normal (cacheable) memory that is perfectly
fine, but they can cause alignment faults when targeted at MMIO
(cache-inhibited) memory.

I think in this particular case it might be ok since we know SEQs are
aligned to 64 byte boundaries and the copy is too small to use our
vectorised memcpy(). I'll assume we don't need explicit ordering
between writes of SEQs since the existing code doesn't seem to care
unless the doorbell is being rung, so you're probably fine there too.
That said, I still think this is a little bit sketchy and at the very
least you should add a comment explaining what's going on when the CMB
is being used. If someone more familiar with the NVMe driver could
chime in I would appreciate it.


I may not be understanding the concern, but I'll give it a shot.

You're right, the start of any SQE is always 64-byte aligned, so that
should satisfy alignment requirements.

The order when writing multiple/successive SQEs in a submission queue
does matter, and this is currently serialized through the q_lock.

The order in which the bytes of a single SQE is written doesn't really
matter as long as the entire SQE is written into the CMB prior to writing
that SQ's doorbell register.

The doorbell register is written immediately after copying a command
entry into the submission queue (ignore "shadow buffer" features),
so the doorbells written to commands submitted is 1:1.

If a CMB SQE and DB order is not enforced with the memcpy, then we do
need a barrier after the SQE's memcpy and before the doorbell's writel.


Keith, while we're on this, regardless of cmb, is SQE memcopy and DB 
update ordering always guaranteed?


If you look at mlx4 (rdma device driver) that works exactly the same as
nvme you will find:
--
qp->sq.head += nreq;

/*
 * Make sure that descriptors are written before
 * doorbell record.
 */
wmb();

writel(qp->doorbell_qpn,
   to_mdev(ibqp->device)->uar_map + 
MLX4_SEND_DOORBELL);


/*
 * Make sure doorbells don't leak out of SQ spinlock
 * and reach the HCA out of order.
 */
mmiowb();
--

That seems to explicitly make sure to place a barrier before updating
the doorbell. So as I see it, either ordering is guaranteed and the
above code is redundant, or nvme needs to do the same.

Thoughts?


Re: [PATCH] lightnvm: pblk: refactor init/exit sequences

2018-03-05 Thread Matias Bjørling

On 03/05/2018 03:18 PM, Javier González wrote:

On 5 Mar 2018, at 15.16, Matias Bjørling  wrote:

On 03/05/2018 02:45 PM, Javier González wrote:

On 5 Mar 2018, at 14.38, Matias Bjørling  wrote:

On 03/01/2018 08:29 PM, Javier González wrote:

On 1 Mar 2018, at 19.49, Matias Bjørling  wrote:

On 03/01/2018 04:59 PM, Javier González wrote:

Refactor init and exit sequences to eliminate dependencies among init
modules and improve readability.
Signed-off-by: Javier González 
---
  drivers/lightnvm/pblk-init.c | 415 +--
  1 file changed, 206 insertions(+), 209 deletions(-)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 25fc70ca07f7..87c390667dd6 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk)
vfree(pblk->trans_map);
  }
  -static int pblk_l2p_init(struct pblk *pblk)
+static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
+{
+   struct pblk_line *line = NULL;
+
+   if (factory_init) {
+   pblk_setup_uuid(pblk);
+   } else {
+   line = pblk_recov_l2p(pblk);
+   if (IS_ERR(line)) {
+   pr_err("pblk: could not recover l2p table\n");
+   return -EFAULT;
+   }
+   }
+
+#ifdef CONFIG_NVM_DEBUG
+   pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+   /* Free full lines directly as GC has not been started yet */
+   pblk_gc_free_full_lines(pblk);
+
+   if (!line) {
+   /* Configure next line for user data */
+   line = pblk_line_get_first_data(pblk);
+   if (!line) {
+   pr_err("pblk: line list corrupted\n");
+   return -EFAULT;
+   }
+   }
+
+   return 0;
+}
+
+static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
  {
sector_t i;
struct ppa_addr ppa;
@@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk)
for (i = 0; i < pblk->rl.nr_secs; i++)
pblk_trans_map_set(pblk, i, ppa);
  - return 0;
+   return pblk_l2p_recover(pblk, factory_init);
  }
static void pblk_rwb_free(struct pblk *pblk)
@@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
struct nvm_addr_format ppaf = geo->ppaf;
-   int power_len;
+   int mod, power_len;
+
+   div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, );
+   if (mod) {
+   pr_err("pblk: bad configuration of sectors/pages\n");
+   return -EINVAL;
+   }
/* Re-calculate channel and lun format to adapt to configuration */
power_len = get_count_order(geo->nr_chnls);
@@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk)
  {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
+   int max_write_ppas;
+
+   atomic64_set(>user_wa, 0);
+   atomic64_set(>pad_wa, 0);
+   atomic64_set(>gc_wa, 0);
+   pblk->user_rst_wa = 0;
+   pblk->pad_rst_wa = 0;
+   pblk->gc_rst_wa = 0;
+
+   atomic64_set(>nr_flush, 0);
+   pblk->nr_flush_rst = 0;
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
geo->nr_planes * geo->all_luns;
  + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+   max_write_ppas = pblk->min_write_pgs * geo->all_luns;
+   pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
+   pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
+
+   if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+   pr_err("pblk: vector list too big(%u > %u)\n",
+   pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
+   return -EINVAL;
+   }
+
+   pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t),
+   GFP_KERNEL);
+   if (!pblk->pad_dist)
+   return -ENOMEM;
+
if (pblk_init_global_caches(pblk))
-   return -ENOMEM;
+   goto fail_free_pad_dist;
/* Internal bios can be at most the sectors signaled by the device. */
pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
@@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk)
if (pblk_set_ppaf(pblk))
goto free_r_end_wq;
  - if (pblk_rwb_init(pblk))
-   goto free_r_end_wq;
-
INIT_LIST_HEAD(>compl_list);
+
return 0;
free_r_end_wq:
@@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk)
mempool_destroy(pblk->page_bio_pool);
  free_global_caches:
pblk_free_global_caches(pblk);

Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Logan Gunthorpe



On 05/03/18 11:02 AM, Sinan Kaya wrote:

writel has a barrier inside on ARM64.

https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/asm/io.h#L143


Yes, and no barrier inside memcpy_toio as it uses __raw_writes. This 
should be sufficient as we are only accessing addresses that look like 
memory and have no side effects (those enabling doorbell accesses may 
need to worry about this though). Typically, what could happen, in this 
case, is the CPU would issue writes to the BAR normally and the next 
time it programmed the DMA engine it would flush everything via the 
flush in writel.



Why do you need another barrier?


We don't.

Thanks,

Logan


Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Logan Gunthorpe



On 05/03/18 09:00 AM, Keith Busch wrote:

On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:

On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe  wrote:

@@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
  {
 u16 tail = nvmeq->sq_tail;



-   if (nvmeq->sq_cmds_io)
-   memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd));
-   else
-   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
+   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));


Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
the _toio() variant enforces alignment, does the copy with 4 byte
stores, and has a full barrier after the copy. In comparison our
regular memcpy() does none of those things and may use unaligned and
vector load/stores. For normal (cacheable) memory that is perfectly
fine, but they can cause alignment faults when targeted at MMIO
(cache-inhibited) memory.

I think in this particular case it might be ok since we know SEQs are
aligned to 64 byte boundaries and the copy is too small to use our
vectorised memcpy(). I'll assume we don't need explicit ordering
between writes of SEQs since the existing code doesn't seem to care
unless the doorbell is being rung, so you're probably fine there too.
That said, I still think this is a little bit sketchy and at the very
least you should add a comment explaining what's going on when the CMB
is being used. If someone more familiar with the NVMe driver could
chime in I would appreciate it.


I may not be understanding the concern, but I'll give it a shot.

You're right, the start of any SQE is always 64-byte aligned, so that
should satisfy alignment requirements.

The order when writing multiple/successive SQEs in a submission queue
does matter, and this is currently serialized through the q_lock.

The order in which the bytes of a single SQE is written doesn't really
matter as long as the entire SQE is written into the CMB prior to writing
that SQ's doorbell register.

The doorbell register is written immediately after copying a command
entry into the submission queue (ignore "shadow buffer" features),
so the doorbells written to commands submitted is 1:1.

If a CMB SQE and DB order is not enforced with the memcpy, then we do
need a barrier after the SQE's memcpy and before the doorbell's writel.



Thanks for the information Keith.

Adding to this: regular memcpy generally also enforces alignment as 
unaligned access to regular memory is typically bad in some way on most 
arches. The generic memcpy_toio also does not have any barrier as it is 
just a call to memcpy. Arm64 also does not appear to have a barrier in 
its implementation and in the short survey I did I could not find any 
implementation with a barrier. I also did not find a ppc implementation 
in the tree but it would be weird for it to add a barrier when other 
arches do not appear to need it.


We've been operating on the assumption that memory mapped by 
devm_memremap_pages() can be treated as regular memory. This is 
emphasized by the fact that it does not return an __iomem pointer. If 
this assumption does not hold for an arch then we cannot support P2P DMA 
without an overhaul of many kernel interfaces or creating other backend 
interfaces into the drivers which take different data types (ie. we'd 
have to bypass the entire block layer when trying to write data in 
p2pmem to an nvme device. This is very undesirable.


Logan





Re: vgdisplay hang on iSCSI session

2018-03-05 Thread Bart Van Assche
On Mon, 2018-03-05 at 10:44 +0100, Jean-Louis Dupond wrote:
> Maby a long shot, but could it be fixed by 
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/drivers/block/loop.c?h=v4.9.86=56bc086358cac1a2949783646eabd57447b9d672
>  
> ?
> Or shouldn't that fix such kind of issues?
> 
> It seems like some kind of race condition, cause it happens so random 
> and not on a specific action.

Hello Jean-Louis,

Have you already tested any of the kernel versions shown below? All these kernel
versions should include that fix:

$ git tag --contains 56bc086358cac1a2949783646eabd57447b9d672
v4.9.80
v4.9.81
v4.9.82
v4.9.83
v4.9.84
v4.9.85
v4.9.86

$ git tag --contains ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5
v4.15
v4.15.1
v4.15.2
v4.15.3
v4.15.4
v4.15.5
v4.15.6
v4.15.7

Thanks,

Bart.

Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-03-05 Thread Keith Busch
On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote:
> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe  wrote:
> > @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
> >  {
> > u16 tail = nvmeq->sq_tail;
> 
> > -   if (nvmeq->sq_cmds_io)
> > -   memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd));
> > -   else
> > -   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
> > +   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
> 
> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC
> the _toio() variant enforces alignment, does the copy with 4 byte
> stores, and has a full barrier after the copy. In comparison our
> regular memcpy() does none of those things and may use unaligned and
> vector load/stores. For normal (cacheable) memory that is perfectly
> fine, but they can cause alignment faults when targeted at MMIO
> (cache-inhibited) memory.
> 
> I think in this particular case it might be ok since we know SEQs are
> aligned to 64 byte boundaries and the copy is too small to use our
> vectorised memcpy(). I'll assume we don't need explicit ordering
> between writes of SEQs since the existing code doesn't seem to care
> unless the doorbell is being rung, so you're probably fine there too.
> That said, I still think this is a little bit sketchy and at the very
> least you should add a comment explaining what's going on when the CMB
> is being used. If someone more familiar with the NVMe driver could
> chime in I would appreciate it.

I may not be understanding the concern, but I'll give it a shot.

You're right, the start of any SQE is always 64-byte aligned, so that
should satisfy alignment requirements.

The order when writing multiple/successive SQEs in a submission queue
does matter, and this is currently serialized through the q_lock.

The order in which the bytes of a single SQE is written doesn't really
matter as long as the entire SQE is written into the CMB prior to writing
that SQ's doorbell register.

The doorbell register is written immediately after copying a command
entry into the submission queue (ignore "shadow buffer" features),
so the doorbells written to commands submitted is 1:1.

If a CMB SQE and DB order is not enforced with the memcpy, then we do
need a barrier after the SQE's memcpy and before the doorbell's writel.


Re: [PATCH] block: null_blk: fix 'Invalid parameters' failure when loading module

2018-03-05 Thread Bart Van Assche
On Sat, 2018-03-03 at 10:24 +0800, Ming Lei wrote:
>  struct nullb_page {
>   struct page *page;
> - unsigned long bitmap;
> + unsigned long bitmap[DIV_ROUND_UP(MAP_SZ, sizeof(unsigned long) * 8)];
>  };

Could DECLARE_BITMAP() have been used here?

Thanks,

Bart.

[PATCH v3] blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()

2018-03-05 Thread Joseph Qi
We've triggered a WARNING in blk_throtl_bio() when throttling writeback
io, which complains blkg->refcnt is already 0 when calling blkg_get(),
and then kernel crashes with invalid page request.
After investigating this issue, we've found it is caused by a race
between blkcg_bio_issue_check() and cgroup_rmdir(), which is described
below:

writeback kworker   cgroup_rmdir
  cgroup_destroy_locked
kill_css
  css_killed_ref_fn
css_killed_work_fn
  offline_css
blkcg_css_offline
  blkcg_bio_issue_check
rcu_read_lock
blkg_lookup
  spin_trylock(q->queue_lock)
  blkg_destroy
  spin_unlock(q->queue_lock)
blk_throtl_bio
spin_lock_irq(q->queue_lock)
...
spin_unlock_irq(q->queue_lock)
  rcu_read_unlock

Since rcu can only prevent blkg from releasing when it is being used,
the blkg->refcnt can be decreased to 0 during blkg_destroy() and schedule
blkg release.
Then trying to blkg_get() in blk_throtl_bio() will complains the WARNING.
And then the corresponding blkg_put() will schedule blkg release again,
which result in double free.
This race is introduced by commit ae1188963611 ("blkcg: consolidate blkg
creation in blkcg_bio_issue_check()"). Before this commit, it will
lookup first and then try to lookup/create again with queue_lock. Since
revive this logic is a bit drastic, so fix it by only offlining pd during
blkcg_css_offline(), and move the rest destruction (especially
blkg_put()) into blkcg_css_free(), which should be the right way as
discussed.

Fixes: ae1188963611 ("blkcg: consolidate blkg creation in 
blkcg_bio_issue_check()")
Reported-by: Jiufei Xue 
Cc: sta...@vger.kernel.org #4.3+
Signed-off-by: Joseph Qi 
---
 block/blk-cgroup.c | 52 +-
 include/linux/blk-cgroup.h |  2 ++
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c2033a2..2e9f510 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -307,11 +307,27 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
}
 }
 
+static void blkg_pd_offline(struct blkcg_gq *blkg)
+{
+   int i;
+
+   lockdep_assert_held(blkg->q->queue_lock);
+   lockdep_assert_held(>blkcg->lock);
+
+   for (i = 0; i < BLKCG_MAX_POLS; i++) {
+   struct blkcg_policy *pol = blkcg_policy[i];
+
+   if (blkg->pd[i] && !blkg->pd_offline[i] && pol->pd_offline_fn) {
+   pol->pd_offline_fn(blkg->pd[i]);
+   blkg->pd_offline[i] = true;
+   }
+   }
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
struct blkcg *blkcg = blkg->blkcg;
struct blkcg_gq *parent = blkg->parent;
-   int i;
 
lockdep_assert_held(blkg->q->queue_lock);
lockdep_assert_held(>lock);
@@ -320,13 +336,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
WARN_ON_ONCE(list_empty(>q_node));
WARN_ON_ONCE(hlist_unhashed(>blkcg_node));
 
-   for (i = 0; i < BLKCG_MAX_POLS; i++) {
-   struct blkcg_policy *pol = blkcg_policy[i];
-
-   if (blkg->pd[i] && pol->pd_offline_fn)
-   pol->pd_offline_fn(blkg->pd[i]);
-   }
-
if (parent) {
blkg_rwstat_add_aux(>stat_bytes, >stat_bytes);
blkg_rwstat_add_aux(>stat_ios, >stat_ios);
@@ -369,6 +378,7 @@ static void blkg_destroy_all(struct request_queue *q)
struct blkcg *blkcg = blkg->blkcg;
 
spin_lock(>lock);
+   blkg_pd_offline(blkg);
blkg_destroy(blkg);
spin_unlock(>lock);
}
@@ -1013,7 +1023,7 @@ static void blkcg_css_offline(struct cgroup_subsys_state 
*css)
struct request_queue *q = blkg->q;
 
if (spin_trylock(q->queue_lock)) {
-   blkg_destroy(blkg);
+   blkg_pd_offline(blkg);
spin_unlock(q->queue_lock);
} else {
spin_unlock_irq(>lock);
@@ -1032,6 +1042,26 @@ static void blkcg_css_free(struct cgroup_subsys_state 
*css)
struct blkcg *blkcg = css_to_blkcg(css);
int i;
 
+   spin_lock_irq(>lock);
+
+   while (!hlist_empty(>blkg_list)) {
+   struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
+   struct blkcg_gq,
+   blkcg_node);
+   struct request_queue *q = blkg->q;
+
+  

RE: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue

2018-03-05 Thread Don Brace
> -Original Message-
> From: Kashyap Desai [mailto:kashyap.de...@broadcom.com]
> Sent: Monday, March 05, 2018 1:24 AM
> To: Laurence Oberman ; Don Brace
> ; Ming Lei 
> Cc: Jens Axboe ; linux-block@vger.kernel.org; Christoph
> Hellwig ; Mike Snitzer ; linux-
> s...@vger.kernel.org; Hannes Reinecke ; Arun Easi
> ; Omar Sandoval ; Martin K .
> Petersen ; James Bottomley
> ; Christoph Hellwig ;
> Peter Rivera ; Meelis Roos 
> Subject: RE: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue
> 
> EXTERNAL EMAIL
> 
> 
> > -Original Message-
> > From: Laurence Oberman [mailto:lober...@redhat.com]
> > Sent: Saturday, March 3, 2018 3:23 AM
> > To: Don Brace; Ming Lei
> > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike
> > Snitzer;
> > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval;
> > Martin K . Petersen; James Bottomley; Christoph Hellwig; Kashyap Desai;
> > Peter
> > Rivera; Meelis Roos
> > Subject: Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue
> >
> > On Fri, 2018-03-02 at 15:03 +, Don Brace wrote:
> > > > -Original Message-
> > > > From: Laurence Oberman [mailto:lober...@redhat.com]
> > > > Sent: Friday, March 02, 2018 8:09 AM
> > > > To: Ming Lei 
> > > > Cc: Don Brace ; Jens Axboe  > > > k>;
> > > > linux-block@vger.kernel.org; Christoph Hellwig ;
> > > > Mike Snitzer ; linux-s...@vger.kernel.org;
> > > > Hannes Reinecke ; Arun Easi ;
> > > > Omar Sandoval ; Martin K . Petersen
> > > > ; James Bottomley
> > > > ; Christoph Hellwig
> > > > ; Kashyap Desai ; Peter
> > > > Rivera ; Meelis Roos 
> > > > Subject: Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue
> > > >
> > > > EXTERNAL EMAIL
> > > >
> > > >
> > > > On Fri, 2018-03-02 at 10:16 +0800, Ming Lei wrote:
> > > > > On Thu, Mar 01, 2018 at 04:19:34PM -0500, Laurence Oberman wrote:
> > > > > > On Thu, 2018-03-01 at 14:01 -0500, Laurence Oberman wrote:
> > > > > > > On Thu, 2018-03-01 at 16:18 +, Don Brace wrote:
> > > > > > > > > -Original Message-
> > > > > > > > > From: Ming Lei [mailto:ming@redhat.com]
> > > > > > > > > Sent: Tuesday, February 27, 2018 4:08 AM
> > > > > > > > > To: Jens Axboe ; linux-block@vger.kernel
> > > > > > > > > .org ; Christoph Hellwig ; Mike Snitzer
> > > > > > > > >  > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Cc: linux-s...@vger.kernel.org; Hannes Reinecke  > > > > > > > > e.de
> > > > > > > > > > ;
> > > > > > > > >
> > > > > > > > > Arun Easi
> > > > > > > > > ; Omar Sandoval ;
> > > > > > > > > Martin K .
> > > > > > > > > Petersen ; James Bottomley
> > > > > > > > > ; Christoph Hellwig
> > > > > > > > > ; Don Brace ;
> > > > > > > > > Kashyap Desai ; Peter Rivera
> > > > > > > > >  > > > > > > > > om>;
> > > > > > > > > Laurence Oberman ; Ming Lei
> > > > > > > > > ; Meelis Roos 
> > > > > > > > > Subject: [PATCH V3 1/8] scsi: hpsa: fix selection of reply
> > > > > > > > > queue
> > > > > > > > >
> > > > >
> > > > > Seems Don run into IO failure without blk-mq, could you run your
> > > > > tests again in legacy mode?
> > > > >
> > > > > Thanks,
> > > > > Ming
> > > >
> > > > Hello Ming
> > > > I ran multiple passes on Legacy and still see no issues in my test
> > > > bed
> > > >

Tests ran all weekend without issues.


> > > > BOOT_IMAGE=/vmlinuz-4.16.0-rc2.ming+ root=UUID=43f86d71-b1bf-
> 4789-
> > > > a28e-
> > > > 21c6ddc90195 ro crashkernel=256M@64M log_buf_len=64M
> > > > console=ttyS1,115200n8
> > > >
> > > > HEAD of the git kernel I am using
> > > >
> > > > 694e16f scsi: megaraid: improve scsi_mq performance via .host_tagset
> > > > 793686c scsi: hpsa: improve scsi_mq performance via .host_tagset
> > > > 60d5b36 block: null_blk: introduce module parameter of 'g_host_tags'
> > > > 8847067 scsi: Add template flag 'host_tagset'
> > > > a8fbdd6 blk-mq: introduce BLK_MQ_F_HOST_TAGS 4710fab blk-mq:
> > > > introduce 'start_tag' field to 'struct blk_mq_tags'
> > > > 09bb153 scsi: megaraid_sas: fix selection of reply queue
> > > > 52700d8 scsi: hpsa: fix selection of reply queue
> > >
> > > I 

Re: [PATCH] lightnvm: pblk: refactor init/exit sequences

2018-03-05 Thread Javier González
> On 5 Mar 2018, at 15.16, Matias Bjørling  wrote:
> 
> On 03/05/2018 02:45 PM, Javier González wrote:
>>> On 5 Mar 2018, at 14.38, Matias Bjørling  wrote:
>>> 
>>> On 03/01/2018 08:29 PM, Javier González wrote:
> On 1 Mar 2018, at 19.49, Matias Bjørling  wrote:
> 
> On 03/01/2018 04:59 PM, Javier González wrote:
>> Refactor init and exit sequences to eliminate dependencies among init
>> modules and improve readability.
>> Signed-off-by: Javier González 
>> ---
>>  drivers/lightnvm/pblk-init.c | 415 
>> +--
>>  1 file changed, 206 insertions(+), 209 deletions(-)
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index 25fc70ca07f7..87c390667dd6 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk)
>>  vfree(pblk->trans_map);
>>  }
>>  -static int pblk_l2p_init(struct pblk *pblk)
>> +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
>> +{
>> +struct pblk_line *line = NULL;
>> +
>> +if (factory_init) {
>> +pblk_setup_uuid(pblk);
>> +} else {
>> +line = pblk_recov_l2p(pblk);
>> +if (IS_ERR(line)) {
>> +pr_err("pblk: could not recover l2p table\n");
>> +return -EFAULT;
>> +}
>> +}
>> +
>> +#ifdef CONFIG_NVM_DEBUG
>> +pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
>> +#endif
>> +
>> +/* Free full lines directly as GC has not been started yet */
>> +pblk_gc_free_full_lines(pblk);
>> +
>> +if (!line) {
>> +/* Configure next line for user data */
>> +line = pblk_line_get_first_data(pblk);
>> +if (!line) {
>> +pr_err("pblk: line list corrupted\n");
>> +return -EFAULT;
>> +}
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
>>  {
>>  sector_t i;
>>  struct ppa_addr ppa;
>> @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk)
>>  for (i = 0; i < pblk->rl.nr_secs; i++)
>>  pblk_trans_map_set(pblk, i, ppa);
>>  -   return 0;
>> +return pblk_l2p_recover(pblk, factory_init);
>>  }
>>static void pblk_rwb_free(struct pblk *pblk)
>> @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk)
>>  struct nvm_tgt_dev *dev = pblk->dev;
>>  struct nvm_geo *geo = >geo;
>>  struct nvm_addr_format ppaf = geo->ppaf;
>> -int power_len;
>> +int mod, power_len;
>> +
>> +div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, );
>> +if (mod) {
>> +pr_err("pblk: bad configuration of sectors/pages\n");
>> +return -EINVAL;
>> +}
>>  /* Re-calculate channel and lun format to adapt to 
>> configuration */
>>  power_len = get_count_order(geo->nr_chnls);
>> @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk)
>>  {
>>  struct nvm_tgt_dev *dev = pblk->dev;
>>  struct nvm_geo *geo = >geo;
>> +int max_write_ppas;
>> +
>> +atomic64_set(>user_wa, 0);
>> +atomic64_set(>pad_wa, 0);
>> +atomic64_set(>gc_wa, 0);
>> +pblk->user_rst_wa = 0;
>> +pblk->pad_rst_wa = 0;
>> +pblk->gc_rst_wa = 0;
>> +
>> +atomic64_set(>nr_flush, 0);
>> +pblk->nr_flush_rst = 0;
>>  pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
>>  geo->nr_planes * 
>> geo->all_luns;
>>  +   pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / 
>> PAGE_SIZE);
>> +max_write_ppas = pblk->min_write_pgs * geo->all_luns;
>> +pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
>> +pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
>> +
>> +if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
>> +pr_err("pblk: vector list too big(%u > %u)\n",
>> +pblk->max_write_pgs, 
>> PBLK_MAX_REQ_ADDRS);
>> +return -EINVAL;
>> +}
>> +
>> +pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * 
>> sizeof(atomic64_t),
>> +
>> 

Re: [PATCH] lightnvm: pblk: refactor init/exit sequences

2018-03-05 Thread Matias Bjørling

On 03/05/2018 02:45 PM, Javier González wrote:

On 5 Mar 2018, at 14.38, Matias Bjørling  wrote:

On 03/01/2018 08:29 PM, Javier González wrote:

On 1 Mar 2018, at 19.49, Matias Bjørling  wrote:

On 03/01/2018 04:59 PM, Javier González wrote:

Refactor init and exit sequences to eliminate dependencies among init
modules and improve readability.
Signed-off-by: Javier González 
---
  drivers/lightnvm/pblk-init.c | 415 +--
  1 file changed, 206 insertions(+), 209 deletions(-)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 25fc70ca07f7..87c390667dd6 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk)
vfree(pblk->trans_map);
  }
  -static int pblk_l2p_init(struct pblk *pblk)
+static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
+{
+   struct pblk_line *line = NULL;
+
+   if (factory_init) {
+   pblk_setup_uuid(pblk);
+   } else {
+   line = pblk_recov_l2p(pblk);
+   if (IS_ERR(line)) {
+   pr_err("pblk: could not recover l2p table\n");
+   return -EFAULT;
+   }
+   }
+
+#ifdef CONFIG_NVM_DEBUG
+   pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+   /* Free full lines directly as GC has not been started yet */
+   pblk_gc_free_full_lines(pblk);
+
+   if (!line) {
+   /* Configure next line for user data */
+   line = pblk_line_get_first_data(pblk);
+   if (!line) {
+   pr_err("pblk: line list corrupted\n");
+   return -EFAULT;
+   }
+   }
+
+   return 0;
+}
+
+static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
  {
sector_t i;
struct ppa_addr ppa;
@@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk)
for (i = 0; i < pblk->rl.nr_secs; i++)
pblk_trans_map_set(pblk, i, ppa);
  - return 0;
+   return pblk_l2p_recover(pblk, factory_init);
  }
static void pblk_rwb_free(struct pblk *pblk)
@@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
struct nvm_addr_format ppaf = geo->ppaf;
-   int power_len;
+   int mod, power_len;
+
+   div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, );
+   if (mod) {
+   pr_err("pblk: bad configuration of sectors/pages\n");
+   return -EINVAL;
+   }
/* Re-calculate channel and lun format to adapt to configuration */
power_len = get_count_order(geo->nr_chnls);
@@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk)
  {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
+   int max_write_ppas;
+
+   atomic64_set(>user_wa, 0);
+   atomic64_set(>pad_wa, 0);
+   atomic64_set(>gc_wa, 0);
+   pblk->user_rst_wa = 0;
+   pblk->pad_rst_wa = 0;
+   pblk->gc_rst_wa = 0;
+
+   atomic64_set(>nr_flush, 0);
+   pblk->nr_flush_rst = 0;
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
geo->nr_planes * geo->all_luns;
  + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+   max_write_ppas = pblk->min_write_pgs * geo->all_luns;
+   pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
+   pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
+
+   if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+   pr_err("pblk: vector list too big(%u > %u)\n",
+   pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
+   return -EINVAL;
+   }
+
+   pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t),
+   GFP_KERNEL);
+   if (!pblk->pad_dist)
+   return -ENOMEM;
+
if (pblk_init_global_caches(pblk))
-   return -ENOMEM;
+   goto fail_free_pad_dist;
/* Internal bios can be at most the sectors signaled by the device. */
pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
@@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk)
if (pblk_set_ppaf(pblk))
goto free_r_end_wq;
  - if (pblk_rwb_init(pblk))
-   goto free_r_end_wq;
-
INIT_LIST_HEAD(>compl_list);
+
return 0;
free_r_end_wq:
@@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk)
mempool_destroy(pblk->page_bio_pool);
  free_global_caches:
pblk_free_global_caches(pblk);
+fail_free_pad_dist:
+   kfree(pblk->pad_dist);
return -ENOMEM;
  }
  @@ -354,14 +420,8 @@ static void 

Re: [PATCH] lightnvm: pblk: refactor init/exit sequences

2018-03-05 Thread Javier González
> On 5 Mar 2018, at 14.38, Matias Bjørling  wrote:
> 
> On 03/01/2018 08:29 PM, Javier González wrote:
>>> On 1 Mar 2018, at 19.49, Matias Bjørling  wrote:
>>> 
>>> On 03/01/2018 04:59 PM, Javier González wrote:
 Refactor init and exit sequences to eliminate dependencies among init
 modules and improve readability.
 Signed-off-by: Javier González 
 ---
  drivers/lightnvm/pblk-init.c | 415 
 +--
  1 file changed, 206 insertions(+), 209 deletions(-)
 diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
 index 25fc70ca07f7..87c390667dd6 100644
 --- a/drivers/lightnvm/pblk-init.c
 +++ b/drivers/lightnvm/pblk-init.c
 @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk)
vfree(pblk->trans_map);
  }
  -static int pblk_l2p_init(struct pblk *pblk)
 +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
 +{
 +  struct pblk_line *line = NULL;
 +
 +  if (factory_init) {
 +  pblk_setup_uuid(pblk);
 +  } else {
 +  line = pblk_recov_l2p(pblk);
 +  if (IS_ERR(line)) {
 +  pr_err("pblk: could not recover l2p table\n");
 +  return -EFAULT;
 +  }
 +  }
 +
 +#ifdef CONFIG_NVM_DEBUG
 +  pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
 +#endif
 +
 +  /* Free full lines directly as GC has not been started yet */
 +  pblk_gc_free_full_lines(pblk);
 +
 +  if (!line) {
 +  /* Configure next line for user data */
 +  line = pblk_line_get_first_data(pblk);
 +  if (!line) {
 +  pr_err("pblk: line list corrupted\n");
 +  return -EFAULT;
 +  }
 +  }
 +
 +  return 0;
 +}
 +
 +static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
  {
sector_t i;
struct ppa_addr ppa;
 @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk)
for (i = 0; i < pblk->rl.nr_secs; i++)
pblk_trans_map_set(pblk, i, ppa);
  - return 0;
 +  return pblk_l2p_recover(pblk, factory_init);
  }
static void pblk_rwb_free(struct pblk *pblk)
 @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
struct nvm_addr_format ppaf = geo->ppaf;
 -  int power_len;
 +  int mod, power_len;
 +
 +  div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, );
 +  if (mod) {
 +  pr_err("pblk: bad configuration of sectors/pages\n");
 +  return -EINVAL;
 +  }
/* Re-calculate channel and lun format to adapt to 
 configuration */
power_len = get_count_order(geo->nr_chnls);
 @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk)
  {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
 +  int max_write_ppas;
 +
 +  atomic64_set(>user_wa, 0);
 +  atomic64_set(>pad_wa, 0);
 +  atomic64_set(>gc_wa, 0);
 +  pblk->user_rst_wa = 0;
 +  pblk->pad_rst_wa = 0;
 +  pblk->gc_rst_wa = 0;
 +
 +  atomic64_set(>nr_flush, 0);
 +  pblk->nr_flush_rst = 0;
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
geo->nr_planes * geo->all_luns;
  + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
 +  max_write_ppas = pblk->min_write_pgs * geo->all_luns;
 +  pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
 +  pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 +
 +  if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
 +  pr_err("pblk: vector list too big(%u > %u)\n",
 +  pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
 +  return -EINVAL;
 +  }
 +
 +  pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t),
 +  GFP_KERNEL);
 +  if (!pblk->pad_dist)
 +  return -ENOMEM;
 +
if (pblk_init_global_caches(pblk))
 -  return -ENOMEM;
 +  goto fail_free_pad_dist;
/* Internal bios can be at most the sectors signaled by the 
 device. */
pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
 @@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk)
if (pblk_set_ppaf(pblk))
goto free_r_end_wq;
  - if (pblk_rwb_init(pblk))
 -  goto free_r_end_wq;
 -
INIT_LIST_HEAD(>compl_list);
 +
return 0;
free_r_end_wq:
 @@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk)

Re: [PATCH] lightnvm: pblk: refactor init/exit sequences

2018-03-05 Thread Matias Bjørling

On 03/01/2018 08:29 PM, Javier González wrote:



On 1 Mar 2018, at 19.49, Matias Bjørling  wrote:

On 03/01/2018 04:59 PM, Javier González wrote:

Refactor init and exit sequences to eliminate dependencies among init
modules and improve readability.
Signed-off-by: Javier González 
---
  drivers/lightnvm/pblk-init.c | 415 +--
  1 file changed, 206 insertions(+), 209 deletions(-)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 25fc70ca07f7..87c390667dd6 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk)
vfree(pblk->trans_map);
  }
  -static int pblk_l2p_init(struct pblk *pblk)
+static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
+{
+   struct pblk_line *line = NULL;
+
+   if (factory_init) {
+   pblk_setup_uuid(pblk);
+   } else {
+   line = pblk_recov_l2p(pblk);
+   if (IS_ERR(line)) {
+   pr_err("pblk: could not recover l2p table\n");
+   return -EFAULT;
+   }
+   }
+
+#ifdef CONFIG_NVM_DEBUG
+   pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk));
+#endif
+
+   /* Free full lines directly as GC has not been started yet */
+   pblk_gc_free_full_lines(pblk);
+
+   if (!line) {
+   /* Configure next line for user data */
+   line = pblk_line_get_first_data(pblk);
+   if (!line) {
+   pr_err("pblk: line list corrupted\n");
+   return -EFAULT;
+   }
+   }
+
+   return 0;
+}
+
+static int pblk_l2p_init(struct pblk *pblk, bool factory_init)
  {
sector_t i;
struct ppa_addr ppa;
@@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk)
for (i = 0; i < pblk->rl.nr_secs; i++)
pblk_trans_map_set(pblk, i, ppa);
  - return 0;
+   return pblk_l2p_recover(pblk, factory_init);
  }
static void pblk_rwb_free(struct pblk *pblk)
@@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
struct nvm_addr_format ppaf = geo->ppaf;
-   int power_len;
+   int mod, power_len;
+
+   div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, );
+   if (mod) {
+   pr_err("pblk: bad configuration of sectors/pages\n");
+   return -EINVAL;
+   }
/* Re-calculate channel and lun format to adapt to configuration */
power_len = get_count_order(geo->nr_chnls);
@@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk)
  {
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = >geo;
+   int max_write_ppas;
+
+   atomic64_set(>user_wa, 0);
+   atomic64_set(>pad_wa, 0);
+   atomic64_set(>gc_wa, 0);
+   pblk->user_rst_wa = 0;
+   pblk->pad_rst_wa = 0;
+   pblk->gc_rst_wa = 0;
+
+   atomic64_set(>nr_flush, 0);
+   pblk->nr_flush_rst = 0;
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
geo->nr_planes * geo->all_luns;
  + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+   max_write_ppas = pblk->min_write_pgs * geo->all_luns;
+   pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
+   pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
+
+   if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+   pr_err("pblk: vector list too big(%u > %u)\n",
+   pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS);
+   return -EINVAL;
+   }
+
+   pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t),
+   GFP_KERNEL);
+   if (!pblk->pad_dist)
+   return -ENOMEM;
+
if (pblk_init_global_caches(pblk))
-   return -ENOMEM;
+   goto fail_free_pad_dist;
/* Internal bios can be at most the sectors signaled by the device. */
pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0);
@@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk)
if (pblk_set_ppaf(pblk))
goto free_r_end_wq;
  - if (pblk_rwb_init(pblk))
-   goto free_r_end_wq;
-
INIT_LIST_HEAD(>compl_list);
+
return 0;
free_r_end_wq:
@@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk)
mempool_destroy(pblk->page_bio_pool);
  free_global_caches:
pblk_free_global_caches(pblk);
+fail_free_pad_dist:
+   kfree(pblk->pad_dist);
return -ENOMEM;
  }
  @@ -354,14 +420,8 @@ static void pblk_core_free(struct pblk *pblk)
mempool_destroy(pblk->e_rq_pool);
mempool_destroy(pblk->w_rq_pool);
  - 

Re: [PATCH 01/12] lightnvm: simplify geometry structure.

2018-03-05 Thread Javier González
> On 5 Mar 2018, at 14.07, Matias Bjørling  wrote:
> 
> On 03/02/2018 04:21 PM, Javier González wrote:
>> Currently, the device geometry is stored redundantly in the nvm_id and
>> nvm_geo structures at a device level. Moreover, when instantiating
>> targets on a specific number of LUNs, these structures are replicated
>> and manually modified to fit the instance channel and LUN partitioning.
>> Instead, create a generic geometry around nvm_geo, which can be used by
>> (i) the underlying device to describe the geometry of the whole device,
>> and (ii) instances to describe their geometry independently.
>> Signed-off-by: Javier González 
>> ---
>>  drivers/lightnvm/core.c  |  70 +++-
>>  drivers/lightnvm/pblk-core.c |  16 +-
>>  drivers/lightnvm/pblk-gc.c   |   2 +-
>>  drivers/lightnvm/pblk-init.c | 113 +++--
>>  drivers/lightnvm/pblk-read.c |   2 +-
>>  drivers/lightnvm/pblk-recovery.c |  14 +-
>>  drivers/lightnvm/pblk-rl.c   |   2 +-
>>  drivers/lightnvm/pblk-sysfs.c|  35 ++--
>>  drivers/lightnvm/pblk-write.c|   2 +-
>>  drivers/lightnvm/pblk.h  |  83 --
>>  drivers/nvme/host/lightnvm.c | 339 
>> +++
>>  include/linux/lightnvm.h | 198 +++
>>  12 files changed, 451 insertions(+), 425 deletions(-)
>> diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
>> index 19c46ebb1b91..9a417d9cdf0c 100644
>> --- a/drivers/lightnvm/core.c
>> +++ b/drivers/lightnvm/core.c
>> @@ -155,7 +155,7 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct 
>> nvm_dev *dev,
>>  int blun = lun_begin % dev->geo.nr_luns;
>>  int lunid = 0;
>>  int lun_balanced = 1;
>> -int prev_nr_luns;
>> +int sec_per_lun, prev_nr_luns;
>>  int i, j;
>>  nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;
>> @@ -215,18 +215,23 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct 
>> nvm_dev *dev,
>>  if (!tgt_dev)
>>  goto err_ch;
>>  +   /* Inherit device geometry from parent */
>>  memcpy(_dev->geo, >geo, sizeof(struct nvm_geo));
>> +
>>  /* Target device only owns a portion of the physical device */
>>  tgt_dev->geo.nr_chnls = nr_chnls;
>> -tgt_dev->geo.all_luns = nr_luns;
>>  tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
>> +tgt_dev->geo.all_luns = nr_luns;
>> +tgt_dev->geo.all_chunks = nr_luns * dev->geo.nr_chks;
>> +
>>  tgt_dev->geo.op = op;
>> -tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
>> +
>> +sec_per_lun = dev->geo.clba * dev->geo.nr_chks;
>> +tgt_dev->geo.total_secs = nr_luns * sec_per_lun;
>> +
>>  tgt_dev->q = dev->q;
>>  tgt_dev->map = dev_map;
>>  tgt_dev->luns = luns;
>> -memcpy(_dev->identity, >identity, sizeof(struct nvm_id));
>> -
>>  tgt_dev->parent = dev;
>>  return tgt_dev;
>> @@ -296,8 +301,6 @@ static int __nvm_config_simple(struct nvm_dev *dev,
>>  static int __nvm_config_extended(struct nvm_dev *dev,
>>   struct nvm_ioctl_create_extended *e)
>>  {
>> -struct nvm_geo *geo = >geo;
>> -
>>  if (e->lun_begin == 0x && e->lun_end == 0x) {
>>  e->lun_begin = 0;
>>  e->lun_end = dev->geo.all_luns - 1;
>> @@ -311,7 +314,7 @@ static int __nvm_config_extended(struct nvm_dev *dev,
>>  return -EINVAL;
>>  }
>>  -   return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
>> +return nvm_config_check_luns(>geo, e->lun_begin, e->lun_end);
>>  }
>>static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create 
>> *create)
>> @@ -406,7 +409,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct 
>> nvm_ioctl_create *create)
>>  tqueue->queuedata = targetdata;
>>  blk_queue_max_hw_sectors(tqueue,
>> -(dev->geo.sec_size >> 9) * NVM_MAX_VLBA);
>> +(dev->geo.csecs >> 9) * NVM_MAX_VLBA);
>>  set_capacity(tdisk, tt->capacity(targetdata));
>>  add_disk(tdisk);
>> @@ -841,40 +844,9 @@ EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
>>static int nvm_core_init(struct nvm_dev *dev)
>>  {
>> -struct nvm_id *id = >identity;
>>  struct nvm_geo *geo = >geo;
>>  int ret;
>>  -   memcpy(>ppaf, >ppaf, sizeof(struct nvm_addr_format));
>> -
>> -if (id->mtype != 0) {
>> -pr_err("nvm: memory type not supported\n");
>> -return -EINVAL;
>> -}
>> -
>> -/* Whole device values */
>> -geo->nr_chnls = id->num_ch;
>> -geo->nr_luns = id->num_lun;
>> -
>> -/* Generic device geometry values */
>> -geo->ws_min = id->ws_min;
>> -geo->ws_opt = id->ws_opt;
>> -geo->ws_seq = id->ws_seq;
>> -geo->ws_per_chk = id->ws_per_chk;
>> -geo->nr_chks = id->num_chk;
>> -geo->mccap = id->mccap;
>> -
>> -geo->sec_per_chk = id->clba;
>> -geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
>> -geo->all_luns = 

Re: [PATCH 01/12] lightnvm: simplify geometry structure.

2018-03-05 Thread Matias Bjørling

On 03/02/2018 04:21 PM, Javier González wrote:

Currently, the device geometry is stored redundantly in the nvm_id and
nvm_geo structures at a device level. Moreover, when instantiating
targets on a specific number of LUNs, these structures are replicated
and manually modified to fit the instance channel and LUN partitioning.

Instead, create a generic geometry around nvm_geo, which can be used by
(i) the underlying device to describe the geometry of the whole device,
and (ii) instances to describe their geometry independently.

Signed-off-by: Javier González 
---
  drivers/lightnvm/core.c  |  70 +++-
  drivers/lightnvm/pblk-core.c |  16 +-
  drivers/lightnvm/pblk-gc.c   |   2 +-
  drivers/lightnvm/pblk-init.c | 113 +++--
  drivers/lightnvm/pblk-read.c |   2 +-
  drivers/lightnvm/pblk-recovery.c |  14 +-
  drivers/lightnvm/pblk-rl.c   |   2 +-
  drivers/lightnvm/pblk-sysfs.c|  35 ++--
  drivers/lightnvm/pblk-write.c|   2 +-
  drivers/lightnvm/pblk.h  |  83 --
  drivers/nvme/host/lightnvm.c | 339 +++
  include/linux/lightnvm.h | 198 +++
  12 files changed, 451 insertions(+), 425 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 19c46ebb1b91..9a417d9cdf0c 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -155,7 +155,7 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct 
nvm_dev *dev,
int blun = lun_begin % dev->geo.nr_luns;
int lunid = 0;
int lun_balanced = 1;
-   int prev_nr_luns;
+   int sec_per_lun, prev_nr_luns;
int i, j;
  
  	nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1;

@@ -215,18 +215,23 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct 
nvm_dev *dev,
if (!tgt_dev)
goto err_ch;
  
+	/* Inherit device geometry from parent */

memcpy(_dev->geo, >geo, sizeof(struct nvm_geo));
+
/* Target device only owns a portion of the physical device */
tgt_dev->geo.nr_chnls = nr_chnls;
-   tgt_dev->geo.all_luns = nr_luns;
tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
+   tgt_dev->geo.all_luns = nr_luns;
+   tgt_dev->geo.all_chunks = nr_luns * dev->geo.nr_chks;
+
tgt_dev->geo.op = op;
-   tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
+
+   sec_per_lun = dev->geo.clba * dev->geo.nr_chks;
+   tgt_dev->geo.total_secs = nr_luns * sec_per_lun;
+
tgt_dev->q = dev->q;
tgt_dev->map = dev_map;
tgt_dev->luns = luns;
-   memcpy(_dev->identity, >identity, sizeof(struct nvm_id));
-
tgt_dev->parent = dev;
  
  	return tgt_dev;

@@ -296,8 +301,6 @@ static int __nvm_config_simple(struct nvm_dev *dev,
  static int __nvm_config_extended(struct nvm_dev *dev,
 struct nvm_ioctl_create_extended *e)
  {
-   struct nvm_geo *geo = >geo;
-
if (e->lun_begin == 0x && e->lun_end == 0x) {
e->lun_begin = 0;
e->lun_end = dev->geo.all_luns - 1;
@@ -311,7 +314,7 @@ static int __nvm_config_extended(struct nvm_dev *dev,
return -EINVAL;
}
  
-	return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);

+   return nvm_config_check_luns(>geo, e->lun_begin, e->lun_end);
  }
  
  static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)

@@ -406,7 +409,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct 
nvm_ioctl_create *create)
tqueue->queuedata = targetdata;
  
  	blk_queue_max_hw_sectors(tqueue,

-   (dev->geo.sec_size >> 9) * NVM_MAX_VLBA);
+   (dev->geo.csecs >> 9) * NVM_MAX_VLBA);
  
  	set_capacity(tdisk, tt->capacity(targetdata));

add_disk(tdisk);
@@ -841,40 +844,9 @@ EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
  
  static int nvm_core_init(struct nvm_dev *dev)

  {
-   struct nvm_id *id = >identity;
struct nvm_geo *geo = >geo;
int ret;
  
-	memcpy(>ppaf, >ppaf, sizeof(struct nvm_addr_format));

-
-   if (id->mtype != 0) {
-   pr_err("nvm: memory type not supported\n");
-   return -EINVAL;
-   }
-
-   /* Whole device values */
-   geo->nr_chnls = id->num_ch;
-   geo->nr_luns = id->num_lun;
-
-   /* Generic device geometry values */
-   geo->ws_min = id->ws_min;
-   geo->ws_opt = id->ws_opt;
-   geo->ws_seq = id->ws_seq;
-   geo->ws_per_chk = id->ws_per_chk;
-   geo->nr_chks = id->num_chk;
-   geo->mccap = id->mccap;
-
-   geo->sec_per_chk = id->clba;
-   geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
-   geo->all_luns = geo->nr_luns * geo->nr_chnls;
-
-   /* 1.2 spec device geometry values */
-   geo->plane_mode = 1 << geo->ws_seq;
-   geo->nr_planes = geo->ws_opt / geo->ws_min;
-   geo->sec_per_pg = geo->ws_min;
-   

[PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1

2018-03-05 Thread Konstantin Khlebnikov
Rate should never overflow or become zero because it is used as divider.
This patch accumulates it with saturation.

Signed-off-by: Konstantin Khlebnikov 
---
 block/bfq-iosched.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index aeca22d91101..a236c8d541b5 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct bfq_data 
*bfqd,
 
 static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
 {
-   u32 rate, weight, divisor;
+   u32 weight, divisor;
+   u64 rate;
 
/*
 * For the convergence property to hold (see comments on
@@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, 
struct request *rq)
 */
bfqd->peak_rate *= divisor-1;
bfqd->peak_rate /= divisor;
-   rate /= divisor; /* smoothing constant alpha = 1/divisor */
+   do_div(rate, divisor);  /* smoothing constant alpha = 1/divisor */
 
-   bfqd->peak_rate += rate;
+   /* rate should never overlow or become zero */
+   bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, U32_MAX);
update_thr_responsiveness_params(bfqd);
 
 reset_computation:



loop: properly declare rotational flag of underlying device

2018-03-05 Thread Holger Hoffstätte

The loop driver has always declared the rotational flag of its device as
rotational, even when the device of the mapped file is nonrotational,
as is the case with SSDs or on tmpfs. This can confuse filesystem tools
which are SSD-aware; in my case I frequently forget to tell mkfs.btrfs
that my loop device on tmpfs is nonrotational, and that I really don't
need any automatic metadata redundancy.

The attached patch fixes this by introspecting the rotational flag of the
mapped file's underlying block device, if it exists. If the mapped file's
filesystem has no associated block device - as is the case on e.g. tmpfs -
we assume nonrotational storage. If there is a better way to identify such
non-devices I'd love to hear them.

On previous submission (~2 years ago I think) Jens commented that this
"changes the default". Tat's precisely the point: the default of declaring
rotational behaviour unconditionally is more likely to be wrong than
the other way around. To the best of my knowledge there are no byte-
addressable but physically rotating media without associated block device.


This is fresh against 4.16-rc4 and has been in production since 4.9,
with some minor changes to accommodate initialization order in 4.14.

Please consider for 4.17.

Signed-off-by: Holger Hoffstätte 

cheers,
Holger

diff -rup linux-4.16-rc4/drivers/block/loop.c 
linux-4.16-rc4-loop/drivers/block/loop.c
--- linux-4.16-rc4/drivers/block/loop.c 2018-03-04 23:54:11.0 +0100
+++ linux-4.16-rc4-loop/drivers/block/loop.c2018-03-05 00:41:48.013602156 
+0100
@@ -829,6 +829,24 @@ static void loop_config_discard(struct l
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
+static void loop_update_rotational(struct loop_device *lo)
+{
+   struct file *file = lo->lo_backing_file;
+   struct inode *file_inode = file->f_mapping->host;
+   struct block_device *file_bdev = file_inode->i_sb->s_bdev;
+   struct request_queue *q = lo->lo_queue;
+   bool nonrot = true;
+
+   /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
+   if (file_bdev)
+   nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev));
+
+   if (nonrot)
+   queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+   else
+   queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
+}
+
 static void loop_unprepare_queue(struct loop_device *lo)
 {
kthread_flush_worker(>worker);
@@ -929,6 +947,7 @@ static int loop_set_fd(struct loop_devic
loop_update_dio(lo);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
+   loop_update_rotational(lo);
loop_sysfs_init(lo);
/* let user-space know about the new size */
kobject_uevent(_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);


Some bcache patches need reveiw

2018-03-05 Thread tang . junhui
Hello Mike

I send some patches some times before, with no response,

Bellow patches are very simple, can you or anybody else have a review?
[PATCH] bcache: fix incorrect sysfs output value of strip size
[PATCH] bcache: fix error return value in memory shrink
[PATCH] bcache: fix error count in memory shrink

Bellow patches are simple too, and important for me and also important for 
bcache,
can you or anybody else have a review?
[PATCH] bcache: finish incremental GC
[PATCH] bcache: calculate the number of incremental GC nodes according to the 
total of btree nodes

Thanks,
Tang Junhui


Re: vgdisplay hang on iSCSI session

2018-03-05 Thread Jean-Louis Dupond

Hi,

Maby a long shot, but could it be fixed by 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/drivers/block/loop.c?h=v4.9.86=56bc086358cac1a2949783646eabd57447b9d672 
?

Or shouldn't that fix such kind of issues?

It seems like some kind of race condition, cause it happens so random 
and not on a specific action.


Thanks
Jean-Louis

Op 2018-03-04 20:26, schreef Bart Van Assche:

On Sun, 2018-03-04 at 20:01 +0100, Jean-Louis Dupond wrote:

I'm running indeed CentOS 6 with the Virt SIG kernels. Already updated
to 4.9.75, but recently hit the problem again.

The first PID that was in D-state (root 27157  0.0  0.0 127664  
5196
?D06:19   0:00  \_ vgdisplay -c --ignorelockingfailure), 
had

the following stack:
# cat /proc/27157/stack
[] blk_mq_freeze_queue_wait+0x6f/0xd0
[] blk_freeze_queue+0x1e/0x30
[] blk_mq_freeze_queue+0xe/0x10
[] loop_switch+0x1e/0xd0
[] lo_release+0x7a/0x80
[] __blkdev_put+0x1a7/0x200
[] blkdev_put+0x56/0x140
[] blkdev_close+0x24/0x30
[] __fput+0xc8/0x240
[] fput+0xe/0x10
[] task_work_run+0x68/0xa0
[] exit_to_usermode_loop+0xc6/0xd0
[] do_syscall_64+0x185/0x240
[] entry_SYSCALL64_slow_path+0x25/0x25
[] 0x

Other procs show the following:
# cat /proc/7803/stack
[] __blkdev_get+0x6c/0x3f0
[] blkdev_get+0x5c/0x1c0
[] blkdev_open+0x62/0x80
[] do_dentry_open+0x22a/0x340
[] vfs_open+0x51/0x80
[] do_last+0x435/0x7a0
[] path_openat+0x87/0x1c0
[] do_filp_open+0x85/0xe0
[] do_sys_open+0x11c/0x210
[] SyS_open+0x1e/0x20
[] do_syscall_64+0x7a/0x240
[] entry_SYSCALL64_slow_path+0x25/0x25
[] 0x

An strace hangs again on loop0 open:
stat("/dev/loop0", {st_mode=S_IFBLK|0660, st_rdev=makedev(7, 0), ...}) 
=

0
open("/dev/loop0", O_RDONLY|O_DIRECT|O_NOATIME

And it seems like indeed alot is hanging on loop0:
# cat /sys/block/loop0/mq/0/queued
5957


Hello Jean-Louis,

Is the system still in this state? If so, can you provide the output of 
the

following command (as an attachment):

find /sys/kernel/debug/block/ -type f \! \( -name poll_stat -o -name
dispatched -o -name merged -o -name completed \)

Thanks,

Bart.


[PATCH v2] bcache: move closure debug file into debug direcotry

2018-03-05 Thread tang . junhui
LGTM.

Reviewed-by: Tang Junhui 

> In current code closure debug file is outside of debug directory
> and when unloading module there is lack of removing operation
> for closure debug file, so it will cause creating error when trying
> to reload  module.
> 
> This patch move closure debug file into "bcache" debug direcory
> so that the file can get deleted properly.
> 
> Signed-off-by: Chengguang Xu 
> ---
> Changes since v1:
> - Move closure debug file into "bcache" debug direcory instead of
> deleting it individually.
> - Change Signed-off-by mail address.
> 
>  drivers/md/bcache/closure.c | 9 +
>  drivers/md/bcache/closure.h | 5 +++--
>  drivers/md/bcache/debug.c   | 2 +-
>  drivers/md/bcache/super.c   | 3 +--
>  4 files changed, 10 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
> index 7f12920..64b123c 100644
> --- a/drivers/md/bcache/closure.c
> +++ b/drivers/md/bcache/closure.c
> @@ -157,7 +157,7 @@ void closure_debug_destroy(struct closure *cl)
>  }
>  EXPORT_SYMBOL(closure_debug_destroy);
>  
> -static struct dentry *debug;
> +static struct dentry *closure_debug;
>  
>  static int debug_seq_show(struct seq_file *f, void *data)
>  {
> @@ -199,11 +199,12 @@ static int debug_seq_open(struct inode *inode, struct 
> file *file)
>  .release= single_release
>  };
>  
> -void __init closure_debug_init(void)
> +int __init closure_debug_init(void)
>  {
> -debug = debugfs_create_file("closures", 0400, NULL, NULL, _ops);
> +closure_debug = debugfs_create_file("closures",
> +0400, debug, NULL, _ops);
> +return IS_ERR_OR_NULL(closure_debug);
>  }
> -
>  #endif
>  
>  MODULE_AUTHOR("Kent Overstreet ");
> diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
> index 3b9dfc9..0fb704d 100644
> --- a/drivers/md/bcache/closure.h
> +++ b/drivers/md/bcache/closure.h
> @@ -105,6 +105,7 @@
>  struct closure;
>  struct closure_syncer;
>  typedef void (closure_fn) (struct closure *);
> +extern struct dentry *debug;
>  
>  struct closure_waitlist {
>  struct llist_headlist;
> @@ -185,13 +186,13 @@ static inline void closure_sync(struct closure *cl)
>  
>  #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
>  
> -void closure_debug_init(void);
> +int closure_debug_init(void);
>  void closure_debug_create(struct closure *cl);
>  void closure_debug_destroy(struct closure *cl);
>  
>  #else
>  
> -static inline void closure_debug_init(void) {}
> +static inline int closure_debug_init(void) { return 0; }
>  static inline void closure_debug_create(struct closure *cl) {}
>  static inline void closure_debug_destroy(struct closure *cl) {}
>  
> diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
> index af89408..5db02de 100644
> --- a/drivers/md/bcache/debug.c
> +++ b/drivers/md/bcache/debug.c
> @@ -17,7 +17,7 @@
>  #include 
>  #include 
>  
> -static struct dentry *debug;
> +struct dentry *debug;
>  
>  #ifdef CONFIG_BCACHE_DEBUG
>  
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 1a9fdab..b784292 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -2133,7 +2133,6 @@ static int __init bcache_init(void)
>  mutex_init(_register_lock);
>  init_waitqueue_head(_wait);
>  register_reboot_notifier();
> -closure_debug_init();
>  
>  bcache_major = register_blkdev(0, "bcache");
>  if (bcache_major < 0) {
> @@ -2145,7 +2144,7 @@ static int __init bcache_init(void)
>  if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
>  !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
>  bch_request_init() ||
> -bch_debug_init(bcache_kobj) ||
> +bch_debug_init(bcache_kobj) || closure_debug_init() ||
>  sysfs_create_files(bcache_kobj, files))
>  goto err;
>  
> -- 
> 1.8.3.1

Thanks
Tang Junhui