Re: Re: [PATCH] bcache: fix error count in memory shrink
Hi Mike--- >>Hi Tang Junhui--- >> >>I'm not really sure about this one. It changes the semantics of the >>amount of work done-- nr_to_scan now means number of things to free >>instead of the number to check. >> >The code seems to be designed as that, sc->nr_to_scan marks how much btree >nodes to scan in c->btree_cache_used, and it doesn't include the btree >nodes in c->btree_cache_freeable. > I am sorry, sc->nr_to_scan should include the btree nodes in c->btree_cache_freeable. I have sended a new patch. >>If the system is under severe memory pressure, and most of the cache is >>essential/actively used, this could greatly increase the amount of work >>done. >> >Yes, I didn't relize this. >>As to the i < btree_cache_used, it's arguably a bug, but it only reduces >>the amount of work done if the cache is very, very small. >> >I think it is a bug, I'll send a new patch to add variable to record >c->btree_cache_used befor for(;;) loop. > >>Mike >> >>On 01/30/2018 06:46 PM, tang.jun...@zte.com.cn wrote: >>> From: Tang Junhui>>> >>> In bch_mca_scan(), nr btree nodes are expected to shrink, so the for(;;) >>> loop need to satisfy the condition freed < nr. >>> And since c->btree_cache_used always decrease after mca_data_free() calling >>> in for(;;) loop, so we need a auto variable to record c->btree_cache_used >>> before the for(;;) loop. >>> >>> Signed-off-by: Tang Junhui >>> --- >>> drivers/md/bcache/btree.c | 4 +++- >>> 1 file changed, 3 insertions(+), 1 deletion(-) >>> >>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c >>> index 81e8dc3..7e9ef3d 100644 >>> --- a/drivers/md/bcache/btree.c >>> +++ b/drivers/md/bcache/btree.c >>> @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker >>> *shrink, >>> struct btree *b, *t; >>> unsigned long i, nr = sc->nr_to_scan; >>> unsigned long freed = 0; >>> +unsigned int btree_cache_used; >>> >>> if (c->shrinker_disabled) >>> return SHRINK_STOP; >>> @@ -700,7 +701,8 @@ static unsigned long bch_mca_scan(struct shrinker >>> *shrink, >>> } >>> } >>> >>> -for (i = 0; (nr--) && i < c->btree_cache_used; i++) { >>> +btree_cache_used = c->btree_cache_used; >>> +for (i = 0; freed < nr && i < btree_cache_used; i++) { >>> if (list_empty(>btree_cache)) >>> goto out; >>> >>> Thanks, Tang Junhui
[PATCH] [v2] bcache: fix using of loop variable in memory shrink
From: Tang JunhuiIn bch_mca_scan(), There are some confusion and logical error in the use of loop variables. In this patch, we clarify them as: 1) nr: the number of btree nodes needs to scan, which will decrease after we scan a btree node, and should not be less than 0; 2) i: the number of btree nodes have scanned, includes both btree_cache_freeable and btree_cache, which should not be bigger than btree_cache_used; 3) freed: the number of btree nodes have freed. Signed-off-by: Tang Junhui --- drivers/md/bcache/btree.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81e8dc3..7487f8e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; + unsigned int btree_cache_used; if (c->shrinker_disabled) return SHRINK_STOP; @@ -688,9 +689,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; + btree_cache_used = c->btree_cache_used; list_for_each_entry_safe(b, t, >btree_cache_freeable, list) { - if (freed >= nr) - break; + if (nr <= 0) + goto out; if (++i > 3 && !mca_reap(b, 0, false)) { @@ -698,9 +700,10 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, rw_unlock(true, b); freed++; } + nr--; } - for (i = 0; (nr--) && i < c->btree_cache_used; i++) { + for (; (nr--) && i < btree_cache_used; i++) { if (list_empty(>btree_cache)) goto out; -- 1.8.3.1
Re: [PATCH] bcache: fix error count in memory shrink
Hi Mike--- >Hi Tang Junhui--- > >I'm not really sure about this one. It changes the semantics of the >amount of work done-- nr_to_scan now means number of things to free >instead of the number to check. > The code seems to be designed as that, sc->nr_to_scan marks how much btree nodes to scan in c->btree_cache_used, and it doesn't include the btree nodes in c->btree_cache_freeable. >If the system is under severe memory pressure, and most of the cache is >essential/actively used, this could greatly increase the amount of work >done. > Yes, I didn't relize this. >As to the i < btree_cache_used, it's arguably a bug, but it only reduces >the amount of work done if the cache is very, very small. > I think it is a bug, I'll send a new patch to add variable to record c->btree_cache_used befor for(;;) loop. >Mike > >On 01/30/2018 06:46 PM, tang.jun...@zte.com.cn wrote: >> From: Tang Junhui>> >> In bch_mca_scan(), nr btree nodes are expected to shrink, so the for(;;) >> loop need to satisfy the condition freed < nr. >> And since c->btree_cache_used always decrease after mca_data_free() calling >> in for(;;) loop, so we need a auto variable to record c->btree_cache_used >> before the for(;;) loop. >> >> Signed-off-by: Tang Junhui >> --- >> drivers/md/bcache/btree.c | 4 +++- >> 1 file changed, 3 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c >> index 81e8dc3..7e9ef3d 100644 >> --- a/drivers/md/bcache/btree.c >> +++ b/drivers/md/bcache/btree.c >> @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker >> *shrink, >> struct btree *b, *t; >> unsigned long i, nr = sc->nr_to_scan; >> unsigned long freed = 0; >> +unsigned int btree_cache_used; >> >> if (c->shrinker_disabled) >> return SHRINK_STOP; >> @@ -700,7 +701,8 @@ static unsigned long bch_mca_scan(struct shrinker >> *shrink, >> } >> } >> >> -for (i = 0; (nr--) && i < c->btree_cache_used; i++) { >> +btree_cache_used = c->btree_cache_used; >> +for (i = 0; freed < nr && i < btree_cache_used; i++) { >> if (list_empty(>btree_cache)) >> goto out; >> >> Thanks, Tang Junhui
[PATCH V2] block: null_blk: fix 'Invalid parameters' when loading module
On ARM64, the default page size has been 64K on some distributions, and we should allow ARM64 people to play null_blk. This patch fixes the issue by extend page bitmap size for supporting other non-4KB PAGE_SIZE. Cc: Bart Van AsscheCc: Shaohua Li Cc: Kyungchan Koh , Cc: weiping zhang Cc: Yi Zhang Reported-by: Yi Zhang Signed-off-by: Ming Lei --- drivers/block/null_blk.c | 46 +- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 51b16249028a..3c5a684de170 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -72,6 +72,7 @@ enum nullb_device_flags { NULLB_DEV_FL_CACHE = 3, }; +#define MAP_SZ ((PAGE_SIZE >> SECTOR_SHIFT) + 2) /* * nullb_page is a page in memory for nullb devices. * @@ -86,10 +87,10 @@ enum nullb_device_flags { */ struct nullb_page { struct page *page; - unsigned long bitmap; + DECLARE_BITMAP(bitmap, MAP_SZ); }; -#define NULLB_PAGE_LOCK (sizeof(unsigned long) * 8 - 1) -#define NULLB_PAGE_FREE (sizeof(unsigned long) * 8 - 2) +#define NULLB_PAGE_LOCK (MAP_SZ - 1) +#define NULLB_PAGE_FREE (MAP_SZ - 2) struct nullb_device { struct nullb *nullb; @@ -732,7 +733,7 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags) if (!t_page->page) goto out_freepage; - t_page->bitmap = 0; + memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); return t_page; out_freepage: kfree(t_page); @@ -742,13 +743,20 @@ static struct nullb_page *null_alloc_page(gfp_t gfp_flags) static void null_free_page(struct nullb_page *t_page) { - __set_bit(NULLB_PAGE_FREE, _page->bitmap); - if (test_bit(NULLB_PAGE_LOCK, _page->bitmap)) + __set_bit(NULLB_PAGE_FREE, t_page->bitmap); + if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap)) return; __free_page(t_page->page); kfree(t_page); } +static bool null_page_empty(struct nullb_page *page) +{ + int size = MAP_SZ - 2; + + return find_first_bit(page->bitmap, size) == size; +} + static void null_free_sector(struct nullb *nullb, sector_t sector, bool is_cache) { @@ -763,9 +771,9 @@ static void null_free_sector(struct nullb *nullb, sector_t sector, t_page = radix_tree_lookup(root, idx); if (t_page) { - __clear_bit(sector_bit, _page->bitmap); + __clear_bit(sector_bit, t_page->bitmap); - if (!t_page->bitmap) { + if (null_page_empty(t_page)) { ret = radix_tree_delete_item(root, idx, t_page); WARN_ON(ret != t_page); null_free_page(ret); @@ -836,7 +844,7 @@ static struct nullb_page *__null_lookup_page(struct nullb *nullb, t_page = radix_tree_lookup(root, idx); WARN_ON(t_page && t_page->page->index != idx); - if (t_page && (for_write || test_bit(sector_bit, _page->bitmap))) + if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) return t_page; return NULL; @@ -899,10 +907,10 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); - __clear_bit(NULLB_PAGE_LOCK, _page->bitmap); - if (test_bit(NULLB_PAGE_FREE, _page->bitmap)) { + __clear_bit(NULLB_PAGE_LOCK, c_page->bitmap); + if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) { null_free_page(c_page); - if (t_page && t_page->bitmap == 0) { + if (t_page && null_page_empty(t_page)) { ret = radix_tree_delete_item(>dev->data, idx, t_page); null_free_page(t_page); @@ -918,11 +926,11 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) for (i = 0; i < PAGE_SECTORS; i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { - if (test_bit(i, _page->bitmap)) { + if (test_bit(i, c_page->bitmap)) { offset = (i << SECTOR_SHIFT); memcpy(dst + offset, src + offset, nullb->dev->blocksize); - __set_bit(i, _page->bitmap); + __set_bit(i, t_page->bitmap); } } @@ -959,10 +967,10 @@ static int null_make_cache_space(struct nullb *nullb, unsigned long n) * We found the page which is being flushed to disk by other * threads */ - if (test_bit(NULLB_PAGE_LOCK, _pages[i]->bitmap)) + if (test_bit(NULLB_PAGE_LOCK,
Re: [PATCH] block: null_blk: fix 'Invalid parameters' failure when loading module
On Mon, Mar 05, 2018 at 03:57:07PM +, Bart Van Assche wrote: > On Sat, 2018-03-03 at 10:24 +0800, Ming Lei wrote: > > struct nullb_page { > > struct page *page; > > - unsigned long bitmap; > > + unsigned long bitmap[DIV_ROUND_UP(MAP_SZ, sizeof(unsigned long) * 8)]; > > }; > > Could DECLARE_BITMAP() have been used here? Indeed, will do it in V2. Thanks, Ming
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On 05/03/18 05:49 PM, Oliver wrote: It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full barriers! Awesome! Our io.h indicates that our iomem accessors are designed to provide x86ish strong ordering of accesses to MMIO space. The git log indicates arch/powerpc/kernel/io.c has barely been touched in the last decade so odds are most of that code was written in the elder days when people were less aware of ordering issues. It might just be overly conservative by today's standards, but maybe not (see below). Yes, that seems overly conservative. (I'm not going to suggest ditching the lwsync trick. mpe is not going to take that patch without a really good reason) Well, that's pretty gross. Is this not exactly the situation mmiowb() is meant to solve? See [1]. Though, you're right in principle. Even if power was similar to other systems in this way, it's still a risk that if these pages get passed somewhere in the kernel that uses a spin lock like that without an mmiowb() call, then it's going to have a bug. For now, the risk is pretty low as we know exactly where all the p2pmem pages will be used but if it gets into other places, all bets are off. I did do some work trying to make a safe version of io-pages and also trying to change from pages to pfn_t in large areas but neither approach seemed likely to get any traction in the community, at least not in the near term. Logan [1] ACQUIRES VS I/O ACCESSES in https://www.kernel.org/doc/Documentation/memory-barriers.txt
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On Tue, Mar 6, 2018 at 4:10 AM, Logan Gunthorpewrote: > > > On 05/03/18 09:00 AM, Keith Busch wrote: >> >> On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote: >>> >>> On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpe >>> wrote: @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, { u16 tail = nvmeq->sq_tail; >>> >>> - if (nvmeq->sq_cmds_io) - memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd)); - else - memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); + memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); >>> >>> >>> Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC >>> the _toio() variant enforces alignment, does the copy with 4 byte >>> stores, and has a full barrier after the copy. In comparison our >>> regular memcpy() does none of those things and may use unaligned and >>> vector load/stores. For normal (cacheable) memory that is perfectly >>> fine, but they can cause alignment faults when targeted at MMIO >>> (cache-inhibited) memory. >>> >>> I think in this particular case it might be ok since we know SEQs are >>> aligned to 64 byte boundaries and the copy is too small to use our >>> vectorised memcpy(). I'll assume we don't need explicit ordering >>> between writes of SEQs since the existing code doesn't seem to care >>> unless the doorbell is being rung, so you're probably fine there too. >>> That said, I still think this is a little bit sketchy and at the very >>> least you should add a comment explaining what's going on when the CMB >>> is being used. If someone more familiar with the NVMe driver could >>> chime in I would appreciate it. >> >> >> I may not be understanding the concern, but I'll give it a shot. >> >> You're right, the start of any SQE is always 64-byte aligned, so that >> should satisfy alignment requirements. >> >> The order when writing multiple/successive SQEs in a submission queue >> does matter, and this is currently serialized through the q_lock. >> >> The order in which the bytes of a single SQE is written doesn't really >> matter as long as the entire SQE is written into the CMB prior to writing >> that SQ's doorbell register. >> >> The doorbell register is written immediately after copying a command >> entry into the submission queue (ignore "shadow buffer" features), >> so the doorbells written to commands submitted is 1:1. >> >> If a CMB SQE and DB order is not enforced with the memcpy, then we do >> need a barrier after the SQE's memcpy and before the doorbell's writel. > > > > Thanks for the information Keith. > > Adding to this: regular memcpy generally also enforces alignment as > unaligned access to regular memory is typically bad in some way on most > arches. The generic memcpy_toio also does not have any barrier as it is just > a call to memcpy. Arm64 also does not appear to have a barrier in its > implementation and in the short survey I did I could not find any > implementation with a barrier. I also did not find a ppc implementation in > the tree but it would be weird for it to add a barrier when other arches do > not appear to need it. It's in arch/powerpc/kernel/io.c as _memcpy_toio() and it has two full barriers! Awesome! Our io.h indicates that our iomem accessors are designed to provide x86ish strong ordering of accesses to MMIO space. The git log indicates arch/powerpc/kernel/io.c has barely been touched in the last decade so odds are most of that code was written in the elder days when people were less aware of ordering issues. It might just be overly conservative by today's standards, but maybe not (see below). > We've been operating on the assumption that memory mapped by > devm_memremap_pages() can be treated as regular memory. This is emphasized > by the fact that it does not return an __iomem pointer. I think the lack of an __iomem annotation is a bit of a red herring. The comment header for devm_memremap_pages() outlines the conditions for when using it is valid, the important one being: > * 4/ res is expected to be a host memory range that could feasibly be > *treated as a "System RAM" range, i.e. not a device mmio range, but > *this is not enforced. Granted, the CMB is a MMIO buffer rather than a register range, but from my perspective it's still a device MMIO range rather than something we could feasibly treat as system RAM. The two big reasons being: a) System RAM is cacheable and coherent while the CMB is neither, and b) On PPC MMIO accesses are in a separate ordering domain to cacheable memory accesses. The latter isn't as terrifying as it sounds since a full barrier will order everything with everything, but it still causes problems. For performance reasons we want to minimise the number of cases where we need to use a sync instruction (full barrier) in favour of a lightweight sync (lwsync) which only orders cacheable accesses. To
Re: EXT4 Oops (Re: [PATCH V15 06/22] mmc: block: Add blk-mq support)
On 01.03.2018 19:04, Theodore Ts'o wrote: > On Thu, Mar 01, 2018 at 10:55:37AM +0200, Adrian Hunter wrote: >> On 27/02/18 11:28, Adrian Hunter wrote: >>> On 26/02/18 23:48, Dmitry Osipenko wrote: But still something is wrong... I've been getting occasional EXT4 Ooops's, like the one below, and __wait_on_bit() is always figuring in the stacktrace. It never happened with blk-mq disabled, though it could be a coincidence and actually unrelated to blk-mq patches. >>> [ 6625.992337] Unable to handle kernel NULL pointer dereference at virtual address 001c [ 6625.993004] pgd = 00b30c03 [ 6625.993257] [001c] *pgd= [ 6625.993594] Internal error: Oops: 5 [#1] PREEMPT SMP ARM [ 6625.994022] Modules linked in: [ 6625.994326] CPU: 1 PID: 19355 Comm: dpkg Not tainted 4.16.0-rc2-next-20180220-00095-ge9c9f5689a84-dirty #2090 [ 6625.995078] Hardware name: NVIDIA Tegra SoC (Flattened Device Tree) [ 6625.995595] PC is aht dx_probe+0x68/0x684 [ 6625.995947] LR is at __wait_on_bit+0xac/0xc8 > > This doesn't seem to make sense; the PC is where we are currently > executing, and LR is the "Link Register" where the flow of control > will be returning after the current function returns, right? Well, > dx_probe should *not* be returning to __wait_on_bit(). So this just > seems weird. > > Ignoring the LR register, this stack trace looks sane... I can't see > which pointer could be NULL and getting dereferenced, though. How > easily can you reproduce the problem? Can you either (a) translate > the PC into a line number, or better yet, if you can reproduce, add a > series of BUG_ON's so we can see what's going on? > > + BUG_ON(frame); > memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); > frame->bh = ext4_read_dirblock(dir, 0, INDEX); > if (IS_ERR(frame->bh)) > return (struct dx_frame *) frame->bh; > > + BUG_ON(frame->bh); > + BUG_ON(frame->bh->b_data); > root = (struct dx_root *) frame->bh->b_data; > if (root->info.hash_version != DX_HASH_TEA && > root->info.hash_version != DX_HASH_HALF_MD4 && > root->info.hash_version != DX_HASH_LEGACY) { > > These are "could never" happen scenarios from looking at the code, but > that will help explain what is going on. > > If this is reliably only happening with mq, the only way I could see > that if is something is returning an error when it previously wasn't. > This isn't a problem we're seeing with any of our testing, though. It happened today again, "BUG_ON(!frame->bh->b_data);" has been trapped. kernel BUG at fs/ext4/namei.c:751! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM Modules linked in: CPU: 0 PID: 296 Comm: cron Not tainted 4.16.0-rc2-next-20180220-00095-ge9c9f5689a84-dirty #2100 Hardware name: NVIDIA Tegra SoC (Flattened Device Tree) PC is at dx_probe+0x308/0x694 LR is at __wait_on_bit+0xac/0xc8 pc : []lr : []psr: 60040013 sp : d545bc20 ip : c0170e88 fp : d545bc74 r10: r9 : d545bca0 r8 : d4209300 r7 : r6 : r5 : d656e838 r4 : d545bcbc r3 : 007b r2 : d5830800 r1 : d5831000 r0 : d4209300 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 1552004a DAC: 0051 Process cron (pid: 296, stack limit = 0x4d1ebf14) Stack: (0xd545bc20 to 0xd545c000) bc20: 02ea c0c019d4 60040113 014000c0 c029e640 d6cf3540 d545bc7c d545bc48 bc40: c02797f4 c0152804 d545bca4 0007 d5830800 d656e838 0001 bc60: d545bca0 d545bd0c d545bc78 c033d578 c033b904 c029e714 c029b088 bc80: 0148 c0c01984 d65f6be0 d545be10 d545bd24 d545bd00 d5830800 bca0: d65f6bf8 d65f6c0c 0007 d6547720 8420edbe c029eec8 d4209300 bcc0: bce0: d545bd48 d65f6be0 d656e838 d65f6be0 d6547720 0001 d545be10 bd00: d545bd44 d545bd10 c033d7c0 c033d1c8 d545bd34 d656e8b8 d656e838 d545be08 bd20: d656e838 d65f6be0 d656e838 d656e8b8 d6547720 d545bd8c d545bd48 bd40: c028ea50 c033d774 dead4ead d545bd58 d545bd58 bd60: d6d7f015 d545be08 d545bee8 d545bee8 d545bf28 bd80: d545bdd4 d545bd90 c028f310 c028e9b0 d545be08 80808080 d545be08 d6d7f010 bda0: d545bdd4 d545bdb0 c028df9c d545be08 d6d7f010 d545bee8 d545bee8 bdc0: d545bf28 d545be04 d545bdd8 c0290e24 c028f160 c0111a1c c0111674 bde0: d545be04 d545bdf0 0001 d6d7f000 d545be08 0001 d545beb4 d545be08 be00: c0293848 c0290da4 d6dd0310 d6547720 8420edbe 0007 d6d7f015 000c be20: d6dd0310 d6547098 d656e838 0001 0002 0fe0 be40: d545be48 c02797f4 0ff0 d6d7f010 c102b4c8 d5522db8 d6d7f000 be60: c130bbdc 004f73f8 0001 d545bf28 d6d7f000 be80: c0293570 0002 ff9c 0001 ff9c 0001 ff9c d545bee8 bea0: ff9c 004f73f8
Re: [PATCH v2 04/10] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches
On 05/03/18 03:28 PM, Bjorn Helgaas wrote: If you put the #ifdef right here, then it's easier to read because we can see that "oh, this is a special and uncommon case that I can probably ignore". Makes sense. I'll do that. Thanks, Logan
Re: [PATCH v2 04/10] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches
On Thu, Mar 01, 2018 at 12:13:10PM -0700, Logan Gunthorpe wrote: > > > On 01/03/18 11:02 AM, Bjorn Helgaas wrote: > > > void pci_enable_acs(struct pci_dev *dev) > > > { > > > + if (pci_p2pdma_disable_acs(dev)) > > > + return; > > > > This doesn't read naturally to me. I do see that when > > CONFIG_PCI_P2PDMA is not set, pci_p2pdma_disable_acs() does nothing > > and returns 0, so we'll go ahead and try to enable ACS as before. > > > > But I think it would be clearer to have an #ifdef CONFIG_PCI_P2PDMA > > right here so it's more obvious that we only disable ACS when it's > > selected. > > I could do this... however, I wrote it this way because I've read Linus > dislikes using #ifdef's inside function bodies and I personally agree with > that sentiment. I try to avoid #ifdefs too, but in this case the plain reading of the code makes no sense (we're in a function called "enable_acs()", and the first thing we do is call a function to "disable_acs()". Disabling ACS is scary for all the security reasons mentioned elsewhere, so a reader who knows what ACS does and cares about virtualization and security *has* to go look up the definition of pci_p2pdma_disable_acs(). If you put the #ifdef right here, then it's easier to read because we can see that "oh, this is a special and uncommon case that I can probably ignore". Bjorn
Re: Some bcache patches need reveiw
Hey Tang Junhui-- You're right. I'm sorry, these had slipped through the cracks / I had lost track of them. I also have the rest of Coly's patchset to work through. I was able to apply the first two, and will continue to work on my for-next branch. Mike On 03/05/2018 02:39 AM, tang.jun...@zte.com.cn wrote: > Hello Mike > > I send some patches some times before, with no response, > > Bellow patches are very simple, can you or anybody else have a review? > [PATCH] bcache: fix incorrect sysfs output value of strip size > [PATCH] bcache: fix error return value in memory shrink > [PATCH] bcache: fix error count in memory shrink > > Bellow patches are simple too, and important for me and also important for > bcache, > can you or anybody else have a review? > [PATCH] bcache: finish incremental GC > [PATCH] bcache: calculate the number of incremental GC nodes according to the > total of btree nodes > > Thanks, > Tang Junhui > -- > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >
Re: [PATCH] bcache: fix error count in memory shrink
Hi Tang Junhui--- I'm not really sure about this one. It changes the semantics of the amount of work done-- nr_to_scan now means number of things to free instead of the number to check. If the system is under severe memory pressure, and most of the cache is essential/actively used, this could greatly increase the amount of work done. As to the i < btree_cache_used, it's arguably a bug, but it only reduces the amount of work done if the cache is very, very small. Mike On 01/30/2018 06:46 PM, tang.jun...@zte.com.cn wrote: > From: Tang Junhui> > In bch_mca_scan(), nr btree nodes are expected to shrink, so the for(;;) > loop need to satisfy the condition freed < nr. > And since c->btree_cache_used always decrease after mca_data_free() calling > in for(;;) loop, so we need a auto variable to record c->btree_cache_used > before the for(;;) loop. > > Signed-off-by: Tang Junhui > --- > drivers/md/bcache/btree.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c > index 81e8dc3..7e9ef3d 100644 > --- a/drivers/md/bcache/btree.c > +++ b/drivers/md/bcache/btree.c > @@ -664,6 +664,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, > struct btree *b, *t; > unsigned long i, nr = sc->nr_to_scan; > unsigned long freed = 0; > + unsigned int btree_cache_used; > > if (c->shrinker_disabled) > return SHRINK_STOP; > @@ -700,7 +701,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, > } > } > > - for (i = 0; (nr--) && i < c->btree_cache_used; i++) { > + btree_cache_used = c->btree_cache_used; > + for (i = 0; freed < nr && i < btree_cache_used; i++) { > if (list_empty(>btree_cache)) > goto out; > >
Re: [PATCH] bcache: fix error return value in memory shrink
LGTM, applied On 01/30/2018 07:30 PM, tang.jun...@zte.com.cn wrote: > From: Tang Junhui> > In bch_mca_scan(), the return value should not be the number of freed btree > nodes, but the number of pages of freed btree nodes. > > Signed-off-by: Tang Junhui > --- > drivers/md/bcache/btree.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c > index 81e8dc3..a5fbccc 100644 > --- a/drivers/md/bcache/btree.c > +++ b/drivers/md/bcache/btree.c > @@ -718,7 +718,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, > } > out: > mutex_unlock(>bucket_lock); > - return freed; > + return freed * c->btree_pages; > } > > static unsigned long bch_mca_count(struct shrinker *shrink, >
Re: [PATCH] bcache: fix incorrect sysfs output value of strip size
LGTM, applied. On 02/10/2018 06:30 PM, tang.jun...@zte.com.cn wrote: > From: Tang Junhui> > Stripe size is shown as zero when no strip in back end device: > [root@ceph132 ~]# cat /sys/block/sdd/bcache/stripe_size > 0.0k > > Actually it should be 1T Bytes (1 << 31 sectors), but in sysfs > interface, stripe_size was changed from sectors to bytes, and move > 9 bits left, so the 32 bits variable overflows. > > This patch change the variable to a 64 bits type before moving bits. > > Signed-off-by: Tang Junhui > --- > drivers/md/bcache/sysfs.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c > index b418409..240cbec 100644 > --- a/drivers/md/bcache/sysfs.c > +++ b/drivers/md/bcache/sysfs.c > @@ -170,7 +170,7 @@ > sysfs_hprint(dirty_data, >bcache_dev_sectors_dirty(>disk) << 9); > > - sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); > + sysfs_hprint(stripe_size,((uint64_t)dc->disk.stripe_size) << 9); > var_printf(partial_stripes_expensive, "%u"); > > var_hprint(sequential_cutoff); >
Re: [PATCH 0/2] Duplicate UUID fixes for 4.16
On 3/5/18 2:41 PM, Michael Lyle wrote: > Jens-- > > Sorry for a couple of last-minute changes. Marc Merlin noticed an issue > with disk imaging where if multiple devices were present with the same > bcache UUID that a kernel panic could result. Tang Junhui fixed this. > I found a related data corruption issue with duplicate backing devices. > > Both are minimal, reviewed, and tested. As always, thanks for your help. Applied, thanks Mike. -- Jens Axboe
Re: [PATCH] bcache: don't attach backing with duplicate UUID
Hi Tang Junhui--- Thanks for your review. I just sent it upstream (with your change) to Jens. Mike On 03/04/2018 05:07 PM, tang.jun...@zte.com.cn wrote: > Hello Mike > > I send the email from my personal mailbox(110950...@qq.com), it may be fail, > so I resend this email from my office mailbox again. bellow is the mail > context I send previous. > > > > I am Tang Junhui(tang.jun...@zte.com.cn), This email comes from my > personal mailbox, since I am not in office today. > >>> From: Tang Junhui>>> >>> Hello, Mike >>> >>> This patch looks good, but has some conflicts with this patch: >>> bcache: fix for data collapse after re-attaching an attached device >>> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?id=73ac105be390c1de42a2f21643c9778a5e002930 >>> Could you modify your fix base on the previous patch? > >> That doesn't make sense. This patch was generated from a current tree >> where it's applied on top of that: (It's based on next when it should >> really be based on Linus's tree, but it doesn't matter for patch >> application because there's no changes in next right now to bcache that >> aren't in Linus's tree). > > Originally, I did not mean merger conflicts, but the > code logical conflicts, since the previous patch add a new input parameter > set_uuid in bch_cached_dev_attach(), and if set_uuid is not NULL, > we use set_uuid as cache set uuid, otherwise, we use dc->sb.set_uuid as > the cache set uuid. > > But now, I read your patch again, and realize that you did not use > dc->sb.set_uuid, but use dc->sb.uuid to judge whether the device is a > duplicate backend device, so it's OK for me. > >> May I add your reviewed-by so I can send this (and your fix) upstream? > Reviewed-by: Tang Junhui > > Thanks > Tang Junhui > > Thanks > Tang Junhui > -- > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >
[PATCH 1/2] bcache: fix crashes in duplicate cache device register
From: Tang JunhuiKernel crashed when register a duplicate cache device, the call trace is bellow: [ 417.643790] CPU: 1 PID: 16886 Comm: bcache-register Tainted: G W OE4.15.5-amd64-preempt-sysrq-20171018 #2 [ 417.643861] Hardware name: LENOVO 20ERCTO1WW/20ERCTO1WW, BIOS N1DET41W (1.15 ) 12/31/2015 [ 417.643870] RIP: 0010:bdevname+0x13/0x1e [ 417.643876] RSP: 0018:a3aa9138fd38 EFLAGS: 00010282 [ 417.643884] RAX: RBX: 8c8f2f2f8000 RCX: d6701f8 c7edf [ 417.643890] RDX: a3aa9138fd88 RSI: a3aa9138fd88 RDI: 000 0 [ 417.643895] RBP: a3aa9138fde0 R08: a3aa9138fae8 R09: 000 1850e [ 417.643901] R10: 8c8eed34b271 R11: 8c8eed34b250 R12: 000 0 [ 417.643906] R13: d6701f78f940 R14: 8c8f38f8 R15: 8c8ea7d 9 [ 417.643913] FS: 7fde7e66f500() GS:8c8f6144() knlGS: [ 417.643919] CS: 0010 DS: ES: CR0: 80050033 [ 417.643925] CR2: 0314 CR3: 0007e6fa0001 CR4: 003 606e0 [ 417.643931] DR0: DR1: DR2: 000 0 [ 417.643938] DR3: DR6: fffe0ff0 DR7: 000 00400 [ 417.643946] Call Trace: [ 417.643978] register_bcache+0x1117/0x1270 [bcache] [ 417.643994] ? slab_pre_alloc_hook+0x15/0x3c [ 417.644001] ? slab_post_alloc_hook.isra.44+0xa/0x1a [ 417.644013] ? kernfs_fop_write+0xf6/0x138 [ 417.644020] kernfs_fop_write+0xf6/0x138 [ 417.644031] __vfs_write+0x31/0xcc [ 417.644043] ? current_kernel_time64+0x10/0x36 [ 417.644115] ? __audit_syscall_entry+0xbf/0xe3 [ 417.644124] vfs_write+0xa5/0xe2 [ 417.644133] SyS_write+0x5c/0x9f [ 417.644144] do_syscall_64+0x72/0x81 [ 417.644161] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 417.644169] RIP: 0033:0x7fde7e1c1974 [ 417.644175] RSP: 002b:7fff13009a38 EFLAGS: 0246 ORIG_RAX: 000 1 [ 417.644183] RAX: ffda RBX: 01658280 RCX: 7fde7e1c 1974 [ 417.644188] RDX: 000a RSI: 01658280 RDI: 0001 [ 417.644193] RBP: 000a R08: 0003 R09: 0077 [ 417.644198] R10: 089e R11: 0246 R12: 0001 [ 417.644203] R13: 000a R14: 7fff R15: [ 417.644213] Code: c7 c2 83 6f ee 98 be 20 00 00 00 48 89 df e8 6c 27 3b 0 0 48 89 d8 5b c3 0f 1f 44 00 00 48 8b 47 70 48 89 f2 48 8b bf 80 00 00 00 <8 b> b0 14 03 00 00 e9 73 ff ff ff 0f 1f 44 00 00 48 8b 47 40 39 [ 417.644302] RIP: bdevname+0x13/0x1e RSP: a3aa9138fd38 [ 417.644306] CR2: 0314 When registering duplicate cache device in register_cache(), after failure on calling register_cache_set(), bch_cache_release() will be called, then bdev will be freed, so bdevname(bdev, name) caused kernel crash. Since bch_cache_release() will free bdev, so in this patch we make sure bdev being freed if register_cache() fail, and do not free bdev again in register_bcache() when register_cache() fail. Signed-off-by: Tang Junhui Reported-by: Marc MERLIN Tested-by: Michael Lyle Reviewed-by: Michael Lyle Cc: --- drivers/md/bcache/super.c | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 312895788036..9c141a8aaacc 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1204,7 +1204,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, return; err: - pr_notice("error opening %s: %s", bdevname(bdev, name), err); + pr_notice("error %s: %s", bdevname(bdev, name), err); bcache_device_stop(>disk); } @@ -1883,6 +1883,8 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, const char *err = NULL; /* must be set for any error case */ int ret = 0; + bdevname(bdev, name); + memcpy(>sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; ca->bdev->bd_holder = ca; @@ -1891,11 +1893,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, bio_first_bvec_all(>sb_bio)->bv_page = sb_page; get_page(sb_page); - if (blk_queue_discard(bdev_get_queue(ca->bdev))) + if (blk_queue_discard(bdev_get_queue(bdev))) ca->discard = CACHE_DISCARD(>sb); ret = cache_alloc(ca); if (ret != 0) { + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else @@ -1918,14 +1921,14 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, goto out; } - pr_info("registered cache device %s", bdevname(bdev, name)); + pr_info("registered
[PATCH 2/2] bcache: don't attach backing with duplicate UUID
This can happen e.g. during disk cloning. This is an incomplete fix: it does not catch duplicate UUIDs earlier when things are still unattached. It does not unregister the device. Further changes to cope better with this are planned but conflict with Coly's ongoing improvements to handling device errors. In the meantime, one can manually stop the device after this has happened. Attempts to attach a duplicate device result in: [ 136.372404] loop: module loaded [ 136.424461] bcache: register_bdev() registered backing device loop0 [ 136.424464] bcache: bch_cached_dev_attach() Tried to attach loop0 but duplicate UUID already attached My test procedure is: dd if=/dev/sdb1 of=imgfile bs=1024 count=262144 losetup -f imgfile Signed-off-by: Michael LyleReviewed-by: Tang Junhui Cc: --- drivers/md/bcache/super.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9c141a8aaacc..5cace6892958 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -963,6 +963,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint32_t rtime = cpu_to_le32(get_seconds()); struct uuid_entry *u; char buf[BDEVNAME_SIZE]; + struct cached_dev *exist_dc, *t; bdevname(dc->bdev, buf); @@ -987,6 +988,16 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return -EINVAL; } + /* Check whether already attached */ + list_for_each_entry_safe(exist_dc, t, >cached_devs, list) { + if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) { + pr_err("Tried to attach %s but duplicate UUID already attached", + buf); + + return -EINVAL; + } + } + u = uuid_find(c, dc->sb.uuid); if (u && -- 2.14.1
[PATCH 0/2] Duplicate UUID fixes for 4.16
Jens-- Sorry for a couple of last-minute changes. Marc Merlin noticed an issue with disk imaging where if multiple devices were present with the same bcache UUID that a kernel panic could result. Tang Junhui fixed this. I found a related data corruption issue with duplicate backing devices. Both are minimal, reviewed, and tested. As always, thanks for your help. Mike
Re: Hangs in balance_dirty_pages with arm-32 LPAE + highmem
On 02/26/2018 06:28 AM, Michal Hocko wrote: On Fri 23-02-18 11:51:41, Laura Abbott wrote: Hi, The Fedora arm-32 build VMs have a somewhat long standing problem of hanging when running mkfs.ext4 with a bunch of processes stuck in D state. This has been seen as far back as 4.13 but is still present on 4.14: [...] This looks like everything is blocked on the writeback completing but the writeback has been throttled. According to the infra team, this problem is _not_ seen without LPAE (i.e. only 4G of RAM). I did see https://patchwork.kernel.org/patch/10201593/ but that doesn't seem to quite match since this seems to be completely stuck. Any suggestions to narrow the problem down? How much dirtyable memory does the system have? We do allow only lowmem to be dirtyable by default on 32b highmem systems. Maybe you have the lowmem mostly consumed by the kernel memory. Have you tried to enable highmem_is_dirtyable? Setting highmem_is_dirtyable did fix the problem. The infrastructure people seemed satisfied enough with this (and are happy to have the machines back). I'll see if they are willing to run a few more tests to get some more state information. Thanks, Laura
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On Mon, Mar 05, 2018 at 01:42:12PM -0700, Keith Busch wrote: > On Mon, Mar 05, 2018 at 01:10:53PM -0700, Jason Gunthorpe wrote: > > So when reading the above mlx code, we see the first wmb() being used > > to ensure that CPU stores to cachable memory are visible to the DMA > > triggered by the doorbell ring. > > IIUC, we don't need a similar barrier for NVMe to ensure memory is > visibile to DMA since the SQE memory is allocated DMA coherent when the > SQ is not within a CMB. You still need it. DMA coherent just means you don't need to call the DMA API after writing, it says nothing about CPU ordering. eg on x86 DMA coherent is just normal system memory, and you do need the SFENCE betweeen system memory stores and DMA triggering MMIO, apparently. Jason
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On Mon, Mar 05, 2018 at 01:10:53PM -0700, Jason Gunthorpe wrote: > So when reading the above mlx code, we see the first wmb() being used > to ensure that CPU stores to cachable memory are visible to the DMA > triggered by the doorbell ring. IIUC, we don't need a similar barrier for NVMe to ensure memory is visibile to DMA since the SQE memory is allocated DMA coherent when the SQ is not within a CMB. > The mmiowb() is used to ensure that DB writes are not combined and not > issued in any order other than implied by the lock that encloses the > whole thing. This is needed because uar_map is WC memory. > > We don't have ordering with respect to two writel's here, so if ARM > performance was a concern the writel could be switched to > writel_relaxed(). > > Presumably nvme has similar requirments, although I guess the DB > register is mapped UC not WC? Yep, the NVMe DB register is required by the spec to be mapped UC.
Re: [PATCH v2 00/10] Copy Offload in NVMe Fabrics with P2P PCI Memory
>Yes i need to document that some more in hmm.txt... Hi Jermone, thanks for the explanation. Can I suggest you update hmm.txt with what you sent out? > I am about to send RFC for nouveau, i am still working out some bugs. Great. I will keep an eye out for it. An example user of hmm will be very helpful. > i will fix the MAINTAINERS as part of those. Awesome, thanks. Stephen
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On 05/03/18 01:10 PM, Jason Gunthorpe wrote: So when reading the above mlx code, we see the first wmb() being used to ensure that CPU stores to cachable memory are visible to the DMA triggered by the doorbell ring. Oh, yes, that makes sense. Disregard my previous email as I was wrong. Logan
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On 05/03/18 12:57 PM, Sagi Grimberg wrote: Keith, while we're on this, regardless of cmb, is SQE memcopy and DB update ordering always guaranteed? If you look at mlx4 (rdma device driver) that works exactly the same as nvme you will find: -- qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); writel(qp->doorbell_qpn, to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order. */ mmiowb(); -- To me, it looks like the wmb() is redundant as writel should guarantee the order. (Indeed, per Sinan's comment, writel on arm64 starts with a wmb() which means, on that platform, there are two wmb() calls in a row.) The mmiowb() call, on the other hand, looks correct per my understanding of it's purpose with respect to the spinlock. Logan
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On Mon, Mar 05, 2018 at 09:57:27PM +0200, Sagi Grimberg wrote: > Keith, while we're on this, regardless of cmb, is SQE memcopy and DB update > ordering always guaranteed? > > If you look at mlx4 (rdma device driver) that works exactly the same as > nvme you will find: > -- > qp->sq.head += nreq; > > /* > * Make sure that descriptors are written before > * doorbell record. > */ > wmb(); > > writel(qp->doorbell_qpn, >to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); > > /* > * Make sure doorbells don't leak out of SQ spinlock > * and reach the HCA out of order. > */ > mmiowb(); > -- > > That seems to explicitly make sure to place a barrier before updating > the doorbell. So as I see it, either ordering is guaranteed and the > above code is redundant, or nvme needs to do the same. A wmb() is always required before operations that can trigger DMA. The reason ARM has a barrier in writel() is not to make it ordered with respect to CPU stores to cachable memory, but to make it ordered with respect to *other* writels. Eg Linux defines this: writel(A, mem); writel(B, mem); To always produce two TLPs on PCI-E when mem is UC BAR memory. And ARM cannot guarentee that without the extra barrier. So now we see stuff like this: writel_relaxed(A, mem); writel_relaxed(B, mem+4); Which says the TLPs to A and B can be issued in any order.. So when reading the above mlx code, we see the first wmb() being used to ensure that CPU stores to cachable memory are visible to the DMA triggered by the doorbell ring. The mmiowb() is used to ensure that DB writes are not combined and not issued in any order other than implied by the lock that encloses the whole thing. This is needed because uar_map is WC memory. We don't have ordering with respect to two writel's here, so if ARM performance was a concern the writel could be switched to writel_relaxed(). Presumably nvme has similar requirments, although I guess the DB register is mapped UC not WC? Jason
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
- if (nvmeq->sq_cmds_io) - memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd)); - else - memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); + memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC the _toio() variant enforces alignment, does the copy with 4 byte stores, and has a full barrier after the copy. In comparison our regular memcpy() does none of those things and may use unaligned and vector load/stores. For normal (cacheable) memory that is perfectly fine, but they can cause alignment faults when targeted at MMIO (cache-inhibited) memory. I think in this particular case it might be ok since we know SEQs are aligned to 64 byte boundaries and the copy is too small to use our vectorised memcpy(). I'll assume we don't need explicit ordering between writes of SEQs since the existing code doesn't seem to care unless the doorbell is being rung, so you're probably fine there too. That said, I still think this is a little bit sketchy and at the very least you should add a comment explaining what's going on when the CMB is being used. If someone more familiar with the NVMe driver could chime in I would appreciate it. I may not be understanding the concern, but I'll give it a shot. You're right, the start of any SQE is always 64-byte aligned, so that should satisfy alignment requirements. The order when writing multiple/successive SQEs in a submission queue does matter, and this is currently serialized through the q_lock. The order in which the bytes of a single SQE is written doesn't really matter as long as the entire SQE is written into the CMB prior to writing that SQ's doorbell register. The doorbell register is written immediately after copying a command entry into the submission queue (ignore "shadow buffer" features), so the doorbells written to commands submitted is 1:1. If a CMB SQE and DB order is not enforced with the memcpy, then we do need a barrier after the SQE's memcpy and before the doorbell's writel. Keith, while we're on this, regardless of cmb, is SQE memcopy and DB update ordering always guaranteed? If you look at mlx4 (rdma device driver) that works exactly the same as nvme you will find: -- qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); writel(qp->doorbell_qpn, to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock * and reach the HCA out of order. */ mmiowb(); -- That seems to explicitly make sure to place a barrier before updating the doorbell. So as I see it, either ordering is guaranteed and the above code is redundant, or nvme needs to do the same. Thoughts?
Re: [PATCH] lightnvm: pblk: refactor init/exit sequences
On 03/05/2018 03:18 PM, Javier González wrote: On 5 Mar 2018, at 15.16, Matias Bjørlingwrote: On 03/05/2018 02:45 PM, Javier González wrote: On 5 Mar 2018, at 14.38, Matias Bjørling wrote: On 03/01/2018 08:29 PM, Javier González wrote: On 1 Mar 2018, at 19.49, Matias Bjørling wrote: On 03/01/2018 04:59 PM, Javier González wrote: Refactor init and exit sequences to eliminate dependencies among init modules and improve readability. Signed-off-by: Javier González --- drivers/lightnvm/pblk-init.c | 415 +-- 1 file changed, 206 insertions(+), 209 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 25fc70ca07f7..87c390667dd6 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk) vfree(pblk->trans_map); } -static int pblk_l2p_init(struct pblk *pblk) +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) +{ + struct pblk_line *line = NULL; + + if (factory_init) { + pblk_setup_uuid(pblk); + } else { + line = pblk_recov_l2p(pblk); + if (IS_ERR(line)) { + pr_err("pblk: could not recover l2p table\n"); + return -EFAULT; + } + } + +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + + if (!line) { + /* Configure next line for user data */ + line = pblk_line_get_first_data(pblk); + if (!line) { + pr_err("pblk: line list corrupted\n"); + return -EFAULT; + } + } + + return 0; +} + +static int pblk_l2p_init(struct pblk *pblk, bool factory_init) { sector_t i; struct ppa_addr ppa; @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk) for (i = 0; i < pblk->rl.nr_secs; i++) pblk_trans_map_set(pblk, i, ppa); - return 0; + return pblk_l2p_recover(pblk, factory_init); } static void pblk_rwb_free(struct pblk *pblk) @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; struct nvm_addr_format ppaf = geo->ppaf; - int power_len; + int mod, power_len; + + div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, ); + if (mod) { + pr_err("pblk: bad configuration of sectors/pages\n"); + return -EINVAL; + } /* Re-calculate channel and lun format to adapt to configuration */ power_len = get_count_order(geo->nr_chnls); @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; + int max_write_ppas; + + atomic64_set(>user_wa, 0); + atomic64_set(>pad_wa, 0); + atomic64_set(>gc_wa, 0); + pblk->user_rst_wa = 0; + pblk->pad_rst_wa = 0; + pblk->gc_rst_wa = 0; + + atomic64_set(>nr_flush, 0); + pblk->nr_flush_rst = 0; pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * geo->nr_planes * geo->all_luns; + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); + max_write_ppas = pblk->min_write_pgs * geo->all_luns; + pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pr_err("pblk: vector list too big(%u > %u)\n", + pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); + return -EINVAL; + } + + pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t), + GFP_KERNEL); + if (!pblk->pad_dist) + return -ENOMEM; + if (pblk_init_global_caches(pblk)) - return -ENOMEM; + goto fail_free_pad_dist; /* Internal bios can be at most the sectors signaled by the device. */ pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0); @@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_set_ppaf(pblk)) goto free_r_end_wq; - if (pblk_rwb_init(pblk)) - goto free_r_end_wq; - INIT_LIST_HEAD(>compl_list); + return 0; free_r_end_wq: @@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk) mempool_destroy(pblk->page_bio_pool); free_global_caches: pblk_free_global_caches(pblk);
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On 05/03/18 11:02 AM, Sinan Kaya wrote: writel has a barrier inside on ARM64. https://elixir.bootlin.com/linux/latest/source/arch/arm64/include/asm/io.h#L143 Yes, and no barrier inside memcpy_toio as it uses __raw_writes. This should be sufficient as we are only accessing addresses that look like memory and have no side effects (those enabling doorbell accesses may need to worry about this though). Typically, what could happen, in this case, is the CPU would issue writes to the BAR normally and the next time it programmed the DMA engine it would flush everything via the flush in writel. Why do you need another barrier? We don't. Thanks, Logan
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On 05/03/18 09:00 AM, Keith Busch wrote: On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote: On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpewrote: @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, { u16 tail = nvmeq->sq_tail; - if (nvmeq->sq_cmds_io) - memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd)); - else - memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); + memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC the _toio() variant enforces alignment, does the copy with 4 byte stores, and has a full barrier after the copy. In comparison our regular memcpy() does none of those things and may use unaligned and vector load/stores. For normal (cacheable) memory that is perfectly fine, but they can cause alignment faults when targeted at MMIO (cache-inhibited) memory. I think in this particular case it might be ok since we know SEQs are aligned to 64 byte boundaries and the copy is too small to use our vectorised memcpy(). I'll assume we don't need explicit ordering between writes of SEQs since the existing code doesn't seem to care unless the doorbell is being rung, so you're probably fine there too. That said, I still think this is a little bit sketchy and at the very least you should add a comment explaining what's going on when the CMB is being used. If someone more familiar with the NVMe driver could chime in I would appreciate it. I may not be understanding the concern, but I'll give it a shot. You're right, the start of any SQE is always 64-byte aligned, so that should satisfy alignment requirements. The order when writing multiple/successive SQEs in a submission queue does matter, and this is currently serialized through the q_lock. The order in which the bytes of a single SQE is written doesn't really matter as long as the entire SQE is written into the CMB prior to writing that SQ's doorbell register. The doorbell register is written immediately after copying a command entry into the submission queue (ignore "shadow buffer" features), so the doorbells written to commands submitted is 1:1. If a CMB SQE and DB order is not enforced with the memcpy, then we do need a barrier after the SQE's memcpy and before the doorbell's writel. Thanks for the information Keith. Adding to this: regular memcpy generally also enforces alignment as unaligned access to regular memory is typically bad in some way on most arches. The generic memcpy_toio also does not have any barrier as it is just a call to memcpy. Arm64 also does not appear to have a barrier in its implementation and in the short survey I did I could not find any implementation with a barrier. I also did not find a ppc implementation in the tree but it would be weird for it to add a barrier when other arches do not appear to need it. We've been operating on the assumption that memory mapped by devm_memremap_pages() can be treated as regular memory. This is emphasized by the fact that it does not return an __iomem pointer. If this assumption does not hold for an arch then we cannot support P2P DMA without an overhaul of many kernel interfaces or creating other backend interfaces into the drivers which take different data types (ie. we'd have to bypass the entire block layer when trying to write data in p2pmem to an nvme device. This is very undesirable. Logan
Re: vgdisplay hang on iSCSI session
On Mon, 2018-03-05 at 10:44 +0100, Jean-Louis Dupond wrote: > Maby a long shot, but could it be fixed by > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/drivers/block/loop.c?h=v4.9.86=56bc086358cac1a2949783646eabd57447b9d672 > > ? > Or shouldn't that fix such kind of issues? > > It seems like some kind of race condition, cause it happens so random > and not on a specific action. Hello Jean-Louis, Have you already tested any of the kernel versions shown below? All these kernel versions should include that fix: $ git tag --contains 56bc086358cac1a2949783646eabd57447b9d672 v4.9.80 v4.9.81 v4.9.82 v4.9.83 v4.9.84 v4.9.85 v4.9.86 $ git tag --contains ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 v4.15 v4.15.1 v4.15.2 v4.15.3 v4.15.4 v4.15.5 v4.15.6 v4.15.7 Thanks, Bart.
Re: [PATCH v2 07/10] nvme-pci: Use PCI p2pmem subsystem to manage the CMB
On Mon, Mar 05, 2018 at 12:33:29PM +1100, Oliver wrote: > On Thu, Mar 1, 2018 at 10:40 AM, Logan Gunthorpewrote: > > @@ -429,10 +429,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, > > { > > u16 tail = nvmeq->sq_tail; > > > - if (nvmeq->sq_cmds_io) > > - memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd)); > > - else > > - memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); > > + memcpy(>sq_cmds[tail], cmd, sizeof(*cmd)); > > Hmm, how safe is replacing memcpy_toio() with regular memcpy()? On PPC > the _toio() variant enforces alignment, does the copy with 4 byte > stores, and has a full barrier after the copy. In comparison our > regular memcpy() does none of those things and may use unaligned and > vector load/stores. For normal (cacheable) memory that is perfectly > fine, but they can cause alignment faults when targeted at MMIO > (cache-inhibited) memory. > > I think in this particular case it might be ok since we know SEQs are > aligned to 64 byte boundaries and the copy is too small to use our > vectorised memcpy(). I'll assume we don't need explicit ordering > between writes of SEQs since the existing code doesn't seem to care > unless the doorbell is being rung, so you're probably fine there too. > That said, I still think this is a little bit sketchy and at the very > least you should add a comment explaining what's going on when the CMB > is being used. If someone more familiar with the NVMe driver could > chime in I would appreciate it. I may not be understanding the concern, but I'll give it a shot. You're right, the start of any SQE is always 64-byte aligned, so that should satisfy alignment requirements. The order when writing multiple/successive SQEs in a submission queue does matter, and this is currently serialized through the q_lock. The order in which the bytes of a single SQE is written doesn't really matter as long as the entire SQE is written into the CMB prior to writing that SQ's doorbell register. The doorbell register is written immediately after copying a command entry into the submission queue (ignore "shadow buffer" features), so the doorbells written to commands submitted is 1:1. If a CMB SQE and DB order is not enforced with the memcpy, then we do need a barrier after the SQE's memcpy and before the doorbell's writel.
Re: [PATCH] block: null_blk: fix 'Invalid parameters' failure when loading module
On Sat, 2018-03-03 at 10:24 +0800, Ming Lei wrote: > struct nullb_page { > struct page *page; > - unsigned long bitmap; > + unsigned long bitmap[DIV_ROUND_UP(MAP_SZ, sizeof(unsigned long) * 8)]; > }; Could DECLARE_BITMAP() have been used here? Thanks, Bart.
[PATCH v3] blk-throttle: fix race between blkcg_bio_issue_check() and cgroup_rmdir()
We've triggered a WARNING in blk_throtl_bio() when throttling writeback io, which complains blkg->refcnt is already 0 when calling blkg_get(), and then kernel crashes with invalid page request. After investigating this issue, we've found it is caused by a race between blkcg_bio_issue_check() and cgroup_rmdir(), which is described below: writeback kworker cgroup_rmdir cgroup_destroy_locked kill_css css_killed_ref_fn css_killed_work_fn offline_css blkcg_css_offline blkcg_bio_issue_check rcu_read_lock blkg_lookup spin_trylock(q->queue_lock) blkg_destroy spin_unlock(q->queue_lock) blk_throtl_bio spin_lock_irq(q->queue_lock) ... spin_unlock_irq(q->queue_lock) rcu_read_unlock Since rcu can only prevent blkg from releasing when it is being used, the blkg->refcnt can be decreased to 0 during blkg_destroy() and schedule blkg release. Then trying to blkg_get() in blk_throtl_bio() will complains the WARNING. And then the corresponding blkg_put() will schedule blkg release again, which result in double free. This race is introduced by commit ae1188963611 ("blkcg: consolidate blkg creation in blkcg_bio_issue_check()"). Before this commit, it will lookup first and then try to lookup/create again with queue_lock. Since revive this logic is a bit drastic, so fix it by only offlining pd during blkcg_css_offline(), and move the rest destruction (especially blkg_put()) into blkcg_css_free(), which should be the right way as discussed. Fixes: ae1188963611 ("blkcg: consolidate blkg creation in blkcg_bio_issue_check()") Reported-by: Jiufei XueCc: sta...@vger.kernel.org #4.3+ Signed-off-by: Joseph Qi --- block/blk-cgroup.c | 52 +- include/linux/blk-cgroup.h | 2 ++ 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c2033a2..2e9f510 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -307,11 +307,27 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, } } +static void blkg_pd_offline(struct blkcg_gq *blkg) +{ + int i; + + lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(>blkcg->lock); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && !blkg->pd_offline[i] && pol->pd_offline_fn) { + pol->pd_offline_fn(blkg->pd[i]); + blkg->pd_offline[i] = true; + } + } +} + static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; struct blkcg_gq *parent = blkg->parent; - int i; lockdep_assert_held(blkg->q->queue_lock); lockdep_assert_held(>lock); @@ -320,13 +336,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) WARN_ON_ONCE(list_empty(>q_node)); WARN_ON_ONCE(hlist_unhashed(>blkcg_node)); - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[i]); - } - if (parent) { blkg_rwstat_add_aux(>stat_bytes, >stat_bytes); blkg_rwstat_add_aux(>stat_ios, >stat_ios); @@ -369,6 +378,7 @@ static void blkg_destroy_all(struct request_queue *q) struct blkcg *blkcg = blkg->blkcg; spin_lock(>lock); + blkg_pd_offline(blkg); blkg_destroy(blkg); spin_unlock(>lock); } @@ -1013,7 +1023,7 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) struct request_queue *q = blkg->q; if (spin_trylock(q->queue_lock)) { - blkg_destroy(blkg); + blkg_pd_offline(blkg); spin_unlock(q->queue_lock); } else { spin_unlock_irq(>lock); @@ -1032,6 +1042,26 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) struct blkcg *blkcg = css_to_blkcg(css); int i; + spin_lock_irq(>lock); + + while (!hlist_empty(>blkg_list)) { + struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, + struct blkcg_gq, + blkcg_node); + struct request_queue *q = blkg->q; + +
RE: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue
> -Original Message- > From: Kashyap Desai [mailto:kashyap.de...@broadcom.com] > Sent: Monday, March 05, 2018 1:24 AM > To: Laurence Oberman; Don Brace > ; Ming Lei > Cc: Jens Axboe ; linux-block@vger.kernel.org; Christoph > Hellwig ; Mike Snitzer ; linux- > s...@vger.kernel.org; Hannes Reinecke ; Arun Easi > ; Omar Sandoval ; Martin K . > Petersen ; James Bottomley > ; Christoph Hellwig ; > Peter Rivera ; Meelis Roos > Subject: RE: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue > > EXTERNAL EMAIL > > > > -Original Message- > > From: Laurence Oberman [mailto:lober...@redhat.com] > > Sent: Saturday, March 3, 2018 3:23 AM > > To: Don Brace; Ming Lei > > Cc: Jens Axboe; linux-block@vger.kernel.org; Christoph Hellwig; Mike > > Snitzer; > > linux-s...@vger.kernel.org; Hannes Reinecke; Arun Easi; Omar Sandoval; > > Martin K . Petersen; James Bottomley; Christoph Hellwig; Kashyap Desai; > > Peter > > Rivera; Meelis Roos > > Subject: Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue > > > > On Fri, 2018-03-02 at 15:03 +, Don Brace wrote: > > > > -Original Message- > > > > From: Laurence Oberman [mailto:lober...@redhat.com] > > > > Sent: Friday, March 02, 2018 8:09 AM > > > > To: Ming Lei > > > > Cc: Don Brace ; Jens Axboe > > > k>; > > > > linux-block@vger.kernel.org; Christoph Hellwig ; > > > > Mike Snitzer ; linux-s...@vger.kernel.org; > > > > Hannes Reinecke ; Arun Easi ; > > > > Omar Sandoval ; Martin K . Petersen > > > > ; James Bottomley > > > > ; Christoph Hellwig > > > > ; Kashyap Desai ; Peter > > > > Rivera ; Meelis Roos > > > > Subject: Re: [PATCH V3 1/8] scsi: hpsa: fix selection of reply queue > > > > > > > > EXTERNAL EMAIL > > > > > > > > > > > > On Fri, 2018-03-02 at 10:16 +0800, Ming Lei wrote: > > > > > On Thu, Mar 01, 2018 at 04:19:34PM -0500, Laurence Oberman wrote: > > > > > > On Thu, 2018-03-01 at 14:01 -0500, Laurence Oberman wrote: > > > > > > > On Thu, 2018-03-01 at 16:18 +, Don Brace wrote: > > > > > > > > > -Original Message- > > > > > > > > > From: Ming Lei [mailto:ming@redhat.com] > > > > > > > > > Sent: Tuesday, February 27, 2018 4:08 AM > > > > > > > > > To: Jens Axboe ; linux-block@vger.kernel > > > > > > > > > .org ; Christoph Hellwig ; Mike Snitzer > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > Cc: linux-s...@vger.kernel.org; Hannes Reinecke > > > > > > > > e.de > > > > > > > > > > ; > > > > > > > > > > > > > > > > > > Arun Easi > > > > > > > > > ; Omar Sandoval ; > > > > > > > > > Martin K . > > > > > > > > > Petersen ; James Bottomley > > > > > > > > > ; Christoph Hellwig > > > > > > > > > ; Don Brace ; > > > > > > > > > Kashyap Desai ; Peter Rivera > > > > > > > > > > > > > > > > > om>; > > > > > > > > > Laurence Oberman ; Ming Lei > > > > > > > > > ; Meelis Roos > > > > > > > > > Subject: [PATCH V3 1/8] scsi: hpsa: fix selection of reply > > > > > > > > > queue > > > > > > > > > > > > > > > > > > > Seems Don run into IO failure without blk-mq, could you run your > > > > > tests again in legacy mode? > > > > > > > > > > Thanks, > > > > > Ming > > > > > > > > Hello Ming > > > > I ran multiple passes on Legacy and still see no issues in my test > > > > bed > > > > Tests ran all weekend without issues. > > > > BOOT_IMAGE=/vmlinuz-4.16.0-rc2.ming+ root=UUID=43f86d71-b1bf- > 4789- > > > > a28e- > > > > 21c6ddc90195 ro crashkernel=256M@64M log_buf_len=64M > > > > console=ttyS1,115200n8 > > > > > > > > HEAD of the git kernel I am using > > > > > > > > 694e16f scsi: megaraid: improve scsi_mq performance via .host_tagset > > > > 793686c scsi: hpsa: improve scsi_mq performance via .host_tagset > > > > 60d5b36 block: null_blk: introduce module parameter of 'g_host_tags' > > > > 8847067 scsi: Add template flag 'host_tagset' > > > > a8fbdd6 blk-mq: introduce BLK_MQ_F_HOST_TAGS 4710fab blk-mq: > > > > introduce 'start_tag' field to 'struct blk_mq_tags' > > > > 09bb153 scsi: megaraid_sas: fix selection of reply queue > > > > 52700d8 scsi: hpsa: fix selection of reply queue > > > > > > I
Re: [PATCH] lightnvm: pblk: refactor init/exit sequences
> On 5 Mar 2018, at 15.16, Matias Bjørlingwrote: > > On 03/05/2018 02:45 PM, Javier González wrote: >>> On 5 Mar 2018, at 14.38, Matias Bjørling wrote: >>> >>> On 03/01/2018 08:29 PM, Javier González wrote: > On 1 Mar 2018, at 19.49, Matias Bjørling wrote: > > On 03/01/2018 04:59 PM, Javier González wrote: >> Refactor init and exit sequences to eliminate dependencies among init >> modules and improve readability. >> Signed-off-by: Javier González >> --- >> drivers/lightnvm/pblk-init.c | 415 >> +-- >> 1 file changed, 206 insertions(+), 209 deletions(-) >> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c >> index 25fc70ca07f7..87c390667dd6 100644 >> --- a/drivers/lightnvm/pblk-init.c >> +++ b/drivers/lightnvm/pblk-init.c >> @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk) >> vfree(pblk->trans_map); >> } >> -static int pblk_l2p_init(struct pblk *pblk) >> +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) >> +{ >> +struct pblk_line *line = NULL; >> + >> +if (factory_init) { >> +pblk_setup_uuid(pblk); >> +} else { >> +line = pblk_recov_l2p(pblk); >> +if (IS_ERR(line)) { >> +pr_err("pblk: could not recover l2p table\n"); >> +return -EFAULT; >> +} >> +} >> + >> +#ifdef CONFIG_NVM_DEBUG >> +pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); >> +#endif >> + >> +/* Free full lines directly as GC has not been started yet */ >> +pblk_gc_free_full_lines(pblk); >> + >> +if (!line) { >> +/* Configure next line for user data */ >> +line = pblk_line_get_first_data(pblk); >> +if (!line) { >> +pr_err("pblk: line list corrupted\n"); >> +return -EFAULT; >> +} >> +} >> + >> +return 0; >> +} >> + >> +static int pblk_l2p_init(struct pblk *pblk, bool factory_init) >> { >> sector_t i; >> struct ppa_addr ppa; >> @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk) >> for (i = 0; i < pblk->rl.nr_secs; i++) >> pblk_trans_map_set(pblk, i, ppa); >> - return 0; >> +return pblk_l2p_recover(pblk, factory_init); >> } >>static void pblk_rwb_free(struct pblk *pblk) >> @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk) >> struct nvm_tgt_dev *dev = pblk->dev; >> struct nvm_geo *geo = >geo; >> struct nvm_addr_format ppaf = geo->ppaf; >> -int power_len; >> +int mod, power_len; >> + >> +div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, ); >> +if (mod) { >> +pr_err("pblk: bad configuration of sectors/pages\n"); >> +return -EINVAL; >> +} >> /* Re-calculate channel and lun format to adapt to >> configuration */ >> power_len = get_count_order(geo->nr_chnls); >> @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk) >> { >> struct nvm_tgt_dev *dev = pblk->dev; >> struct nvm_geo *geo = >geo; >> +int max_write_ppas; >> + >> +atomic64_set(>user_wa, 0); >> +atomic64_set(>pad_wa, 0); >> +atomic64_set(>gc_wa, 0); >> +pblk->user_rst_wa = 0; >> +pblk->pad_rst_wa = 0; >> +pblk->gc_rst_wa = 0; >> + >> +atomic64_set(>nr_flush, 0); >> +pblk->nr_flush_rst = 0; >> pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * >> geo->nr_planes * >> geo->all_luns; >> + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / >> PAGE_SIZE); >> +max_write_ppas = pblk->min_write_pgs * geo->all_luns; >> +pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); >> +pblk_set_sec_per_write(pblk, pblk->min_write_pgs); >> + >> +if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { >> +pr_err("pblk: vector list too big(%u > %u)\n", >> +pblk->max_write_pgs, >> PBLK_MAX_REQ_ADDRS); >> +return -EINVAL; >> +} >> + >> +pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * >> sizeof(atomic64_t), >> + >>
Re: [PATCH] lightnvm: pblk: refactor init/exit sequences
On 03/05/2018 02:45 PM, Javier González wrote: On 5 Mar 2018, at 14.38, Matias Bjørlingwrote: On 03/01/2018 08:29 PM, Javier González wrote: On 1 Mar 2018, at 19.49, Matias Bjørling wrote: On 03/01/2018 04:59 PM, Javier González wrote: Refactor init and exit sequences to eliminate dependencies among init modules and improve readability. Signed-off-by: Javier González --- drivers/lightnvm/pblk-init.c | 415 +-- 1 file changed, 206 insertions(+), 209 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 25fc70ca07f7..87c390667dd6 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk) vfree(pblk->trans_map); } -static int pblk_l2p_init(struct pblk *pblk) +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) +{ + struct pblk_line *line = NULL; + + if (factory_init) { + pblk_setup_uuid(pblk); + } else { + line = pblk_recov_l2p(pblk); + if (IS_ERR(line)) { + pr_err("pblk: could not recover l2p table\n"); + return -EFAULT; + } + } + +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + + if (!line) { + /* Configure next line for user data */ + line = pblk_line_get_first_data(pblk); + if (!line) { + pr_err("pblk: line list corrupted\n"); + return -EFAULT; + } + } + + return 0; +} + +static int pblk_l2p_init(struct pblk *pblk, bool factory_init) { sector_t i; struct ppa_addr ppa; @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk) for (i = 0; i < pblk->rl.nr_secs; i++) pblk_trans_map_set(pblk, i, ppa); - return 0; + return pblk_l2p_recover(pblk, factory_init); } static void pblk_rwb_free(struct pblk *pblk) @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; struct nvm_addr_format ppaf = geo->ppaf; - int power_len; + int mod, power_len; + + div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, ); + if (mod) { + pr_err("pblk: bad configuration of sectors/pages\n"); + return -EINVAL; + } /* Re-calculate channel and lun format to adapt to configuration */ power_len = get_count_order(geo->nr_chnls); @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; + int max_write_ppas; + + atomic64_set(>user_wa, 0); + atomic64_set(>pad_wa, 0); + atomic64_set(>gc_wa, 0); + pblk->user_rst_wa = 0; + pblk->pad_rst_wa = 0; + pblk->gc_rst_wa = 0; + + atomic64_set(>nr_flush, 0); + pblk->nr_flush_rst = 0; pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * geo->nr_planes * geo->all_luns; + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); + max_write_ppas = pblk->min_write_pgs * geo->all_luns; + pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pr_err("pblk: vector list too big(%u > %u)\n", + pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); + return -EINVAL; + } + + pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t), + GFP_KERNEL); + if (!pblk->pad_dist) + return -ENOMEM; + if (pblk_init_global_caches(pblk)) - return -ENOMEM; + goto fail_free_pad_dist; /* Internal bios can be at most the sectors signaled by the device. */ pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0); @@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_set_ppaf(pblk)) goto free_r_end_wq; - if (pblk_rwb_init(pblk)) - goto free_r_end_wq; - INIT_LIST_HEAD(>compl_list); + return 0; free_r_end_wq: @@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk) mempool_destroy(pblk->page_bio_pool); free_global_caches: pblk_free_global_caches(pblk); +fail_free_pad_dist: + kfree(pblk->pad_dist); return -ENOMEM; } @@ -354,14 +420,8 @@ static void
Re: [PATCH] lightnvm: pblk: refactor init/exit sequences
> On 5 Mar 2018, at 14.38, Matias Bjørlingwrote: > > On 03/01/2018 08:29 PM, Javier González wrote: >>> On 1 Mar 2018, at 19.49, Matias Bjørling wrote: >>> >>> On 03/01/2018 04:59 PM, Javier González wrote: Refactor init and exit sequences to eliminate dependencies among init modules and improve readability. Signed-off-by: Javier González --- drivers/lightnvm/pblk-init.c | 415 +-- 1 file changed, 206 insertions(+), 209 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 25fc70ca07f7..87c390667dd6 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk) vfree(pblk->trans_map); } -static int pblk_l2p_init(struct pblk *pblk) +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) +{ + struct pblk_line *line = NULL; + + if (factory_init) { + pblk_setup_uuid(pblk); + } else { + line = pblk_recov_l2p(pblk); + if (IS_ERR(line)) { + pr_err("pblk: could not recover l2p table\n"); + return -EFAULT; + } + } + +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + + if (!line) { + /* Configure next line for user data */ + line = pblk_line_get_first_data(pblk); + if (!line) { + pr_err("pblk: line list corrupted\n"); + return -EFAULT; + } + } + + return 0; +} + +static int pblk_l2p_init(struct pblk *pblk, bool factory_init) { sector_t i; struct ppa_addr ppa; @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk) for (i = 0; i < pblk->rl.nr_secs; i++) pblk_trans_map_set(pblk, i, ppa); - return 0; + return pblk_l2p_recover(pblk, factory_init); } static void pblk_rwb_free(struct pblk *pblk) @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; struct nvm_addr_format ppaf = geo->ppaf; - int power_len; + int mod, power_len; + + div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, ); + if (mod) { + pr_err("pblk: bad configuration of sectors/pages\n"); + return -EINVAL; + } /* Re-calculate channel and lun format to adapt to configuration */ power_len = get_count_order(geo->nr_chnls); @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; + int max_write_ppas; + + atomic64_set(>user_wa, 0); + atomic64_set(>pad_wa, 0); + atomic64_set(>gc_wa, 0); + pblk->user_rst_wa = 0; + pblk->pad_rst_wa = 0; + pblk->gc_rst_wa = 0; + + atomic64_set(>nr_flush, 0); + pblk->nr_flush_rst = 0; pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * geo->nr_planes * geo->all_luns; + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); + max_write_ppas = pblk->min_write_pgs * geo->all_luns; + pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pr_err("pblk: vector list too big(%u > %u)\n", + pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); + return -EINVAL; + } + + pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t), + GFP_KERNEL); + if (!pblk->pad_dist) + return -ENOMEM; + if (pblk_init_global_caches(pblk)) - return -ENOMEM; + goto fail_free_pad_dist; /* Internal bios can be at most the sectors signaled by the device. */ pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0); @@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_set_ppaf(pblk)) goto free_r_end_wq; - if (pblk_rwb_init(pblk)) - goto free_r_end_wq; - INIT_LIST_HEAD(>compl_list); + return 0; free_r_end_wq: @@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk)
Re: [PATCH] lightnvm: pblk: refactor init/exit sequences
On 03/01/2018 08:29 PM, Javier González wrote: On 1 Mar 2018, at 19.49, Matias Bjørlingwrote: On 03/01/2018 04:59 PM, Javier González wrote: Refactor init and exit sequences to eliminate dependencies among init modules and improve readability. Signed-off-by: Javier González --- drivers/lightnvm/pblk-init.c | 415 +-- 1 file changed, 206 insertions(+), 209 deletions(-) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 25fc70ca07f7..87c390667dd6 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -103,7 +103,40 @@ static void pblk_l2p_free(struct pblk *pblk) vfree(pblk->trans_map); } -static int pblk_l2p_init(struct pblk *pblk) +static int pblk_l2p_recover(struct pblk *pblk, bool factory_init) +{ + struct pblk_line *line = NULL; + + if (factory_init) { + pblk_setup_uuid(pblk); + } else { + line = pblk_recov_l2p(pblk); + if (IS_ERR(line)) { + pr_err("pblk: could not recover l2p table\n"); + return -EFAULT; + } + } + +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + + if (!line) { + /* Configure next line for user data */ + line = pblk_line_get_first_data(pblk); + if (!line) { + pr_err("pblk: line list corrupted\n"); + return -EFAULT; + } + } + + return 0; +} + +static int pblk_l2p_init(struct pblk *pblk, bool factory_init) { sector_t i; struct ppa_addr ppa; @@ -119,7 +152,7 @@ static int pblk_l2p_init(struct pblk *pblk) for (i = 0; i < pblk->rl.nr_secs; i++) pblk_trans_map_set(pblk, i, ppa); - return 0; + return pblk_l2p_recover(pblk, factory_init); } static void pblk_rwb_free(struct pblk *pblk) @@ -159,7 +192,13 @@ static int pblk_set_ppaf(struct pblk *pblk) struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; struct nvm_addr_format ppaf = geo->ppaf; - int power_len; + int mod, power_len; + + div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, ); + if (mod) { + pr_err("pblk: bad configuration of sectors/pages\n"); + return -EINVAL; + } /* Re-calculate channel and lun format to adapt to configuration */ power_len = get_count_order(geo->nr_chnls); @@ -252,12 +291,39 @@ static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = >geo; + int max_write_ppas; + + atomic64_set(>user_wa, 0); + atomic64_set(>pad_wa, 0); + atomic64_set(>gc_wa, 0); + pblk->user_rst_wa = 0; + pblk->pad_rst_wa = 0; + pblk->gc_rst_wa = 0; + + atomic64_set(>nr_flush, 0); + pblk->nr_flush_rst = 0; pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg * geo->nr_planes * geo->all_luns; + pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE); + max_write_ppas = pblk->min_write_pgs * geo->all_luns; + pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA); + pblk_set_sec_per_write(pblk, pblk->min_write_pgs); + + if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) { + pr_err("pblk: vector list too big(%u > %u)\n", + pblk->max_write_pgs, PBLK_MAX_REQ_ADDRS); + return -EINVAL; + } + + pblk->pad_dist = kzalloc((pblk->min_write_pgs - 1) * sizeof(atomic64_t), + GFP_KERNEL); + if (!pblk->pad_dist) + return -ENOMEM; + if (pblk_init_global_caches(pblk)) - return -ENOMEM; + goto fail_free_pad_dist; /* Internal bios can be at most the sectors signaled by the device. */ pblk->page_bio_pool = mempool_create_page_pool(NVM_MAX_VLBA, 0); @@ -307,10 +373,8 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_set_ppaf(pblk)) goto free_r_end_wq; - if (pblk_rwb_init(pblk)) - goto free_r_end_wq; - INIT_LIST_HEAD(>compl_list); + return 0; free_r_end_wq: @@ -333,6 +397,8 @@ static int pblk_core_init(struct pblk *pblk) mempool_destroy(pblk->page_bio_pool); free_global_caches: pblk_free_global_caches(pblk); +fail_free_pad_dist: + kfree(pblk->pad_dist); return -ENOMEM; } @@ -354,14 +420,8 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); -
Re: [PATCH 01/12] lightnvm: simplify geometry structure.
> On 5 Mar 2018, at 14.07, Matias Bjørlingwrote: > > On 03/02/2018 04:21 PM, Javier González wrote: >> Currently, the device geometry is stored redundantly in the nvm_id and >> nvm_geo structures at a device level. Moreover, when instantiating >> targets on a specific number of LUNs, these structures are replicated >> and manually modified to fit the instance channel and LUN partitioning. >> Instead, create a generic geometry around nvm_geo, which can be used by >> (i) the underlying device to describe the geometry of the whole device, >> and (ii) instances to describe their geometry independently. >> Signed-off-by: Javier González >> --- >> drivers/lightnvm/core.c | 70 +++- >> drivers/lightnvm/pblk-core.c | 16 +- >> drivers/lightnvm/pblk-gc.c | 2 +- >> drivers/lightnvm/pblk-init.c | 113 +++-- >> drivers/lightnvm/pblk-read.c | 2 +- >> drivers/lightnvm/pblk-recovery.c | 14 +- >> drivers/lightnvm/pblk-rl.c | 2 +- >> drivers/lightnvm/pblk-sysfs.c| 35 ++-- >> drivers/lightnvm/pblk-write.c| 2 +- >> drivers/lightnvm/pblk.h | 83 -- >> drivers/nvme/host/lightnvm.c | 339 >> +++ >> include/linux/lightnvm.h | 198 +++ >> 12 files changed, 451 insertions(+), 425 deletions(-) >> diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c >> index 19c46ebb1b91..9a417d9cdf0c 100644 >> --- a/drivers/lightnvm/core.c >> +++ b/drivers/lightnvm/core.c >> @@ -155,7 +155,7 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct >> nvm_dev *dev, >> int blun = lun_begin % dev->geo.nr_luns; >> int lunid = 0; >> int lun_balanced = 1; >> -int prev_nr_luns; >> +int sec_per_lun, prev_nr_luns; >> int i, j; >> nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; >> @@ -215,18 +215,23 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct >> nvm_dev *dev, >> if (!tgt_dev) >> goto err_ch; >> + /* Inherit device geometry from parent */ >> memcpy(_dev->geo, >geo, sizeof(struct nvm_geo)); >> + >> /* Target device only owns a portion of the physical device */ >> tgt_dev->geo.nr_chnls = nr_chnls; >> -tgt_dev->geo.all_luns = nr_luns; >> tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1; >> +tgt_dev->geo.all_luns = nr_luns; >> +tgt_dev->geo.all_chunks = nr_luns * dev->geo.nr_chks; >> + >> tgt_dev->geo.op = op; >> -tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; >> + >> +sec_per_lun = dev->geo.clba * dev->geo.nr_chks; >> +tgt_dev->geo.total_secs = nr_luns * sec_per_lun; >> + >> tgt_dev->q = dev->q; >> tgt_dev->map = dev_map; >> tgt_dev->luns = luns; >> -memcpy(_dev->identity, >identity, sizeof(struct nvm_id)); >> - >> tgt_dev->parent = dev; >> return tgt_dev; >> @@ -296,8 +301,6 @@ static int __nvm_config_simple(struct nvm_dev *dev, >> static int __nvm_config_extended(struct nvm_dev *dev, >> struct nvm_ioctl_create_extended *e) >> { >> -struct nvm_geo *geo = >geo; >> - >> if (e->lun_begin == 0x && e->lun_end == 0x) { >> e->lun_begin = 0; >> e->lun_end = dev->geo.all_luns - 1; >> @@ -311,7 +314,7 @@ static int __nvm_config_extended(struct nvm_dev *dev, >> return -EINVAL; >> } >> - return nvm_config_check_luns(geo, e->lun_begin, e->lun_end); >> +return nvm_config_check_luns(>geo, e->lun_begin, e->lun_end); >> } >>static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create >> *create) >> @@ -406,7 +409,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct >> nvm_ioctl_create *create) >> tqueue->queuedata = targetdata; >> blk_queue_max_hw_sectors(tqueue, >> -(dev->geo.sec_size >> 9) * NVM_MAX_VLBA); >> +(dev->geo.csecs >> 9) * NVM_MAX_VLBA); >> set_capacity(tdisk, tt->capacity(targetdata)); >> add_disk(tdisk); >> @@ -841,40 +844,9 @@ EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); >>static int nvm_core_init(struct nvm_dev *dev) >> { >> -struct nvm_id *id = >identity; >> struct nvm_geo *geo = >geo; >> int ret; >> - memcpy(>ppaf, >ppaf, sizeof(struct nvm_addr_format)); >> - >> -if (id->mtype != 0) { >> -pr_err("nvm: memory type not supported\n"); >> -return -EINVAL; >> -} >> - >> -/* Whole device values */ >> -geo->nr_chnls = id->num_ch; >> -geo->nr_luns = id->num_lun; >> - >> -/* Generic device geometry values */ >> -geo->ws_min = id->ws_min; >> -geo->ws_opt = id->ws_opt; >> -geo->ws_seq = id->ws_seq; >> -geo->ws_per_chk = id->ws_per_chk; >> -geo->nr_chks = id->num_chk; >> -geo->mccap = id->mccap; >> - >> -geo->sec_per_chk = id->clba; >> -geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks; >> -geo->all_luns =
Re: [PATCH 01/12] lightnvm: simplify geometry structure.
On 03/02/2018 04:21 PM, Javier González wrote: Currently, the device geometry is stored redundantly in the nvm_id and nvm_geo structures at a device level. Moreover, when instantiating targets on a specific number of LUNs, these structures are replicated and manually modified to fit the instance channel and LUN partitioning. Instead, create a generic geometry around nvm_geo, which can be used by (i) the underlying device to describe the geometry of the whole device, and (ii) instances to describe their geometry independently. Signed-off-by: Javier González--- drivers/lightnvm/core.c | 70 +++- drivers/lightnvm/pblk-core.c | 16 +- drivers/lightnvm/pblk-gc.c | 2 +- drivers/lightnvm/pblk-init.c | 113 +++-- drivers/lightnvm/pblk-read.c | 2 +- drivers/lightnvm/pblk-recovery.c | 14 +- drivers/lightnvm/pblk-rl.c | 2 +- drivers/lightnvm/pblk-sysfs.c| 35 ++-- drivers/lightnvm/pblk-write.c| 2 +- drivers/lightnvm/pblk.h | 83 -- drivers/nvme/host/lightnvm.c | 339 +++ include/linux/lightnvm.h | 198 +++ 12 files changed, 451 insertions(+), 425 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 19c46ebb1b91..9a417d9cdf0c 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -155,7 +155,7 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, int blun = lun_begin % dev->geo.nr_luns; int lunid = 0; int lun_balanced = 1; - int prev_nr_luns; + int sec_per_lun, prev_nr_luns; int i, j; nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; @@ -215,18 +215,23 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, if (!tgt_dev) goto err_ch; + /* Inherit device geometry from parent */ memcpy(_dev->geo, >geo, sizeof(struct nvm_geo)); + /* Target device only owns a portion of the physical device */ tgt_dev->geo.nr_chnls = nr_chnls; - tgt_dev->geo.all_luns = nr_luns; tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1; + tgt_dev->geo.all_luns = nr_luns; + tgt_dev->geo.all_chunks = nr_luns * dev->geo.nr_chks; + tgt_dev->geo.op = op; - tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun; + + sec_per_lun = dev->geo.clba * dev->geo.nr_chks; + tgt_dev->geo.total_secs = nr_luns * sec_per_lun; + tgt_dev->q = dev->q; tgt_dev->map = dev_map; tgt_dev->luns = luns; - memcpy(_dev->identity, >identity, sizeof(struct nvm_id)); - tgt_dev->parent = dev; return tgt_dev; @@ -296,8 +301,6 @@ static int __nvm_config_simple(struct nvm_dev *dev, static int __nvm_config_extended(struct nvm_dev *dev, struct nvm_ioctl_create_extended *e) { - struct nvm_geo *geo = >geo; - if (e->lun_begin == 0x && e->lun_end == 0x) { e->lun_begin = 0; e->lun_end = dev->geo.all_luns - 1; @@ -311,7 +314,7 @@ static int __nvm_config_extended(struct nvm_dev *dev, return -EINVAL; } - return nvm_config_check_luns(geo, e->lun_begin, e->lun_end); + return nvm_config_check_luns(>geo, e->lun_begin, e->lun_end); } static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) @@ -406,7 +409,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) tqueue->queuedata = targetdata; blk_queue_max_hw_sectors(tqueue, - (dev->geo.sec_size >> 9) * NVM_MAX_VLBA); + (dev->geo.csecs >> 9) * NVM_MAX_VLBA); set_capacity(tdisk, tt->capacity(targetdata)); add_disk(tdisk); @@ -841,40 +844,9 @@ EXPORT_SYMBOL(nvm_get_tgt_bb_tbl); static int nvm_core_init(struct nvm_dev *dev) { - struct nvm_id *id = >identity; struct nvm_geo *geo = >geo; int ret; - memcpy(>ppaf, >ppaf, sizeof(struct nvm_addr_format)); - - if (id->mtype != 0) { - pr_err("nvm: memory type not supported\n"); - return -EINVAL; - } - - /* Whole device values */ - geo->nr_chnls = id->num_ch; - geo->nr_luns = id->num_lun; - - /* Generic device geometry values */ - geo->ws_min = id->ws_min; - geo->ws_opt = id->ws_opt; - geo->ws_seq = id->ws_seq; - geo->ws_per_chk = id->ws_per_chk; - geo->nr_chks = id->num_chk; - geo->mccap = id->mccap; - - geo->sec_per_chk = id->clba; - geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks; - geo->all_luns = geo->nr_luns * geo->nr_chnls; - - /* 1.2 spec device geometry values */ - geo->plane_mode = 1 << geo->ws_seq; - geo->nr_planes = geo->ws_opt / geo->ws_min; - geo->sec_per_pg = geo->ws_min; -
[PATCH] block, bfq: keep peak_rate estimation within range 1..2^32-1
Rate should never overflow or become zero because it is used as divider. This patch accumulates it with saturation. Signed-off-by: Konstantin Khlebnikov--- block/bfq-iosched.c |8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index aeca22d91101..a236c8d541b5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2546,7 +2546,8 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) { - u32 rate, weight, divisor; + u32 weight, divisor; + u64 rate; /* * For the convergence property to hold (see comments on @@ -2634,9 +2635,10 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) */ bfqd->peak_rate *= divisor-1; bfqd->peak_rate /= divisor; - rate /= divisor; /* smoothing constant alpha = 1/divisor */ + do_div(rate, divisor); /* smoothing constant alpha = 1/divisor */ - bfqd->peak_rate += rate; + /* rate should never overlow or become zero */ + bfqd->peak_rate = clamp_t(u64, rate + bfqd->peak_rate, 1, U32_MAX); update_thr_responsiveness_params(bfqd); reset_computation:
loop: properly declare rotational flag of underlying device
The loop driver has always declared the rotational flag of its device as rotational, even when the device of the mapped file is nonrotational, as is the case with SSDs or on tmpfs. This can confuse filesystem tools which are SSD-aware; in my case I frequently forget to tell mkfs.btrfs that my loop device on tmpfs is nonrotational, and that I really don't need any automatic metadata redundancy. The attached patch fixes this by introspecting the rotational flag of the mapped file's underlying block device, if it exists. If the mapped file's filesystem has no associated block device - as is the case on e.g. tmpfs - we assume nonrotational storage. If there is a better way to identify such non-devices I'd love to hear them. On previous submission (~2 years ago I think) Jens commented that this "changes the default". Tat's precisely the point: the default of declaring rotational behaviour unconditionally is more likely to be wrong than the other way around. To the best of my knowledge there are no byte- addressable but physically rotating media without associated block device. This is fresh against 4.16-rc4 and has been in production since 4.9, with some minor changes to accommodate initialization order in 4.14. Please consider for 4.17. Signed-off-by: Holger Hoffstättecheers, Holger diff -rup linux-4.16-rc4/drivers/block/loop.c linux-4.16-rc4-loop/drivers/block/loop.c --- linux-4.16-rc4/drivers/block/loop.c 2018-03-04 23:54:11.0 +0100 +++ linux-4.16-rc4-loop/drivers/block/loop.c2018-03-05 00:41:48.013602156 +0100 @@ -829,6 +829,24 @@ static void loop_config_discard(struct l queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } +static void loop_update_rotational(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct inode *file_inode = file->f_mapping->host; + struct block_device *file_bdev = file_inode->i_sb->s_bdev; + struct request_queue *q = lo->lo_queue; + bool nonrot = true; + + /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */ + if (file_bdev) + nonrot = blk_queue_nonrot(bdev_get_queue(file_bdev)); + + if (nonrot) + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); +} + static void loop_unprepare_queue(struct loop_device *lo) { kthread_flush_worker(>worker); @@ -929,6 +947,7 @@ static int loop_set_fd(struct loop_devic loop_update_dio(lo); set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); + loop_update_rotational(lo); loop_sysfs_init(lo); /* let user-space know about the new size */ kobject_uevent(_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
Some bcache patches need reveiw
Hello Mike I send some patches some times before, with no response, Bellow patches are very simple, can you or anybody else have a review? [PATCH] bcache: fix incorrect sysfs output value of strip size [PATCH] bcache: fix error return value in memory shrink [PATCH] bcache: fix error count in memory shrink Bellow patches are simple too, and important for me and also important for bcache, can you or anybody else have a review? [PATCH] bcache: finish incremental GC [PATCH] bcache: calculate the number of incremental GC nodes according to the total of btree nodes Thanks, Tang Junhui
Re: vgdisplay hang on iSCSI session
Hi, Maby a long shot, but could it be fixed by https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/drivers/block/loop.c?h=v4.9.86=56bc086358cac1a2949783646eabd57447b9d672 ? Or shouldn't that fix such kind of issues? It seems like some kind of race condition, cause it happens so random and not on a specific action. Thanks Jean-Louis Op 2018-03-04 20:26, schreef Bart Van Assche: On Sun, 2018-03-04 at 20:01 +0100, Jean-Louis Dupond wrote: I'm running indeed CentOS 6 with the Virt SIG kernels. Already updated to 4.9.75, but recently hit the problem again. The first PID that was in D-state (root 27157 0.0 0.0 127664 5196 ?D06:19 0:00 \_ vgdisplay -c --ignorelockingfailure), had the following stack: # cat /proc/27157/stack [] blk_mq_freeze_queue_wait+0x6f/0xd0 [] blk_freeze_queue+0x1e/0x30 [] blk_mq_freeze_queue+0xe/0x10 [] loop_switch+0x1e/0xd0 [] lo_release+0x7a/0x80 [] __blkdev_put+0x1a7/0x200 [] blkdev_put+0x56/0x140 [] blkdev_close+0x24/0x30 [] __fput+0xc8/0x240 [] fput+0xe/0x10 [] task_work_run+0x68/0xa0 [] exit_to_usermode_loop+0xc6/0xd0 [] do_syscall_64+0x185/0x240 [] entry_SYSCALL64_slow_path+0x25/0x25 [] 0x Other procs show the following: # cat /proc/7803/stack [] __blkdev_get+0x6c/0x3f0 [] blkdev_get+0x5c/0x1c0 [] blkdev_open+0x62/0x80 [] do_dentry_open+0x22a/0x340 [] vfs_open+0x51/0x80 [] do_last+0x435/0x7a0 [] path_openat+0x87/0x1c0 [] do_filp_open+0x85/0xe0 [] do_sys_open+0x11c/0x210 [] SyS_open+0x1e/0x20 [] do_syscall_64+0x7a/0x240 [] entry_SYSCALL64_slow_path+0x25/0x25 [] 0x An strace hangs again on loop0 open: stat("/dev/loop0", {st_mode=S_IFBLK|0660, st_rdev=makedev(7, 0), ...}) = 0 open("/dev/loop0", O_RDONLY|O_DIRECT|O_NOATIME And it seems like indeed alot is hanging on loop0: # cat /sys/block/loop0/mq/0/queued 5957 Hello Jean-Louis, Is the system still in this state? If so, can you provide the output of the following command (as an attachment): find /sys/kernel/debug/block/ -type f \! \( -name poll_stat -o -name dispatched -o -name merged -o -name completed \) Thanks, Bart.
[PATCH v2] bcache: move closure debug file into debug direcotry
LGTM. Reviewed-by: Tang Junhui> In current code closure debug file is outside of debug directory > and when unloading module there is lack of removing operation > for closure debug file, so it will cause creating error when trying > to reload module. > > This patch move closure debug file into "bcache" debug direcory > so that the file can get deleted properly. > > Signed-off-by: Chengguang Xu > --- > Changes since v1: > - Move closure debug file into "bcache" debug direcory instead of > deleting it individually. > - Change Signed-off-by mail address. > > drivers/md/bcache/closure.c | 9 + > drivers/md/bcache/closure.h | 5 +++-- > drivers/md/bcache/debug.c | 2 +- > drivers/md/bcache/super.c | 3 +-- > 4 files changed, 10 insertions(+), 9 deletions(-) > > diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c > index 7f12920..64b123c 100644 > --- a/drivers/md/bcache/closure.c > +++ b/drivers/md/bcache/closure.c > @@ -157,7 +157,7 @@ void closure_debug_destroy(struct closure *cl) > } > EXPORT_SYMBOL(closure_debug_destroy); > > -static struct dentry *debug; > +static struct dentry *closure_debug; > > static int debug_seq_show(struct seq_file *f, void *data) > { > @@ -199,11 +199,12 @@ static int debug_seq_open(struct inode *inode, struct > file *file) > .release= single_release > }; > > -void __init closure_debug_init(void) > +int __init closure_debug_init(void) > { > -debug = debugfs_create_file("closures", 0400, NULL, NULL, _ops); > +closure_debug = debugfs_create_file("closures", > +0400, debug, NULL, _ops); > +return IS_ERR_OR_NULL(closure_debug); > } > - > #endif > > MODULE_AUTHOR("Kent Overstreet "); > diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h > index 3b9dfc9..0fb704d 100644 > --- a/drivers/md/bcache/closure.h > +++ b/drivers/md/bcache/closure.h > @@ -105,6 +105,7 @@ > struct closure; > struct closure_syncer; > typedef void (closure_fn) (struct closure *); > +extern struct dentry *debug; > > struct closure_waitlist { > struct llist_headlist; > @@ -185,13 +186,13 @@ static inline void closure_sync(struct closure *cl) > > #ifdef CONFIG_BCACHE_CLOSURES_DEBUG > > -void closure_debug_init(void); > +int closure_debug_init(void); > void closure_debug_create(struct closure *cl); > void closure_debug_destroy(struct closure *cl); > > #else > > -static inline void closure_debug_init(void) {} > +static inline int closure_debug_init(void) { return 0; } > static inline void closure_debug_create(struct closure *cl) {} > static inline void closure_debug_destroy(struct closure *cl) {} > > diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c > index af89408..5db02de 100644 > --- a/drivers/md/bcache/debug.c > +++ b/drivers/md/bcache/debug.c > @@ -17,7 +17,7 @@ > #include > #include > > -static struct dentry *debug; > +struct dentry *debug; > > #ifdef CONFIG_BCACHE_DEBUG > > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c > index 1a9fdab..b784292 100644 > --- a/drivers/md/bcache/super.c > +++ b/drivers/md/bcache/super.c > @@ -2133,7 +2133,6 @@ static int __init bcache_init(void) > mutex_init(_register_lock); > init_waitqueue_head(_wait); > register_reboot_notifier(); > -closure_debug_init(); > > bcache_major = register_blkdev(0, "bcache"); > if (bcache_major < 0) { > @@ -2145,7 +2144,7 @@ static int __init bcache_init(void) > if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || > !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || > bch_request_init() || > -bch_debug_init(bcache_kobj) || > +bch_debug_init(bcache_kobj) || closure_debug_init() || > sysfs_create_files(bcache_kobj, files)) > goto err; > > -- > 1.8.3.1 Thanks Tang Junhui