Re: [PATCH 1/2] iommu/iova: Flush CPU rcache for when a depot fills
On 25/09/2020 15:34, John Garry wrote: Indeed, I think that the mainline code has a bug: If the initial allocation for the loaded/prev magazines fail (give NULL) in init_iova_rcaches(), then in __iova_rcache_insert(): if (!iova_magazine_full(cpu_rcache->loaded)) { can_insert = true; If cpu_rcache->loaded == NULL, then can_insert is assigned true -> bang, as I experimented, below. This needs to be fixed... This looks better: Subject: [PATCH] iommu/iova: Avoid double-negatives with magazine helpers Expression !iova_magazine_full(mag) evaluates true when mag == NULL. This falls over in __iova_rcache_insert() when loaded == NULL: if (!iova_magazine_full(cpu_rcache->loaded)) { can_insert = true; ... if (can_insert) iova_magazine_push(cpu_rcache->loaded, iova_pfn); Here, can_insert is evaluated true, which is wrong. Members loaded/prev can possibly be NULL if the initial allocations fail in __iova_rcache_insert(). Let's stop using double-negatives, like !iova_magazine_full(), and use iova_magazine_has_space() instead in this case. And similar for !iova_magazine_empty(). Signed-off-by: John Garry diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 5b4ffab7140b..42ca9d0f39b7 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -827,14 +827,18 @@ iova_magazine_free_pfns(struct iova_magazine *mag, struct iova_domain *iovad) mag->size = 0; } -static bool iova_magazine_full(struct iova_magazine *mag) +static bool iova_magazine_has_space(struct iova_magazine *mag) { - return (mag && mag->size == IOVA_MAG_SIZE); + if (!mag) + return false; + return mag->size < IOVA_MAG_SIZE; } -static bool iova_magazine_empty(struct iova_magazine *mag) +static bool iova_magazine_has_pfns(struct iova_magazine *mag) { - return (!mag || mag->size == 0); + if (!mag) + return false; + return mag->size; } static unsigned long iova_magazine_pop(struct iova_magazine *mag, @@ -843,7 +847,7 @@ static unsigned long iova_magazine_pop(struct iova_magazine *mag, int i; unsigned long pfn; - BUG_ON(iova_magazine_empty(mag)); + BUG_ON(!iova_magazine_has_pfns(mag)); /* Only fall back to the rbtree if we have no suitable pfns at all */ for (i = mag->size - 1; mag->pfns[i] > limit_pfn; i--) @@ -859,7 +863,7 @@ static unsigned long iova_magazine_pop(struct iova_magazine *mag, static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn) { - BUG_ON(iova_magazine_full(mag)); + BUG_ON(!iova_magazine_has_space(mag)); mag->pfns[mag->size++] = pfn; } @@ -905,9 +909,9 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); spin_lock_irqsave(_rcache->lock, flags); - if (!iova_magazine_full(cpu_rcache->loaded)) { + if (iova_magazine_has_space(cpu_rcache->loaded)) { can_insert = true; - } else if (!iova_magazine_full(cpu_rcache->prev)) { + } else if (iova_magazine_has_space(cpu_rcache->prev)) { swap(cpu_rcache->prev, cpu_rcache->loaded); can_insert = true; } else { @@ -915,7 +919,8 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, if (new_mag) { spin_lock(>lock); - if (rcache->depot_size < MAX_GLOBAL_MAGS) { + if (rcache->depot_size < MAX_GLOBAL_MAGS && + cpu_rcache->loaded) { rcache->depot[rcache->depot_size++] = cpu_rcache->loaded; } else { @@ -968,9 +973,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); spin_lock_irqsave(_rcache->lock, flags); - if (!iova_magazine_empty(cpu_rcache->loaded)) { + if (iova_magazine_has_pfns(cpu_rcache->loaded)) { has_pfn = true; - } else if (!iova_magazine_empty(cpu_rcache->prev)) { + } else if (iova_magazine_has_pfns(cpu_rcache->prev)) { swap(cpu_rcache->prev, cpu_rcache->loaded); has_pfn = true; } else { -- 2.26.2 ___ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu
Re: [PATCH 1/2] iommu/iova: Flush CPU rcache for when a depot fills
On 25/09/2020 12:53, Robin Murphy wrote: --- drivers/iommu/iova.c | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 45a251da5453..05e0b462e0d9 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -892,9 +892,8 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, struct iova_rcache *rcache, unsigned long iova_pfn) { - struct iova_magazine *mag_to_free = NULL; struct iova_cpu_rcache *cpu_rcache; - bool can_insert = false; + bool can_insert = false, flush = false; unsigned long flags; cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); @@ -913,13 +912,19 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, if (rcache->depot_size < MAX_GLOBAL_MAGS) { rcache->depot[rcache->depot_size++] = cpu_rcache->loaded; + can_insert = true; + cpu_rcache->loaded = new_mag; } else { - mag_to_free = cpu_rcache->loaded; + /* + * The depot is full, meaning that a very large + * cache of IOVAs has built up, which slows + * down RB tree accesses significantly + * -> let's flush at this point. + */ + flush = true; + iova_magazine_free(new_mag); } spin_unlock(>lock); - - cpu_rcache->loaded = new_mag; - can_insert = true; } } @@ -928,9 +933,11 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, spin_unlock_irqrestore(_rcache->lock, flags); - if (mag_to_free) { - iova_magazine_free_pfns(mag_to_free, iovad); - iova_magazine_free(mag_to_free); + if (flush) { Do you really need this flag, or is it effectively just mirroring "!can_insert" - in theory if there wasn't enough memory to allocate a new magazine, then freeing some more IOVAs wouldn't necessarily be a bad thing to do anyway. Right, I can reuse can_insert. Other than that, I think this looks reasonable. Every time I look at __iova_rcache_insert() I'm convinced there must be a way to restructure it to be more streamlined overall, but I can never quite see exactly how... We could remove the new_mag check, but the code cannot safely handle loaded/prev = NULL. Indeed, I think that the mainline code has a bug: If the initial allocation for the loaded/prev magazines fail (give NULL) in init_iova_rcaches(), then in __iova_rcache_insert(): if (!iova_magazine_full(cpu_rcache->loaded)) { can_insert = true; If cpu_rcache->loaded == NULL, then can_insert is assigned true -> bang, as I experimented, below. This needs to be fixed... Thanks, john ereference at virtual address [ 10.195299] Mem abort info: [ 10.198080] ESR = 0x9604 [ 10.201121] EC = 0x25: DABT (current EL), IL = 32 bits [ 10.206418] SET = 0, FnV = 0 [ 10.209459] EA = 0, S1PTW = 0 [ 10.212585] Data abort info: [ 10.215452] ISV = 0, ISS = 0x0004 [ 10.219274] CM = 0, WnR = 0 [ 10.28] [] user address but active_mm is swapper [ 10.228569] Internal error: Oops: 9604 [#1] PREEMPT SMP [ 10.234127] Modules linked in: [ 10.237170] CPU: 11 PID: 696 Comm: irq/40-hisi_sas Not tainted 5.9.0-rc5-47738-gb1ead657a3fa-dirty #658 [ 10.246548] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI RC0 - V1.16.01 03/15/2019 [ 10.255058] pstate: 60c00089 (nZCv daIf +PAN +UAO BTYPE=--) [ 10.260620] pc : free_iova_fast+0xfc/0x280 [ 10.264703] lr : free_iova_fast+0x94/0x280 [ 10.268785] sp : 80002477bbb0 [ 10.272086] x29: 80002477bbb0 x28: [ 10.277385] x27: 002bc8fbb940 x26: 002bc727e26c [ 10.282684] x25: x24: 002bc9439008 [ 10.287982] x23: 000fdffe x22: 0080 [ 10.293280] x21: 002bc9439008 x20: [ 10.298579] x19: f403e9ebb700 x18: [ 10.303877] x17: 0001 x16: [ 10.309176] x15: x14: 0040 [ 10.314474] x13: 7fff x12: 0001 [ 10.319772] x11: 000f x10: 6000 [ 10.325070] x9 : x8 : 80002477b768 [ 10.330368] x7 : x6 : 003f [ 10.335666] x5 : 0040 x4 : [ 10.340964] x3 : f403e9ebb700 x2 : [ 10.346262] x1 : x0 : [ 10.351561] Call trace: [ 10.353995]free_iova_fast+0xfc/0x280 [ 10.357731]iommu_dma_free_iova+0x64/0x70 [ 10.361814]__iommu_dma_unmap+0x9c/0xf8 [ 10.365723]iommu_dma_unmap_sg+0xa8/0xc8 [ 10.369720]dma_unmap_sg_attrs+0x28/0x50 [ 10.373717]cq_thread_v3_hw+0x2dc/0x528 [ 10.377626]irq_thread_fn+0x2c/0xa0 [ 10.381188]irq_thread+0x130/0x1e0 [ 10.384664]kthread+0x154/0x158 [
Re: [PATCH 1/2] iommu/iova: Flush CPU rcache for when a depot fills
On 2020-09-25 10:51, John Garry wrote: Leizhen reported some time ago that IOVA performance may degrade over time [0], but unfortunately his solution to fix this problem was not given attention. To summarize, the issue is that as time goes by, the CPU rcache and depot rcache continue to grow. As such, IOVA RB tree access time also continues to grow. At a certain point, a depot may become full, and also some CPU rcaches may also be full when we try to insert another IOVA. For this scenario, currently we free the "loaded" CPU rcache and create a new one. This free'ing means that we need to free many IOVAs in the RB tree, which makes IO throughput performance fall off a cliff in our storage scenario: Jobs: 12 (f=12): [] [0.0% done] [6314MB/0KB/0KB /s] [1616K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [5669MB/0KB/0KB /s] [1451K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6031MB/0KB/0KB /s] [1544K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6673MB/0KB/0KB /s] [1708K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6705MB/0KB/0KB /s] [1717K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6031MB/0KB/0KB /s] [1544K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6761MB/0KB/0KB /s] [1731K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6705MB/0KB/0KB /s] [1717K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6685MB/0KB/0KB /s] [1711K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6178MB/0KB/0KB /s] [1582K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [6731MB/0KB/0KB /s] [1723K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [2387MB/0KB/0KB /s] [611K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [2689MB/0KB/0KB /s] [688K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [2278MB/0KB/0KB /s] [583K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [1288MB/0KB/0KB /s] [330K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [1632MB/0KB/0KB /s] [418K/0/0 iops] Jobs: 12 (f=12): [] [0.0% done] [1765MB/0KB/0KB /s] [452K/0/0 iops] And continue in this fashion, without recovering. Note that in this example we had to wait 16 hours for this to occur. Also note that IO throughput also becomes gradually becomes more unstable leading up to this point. As a solution this issue, we judge that the IOVA rcaches have grown too big, and just flush all the CPUs rcaches instead. The depot rcaches, however, are not flushed, as they can be used to immediately replenish active CPUs. In future, some IOVA rcache compaction could be implemented to solve the instabilty issue, which I figure could be quite complex to implement. [0] https://lore.kernel.org/linux-iommu/20190815121104.29140-3-thunder.leiz...@huawei.com/ Reported-by: Xiang Chen Tested-by: Xiang Chen Signed-off-by: John Garry --- drivers/iommu/iova.c | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 45a251da5453..05e0b462e0d9 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -892,9 +892,8 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, struct iova_rcache *rcache, unsigned long iova_pfn) { - struct iova_magazine *mag_to_free = NULL; struct iova_cpu_rcache *cpu_rcache; - bool can_insert = false; + bool can_insert = false, flush = false; unsigned long flags; cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); @@ -913,13 +912,19 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, if (rcache->depot_size < MAX_GLOBAL_MAGS) { rcache->depot[rcache->depot_size++] = cpu_rcache->loaded; + can_insert = true; + cpu_rcache->loaded = new_mag; } else { - mag_to_free = cpu_rcache->loaded; + /* +* The depot is full, meaning that a very large +* cache of IOVAs has built up, which slows +* down RB tree accesses significantly +* -> let's flush at this point. +*/ + flush = true; + iova_magazine_free(new_mag); } spin_unlock(>lock); - - cpu_rcache->loaded = new_mag; - can_insert = true; } } @@ -928,9 +933,11 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, spin_unlock_irqrestore(_rcache->lock, flags); - if (mag_to_free) { - iova_magazine_free_pfns(mag_to_free, iovad); -