Re: [PATCH 1/2] iommu/iova: Flush CPU rcache for when a depot fills

2020-09-25 Thread John Garry

On 25/09/2020 15:34, John Garry wrote:

Indeed, I think that the mainline code has a bug:

If the initial allocation for the loaded/prev magazines fail (give NULL) 
in init_iova_rcaches(), then in __iova_rcache_insert():


if (!iova_magazine_full(cpu_rcache->loaded)) {
 can_insert = true;

If cpu_rcache->loaded == NULL, then can_insert is assigned true -> bang, 
as I experimented, below. This needs to be fixed...




This looks better:

Subject: [PATCH] iommu/iova: Avoid double-negatives with magazine helpers

Expression !iova_magazine_full(mag) evaluates true when mag == NULL.

This falls over in __iova_rcache_insert() when loaded == NULL:

if (!iova_magazine_full(cpu_rcache->loaded)) {
can_insert = true;

...

if (can_insert)
iova_magazine_push(cpu_rcache->loaded, iova_pfn);

Here, can_insert is evaluated true, which is wrong. Members
loaded/prev can possibly be NULL if the initial allocations fail in
__iova_rcache_insert().

Let's stop using double-negatives, like !iova_magazine_full(), and use
iova_magazine_has_space() instead in this case. And similar for
!iova_magazine_empty().

Signed-off-by: John Garry 

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 5b4ffab7140b..42ca9d0f39b7 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -827,14 +827,18 @@ iova_magazine_free_pfns(struct iova_magazine *mag, 
struct iova_domain *iovad)

mag->size = 0;
 }

-static bool iova_magazine_full(struct iova_magazine *mag)
+static bool iova_magazine_has_space(struct iova_magazine *mag)
 {
-   return (mag && mag->size == IOVA_MAG_SIZE);
+   if (!mag)
+   return false;
+   return mag->size < IOVA_MAG_SIZE;
 }

-static bool iova_magazine_empty(struct iova_magazine *mag)
+static bool iova_magazine_has_pfns(struct iova_magazine *mag)
 {
-   return (!mag || mag->size == 0);
+   if (!mag)
+   return false;
+   return mag->size;
 }

 static unsigned long iova_magazine_pop(struct iova_magazine *mag,
@@ -843,7 +847,7 @@ static unsigned long iova_magazine_pop(struct 
iova_magazine *mag,

int i;
unsigned long pfn;

-   BUG_ON(iova_magazine_empty(mag));
+   BUG_ON(!iova_magazine_has_pfns(mag));

/* Only fall back to the rbtree if we have no suitable pfns at all */
for (i = mag->size - 1; mag->pfns[i] > limit_pfn; i--)
@@ -859,7 +863,7 @@ static unsigned long iova_magazine_pop(struct 
iova_magazine *mag,


 static void iova_magazine_push(struct iova_magazine *mag, unsigned 
long pfn)

 {
-   BUG_ON(iova_magazine_full(mag));
+   BUG_ON(!iova_magazine_has_space(mag));

mag->pfns[mag->size++] = pfn;
 }
@@ -905,9 +909,9 @@ static bool __iova_rcache_insert(struct iova_domain 
*iovad,

cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
spin_lock_irqsave(_rcache->lock, flags);

-   if (!iova_magazine_full(cpu_rcache->loaded)) {
+   if (iova_magazine_has_space(cpu_rcache->loaded)) {
can_insert = true;
-   } else if (!iova_magazine_full(cpu_rcache->prev)) {
+   } else if (iova_magazine_has_space(cpu_rcache->prev)) {
swap(cpu_rcache->prev, cpu_rcache->loaded);
can_insert = true;
} else {
@@ -915,7 +919,8 @@ static bool __iova_rcache_insert(struct iova_domain 
*iovad,


if (new_mag) {
spin_lock(>lock);
-   if (rcache->depot_size < MAX_GLOBAL_MAGS) {
+   if (rcache->depot_size < MAX_GLOBAL_MAGS &&
+   cpu_rcache->loaded) {
rcache->depot[rcache->depot_size++] =
cpu_rcache->loaded;
} else {
@@ -968,9 +973,9 @@ static unsigned long __iova_rcache_get(struct 
iova_rcache *rcache,

cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
spin_lock_irqsave(_rcache->lock, flags);

-   if (!iova_magazine_empty(cpu_rcache->loaded)) {
+   if (iova_magazine_has_pfns(cpu_rcache->loaded)) {
has_pfn = true;
-   } else if (!iova_magazine_empty(cpu_rcache->prev)) {
+   } else if (iova_magazine_has_pfns(cpu_rcache->prev)) {
swap(cpu_rcache->prev, cpu_rcache->loaded);
has_pfn = true;
} else {
--
2.26.2

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH 1/2] iommu/iova: Flush CPU rcache for when a depot fills

2020-09-25 Thread John Garry

On 25/09/2020 12:53, Robin Murphy wrote:

---
  drivers/iommu/iova.c | 25 -
  1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 45a251da5453..05e0b462e0d9 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -892,9 +892,8 @@ static bool __iova_rcache_insert(struct 
iova_domain *iovad,

   struct iova_rcache *rcache,
   unsigned long iova_pfn)
  {
-    struct iova_magazine *mag_to_free = NULL;
  struct iova_cpu_rcache *cpu_rcache;
-    bool can_insert = false;
+    bool can_insert = false, flush = false;
  unsigned long flags;
  cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
@@ -913,13 +912,19 @@ static bool __iova_rcache_insert(struct 
iova_domain *iovad,

  if (rcache->depot_size < MAX_GLOBAL_MAGS) {
  rcache->depot[rcache->depot_size++] =
  cpu_rcache->loaded;
+    can_insert = true;
+    cpu_rcache->loaded = new_mag;
  } else {
-    mag_to_free = cpu_rcache->loaded;
+    /*
+ * The depot is full, meaning that a very large
+ * cache of IOVAs has built up, which slows
+ * down RB tree accesses significantly
+ * -> let's flush at this point.
+ */
+    flush = true;
+    iova_magazine_free(new_mag);
  }
  spin_unlock(>lock);
-
-    cpu_rcache->loaded = new_mag;
-    can_insert = true;
  }
  }
@@ -928,9 +933,11 @@ static bool __iova_rcache_insert(struct 
iova_domain *iovad,

  spin_unlock_irqrestore(_rcache->lock, flags);
-    if (mag_to_free) {
-    iova_magazine_free_pfns(mag_to_free, iovad);
-    iova_magazine_free(mag_to_free);
+    if (flush) {


Do you really need this flag, or is it effectively just mirroring 
"!can_insert" - in theory if there wasn't enough memory to allocate a 
new magazine, then freeing some more IOVAs wouldn't necessarily be a bad 
thing to do anyway.


Right, I can reuse can_insert.



Other than that, I think this looks reasonable. Every time I look at 
__iova_rcache_insert() I'm convinced there must be a way to restructure 
it to be more streamlined overall, but I can never quite see exactly how...




We could remove the new_mag check, but the code cannot safely handle 
loaded/prev = NULL. Indeed, I think that the mainline code has a bug:


If the initial allocation for the loaded/prev magazines fail (give NULL) 
in init_iova_rcaches(), then in __iova_rcache_insert():


if (!iova_magazine_full(cpu_rcache->loaded)) {
can_insert = true;

If cpu_rcache->loaded == NULL, then can_insert is assigned true -> bang, 
as I experimented, below. This needs to be fixed...


Thanks,
john



ereference at virtual address 
[ 10.195299] Mem abort info:
[ 10.198080] ESR = 0x9604
[ 10.201121] EC = 0x25: DABT (current EL), IL = 32 bits
[ 10.206418] SET = 0, FnV = 0
[ 10.209459] EA = 0, S1PTW = 0
[ 10.212585] Data abort info:
[ 10.215452] ISV = 0, ISS = 0x0004
[ 10.219274] CM = 0, WnR = 0
[ 10.28] [] user address but active_mm is swapper
[ 10.228569] Internal error: Oops: 9604 [#1] PREEMPT SMP
[ 10.234127] Modules linked in:
[ 10.237170] CPU: 11 PID: 696 Comm: irq/40-hisi_sas Not tainted 
5.9.0-rc5-47738-gb1ead657a3fa-dirty #658
[ 10.246548] Hardware name: Huawei D06 /D06, BIOS Hisilicon D06 UEFI RC0 
- V1.16.01 03/15/2019

[ 10.255058] pstate: 60c00089 (nZCv daIf +PAN +UAO BTYPE=--)
[ 10.260620] pc : free_iova_fast+0xfc/0x280
[ 10.264703] lr : free_iova_fast+0x94/0x280
[ 10.268785] sp : 80002477bbb0
[ 10.272086] x29: 80002477bbb0 x28: 
[ 10.277385] x27: 002bc8fbb940 x26: 002bc727e26c
[ 10.282684] x25:  x24: 002bc9439008
[ 10.287982] x23: 000fdffe x22: 0080
[ 10.293280] x21: 002bc9439008 x20: 
[ 10.298579] x19: f403e9ebb700 x18: 
[ 10.303877] x17: 0001 x16: 
[ 10.309176] x15:  x14: 0040
[ 10.314474] x13: 7fff x12: 0001
[ 10.319772] x11: 000f x10: 6000
[ 10.325070] x9 :  x8 : 80002477b768
[ 10.330368] x7 :  x6 : 003f
[ 10.335666] x5 : 0040 x4 : 
[ 10.340964] x3 : f403e9ebb700 x2 : 
[ 10.346262] x1 :  x0 : 
[ 10.351561] Call trace:
[ 10.353995]free_iova_fast+0xfc/0x280
[ 10.357731]iommu_dma_free_iova+0x64/0x70
[ 10.361814]__iommu_dma_unmap+0x9c/0xf8
[ 10.365723]iommu_dma_unmap_sg+0xa8/0xc8
[ 10.369720]dma_unmap_sg_attrs+0x28/0x50
[ 10.373717]cq_thread_v3_hw+0x2dc/0x528
[ 10.377626]irq_thread_fn+0x2c/0xa0
[ 10.381188]irq_thread+0x130/0x1e0
[ 10.384664]kthread+0x154/0x158
[ 

Re: [PATCH 1/2] iommu/iova: Flush CPU rcache for when a depot fills

2020-09-25 Thread Robin Murphy

On 2020-09-25 10:51, John Garry wrote:

Leizhen reported some time ago that IOVA performance may degrade over time
[0], but unfortunately his solution to fix this problem was not given
attention.

To summarize, the issue is that as time goes by, the CPU rcache and depot
rcache continue to grow. As such, IOVA RB tree access time also continues
to grow.

At a certain point, a depot may become full, and also some CPU rcaches may
also be full when we try to insert another IOVA. For this scenario,
currently we free the "loaded" CPU rcache and create a new one. This
free'ing means that we need to free many IOVAs in the RB tree, which
makes IO throughput performance fall off a cliff in our storage scenario:

Jobs: 12 (f=12): [] [0.0% done] [6314MB/0KB/0KB /s] [1616K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [5669MB/0KB/0KB /s] [1451K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6031MB/0KB/0KB /s] [1544K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6673MB/0KB/0KB /s] [1708K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6705MB/0KB/0KB /s] [1717K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6031MB/0KB/0KB /s] [1544K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6761MB/0KB/0KB /s] [1731K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6705MB/0KB/0KB /s] [1717K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6685MB/0KB/0KB /s] [1711K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6178MB/0KB/0KB /s] [1582K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [6731MB/0KB/0KB /s] [1723K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [2387MB/0KB/0KB /s] [611K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [2689MB/0KB/0KB /s] [688K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [2278MB/0KB/0KB /s] [583K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [1288MB/0KB/0KB /s] [330K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [1632MB/0KB/0KB /s] [418K/0/0 iops]
Jobs: 12 (f=12): [] [0.0% done] [1765MB/0KB/0KB /s] [452K/0/0 iops]

And continue in this fashion, without recovering. Note that in this
example we had to wait 16 hours for this to occur. Also note that IO
throughput also becomes gradually becomes more unstable leading up to this
point.

As a solution this issue, we judge that the IOVA rcaches have grown too
big, and just flush all the CPUs rcaches instead.

The depot rcaches, however, are not flushed, as they can be used to
immediately replenish active CPUs.

In future, some IOVA rcache compaction could be implemented to solve the
instabilty issue, which I figure could be quite complex to implement.

[0] 
https://lore.kernel.org/linux-iommu/20190815121104.29140-3-thunder.leiz...@huawei.com/

Reported-by: Xiang Chen 
Tested-by: Xiang Chen 
Signed-off-by: John Garry 
---
  drivers/iommu/iova.c | 25 -
  1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 45a251da5453..05e0b462e0d9 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -892,9 +892,8 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 struct iova_rcache *rcache,
 unsigned long iova_pfn)
  {
-   struct iova_magazine *mag_to_free = NULL;
struct iova_cpu_rcache *cpu_rcache;
-   bool can_insert = false;
+   bool can_insert = false, flush = false;
unsigned long flags;
  
  	cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);

@@ -913,13 +912,19 @@ static bool __iova_rcache_insert(struct iova_domain 
*iovad,
if (rcache->depot_size < MAX_GLOBAL_MAGS) {
rcache->depot[rcache->depot_size++] =
cpu_rcache->loaded;
+   can_insert = true;
+   cpu_rcache->loaded = new_mag;
} else {
-   mag_to_free = cpu_rcache->loaded;
+   /*
+* The depot is full, meaning that a very large
+* cache of IOVAs has built up, which slows
+* down RB tree accesses significantly
+* -> let's flush at this point.
+*/
+   flush = true;
+   iova_magazine_free(new_mag);
}
spin_unlock(>lock);
-
-   cpu_rcache->loaded = new_mag;
-   can_insert = true;
}
}
  
@@ -928,9 +933,11 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
  
  	spin_unlock_irqrestore(_rcache->lock, flags);
  
-	if (mag_to_free) {

-   iova_magazine_free_pfns(mag_to_free, iovad);
-