RE: arm-smmu-v3 high cpu usage for NVMe

2020-05-24 Thread Song Bao Hua (Barry Song)
> Subject: Re: arm-smmu-v3 high cpu usage for NVMe
> 
> On 20/03/2020 10:41, John Garry wrote:
> 
> + Barry, Alexandru
> 
> >     PerfTop:   85864 irqs/sec  kernel:89.6%  exact:  0.0% lost:
> > 0/34434 drop:
> > 0/40116 [4000Hz cycles],  (all, 96 CPUs)
> >
> 
> --
> >
> >
> >   27.43%  [kernel]  [k]
> arm_smmu_cmdq_issue_cmdlist
> >   11.71%  [kernel]  [k]
> _raw_spin_unlock_irqrestore
> >    6.35%  [kernel]  [k] _raw_spin_unlock_irq
> >    2.65%  [kernel]  [k] get_user_pages_fast
> >    2.03%  [kernel]  [k] __slab_free
> >    1.55%  [kernel]  [k] tick_nohz_idle_exit
> >    1.47%  [kernel]  [k] arm_lpae_map
> >    1.39%  [kernel]  [k] __fget
> >    1.14%  [kernel]  [k] __lock_text_start
> >    1.09%  [kernel]  [k] _raw_spin_lock
> >    1.08%  [kernel]  [k] bio_release_pages.part.42
> >    1.03%  [kernel]  [k] __sbitmap_get_word
> >    0.97%  [kernel]  [k]
> > arm_smmu_atc_inv_domain.constprop.42
> >    0.91%  [kernel]  [k] fput_many
> >    0.88%  [kernel]  [k] __arm_lpae_map
> >
> 
> Hi Will, Robin,
> 
> I'm just getting around to look at this topic again. Here's the current
> picture for my NVMe test:
> 
> perf top -C 0 *
> Samples: 808 of event 'cycles:ppp', Event count (approx.): 469909024
> Overhead Shared Object Symbol
> 75.91% [kernel] [k] arm_smmu_cmdq_issue_cmdlist
> 3.28% [kernel] [k] arm_smmu_tlb_inv_range
> 2.42% [kernel] [k] arm_smmu_atc_inv_domain.constprop.49
> 2.35% [kernel] [k] _raw_spin_unlock_irqrestore
> 1.32% [kernel] [k] __arm_smmu_cmdq_poll_set_valid_map.isra.41
> 1.20% [kernel] [k] aio_complete_rw
> 0.96% [kernel] [k] enqueue_task_fair
> 0.93% [kernel] [k] gic_handle_irq
> 0.86% [kernel] [k] _raw_spin_lock_irqsave
> 0.72% [kernel] [k] put_reqs_available
> 0.72% [kernel] [k] sbitmap_queue_clear
> 
> * only certain CPUs run the dma unmap for my scenario, cpu0 being one of
> them.
> 
> Colleague Barry has similar findings for some other scenarios.

I wrote a test module and use the parameter "ways" to simulate how busy SMMU is 
and
compare the latency under different degrees of contentions.
1.  static int ways=16;  
2.  module_param(ways, int, S_IRUGO);  
3.
4.  static int seconds=120;  
5.  module_param(seconds, int, S_IRUGO);  
6.
7.  extern struct device *get_zip_dev(void);  
8.
9.  static noinline void test_mapsingle(struct device *dev, void *buf, int 
size)  
10. {  
11. dma_addr_t dma_addr = dma_map_single(dev, buf, size, 
DMA_TO_DEVICE);  
12. dma_unmap_single(dev, dma_addr, size, DMA_TO_DEVICE);  
13. }  
14.   
15. static noinline void test_memcpy(void *out, void *in, int size)  
16. {  
17. memcpy(out, in, size);  
18. }  
19.   
20. static int testthread(void *data)  
21. {  
22. unsigned long stop = jiffies +seconds*HZ;  
23. struct device *dev = get_zip_dev();  
24.   
25. char *input = kzalloc(4096, GFP_KERNEL);  
26. if (!input)  
27. return -ENOMEM;  
28.   
29. char *output = kzalloc(4096, GFP_KERNEL);  
30. if (!output)  
31. return -ENOMEM;  
32.   
33. while (time_before(jiffies, stop)) {  
34. test_mapsingle(dev, input, 4096);  
35. test_memcpy(output, input, 4096);  
36. }  
37.   
38. kfree(output);  
39. kfree(input);  
40.   
41. return 0;  
42. }  
43.   
44. static int __init test_init(void)  
45. {  
46. struct task_struct *tsk;  
47. int i;  
50.   
51. for(i=0;iq.llq.val, llq.val, head.val)

when ways=64, more than 60% time will be on:
cmpxchg_relaxed(>q.llq.val, llq.val, head.val)

here is a table for dma_unmap, arm_smmu_cmdq_issue_cmdlist() and CMD_SYNC with 
different ways:
 whole unmap(ns)   arm_smmu_cmdq_issue_cmdlist()ns  wait 
CMD_SYNC(ns) 
Ways=1 19561328 883  
Ways=1688917474 4000
Ways=3222043   195196879
Ways=6460842   5589516746 
Ways=96101880  9364924429

As you can see, while ways=1, we still need 2us to unmap, and 
arm_smmu_cmdq_issue_cmdlist() takes 60% time of the dma_unmap, CMD_SNC
takes more than 60% time of arm_smmu_cmdq_issue_cmdlist().

When SMMU is very busy, dma_unmap latency can be very large due to contention, 
more than 100us.


Thanks
Barry

> 
> So we tried the latest perf NMI 

[PATCH v1] iommu/tegra-smmu: Add missing locks around mapping operations

2020-05-24 Thread Dmitry Osipenko
The mapping operations of the Tegra SMMU driver are subjected to a race
condition issues because SMMU Address Space isn't allocated and freed
atomically, while it should be. This patch makes the mapping operations
atomic, it fixes an accidentally released Host1x Address Space problem
which happens while running multiple graphics tests in parallel on
Tegra30, i.e. by having multiple threads racing with each other in the
Host1x's submission and completion code paths, performing IOVA mappings
and unmappings in parallel.

Cc: 
Signed-off-by: Dmitry Osipenko 
---
 drivers/iommu/tegra-smmu.c | 43 +-
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 7426b7666e2b..4f956a797838 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -49,6 +50,7 @@ struct tegra_smmu_as {
struct iommu_domain domain;
struct tegra_smmu *smmu;
unsigned int use_count;
+   spinlock_t lock;
u32 *count;
struct page **pts;
struct page *pd;
@@ -308,6 +310,8 @@ static struct iommu_domain 
*tegra_smmu_domain_alloc(unsigned type)
return NULL;
}
 
+   spin_lock_init(>lock);
+
/* setup aperture */
as->domain.geometry.aperture_start = 0;
as->domain.geometry.aperture_end = 0x;
@@ -578,7 +582,7 @@ static u32 *as_get_pte(struct tegra_smmu_as *as, dma_addr_t 
iova,
struct page *page;
dma_addr_t dma;
 
-   page = alloc_page(GFP_KERNEL | __GFP_DMA | __GFP_ZERO);
+   page = alloc_page(GFP_ATOMIC | __GFP_DMA | __GFP_ZERO);
if (!page)
return NULL;
 
@@ -655,8 +659,9 @@ static void tegra_smmu_set_pte(struct tegra_smmu_as *as, 
unsigned long iova,
smmu_flush(smmu);
 }
 
-static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
- phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+static int
+tegra_smmu_map_locked(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
 {
struct tegra_smmu_as *as = to_smmu_as(domain);
dma_addr_t pte_dma;
@@ -685,8 +690,9 @@ static int tegra_smmu_map(struct iommu_domain *domain, 
unsigned long iova,
return 0;
 }
 
-static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
-  size_t size, struct iommu_iotlb_gather *gather)
+static size_t
+tegra_smmu_unmap_locked(struct iommu_domain *domain, unsigned long iova,
+   size_t size, struct iommu_iotlb_gather *gather)
 {
struct tegra_smmu_as *as = to_smmu_as(domain);
dma_addr_t pte_dma;
@@ -702,6 +708,33 @@ static size_t tegra_smmu_unmap(struct iommu_domain 
*domain, unsigned long iova,
return size;
 }
 
+static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+   struct tegra_smmu_as *as = to_smmu_as(domain);
+   unsigned long flags;
+   int ret;
+
+   spin_lock_irqsave(>lock, flags);
+   ret = tegra_smmu_map_locked(domain, iova, paddr, size, prot, gfp);
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+static size_t tegra_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
+  size_t size, struct iommu_iotlb_gather *gather)
+{
+   struct tegra_smmu_as *as = to_smmu_as(domain);
+   unsigned long flags;
+
+   spin_lock_irqsave(>lock, flags);
+   size = tegra_smmu_unmap_locked(domain, iova, size, gather);
+   spin_unlock_irqrestore(>lock, flags);
+
+   return size;
+}
+
 static phys_addr_t tegra_smmu_iova_to_phys(struct iommu_domain *domain,
   dma_addr_t iova)
 {
-- 
2.26.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu