Re: [PATCH v4 2/3] mm: Add support for kmem caches in DMA32 zone

2018-12-04 Thread Nicolas Boichat
On Wed, Dec 5, 2018 at 3:25 PM Wei Yang  wrote:
>
> On Wed, Dec 05, 2018 at 01:48:27PM +0800, Nicolas Boichat wrote:
> >In some cases (e.g. IOMMU ARMv7s page allocator), we need to allocate
> >data structures smaller than a page with GFP_DMA32 flag.
> >
> >This change makes it possible to create a custom cache in DMA32 zone
> >using kmem_cache_create, then allocate memory using kmem_cache_alloc.
> >
> >We do not create a DMA32 kmalloc cache array, as there are currently
> >no users of kmalloc(..., GFP_DMA32). The new test in check_slab_flags
> >ensures that such calls still fail (as they do before this change).
> >
> >Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
> >Signed-off-by: Nicolas Boichat 
> >---
> >
> >Changes since v2:
> > - Clarified commit message
> > - Add entry in sysfs-kernel-slab to document the new sysfs file
> >
> >(v3 used the page_frag approach)
> >
> >Documentation/ABI/testing/sysfs-kernel-slab |  9 +
> > include/linux/slab.h|  2 ++
> > mm/internal.h   |  8 ++--
> > mm/slab.c   |  4 +++-
> > mm/slab.h   |  3 ++-
> > mm/slab_common.c|  2 +-
> > mm/slub.c   | 18 +-
> > 7 files changed, 40 insertions(+), 6 deletions(-)
> >
> >diff --git a/Documentation/ABI/testing/sysfs-kernel-slab 
> >b/Documentation/ABI/testing/sysfs-kernel-slab
> >index 29601d93a1c2ea..d742c6cfdffbe9 100644
> >--- a/Documentation/ABI/testing/sysfs-kernel-slab
> >+++ b/Documentation/ABI/testing/sysfs-kernel-slab
> >@@ -106,6 +106,15 @@ Description:
> >   are from ZONE_DMA.
> >   Available when CONFIG_ZONE_DMA is enabled.
> >
> >+What: /sys/kernel/slab/cache/cache_dma32
> >+Date: December 2018
> >+KernelVersion:4.21
> >+Contact:  Nicolas Boichat 
> >+Description:
> >+  The cache_dma32 file is read-only and specifies whether 
> >objects
> >+  are from ZONE_DMA32.
> >+  Available when CONFIG_ZONE_DMA32 is enabled.
> >+
> > What: /sys/kernel/slab/cache/cpu_slabs
> > Date: May 2007
> > KernelVersion:2.6.22
> >diff --git a/include/linux/slab.h b/include/linux/slab.h
> >index 11b45f7ae4057c..9449b19c5f107a 100644
> >--- a/include/linux/slab.h
> >+++ b/include/linux/slab.h
> >@@ -32,6 +32,8 @@
> > #define SLAB_HWCACHE_ALIGN((slab_flags_t __force)0x2000U)
> > /* Use GFP_DMA memory */
> > #define SLAB_CACHE_DMA((slab_flags_t __force)0x4000U)
> >+/* Use GFP_DMA32 memory */
> >+#define SLAB_CACHE_DMA32  ((slab_flags_t __force)0x8000U)
> > /* DEBUG: Store the last owner for bug hunting */
> > #define SLAB_STORE_USER   ((slab_flags_t __force)0x0001U)
> > /* Panic if kmem_cache_create() fails */
> >diff --git a/mm/internal.h b/mm/internal.h
> >index a2ee82a0cd44ae..fd244ad716eaf8 100644
> >--- a/mm/internal.h
> >+++ b/mm/internal.h
> >@@ -14,6 +14,7 @@
> > #include 
> > #include 
> > #include 
> >+#include 
> > #include 
> >
> > /*
> >@@ -34,9 +35,12 @@
> > #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
> >
> > /* Check for flags that must not be used with a slab allocator */
> >-static inline gfp_t check_slab_flags(gfp_t flags)
> >+static inline gfp_t check_slab_flags(gfp_t flags, slab_flags_t slab_flags)
> > {
> >-  gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK;
> >+  gfp_t bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK;
> >+
> >+  if (!IS_ENABLED(CONFIG_ZONE_DMA32) || !(slab_flags & 
> >SLAB_CACHE_DMA32))
> >+  bug_mask |= __GFP_DMA32;
>
> The original version doesn't check CONFIG_ZONE_DMA32.
>
> Do we need to add this condition here?
> Could we just decide the bug_mask based on slab_flags?

We can. The reason I did it this way is that when we don't have
CONFIG_ZONE_DMA32, the compiler should be able to simplify to:

bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK;
if (true || ..) => if (true)
   bug_mask |= __GFP_DMA32;

Then just
bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK | __GFP_DMA32;

And since the function is inline, slab_flags would not even need to be
accessed at all.

> >
> >   if (unlikely(flags & bug_mask)) {
> >   gfp_t invalid_mask = flags & bug_mask;
> >diff --git a/mm/slab.c b/mm/slab.c
> >index 65a774f05e7836..2fd3b9a996cbe6 100644
> >--- a/mm/slab.c
> >+++ b/mm/slab.c
> >@@ -2109,6 +2109,8 @@ int __kmem_cache_create(struct kmem_cache *cachep, 
> >slab_flags_t flags)
> >   cachep->allocflags = __GFP_COMP;
> >   if (flags & SLAB_CACHE_DMA)
> >   cachep->allocflags |= GFP_DMA;
> >+  if (flags & SLAB_CACHE_DMA32)
> >+  cachep->allocflags |= GFP_DMA32;
> >   if (flags & SLAB_RECLAIM_ACCOUNT)
> >   cachep->allocflags |= __GFP_RECLAIMABLE;
> >   cachep->size = size;
> >@@ -2643,7 +2645,7 @@ static struct page 

Re: [PATCH v4 2/3] mm: Add support for kmem caches in DMA32 zone

2018-12-04 Thread Wei Yang
On Wed, Dec 05, 2018 at 01:48:27PM +0800, Nicolas Boichat wrote:
>In some cases (e.g. IOMMU ARMv7s page allocator), we need to allocate
>data structures smaller than a page with GFP_DMA32 flag.
>
>This change makes it possible to create a custom cache in DMA32 zone
>using kmem_cache_create, then allocate memory using kmem_cache_alloc.
>
>We do not create a DMA32 kmalloc cache array, as there are currently
>no users of kmalloc(..., GFP_DMA32). The new test in check_slab_flags
>ensures that such calls still fail (as they do before this change).
>
>Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
>Signed-off-by: Nicolas Boichat 
>---
>
>Changes since v2:
> - Clarified commit message
> - Add entry in sysfs-kernel-slab to document the new sysfs file
>
>(v3 used the page_frag approach)
>
>Documentation/ABI/testing/sysfs-kernel-slab |  9 +
> include/linux/slab.h|  2 ++
> mm/internal.h   |  8 ++--
> mm/slab.c   |  4 +++-
> mm/slab.h   |  3 ++-
> mm/slab_common.c|  2 +-
> mm/slub.c   | 18 +-
> 7 files changed, 40 insertions(+), 6 deletions(-)
>
>diff --git a/Documentation/ABI/testing/sysfs-kernel-slab 
>b/Documentation/ABI/testing/sysfs-kernel-slab
>index 29601d93a1c2ea..d742c6cfdffbe9 100644
>--- a/Documentation/ABI/testing/sysfs-kernel-slab
>+++ b/Documentation/ABI/testing/sysfs-kernel-slab
>@@ -106,6 +106,15 @@ Description:
>   are from ZONE_DMA.
>   Available when CONFIG_ZONE_DMA is enabled.
> 
>+What: /sys/kernel/slab/cache/cache_dma32
>+Date: December 2018
>+KernelVersion:4.21
>+Contact:  Nicolas Boichat 
>+Description:
>+  The cache_dma32 file is read-only and specifies whether objects
>+  are from ZONE_DMA32.
>+  Available when CONFIG_ZONE_DMA32 is enabled.
>+
> What: /sys/kernel/slab/cache/cpu_slabs
> Date: May 2007
> KernelVersion:2.6.22
>diff --git a/include/linux/slab.h b/include/linux/slab.h
>index 11b45f7ae4057c..9449b19c5f107a 100644
>--- a/include/linux/slab.h
>+++ b/include/linux/slab.h
>@@ -32,6 +32,8 @@
> #define SLAB_HWCACHE_ALIGN((slab_flags_t __force)0x2000U)
> /* Use GFP_DMA memory */
> #define SLAB_CACHE_DMA((slab_flags_t __force)0x4000U)
>+/* Use GFP_DMA32 memory */
>+#define SLAB_CACHE_DMA32  ((slab_flags_t __force)0x8000U)
> /* DEBUG: Store the last owner for bug hunting */
> #define SLAB_STORE_USER   ((slab_flags_t __force)0x0001U)
> /* Panic if kmem_cache_create() fails */
>diff --git a/mm/internal.h b/mm/internal.h
>index a2ee82a0cd44ae..fd244ad716eaf8 100644
>--- a/mm/internal.h
>+++ b/mm/internal.h
>@@ -14,6 +14,7 @@
> #include 
> #include 
> #include 
>+#include 
> #include 
> 
> /*
>@@ -34,9 +35,12 @@
> #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
> 
> /* Check for flags that must not be used with a slab allocator */
>-static inline gfp_t check_slab_flags(gfp_t flags)
>+static inline gfp_t check_slab_flags(gfp_t flags, slab_flags_t slab_flags)
> {
>-  gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK;
>+  gfp_t bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK;
>+
>+  if (!IS_ENABLED(CONFIG_ZONE_DMA32) || !(slab_flags & SLAB_CACHE_DMA32))
>+  bug_mask |= __GFP_DMA32;

The original version doesn't check CONFIG_ZONE_DMA32.

Do we need to add this condition here?
Could we just decide the bug_mask based on slab_flags?

> 
>   if (unlikely(flags & bug_mask)) {
>   gfp_t invalid_mask = flags & bug_mask;
>diff --git a/mm/slab.c b/mm/slab.c
>index 65a774f05e7836..2fd3b9a996cbe6 100644
>--- a/mm/slab.c
>+++ b/mm/slab.c
>@@ -2109,6 +2109,8 @@ int __kmem_cache_create(struct kmem_cache *cachep, 
>slab_flags_t flags)
>   cachep->allocflags = __GFP_COMP;
>   if (flags & SLAB_CACHE_DMA)
>   cachep->allocflags |= GFP_DMA;
>+  if (flags & SLAB_CACHE_DMA32)
>+  cachep->allocflags |= GFP_DMA32;
>   if (flags & SLAB_RECLAIM_ACCOUNT)
>   cachep->allocflags |= __GFP_RECLAIMABLE;
>   cachep->size = size;
>@@ -2643,7 +2645,7 @@ static struct page *cache_grow_begin(struct kmem_cache 
>*cachep,
>* Be lazy and only check for valid flags here,  keeping it out of the
>* critical path in kmem_cache_alloc().
>*/
>-  flags = check_slab_flags(flags);
>+  flags = check_slab_flags(flags, cachep->flags);
>   WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
>   local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
> 
>diff --git a/mm/slab.h b/mm/slab.h
>index 4190c24ef0e9df..fcf717e12f0a86 100644
>--- a/mm/slab.h
>+++ b/mm/slab.h
>@@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int 
>object_size,
> 
> 
> /* Legal flag mask for kmem_cache_create(), 

Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-12-04 Thread Nicolas Boichat
On Wed, Dec 5, 2018 at 10:04 AM Nicolas Boichat  wrote:
>
> On Tue, Dec 4, 2018 at 10:35 PM Vlastimil Babka  wrote:
> >
> > On 12/4/18 10:37 AM, Nicolas Boichat wrote:
> > > On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat  
> > > wrote:
> > >>
> > >> This is a follow-up to the discussion in [1], to make sure that the page
> > >> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit
> > >> physical address space.
> > >>
> > >> [1] 
> > >> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html
> > >
> > > Hi everyone,
> > >
> > > Let's try to summarize here.
> > >
> > > First, we confirmed that this is a regression, and IOMMU errors happen
> > > on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13).
> > > The issue most likely starts from ad67f5a6545f ("arm64: replace
> > > ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number
> > > of Mediatek platforms (and maybe others?).
> > >
> > > We have a few options here:
> > > 1. This series [2], that adds support for GFP_DMA32 slab caches,
> > > _without_ adding kmalloc caches (since there are no users of
> > > kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on
> > > the 3 patches, and AFAICT this solution works fine.
> > > 2. genalloc. That works, but unless we preallocate 4MB for L2 tables
> > > (which is wasteful as we usually only need a handful of L2 tables),
> > > we'll need changes in the core (use GFP_ATOMIC) to allow allocating on
> > > demand, and as it stands we'd have no way to shrink the allocation.
> > > 3. page_frag [3]. That works fine, and the code is quite simple. One
> > > drawback is that fragments in partially freed pages cannot be reused
> > > (from limited experiments, I see that IOMMU L2 tables are rarely
> > > freed, so it's unlikely a whole page would get freed). But given the
> > > low number of L2 tables, maybe we can live with that.
> > >
> > > I think 2 is out. Any preference between 1 and 3? I think 1 makes
> > > better use of the memory, so that'd be my preference. But I'm probably
> > > missing something.
> >
> > I would prefer 1 as well. IIRC you already confirmed that alignment
> > requirements are not broken for custom kmem caches even in presence of
> > SLUB debug options (and I would say it's a bug to be fixed if they
> > weren't).
>
> > I just asked (and didn't get a reply I think) about your
> > ability to handle the GFP_ATOMIC allocation failures. They should be
> > rare when only single page allocations are needed for the kmem cache.
> > But in case they are not an option, then preallocating would be needed,
> > thus probably option 2.
>
> Oh, sorry, I missed your question.
>
> I don't have a full answer, but:
> - The allocations themselves are rare (I count a few 10s of L2 tables
> at most on my system, I assume we rarely have >100), and yes, we only
> need a single page, so the failures should be exceptional.
> - My change is probably not making anything worse: I assume that even
> with the current approach using GFP_DMA slab caches on older kernels,
> failures could potentially happen. I don't think we've seen those. If
> we are really concerned about this, maybe we'd need to modify
> mtk_iommu_map to not hold a spinlock (if that's possible), so we don't
> need to use GFP_ATOMIC. I suggest we just keep an eye on such issues,
> and address them if they show up (we can even revisit genalloc at that
> stage).
>
> Anyway, I'll clean up patches for 1 (mostly commit message changes
> based on the comments in the threads) and resend.

Done here: https://patchwork.kernel.org/cover/10713019/ .

> Thanks,
>
> > > [2] https://patchwork.kernel.org/cover/10677529/, 3 patches
> > > [3] https://patchwork.codeaurora.org/patch/671639/
> > >
> > > Thanks,
> > >
> > > Nicolas
> > >
> >
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v4 3/3] iommu/io-pgtable-arm-v7s: Request DMA32 memory, and improve debugging

2018-12-04 Thread Nicolas Boichat
IOMMUs using ARMv7 short-descriptor format require page tables
(level 1 and 2) to be allocated within the first 4GB of RAM, even
on 64-bit systems.

For level 1/2 pages, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32
is defined (e.g. on arm64 platforms).

For level 2 pages, allocate a slab cache in SLAB_CACHE_DMA32.

Also, print an error when the physical address does not fit in
32-bit, to make debugging easier in the future.

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---

Changes since v2:
 - Commit message

(v3 used the page_frag approach)

 drivers/iommu/io-pgtable-arm-v7s.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c 
b/drivers/iommu/io-pgtable-arm-v7s.c
index 445c3bde04800c..996f7b6d00b44a 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -161,6 +161,14 @@
 
 #define ARM_V7S_TCR_PD1BIT(5)
 
+#ifdef CONFIG_ZONE_DMA32
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32
+#define ARM_V7S_TABLE_SLAB_CACHE SLAB_CACHE_DMA32
+#else
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA
+#define ARM_V7S_TABLE_SLAB_CACHE SLAB_CACHE_DMA
+#endif
+
 typedef u32 arm_v7s_iopte;
 
 static bool selftest_running;
@@ -198,13 +206,17 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
void *table = NULL;
 
if (lvl == 1)
-   table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size));
+   table = (void *)__get_free_pages(
+   __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size));
else if (lvl == 2)
-   table = kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA);
+   table = kmem_cache_zalloc(data->l2_tables,
+ gfp | ARM_V7S_TABLE_GFP_DMA);
phys = virt_to_phys(table);
-   if (phys != (arm_v7s_iopte)phys)
+   if (phys != (arm_v7s_iopte)phys) {
/* Doesn't fit in PTE */
+   dev_err(dev, "Page table does not fit in PTE: %pa", );
goto out_free;
+   }
if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
if (dma_mapping_error(dev, dma))
@@ -737,7 +749,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct 
io_pgtable_cfg *cfg,
data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2",
ARM_V7S_TABLE_SIZE(2),
ARM_V7S_TABLE_SIZE(2),
-   SLAB_CACHE_DMA, NULL);
+   ARM_V7S_TABLE_SLAB_CACHE, NULL);
if (!data->l2_tables)
goto out_free_data;
 
-- 
2.20.0.rc1.387.gf8505762e3-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v4 1/3] mm: slab/slub: Add check_slab_flags function to check for valid flags

2018-12-04 Thread Nicolas Boichat
Remove duplicated code between slab and slub, and will make it
easier to make the test more complicated in the next commits.

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---
 mm/internal.h | 18 --
 mm/slab.c |  8 +---
 mm/slub.c |  8 +---
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index f0c9ccde3bdb9e..a2ee82a0cd44ae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -33,8 +33,22 @@
 /* Control allocation cpuset and node placement constraints */
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
 
-/* Do not use these with a slab allocator */
-#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+/* Check for flags that must not be used with a slab allocator */
+static inline gfp_t check_slab_flags(gfp_t flags)
+{
+   gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK;
+
+   if (unlikely(flags & bug_mask)) {
+   gfp_t invalid_mask = flags & bug_mask;
+
+   flags &= ~bug_mask;
+   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
+   invalid_mask, _mask, flags, );
+   dump_stack();
+   }
+
+   return flags;
+}
 
 void page_writeback_init(void);
 
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c91a..65a774f05e7836 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2643,13 +2643,7 @@ static struct page *cache_grow_begin(struct kmem_cache 
*cachep,
 * Be lazy and only check for valid flags here,  keeping it out of the
 * critical path in kmem_cache_alloc().
 */
-   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-   flags &= ~GFP_SLAB_BUG_MASK;
-   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
-   invalid_mask, _mask, flags, );
-   dump_stack();
-   }
+   flags = check_slab_flags(flags);
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
diff --git a/mm/slub.c b/mm/slub.c
index c229a9b7dd5448..21a3f6866da472 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1685,13 +1685,7 @@ static struct page *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 {
-   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-   gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
-   flags &= ~GFP_SLAB_BUG_MASK;
-   pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x 
(%pGg). Fix your code!\n",
-   invalid_mask, _mask, flags, );
-   dump_stack();
-   }
+   flags = check_slab_flags(flags);
 
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-- 
2.20.0.rc1.387.gf8505762e3-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v4 2/3] mm: Add support for kmem caches in DMA32 zone

2018-12-04 Thread Nicolas Boichat
In some cases (e.g. IOMMU ARMv7s page allocator), we need to allocate
data structures smaller than a page with GFP_DMA32 flag.

This change makes it possible to create a custom cache in DMA32 zone
using kmem_cache_create, then allocate memory using kmem_cache_alloc.

We do not create a DMA32 kmalloc cache array, as there are currently
no users of kmalloc(..., GFP_DMA32). The new test in check_slab_flags
ensures that such calls still fail (as they do before this change).

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---

Changes since v2:
 - Clarified commit message
 - Add entry in sysfs-kernel-slab to document the new sysfs file

(v3 used the page_frag approach)

Documentation/ABI/testing/sysfs-kernel-slab |  9 +
 include/linux/slab.h|  2 ++
 mm/internal.h   |  8 ++--
 mm/slab.c   |  4 +++-
 mm/slab.h   |  3 ++-
 mm/slab_common.c|  2 +-
 mm/slub.c   | 18 +-
 7 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-kernel-slab 
b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d93a1c2ea..d742c6cfdffbe9 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -106,6 +106,15 @@ Description:
are from ZONE_DMA.
Available when CONFIG_ZONE_DMA is enabled.
 
+What:  /sys/kernel/slab/cache/cache_dma32
+Date:  December 2018
+KernelVersion: 4.21
+Contact:   Nicolas Boichat 
+Description:
+   The cache_dma32 file is read-only and specifies whether objects
+   are from ZONE_DMA32.
+   Available when CONFIG_ZONE_DMA32 is enabled.
+
 What:  /sys/kernel/slab/cache/cpu_slabs
 Date:  May 2007
 KernelVersion: 2.6.22
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 11b45f7ae4057c..9449b19c5f107a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -32,6 +32,8 @@
 #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x2000U)
 /* Use GFP_DMA memory */
 #define SLAB_CACHE_DMA ((slab_flags_t __force)0x4000U)
+/* Use GFP_DMA32 memory */
+#define SLAB_CACHE_DMA32   ((slab_flags_t __force)0x8000U)
 /* DEBUG: Store the last owner for bug hunting */
 #define SLAB_STORE_USER((slab_flags_t __force)0x0001U)
 /* Panic if kmem_cache_create() fails */
diff --git a/mm/internal.h b/mm/internal.h
index a2ee82a0cd44ae..fd244ad716eaf8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -34,9 +35,12 @@
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
 
 /* Check for flags that must not be used with a slab allocator */
-static inline gfp_t check_slab_flags(gfp_t flags)
+static inline gfp_t check_slab_flags(gfp_t flags, slab_flags_t slab_flags)
 {
-   gfp_t bug_mask = __GFP_DMA32 | __GFP_HIGHMEM | ~__GFP_BITS_MASK;
+   gfp_t bug_mask = __GFP_HIGHMEM | ~__GFP_BITS_MASK;
+
+   if (!IS_ENABLED(CONFIG_ZONE_DMA32) || !(slab_flags & SLAB_CACHE_DMA32))
+   bug_mask |= __GFP_DMA32;
 
if (unlikely(flags & bug_mask)) {
gfp_t invalid_mask = flags & bug_mask;
diff --git a/mm/slab.c b/mm/slab.c
index 65a774f05e7836..2fd3b9a996cbe6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2109,6 +2109,8 @@ int __kmem_cache_create(struct kmem_cache *cachep, 
slab_flags_t flags)
cachep->allocflags = __GFP_COMP;
if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
+   if (flags & SLAB_CACHE_DMA32)
+   cachep->allocflags |= GFP_DMA32;
if (flags & SLAB_RECLAIM_ACCOUNT)
cachep->allocflags |= __GFP_RECLAIMABLE;
cachep->size = size;
@@ -2643,7 +2645,7 @@ static struct page *cache_grow_begin(struct kmem_cache 
*cachep,
 * Be lazy and only check for valid flags here,  keeping it out of the
 * critical path in kmem_cache_alloc().
 */
-   flags = check_slab_flags(flags);
+   flags = check_slab_flags(flags, cachep->flags);
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
diff --git a/mm/slab.h b/mm/slab.h
index 4190c24ef0e9df..fcf717e12f0a86 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int 
object_size,
 
 
 /* Legal flag mask for kmem_cache_create(), for various configurations */
-#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
+SLAB_CACHE_DMA32 | SLAB_PANIC | \
 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
 
 #if 

[PATCH v4 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-12-04 Thread Nicolas Boichat
This is a follow-up to the discussion in [1], [2].

IOMMUs using ARMv7 short-descriptor format require page tables
(level 1 and 2) to be allocated within the first 4GB of RAM, even
on 64-bit systems.

For L1 tables that are bigger than a page, we can just use __get_free_pages
with GFP_DMA32 (on arm64 systems only, arm would still use GFP_DMA).

For L2 tables that only take 1KB, it would be a waste to allocate a full
page, so we considered 3 approaches:
 1. This series, adding support for GFP_DMA32 slab caches.
 2. genalloc, which requires pre-allocating the maximum number of L2 page
tables (4096, so 4MB of memory).
 3. page_frag, which is not very memory-efficient as it is unable to reuse
freed fragments until the whole page is freed. [3]

This series is the most memory-efficient approach.

[1] https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html
[2] https://lists.linuxfoundation.org/pipermail/iommu/2018-December/031696.html
[3] https://patchwork.codeaurora.org/patch/671639/

Changes since v1:
 - Add support for SLAB_CACHE_DMA32 in slab and slub (patches 1/2)
 - iommu/io-pgtable-arm-v7s (patch 3):
   - Changed approach to use SLAB_CACHE_DMA32 added by the previous
 commit.
   - Use DMA or DMA32 depending on the architecture (DMA for arm,
 DMA32 for arm64).

Changes since v2:
 - Reworded and expanded commit messages
 - Added cache_dma32 documentation in PATCH 2/3.

v3 used the page_frag approach, see [3].

Nicolas Boichat (3):
  mm: slab/slub: Add check_slab_flags function to check for valid flags
  mm: Add support for kmem caches in DMA32 zone
  iommu/io-pgtable-arm-v7s: Request DMA32 memory, and improve debugging

 Documentation/ABI/testing/sysfs-kernel-slab |  9 
 drivers/iommu/io-pgtable-arm-v7s.c  | 20 +
 include/linux/slab.h|  2 ++
 mm/internal.h   | 22 +--
 mm/slab.c   | 10 +++--
 mm/slab.h   |  3 ++-
 mm/slab_common.c|  2 +-
 mm/slub.c   | 24 +++--
 8 files changed, 70 insertions(+), 22 deletions(-)

-- 
2.20.0.rc1.387.gf8505762e3-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/4] dma-debug: Use pr_fmt()

2018-12-04 Thread Joe Perches
On Mon, 2018-12-03 at 17:28 +, Robin Murphy wrote:
> Use pr_fmt() to generate the "DMA-API: " prefix consistently. This
> results in it being added to a couple of pr_*() messages which were
> missing it before, and for the err_printk() calls moves it to the actual
> start of the message instead of somewhere in the middle.
> 
> Signed-off-by: Robin Murphy 
> ---
> 
> I chose not to refactor the existing split strings for minimal churn here.
> 
>  kernel/dma/debug.c | 74 --
>  1 file changed, 38 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
> index 231ca4628062..91b84140e4a5 100644
> --- a/kernel/dma/debug.c
> +++ b/kernel/dma/debug.c
> @@ -17,6 +17,8 @@
>   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
>   */
>  
> +#define pr_fmt(fmt)  "DMA-API: " fmt
> +
>  #include 
>  #include 
>  #include 
> @@ -234,7 +236,7 @@ static bool driver_filter(struct device *dev)
>   error_count += 1;   \
>   if (driver_filter(dev) &&   \
>   (show_all_errors || show_num_errors > 0)) { \
> - WARN(1, "%s %s: " format,   \
> + WARN(1, pr_fmt("%s %s: ") format,   \
>dev ? dev_driver_string(dev) : "NULL", \
>dev ? dev_name(dev) : "NULL", ## arg); \
>   dump_entry_trace(entry);\

I think converting this WARN to

dev_err(dev, format, ##__VA_ARGS__);
dump_stack();

would look better and be more intelligible.

Perhaps add a #define for dev_fmt if really necessary.


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-12-04 Thread Nicolas Boichat
On Tue, Dec 4, 2018 at 10:35 PM Vlastimil Babka  wrote:
>
> On 12/4/18 10:37 AM, Nicolas Boichat wrote:
> > On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat  
> > wrote:
> >>
> >> This is a follow-up to the discussion in [1], to make sure that the page
> >> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit
> >> physical address space.
> >>
> >> [1] 
> >> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html
> >
> > Hi everyone,
> >
> > Let's try to summarize here.
> >
> > First, we confirmed that this is a regression, and IOMMU errors happen
> > on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13).
> > The issue most likely starts from ad67f5a6545f ("arm64: replace
> > ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number
> > of Mediatek platforms (and maybe others?).
> >
> > We have a few options here:
> > 1. This series [2], that adds support for GFP_DMA32 slab caches,
> > _without_ adding kmalloc caches (since there are no users of
> > kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on
> > the 3 patches, and AFAICT this solution works fine.
> > 2. genalloc. That works, but unless we preallocate 4MB for L2 tables
> > (which is wasteful as we usually only need a handful of L2 tables),
> > we'll need changes in the core (use GFP_ATOMIC) to allow allocating on
> > demand, and as it stands we'd have no way to shrink the allocation.
> > 3. page_frag [3]. That works fine, and the code is quite simple. One
> > drawback is that fragments in partially freed pages cannot be reused
> > (from limited experiments, I see that IOMMU L2 tables are rarely
> > freed, so it's unlikely a whole page would get freed). But given the
> > low number of L2 tables, maybe we can live with that.
> >
> > I think 2 is out. Any preference between 1 and 3? I think 1 makes
> > better use of the memory, so that'd be my preference. But I'm probably
> > missing something.
>
> I would prefer 1 as well. IIRC you already confirmed that alignment
> requirements are not broken for custom kmem caches even in presence of
> SLUB debug options (and I would say it's a bug to be fixed if they
> weren't).

> I just asked (and didn't get a reply I think) about your
> ability to handle the GFP_ATOMIC allocation failures. They should be
> rare when only single page allocations are needed for the kmem cache.
> But in case they are not an option, then preallocating would be needed,
> thus probably option 2.

Oh, sorry, I missed your question.

I don't have a full answer, but:
- The allocations themselves are rare (I count a few 10s of L2 tables
at most on my system, I assume we rarely have >100), and yes, we only
need a single page, so the failures should be exceptional.
- My change is probably not making anything worse: I assume that even
with the current approach using GFP_DMA slab caches on older kernels,
failures could potentially happen. I don't think we've seen those. If
we are really concerned about this, maybe we'd need to modify
mtk_iommu_map to not hold a spinlock (if that's possible), so we don't
need to use GFP_ATOMIC. I suggest we just keep an eye on such issues,
and address them if they show up (we can even revisit genalloc at that
stage).

Anyway, I'll clean up patches for 1 (mostly commit message changes
based on the comments in the threads) and resend.

Thanks,

> > [2] https://patchwork.kernel.org/cover/10677529/, 3 patches
> > [3] https://patchwork.codeaurora.org/patch/671639/
> >
> > Thanks,
> >
> > Nicolas
> >
>
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu: fix amd_iommu=force_isolation

2018-12-04 Thread Yu Zhao via iommu
The parameter is still there but it's ignored. We need to check its
value before deciding to go into passthrough mode for AMD IOMMU.

Fixes: aafd8ba0ca74 ("iommu/amd: Implement add_device and remove_device")

Signed-off-by: Yu Zhao 
---
 drivers/iommu/amd_iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 1167ff0416cf..3e4219e6cff0 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -2195,7 +2195,8 @@ static int amd_iommu_add_device(struct device *dev)
 
BUG_ON(!dev_data);
 
-   if (iommu_pass_through || dev_data->iommu_v2)
+   if (iommu_pass_through ||
+   (!amd_iommu_force_isolation && dev_data->iommu_v2))
iommu_request_dm_for_dev(dev);
 
/* Domains are initialized for this device - have a look what we ended 
up with */
-- 
2.20.0.rc1.387.gf8505762e3-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 01/23] dma-mapping: provide a generic DMA_MAPPING_ERROR

2018-12-04 Thread Christoph Hellwig
On Tue, Dec 04, 2018 at 04:41:34PM +, Robin Murphy wrote:
> I'd have been inclined to put the default check here, i.e.
>
> - return 0
> + return dma_addr == DMA_MAPPING_ERROR
>
> such that the callback retains full precedence and we don't have to deal 
> with the non-trivial removals immediately if it comes to it. Not that it 
> makes a vast difference though, so either way,

Ok, I've switched it around.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] of/device: add blacklist for iommu dma_ops

2018-12-04 Thread Rob Herring
On Sat, Dec 1, 2018 at 10:54 AM Rob Clark  wrote:
>
> This solves a problem we see with drm/msm, caused by getting
> iommu_dma_ops while we attach our own domain and manage it directly at
> the iommu API level:
>
>   [0038] user address but active_mm is swapper
>   Internal error: Oops: 9605 [#1] PREEMPT SMP
>   Modules linked in:
>   CPU: 7 PID: 70 Comm: kworker/7:1 Tainted: GW 4.19.3 #90
>   Hardware name: xxx (DT)
>   Workqueue: events deferred_probe_work_func
>   pstate: 80c9 (Nzcv daif +PAN +UAO)
>   pc : iommu_dma_map_sg+0x7c/0x2c8
>   lr : iommu_dma_map_sg+0x40/0x2c8
>   sp : ff80095eb4f0
>   x29: ff80095eb4f0 x28: 
>   x27: ffc0f9431578 x26: 
>   x25:  x24: 0003
>   x23: 0001 x22: ffc0fa9ac010
>   x21:  x20: ffc0fab40980
>   x19: ffc0fab40980 x18: 0003
>   x17: 01c4 x16: 0007
>   x15: 000e x14: 
>   x13:  x12: 0028
>   x11: 0101010101010101 x10: 7f7f7f7f7f7f7f7f
>   x9 :  x8 : ffc0fab409a0
>   x7 :  x6 : 0002
>   x5 : 0001 x4 : 
>   x3 : 0001 x2 : 0002
>   x1 : ffc0f9431578 x0 : 
>   Process kworker/7:1 (pid: 70, stack limit = 0x17d08ffb)
>   Call trace:
>iommu_dma_map_sg+0x7c/0x2c8
>__iommu_map_sg_attrs+0x70/0x84
>get_pages+0x170/0x1e8
>msm_gem_get_iova+0x8c/0x128
>_msm_gem_kernel_new+0x6c/0xc8
>msm_gem_kernel_new+0x4c/0x58
>dsi_tx_buf_alloc_6g+0x4c/0x8c
>msm_dsi_host_modeset_init+0xc8/0x108
>msm_dsi_modeset_init+0x54/0x18c
>_dpu_kms_drm_obj_init+0x430/0x474
>dpu_kms_hw_init+0x5f8/0x6b4
>msm_drm_bind+0x360/0x6c8
>try_to_bring_up_master.part.7+0x28/0x70
>component_master_add_with_match+0xe8/0x124
>msm_pdev_probe+0x294/0x2b4
>platform_drv_probe+0x58/0xa4
>really_probe+0x150/0x294
>driver_probe_device+0xac/0xe8
>__device_attach_driver+0xa4/0xb4
>bus_for_each_drv+0x98/0xc8
>__device_attach+0xac/0x12c
>device_initial_probe+0x24/0x30
>bus_probe_device+0x38/0x98
>deferred_probe_work_func+0x78/0xa4
>process_one_work+0x24c/0x3dc
>worker_thread+0x280/0x360
>kthread+0x134/0x13c
>ret_from_fork+0x10/0x18
>   Code: d284 91000725 6b17039f 5400048a (f9401f40)
>   ---[ end trace f22dda57f3648e2c ]---
>   Kernel panic - not syncing: Fatal exception
>   SMP: stopping secondary CPUs
>   Kernel Offset: disabled
>   CPU features: 0x0,22802a18
>   Memory Limit: none
>
> The problem is that when drm/msm does it's own iommu_attach_device(),
> now the domain returned by iommu_get_domain_for_dev() is drm/msm's
> domain, and it doesn't have domain->iova_cookie.
>
> We kind of avoided this problem prior to sdm845/dpu because the iommu
> was attached to the mdp node in dt, which is a child of the toplevel
> mdss node (which corresponds to the dev passed in dma_map_sg()).  But
> with sdm845, now the iommu is attached at the mdss level so we hit the
> iommu_dma_ops in dma_map_sg().
>
> But auto allocating/attaching a domain before the driver is probed was
> already a blocking problem for enabling per-context pagetables for the
> GPU.  This problem is also now solved with this patch.
>
> Fixes: 97890ba9289c dma-mapping: detect and configure IOMMU in 
> of_dma_configure
> Tested-by: Douglas Anderson 
> Signed-off-by: Rob Clark 
> ---
> This is an alternative/replacement for [1].  What it lacks in elegance
> it makes up for in practicality ;-)
>
> [1] https://patchwork.freedesktop.org/patch/264930/
>
>  drivers/of/device.c | 22 ++
>  1 file changed, 22 insertions(+)
>
> diff --git a/drivers/of/device.c b/drivers/of/device.c
> index 5957cd4fa262..15ffee00fb22 100644
> --- a/drivers/of/device.c
> +++ b/drivers/of/device.c
> @@ -72,6 +72,14 @@ int of_device_add(struct platform_device *ofdev)
> return device_add(>dev);
>  }
>
> +static const struct of_device_id iommu_blacklist[] = {
> +   { .compatible = "qcom,mdp4" },
> +   { .compatible = "qcom,mdss" },
> +   { .compatible = "qcom,sdm845-mdss" },
> +   { .compatible = "qcom,adreno" },
> +   {}
> +};

Not completely clear to whether this is still needed or not, but this
really won't scale. Why can't the driver for these devices override
whatever has been setup by default?

Rob
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Andy Shevchenko
On Tue, Dec 4, 2018 at 11:26 PM Tony Battersby  wrote:
>
> On 12/4/18 3:30 PM, Andy Shevchenko wrote:
> > On Tue, Dec 4, 2018 at 10:18 PM Matthew Wilcox  wrote:
> >> On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote:
> >>> Also, Andy had issues with the v2 series so it would be good to hear an
> >>> update from him?
> >> Certainly.
> > Hmm... I certainly forgot what was long time ago.
> > If I _was_ in Cc list and didn't comment, I'm fine with it.
> >
> v4 of the patchset is the same as v3 but with the last patch dropped.
> Andy had only one minor comment on v3 about the use of division in patch
> #8, to which I replied.  That was back on August 8.

Seems I'm fine with the last version then.

-- 
With Best Regards,
Andy Shevchenko
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH V2] mm: Replace all open encodings for NUMA_NO_NODE

2018-12-04 Thread Lubomir Rintel
On Mon, 2018-11-26 at 17:56 +0530, Anshuman Khandual wrote:
> At present there are multiple places where invalid node number is encoded
> as -1. Even though implicitly understood it is always better to have macros
> in there. Replace these open encodings for an invalid node number with the
> global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like
> 'invalid node' from various places redirecting them to a common definition.
> 
> Signed-off-by: Anshuman Khandual 
> ---
> Changes in V2:
> 
> - Added inclusion of 'numa.h' header at various places per Andrew
> - Updated 'dev_to_node' to use NUMA_NO_NODE instead per Vinod
> 
> Changes in V1: (https://lkml.org/lkml/2018/11/23/485)
> 
> - Dropped OCFS2 changes per Joseph
> - Dropped media/video drivers changes per Hans
> 
> RFC - https://patchwork.kernel.org/patch/10678035/
> 
> Build tested this with multiple cross compiler options like alpha, sparc,
> arm64, x86, powerpc, powerpc64le etc with their default config which might
> not have compiled tested all driver related changes. I will appreciate
> folks giving this a test in their respective build environment.
> 
> All these places for replacement were found by running the following grep
> patterns on the entire kernel code. Please let me know if this might have
> missed some instances. This might also have replaced some false positives.
> I will appreciate suggestions, inputs and review.
> 
> 1. git grep "nid == -1"
> 2. git grep "node == -1"
> 3. git grep "nid = -1"
> 4. git grep "node = -1"
> 
>  arch/alpha/include/asm/topology.h |  3 ++-
>  arch/ia64/kernel/numa.c   |  2 +-
>  arch/ia64/mm/discontig.c  |  6 +++---
>  arch/ia64/sn/kernel/io_common.c   |  3 ++-
>  arch/powerpc/include/asm/pci-bridge.h |  3 ++-
>  arch/powerpc/kernel/paca.c|  3 ++-
>  arch/powerpc/kernel/pci-common.c  |  3 ++-
>  arch/powerpc/mm/numa.c| 14 +++---
>  arch/powerpc/platforms/powernv/memtrace.c |  5 +++--
>  arch/sparc/kernel/auxio_32.c  |  3 ++-
>  arch/sparc/kernel/pci_fire.c  |  3 ++-
>  arch/sparc/kernel/pci_schizo.c|  3 ++-
>  arch/sparc/kernel/pcic.c  |  7 ---
>  arch/sparc/kernel/psycho_common.c |  3 ++-
>  arch/sparc/kernel/sbus.c  |  3 ++-
>  arch/sparc/mm/init_64.c   |  6 +++---
>  arch/sparc/prom/init_32.c |  3 ++-
>  arch/sparc/prom/init_64.c |  5 +++--
>  arch/sparc/prom/tree_32.c | 13 +++--
>  arch/sparc/prom/tree_64.c | 19 ++-
>  arch/x86/include/asm/pci.h|  3 ++-
>  arch/x86/kernel/apic/x2apic_uv_x.c|  7 ---
>  arch/x86/kernel/smpboot.c |  3 ++-
>  arch/x86/platform/olpc/olpc_dt.c  | 17 +
>  drivers/block/mtip32xx/mtip32xx.c |  5 +++--
>  drivers/dma/dmaengine.c   |  4 +++-
>  drivers/infiniband/hw/hfi1/affinity.c |  3 ++-
>  drivers/infiniband/hw/hfi1/init.c |  3 ++-
>  drivers/iommu/dmar.c  |  5 +++--
>  drivers/iommu/intel-iommu.c   |  3 ++-
>  drivers/misc/sgi-xp/xpc_uv.c  |  3 ++-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  5 +++--
>  include/linux/device.h|  2 +-
>  init/init_task.c  |  3 ++-
>  kernel/kthread.c  |  3 ++-
>  kernel/sched/fair.c   | 15 ---
>  lib/cpumask.c |  3 ++-
>  mm/huge_memory.c  | 13 +++--
>  mm/hugetlb.c  |  3 ++-
>  mm/ksm.c  |  2 +-
>  mm/memory.c   |  7 ---
>  mm/memory_hotplug.c   | 12 ++--
>  mm/mempolicy.c|  2 +-
>  mm/page_alloc.c   |  4 ++--
>  mm/page_ext.c |  2 +-
>  net/core/pktgen.c |  3 ++-
>  net/qrtr/qrtr.c   |  3 ++-
>  tools/perf/bench/numa.c   |  6 +++---
>  48 files changed, 146 insertions(+), 108 deletions(-)

Thanks for the patch. It seems to me that you've got a fairly large
amount of it wrong though -- perhaps relying just on "git grep" alone
is not the best idea.

The diffstat is not all that big, it is entirely plausible to just
review each hunk manually: just do a "git show -U20" to get some
context.

You get a NAK from me for the OLPC DT part, but I think at least the
sparc/prom part also deals with device tree nodes and not NUMA nodes.

More inline.

Lubo

> 
> diff --git 

Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Tony Battersby
On 12/4/18 3:30 PM, Andy Shevchenko wrote:
> On Tue, Dec 4, 2018 at 10:18 PM Matthew Wilcox  wrote:
>> On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote:
>>> Also, Andy had issues with the v2 series so it would be good to hear an
>>> update from him?
>> Certainly.
> Hmm... I certainly forgot what was long time ago.
> If I _was_ in Cc list and didn't comment, I'm fine with it.
>
v4 of the patchset is the same as v3 but with the last patch dropped. 
Andy had only one minor comment on v3 about the use of division in patch
#8, to which I replied.  That was back on August 8.

Tony

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Matthew Wilcox
On Tue, Dec 04, 2018 at 12:28:54PM -0800, Andrew Morton wrote:
> On Tue, 4 Dec 2018 12:18:01 -0800 Matthew Wilcox  wrote:
> > I only had a review comment on 8/9, which I then withdrew during my review
> > of patch 9/9.  Unless I missed something during my re-review of my 
> > responses?
> 
> And in 0/9, that 1.3MB allocation.
> 
> Maybe it's using kvmalloc, I didn't look.

Oh!  That's the mptsas driver doing something utterly awful.  Not the
fault of this patchset, in any way.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Andy Shevchenko
On Tue, Dec 4, 2018 at 10:18 PM Matthew Wilcox  wrote:
> On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote:

> > Also, Andy had issues with the v2 series so it would be good to hear an
> > update from him?
>
> Certainly.

Hmm... I certainly forgot what was long time ago.
If I _was_ in Cc list and didn't comment, I'm fine with it.

-- 
With Best Regards,
Andy Shevchenko
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Andrew Morton
On Tue, 4 Dec 2018 12:18:01 -0800 Matthew Wilcox  wrote:

> On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote:
> > On Tue, 4 Dec 2018 11:22:34 -0500 Tony Battersby  
> > wrote:
> > 
> > > On 11/13/18 1:36 AM, Matthew Wilcox wrote:
> > > > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote:
> > > >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy
> > > >> driver corrupts DMA pool memory.
> > > >>
> > > >> Signed-off-by: Tony Battersby 
> > > > I like it!  Also, here you're using blks_per_alloc in a way which isn't
> > > > normally in the performance path, but might be with the right config
> > > > options.  With that, I withdraw my objection to the previous patch and
> > > >
> > > > Acked-by: Matthew Wilcox 
> > > >
> > > > Andrew, can you funnel these in through your tree?  If you'd rather not,
> > > > I don't mind stuffing them into a git tree and asking Linus to pull
> > > > for 4.21.
> > > >
> > > No reply for 3 weeks, so adding Andrew Morton to recipient list.
> > > 
> > > Andrew, I have 9 dmapool patches ready for merging in 4.21.  See Matthew
> > > Wilcox's request above.
> > > 
> > 
> > I'll take a look, but I see that this v4 series has several review
> > comments from Matthew which remain unresponded to.  Please attend to
> > that.
> 
> I only had a review comment on 8/9, which I then withdrew during my review
> of patch 9/9.  Unless I missed something during my re-review of my responses?

And in 0/9, that 1.3MB allocation.

Maybe it's using kvmalloc, I didn't look.


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Matthew Wilcox
On Tue, Dec 04, 2018 at 12:14:43PM -0800, Andrew Morton wrote:
> On Tue, 4 Dec 2018 11:22:34 -0500 Tony Battersby  
> wrote:
> 
> > On 11/13/18 1:36 AM, Matthew Wilcox wrote:
> > > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote:
> > >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy
> > >> driver corrupts DMA pool memory.
> > >>
> > >> Signed-off-by: Tony Battersby 
> > > I like it!  Also, here you're using blks_per_alloc in a way which isn't
> > > normally in the performance path, but might be with the right config
> > > options.  With that, I withdraw my objection to the previous patch and
> > >
> > > Acked-by: Matthew Wilcox 
> > >
> > > Andrew, can you funnel these in through your tree?  If you'd rather not,
> > > I don't mind stuffing them into a git tree and asking Linus to pull
> > > for 4.21.
> > >
> > No reply for 3 weeks, so adding Andrew Morton to recipient list.
> > 
> > Andrew, I have 9 dmapool patches ready for merging in 4.21.  See Matthew
> > Wilcox's request above.
> > 
> 
> I'll take a look, but I see that this v4 series has several review
> comments from Matthew which remain unresponded to.  Please attend to
> that.

I only had a review comment on 8/9, which I then withdrew during my review
of patch 9/9.  Unless I missed something during my re-review of my responses?

> Also, Andy had issues with the v2 series so it would be good to hear an
> update from him?

Certainly.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Andrew Morton
On Tue, 4 Dec 2018 11:22:34 -0500 Tony Battersby  wrote:

> On 11/13/18 1:36 AM, Matthew Wilcox wrote:
> > On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote:
> >> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy
> >> driver corrupts DMA pool memory.
> >>
> >> Signed-off-by: Tony Battersby 
> > I like it!  Also, here you're using blks_per_alloc in a way which isn't
> > normally in the performance path, but might be with the right config
> > options.  With that, I withdraw my objection to the previous patch and
> >
> > Acked-by: Matthew Wilcox 
> >
> > Andrew, can you funnel these in through your tree?  If you'd rather not,
> > I don't mind stuffing them into a git tree and asking Linus to pull
> > for 4.21.
> >
> No reply for 3 weeks, so adding Andrew Morton to recipient list.
> 
> Andrew, I have 9 dmapool patches ready for merging in 4.21.  See Matthew
> Wilcox's request above.
> 

I'll take a look, but I see that this v4 series has several review
comments from Matthew which remain unresponded to.  Please attend to
that.

Also, Andy had issues with the v2 series so it would be good to hear an
update from him?

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] Fix typo. Change tlb_range_add to iotlb_range_add and tlb_sync to iotlb_sync

2018-12-04 Thread Tom Murphy
From: tom 

Someone forgot to update this comment.

Signed-off-by: Tom Murphy 
---
 include/linux/iommu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a1d28f42cb77..11db18b9ffe8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -168,8 +168,8 @@ struct iommu_resv_region {
  * @map: map a physically contiguous memory region to an iommu domain
  * @unmap: unmap a physically contiguous memory region from an iommu domain
  * @flush_tlb_all: Synchronously flush all hardware TLBs for this domain
- * @tlb_range_add: Add a given iova range to the flush queue for this domain
- * @tlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
+ * @iotlb_range_add: Add a given iova range to the flush queue for this domain
+ * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush
  *queue
  * @iova_to_phys: translate iova to physical address
  * @add_device: add device to iommu grouping
-- 
2.11.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread John Garry






In fact, having got this far in, what I'd quite like to do is to get rid
of dma_debug_resize_entries() such that we never need to free things at
all, since then we could allocate whole pages as blocks of entries to
save on masses of individual slab allocations.



On a related topic, is it possible for the user to learn the total
entries created at a given point in time? If not, could we add a file
in the debugfs folder for this?




Hi Robin,


I did get as far as pondering that you effectively lose track of
utilisation once the low-water-mark of min_free_entries hits 0 and stays


I did try your patches and I noticed this, i.e I was hitting the point 
at which we start to alloc more entries.



there - AFAICS it should be sufficient to just expose nr_total_entries
as-is, since users can then calculate current and maximum occupancy
based on *_free_entries. Does that sound reasonable to you?



Sounds ok. I am just interested to know roughly how many DMA buffers 
we're using in our system.



That also indirectly reminds me that this lot is documented in
DMA_API.txt, so I should be good and update that too...


Thanks,
John



Cheers,
Robin.

.




___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/5] xhci: Use device_iommu_mapped()

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Replace the dev->iommu_group check with a proper function
call that better reprensents its purpose.

Cc: Mathias Nyman 
Signed-off-by: Joerg Roedel 
---
 drivers/usb/host/xhci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index c928dbbff881..5ab97e54d070 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -244,7 +244,7 @@ static void xhci_zero_64b_regs(struct xhci_hcd *xhci)
 * an iommu. Doing anything when there is no iommu is definitely
 * unsafe...
 */
-   if (!(xhci->quirks & XHCI_ZERO_64B_REGS) || !dev->iommu_group)
+   if (!(xhci->quirks & XHCI_ZERO_64B_REGS) || !device_iommu_mapped(dev))
return;
 
xhci_info(xhci, "Zeroing 64bit base registers, expecting fault\n");
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/5] powerpc/iommu: Use device_iommu_mapped()

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new function to replace the open-coded iommu check.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Russell Currey 
Cc: Sam Bobroff 
Signed-off-by: Joerg Roedel 
---
 arch/powerpc/kernel/eeh.c   | 2 +-
 arch/powerpc/kernel/iommu.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 6cae6b56ffd6..23fe62f11486 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1472,7 +1472,7 @@ static int dev_has_iommu_table(struct device *dev, void 
*data)
if (!dev)
return 0;
 
-   if (dev->iommu_group) {
+   if (device_iommu_mapped(dev)) {
*ppdev = pdev;
return 1;
}
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f0dc680e659a..48d58d1dcac2 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1086,7 +1086,7 @@ int iommu_add_device(struct device *dev)
if (!device_is_registered(dev))
return -ENOENT;
 
-   if (dev->iommu_group) {
+   if (device_iommu_mapped(dev)) {
pr_debug("%s: Skipping device %s with iommu group %d\n",
 __func__, dev_name(dev),
 iommu_group_id(dev->iommu_group));
@@ -1129,7 +1129,7 @@ void iommu_del_device(struct device *dev)
 * and we needn't detach them from the associated
 * IOMMU groups
 */
-   if (!dev->iommu_group) {
+   if (!device_iommu_mapped(dev)) {
pr_debug("iommu_tce: skipping device %s with no tbl\n",
 dev_name(dev));
return;
@@ -1148,7 +1148,7 @@ static int tce_iommu_bus_notifier(struct notifier_block 
*nb,
 case BUS_NOTIFY_ADD_DEVICE:
 return iommu_add_device(dev);
 case BUS_NOTIFY_DEL_DEVICE:
-if (dev->iommu_group)
+if (device_iommu_mapped(dev))
 iommu_del_device(dev);
 return 0;
 default:
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/5] iommu/of: Use device_iommu_mapped()

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use Use device_iommu_mapped() to check if the device is
already mapped by an IOMMU.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/of_iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index c5dd63072529..bfcf139503f0 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -220,7 +220,7 @@ const struct iommu_ops *of_iommu_configure(struct device 
*dev,
 * If we have reason to believe the IOMMU driver missed the initial
 * add_device callback for dev, replay it to get things in order.
 */
-   if (ops && ops->add_device && dev->bus && !dev->iommu_group)
+   if (ops && ops->add_device && dev->bus && !device_iommu_mapped(dev))
err = ops->add_device(dev);
 
/* Ignore all other errors apart from EPROBE_DEFER */
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 3/5] ACPI/IORT: Use device_iommu_mapped()

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Replace the iommu-check with a proper and readable function
call.

Cc: Lorenzo Pieralisi 
Signed-off-by: Joerg Roedel 
---
 drivers/acpi/arm64/iort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 70f4e80b9246..0125c8eb9e81 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -805,7 +805,7 @@ static inline int iort_add_device_replay(const struct 
iommu_ops *ops,
 {
int err = 0;
 
-   if (ops->add_device && dev->bus && !dev->iommu_group)
+   if (ops->add_device && dev->bus && !device_iommu_mapped(dev))
err = ops->add_device(dev);
 
return err;
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/5] driver core: Introduce device_iommu_mapped() function

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Some places in the kernel check the iommu_group pointer in
'struct device' in order to find ot whether a device is
mapped by an IOMMU.

This is not good way to make this check, as the pointer will
be moved to 'struct dev_iommu_data'. This way to make the
check is also not very readable.

Introduce an explicit function to perform this check.

Cc: Greg Kroah-Hartman 
Signed-off-by: Joerg Roedel 
---
 include/linux/device.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index 1b25c7a43f4c..6cb4640b6160 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1058,6 +1058,16 @@ static inline struct device *kobj_to_dev(struct kobject 
*kobj)
return container_of(kobj, struct device, kobj);
 }
 
+/**
+ * device_iommu_mapped - Returns true when the device DMA is translated
+ *  by an IOMMU
+ * @dev: Device to perform the check on
+ */
+static inline bool device_iommu_mapped(struct device *dev)
+{
+   return (dev->iommu_group != NULL);
+}
+
 /* Get the wakeup routines, which depend on struct device */
 #include 
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 0/5] Introduce device_iommu_maped() function

2018-12-04 Thread Joerg Roedel
Hi,

here is a patch-set to replace the dev->iommu_group checks
in the source tree by a proper function call. The pointer
checks mostly happen to check whether a device is managed my
an IOMMU. For that purpose a pointer check is not very
descriptable, so replace it by a function call that make its
purpose readable.

This also starts to remove direct access to the
dev->iommu_group pointer outside of iommu-code. This is
another move towards consolidating the various
iommu-related pointers in 'struct device' into one pointer
only.

Please review.

Thanks,

Joerg

Joerg Roedel (5):
  driver core: Introduce device_iommu_mapped() function
  iommu/of: Use device_iommu_mapped()
  ACPI/IORT: Use device_iommu_mapped()
  powerpc/iommu: Use device_iommu_mapped()
  xhci: Use device_iommu_mapped()

 arch/powerpc/kernel/eeh.c   |  2 +-
 arch/powerpc/kernel/iommu.c |  6 +++---
 drivers/acpi/arm64/iort.c   |  2 +-
 drivers/iommu/of_iommu.c|  2 +-
 drivers/usb/host/xhci.c |  2 +-
 include/linux/device.h  | 10 ++
 6 files changed, 17 insertions(+), 7 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread Robin Murphy

On 04/12/2018 16:30, John Garry wrote:

On 04/12/2018 13:11, Robin Murphy wrote:

Hi John,

On 03/12/2018 18:23, John Garry wrote:

On 03/12/2018 17:28, Robin Murphy wrote:
Certain drivers such as large multi-queue network adapters can use 
pools

of mapped DMA buffers larger than the default dma_debug_entry pool of
65536 entries, with the result that merely probing such a device can
cause DMA debug to disable itself during boot unless explicitly 
given an

appropriate "dma_debug_entries=..." option.

Developers trying to debug some other driver on such a system may 
not be
immediately aware of this, and at worst it can hide bugs if they 
fail to

realise that dma-debug has already disabled itself unexpectedly by the
time the code of interest gets to run. Even once they do realise, it 
can

be a bit of a pain to emprirically determine a suitable number of
preallocated entries to configure without massively over-allocating.

There's really no need for such a static limit, though, since we can
quite easily expand the pool at runtime in those rare cases that the
preallocated entries are insufficient, which is arguably the least
surprising and most useful behaviour.


Hi Robin,

Do you have an idea on shrinking the pool again when the culprit
driver is removed, i.e. we have so many unused debug entries now
available?


I honestly don't believe it's worth the complication. This is a
development feature with significant overheads already, so there's not
an awful lot to gain by trying to optimise memory usage. If a system can
ever load a driver that makes hundreds of thousands of simultaneous
mappings, it can almost certainly spare 20-odd megabytes of RAM for the
corresponding debug entries in perpetuity. Sure, it does mean you'd need
to reboot to recover memory from a major leak, but that's mostly true of
the current behaviour too, and rebooting during driver development is
hardly an unacceptable inconvenience.



ok, I just thought that it would not be too difficult to implement this 
on the dma entry free path.


True, in the current code it wouldn't be all that hard, but it feels 
more worthwhile to optimise for allocation rather than freeing, and as 
soon as we start allocating memory for multiple entries at once, trying 
to free anything becomes extremely challenging.



In fact, having got this far in, what I'd quite like to do is to get rid
of dma_debug_resize_entries() such that we never need to free things at
all, since then we could allocate whole pages as blocks of entries to
save on masses of individual slab allocations.



On a related topic, is it possible for the user to learn the total 
entries created at a given point in time? If not, could we add a file in 
the debugfs folder for this?


I did get as far as pondering that you effectively lose track of 
utilisation once the low-water-mark of min_free_entries hits 0 and stays 
there - AFAICS it should be sufficient to just expose nr_total_entries 
as-is, since users can then calculate current and maximum occupancy 
based on *_free_entries. Does that sound reasonable to you?


That also indirectly reminds me that this lot is documented in 
DMA_API.txt, so I should be good and update that too...


Cheers,
Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 01/23] dma-mapping: provide a generic DMA_MAPPING_ERROR

2018-12-04 Thread Robin Murphy

On 30/11/2018 13:22, Christoph Hellwig wrote:

Error handling of the dma_map_single and dma_map_page APIs is a little
problematic at the moment, in that we use different encodings in the
returned dma_addr_t to indicate an error.  That means we require an
additional indirect call to figure out if a dma mapping call returned
an error, and a lot of boilerplate code to implement these semantics.

Instead return the maximum addressable value as the error.  As long
as we don't allow mapping single-byte ranges with single-byte alignment
this value can never be a valid return.  Additionaly if drivers do
not check the return value from the dma_map* routines this values means
they will generally not be pointed to actual memory.

Once the default value is added here we can start removing the
various mapping_error methods and just rely on this generic check.

Signed-off-by: Christoph Hellwig 
---
  include/linux/dma-mapping.h | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0f81c713f6e9..46bd612d929e 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -133,6 +133,8 @@ struct dma_map_ops {
u64 (*get_required_mask)(struct device *dev);
  };
  
+#define DMA_MAPPING_ERROR		(~(dma_addr_t)0)

+
  extern const struct dma_map_ops dma_direct_ops;
  extern const struct dma_map_ops dma_virt_ops;
  
@@ -576,6 +578,10 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)

const struct dma_map_ops *ops = get_dma_ops(dev);
  
  	debug_dma_mapping_error(dev, dma_addr);

+
+   if (dma_addr == DMA_MAPPING_ERROR)
+   return 1;
+
if (ops->mapping_error)
return ops->mapping_error(dev, dma_addr);
return 0;


I'd have been inclined to put the default check here, i.e.

-   return 0
+   return dma_addr == DMA_MAPPING_ERROR

such that the callback retains full precedence and we don't have to deal 
with the non-trivial removals immediately if it comes to it. Not that it 
makes a vast difference though, so either way,


Reviewed-by: Robin Murphy 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3, RFC] iommu/io-pgtable-arm-v7s: Use page_frag to request DMA32 memory

2018-12-04 Thread Will Deacon
On Tue, Dec 04, 2018 at 04:23:00PM +0800, Nicolas Boichat wrote:
> IOMMUs using ARMv7 short-descriptor format require page tables
> (level 1 and 2) to be allocated within the first 4GB of RAM, even
> on 64-bit systems.
> 
> For level 1/2 tables, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32
> is defined (e.g. on arm64 platforms).
> 
> For level 2 tables (1 KB), we use page_frag to allocate these pages,
> as we cannot directly use kmalloc (no slab cache for GFP_DMA32) or
> kmem_cache (mm/ code treats GFP_DMA32 as an invalid flag).
> 
> One downside is that we only free the allocated page if all the
> 4 fragments (4 IOMMU L2 tables) are freed, but given that we
> usually only allocate limited number of IOMMU L2 tables, this
> should not have too much impact on memory usage: In the absolute
> worst case (4096 L2 page tables, each on their own 4K page),
> we would use 16 MB of memory for 4 MB of L2 tables.
> 
> Also, print an error when the physical address does not fit in
> 32-bit, to make debugging easier in the future.
> 
> Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
> Signed-off-by: Nicolas Boichat 
> ---
> 
> As an alternative to the series [1], which adds support for GFP_DMA32
> to kmem_cache in mm/. IMHO the solution in [1] is cleaner and more
> efficient, as it allows freed fragments (L2 tables) to be reused, but
> this approach does not require any core change.
> 
> [1] https://patchwork.kernel.org/cover/10677529/, 3 patches
> 
>  drivers/iommu/io-pgtable-arm-v7s.c | 32 --
>  1 file changed, 17 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/io-pgtable-arm-v7s.c 
> b/drivers/iommu/io-pgtable-arm-v7s.c
> index 445c3bde04800c..0de6a51eb6755f 100644
> --- a/drivers/iommu/io-pgtable-arm-v7s.c
> +++ b/drivers/iommu/io-pgtable-arm-v7s.c
> @@ -161,6 +161,12 @@
>  
>  #define ARM_V7S_TCR_PD1  BIT(5)
>  
> +#ifdef CONFIG_ZONE_DMA32
> +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32
> +#else
> +#define ARM_V7S_TABLE_GFP_DMA GFP_DMA
> +#endif

We may as well include __GFP_ZERO in here too.
Anyway, this looks alright to me:

Acked-by: Will Deacon 

But it sounds like you're still on the fence about this patch, so I won't
pick it up unless you ask explicitly.

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread Robin Murphy

On 04/12/2018 14:29, Christoph Hellwig wrote:

+   for (retry_count = 0; ; retry_count++) {
+   spin_lock_irqsave(_entries_lock, flags);
+
+   if (num_free_entries > 0)
+   break;
  
  		spin_unlock_irqrestore(_entries_lock, flags);


Taking a spinlock just to read a single integer value doesn't really
help anything.


If the freelist is non-empty we break out with the lock still held in 
order to actually allocate our entry - only if there are no free entries 
left do we drop the lock in order to handle the failure. This much is 
just the original logic shuffled around a bit (with the tweak that 
testing num_free_entries seemed justifiably simpler than the original 
list_empty() check).



+
+   if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES &&
+   !prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES))


Don't we need GFP_ATOMIC here?  Also why do we need the retries?


Ah, right, we may be outside our own spinlock, but of course the whole 
DMA API call which got us here might be under someone else's and/or in a 
non-sleeping context - I'll fix that.


The number of retries is just to bound the loop due to its inherent 
raciness - since we drop the lock to create more entries, under 
pathological conditions by the time we get back in to grab one they 
could have all gone. 2 retries (well, strictly it's 1 try and 1 retry) 
was an entirely arbitrary choice just to accommodate that happening very 
occasionally by chance.


However, if the dynamic allocations need GFP_ATOMIC for external reasons 
anyway, then I don't need the lock-juggling that invites that race in 
the first place, and the whole loop disappears again. Neat!


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread John Garry

On 04/12/2018 13:11, Robin Murphy wrote:

Hi John,

On 03/12/2018 18:23, John Garry wrote:

On 03/12/2018 17:28, Robin Murphy wrote:

Certain drivers such as large multi-queue network adapters can use pools
of mapped DMA buffers larger than the default dma_debug_entry pool of
65536 entries, with the result that merely probing such a device can
cause DMA debug to disable itself during boot unless explicitly given an
appropriate "dma_debug_entries=..." option.

Developers trying to debug some other driver on such a system may not be
immediately aware of this, and at worst it can hide bugs if they fail to
realise that dma-debug has already disabled itself unexpectedly by the
time the code of interest gets to run. Even once they do realise, it can
be a bit of a pain to emprirically determine a suitable number of
preallocated entries to configure without massively over-allocating.

There's really no need for such a static limit, though, since we can
quite easily expand the pool at runtime in those rare cases that the
preallocated entries are insufficient, which is arguably the least
surprising and most useful behaviour.


Hi Robin,

Do you have an idea on shrinking the pool again when the culprit
driver is removed, i.e. we have so many unused debug entries now
available?


I honestly don't believe it's worth the complication. This is a
development feature with significant overheads already, so there's not
an awful lot to gain by trying to optimise memory usage. If a system can
ever load a driver that makes hundreds of thousands of simultaneous
mappings, it can almost certainly spare 20-odd megabytes of RAM for the
corresponding debug entries in perpetuity. Sure, it does mean you'd need
to reboot to recover memory from a major leak, but that's mostly true of
the current behaviour too, and rebooting during driver development is
hardly an unacceptable inconvenience.



ok, I just thought that it would not be too difficult to implement this 
on the dma entry free path.



In fact, having got this far in, what I'd quite like to do is to get rid
of dma_debug_resize_entries() such that we never need to free things at
all, since then we could allocate whole pages as blocks of entries to
save on masses of individual slab allocations.



On a related topic, is it possible for the user to learn the total 
entries created at a given point in time? If not, could we add a file in 
the debugfs folder for this?


Thanks,
John


Robin.



Thanks,
John



Signed-off-by: Robin Murphy 
---
 kernel/dma/debug.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index de5db800dbfc..46cc075aec99 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -47,6 +47,9 @@
 #ifndef PREALLOC_DMA_DEBUG_ENTRIES
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
 #endif
+/* If the pool runs out, try this many times to allocate this many
new entries */
+#define DMA_DEBUG_DYNAMIC_ENTRIES 256
+#define DMA_DEBUG_DYNAMIC_RETRIES 2

 enum {
 dma_debug_single,
@@ -702,12 +705,21 @@ static struct dma_debug_entry
*dma_entry_alloc(void)
 {
 struct dma_debug_entry *entry;
 unsigned long flags;
+int retry_count;

-spin_lock_irqsave(_entries_lock, flags);
+for (retry_count = 0; ; retry_count++) {
+spin_lock_irqsave(_entries_lock, flags);
+
+if (num_free_entries > 0)
+break;

-if (list_empty(_entries)) {
-global_disable = true;
 spin_unlock_irqrestore(_entries_lock, flags);
+
+if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES &&
+!prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES))
+continue;
+
+global_disable = true;
 pr_err("debugging out of memory - disabling\n");
 return NULL;
 }






.




___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 9/9] iommu/tegra: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Cc: Thierry Reding 
Signed-off-by: Joerg Roedel 
---
 drivers/iommu/tegra-smmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 0d03341317c4..0d026cb2dfff 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -846,7 +846,7 @@ static struct iommu_group *tegra_smmu_group_get(struct 
tegra_smmu *smmu,
 
 static struct iommu_group *tegra_smmu_device_group(struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct tegra_smmu *smmu = dev->archdata.iommu;
struct iommu_group *group;
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 7/9] iommu/of: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/of_iommu.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index c5dd63072529..38232250b5f4 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -164,7 +164,7 @@ const struct iommu_ops *of_iommu_configure(struct device 
*dev,
   struct device_node *master_np)
 {
const struct iommu_ops *ops = NULL;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
int err = NO_IOMMU;
 
if (!master_np)
@@ -208,6 +208,9 @@ const struct iommu_ops *of_iommu_configure(struct device 
*dev,
}
}
 
+   /* The fwspec pointer changed, read it again */
+   fwspec = dev_iommu_fwspec_get(dev);
+
/*
 * Two success conditions can be represented by non-negative err here:
 * >0 : there is no IOMMU, or one was unavailable for non-fatal reasons
@@ -215,7 +218,7 @@ const struct iommu_ops *of_iommu_configure(struct device 
*dev,
 * <0 : any actual error
 */
if (!err)
-   ops = dev->iommu_fwspec->ops;
+   ops = fwspec->ops;
/*
 * If we have reason to believe the IOMMU driver missed the initial
 * add_device callback for dev, replay it to get things in order.
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/9] ACPI/IORT: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Cc: Lorenzo Pieralisi 
Signed-off-by: Joerg Roedel 
---
 drivers/acpi/arm64/iort.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 70f4e80b9246..754a67ba49e5 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -779,7 +779,7 @@ static inline bool iort_iommu_driver_enabled(u8 type)
 static struct acpi_iort_node *iort_get_msi_resv_iommu(struct device *dev)
 {
struct acpi_iort_node *iommu;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 
iommu = iort_get_iort_node(fwspec->iommu_fwnode);
 
@@ -824,6 +824,7 @@ static inline int iort_add_device_replay(const struct 
iommu_ops *ops,
  */
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct acpi_iort_its_group *its;
struct acpi_iort_node *iommu_node, *its_node = NULL;
int i, resv = 0;
@@ -841,9 +842,9 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, 
struct list_head *head)
 * a given PCI or named component may map IDs to.
 */
 
-   for (i = 0; i < dev->iommu_fwspec->num_ids; i++) {
+   for (i = 0; i < fwspec->num_ids; i++) {
its_node = iort_node_map_id(iommu_node,
-   dev->iommu_fwspec->ids[i],
+   fwspec->ids[i],
NULL, IORT_MSI_TYPE);
if (its_node)
break;
@@ -1036,6 +1037,7 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, 
u64 *dma_size)
  */
 const struct iommu_ops *iort_iommu_configure(struct device *dev)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct acpi_iort_node *node, *parent;
const struct iommu_ops *ops;
u32 streamid = 0;
@@ -1045,7 +1047,7 @@ const struct iommu_ops *iort_iommu_configure(struct 
device *dev)
 * If we already translated the fwspec there
 * is nothing left to do, return the iommu_ops.
 */
-   ops = iort_fwspec_iommu_ops(dev->iommu_fwspec);
+   ops = iort_fwspec_iommu_ops(fwspec);
if (ops)
return ops;
 
@@ -1084,7 +1086,7 @@ const struct iommu_ops *iort_iommu_configure(struct 
device *dev)
 * add_device callback for dev, replay it to get things in order.
 */
if (!err) {
-   ops = iort_fwspec_iommu_ops(dev->iommu_fwspec);
+   ops = iort_fwspec_iommu_ops(fwspec);
err = iort_add_device_replay(ops, dev);
}
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 8/9] iommu/qcom: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Cc: Rob Clark 
Signed-off-by: Joerg Roedel 
---
 drivers/iommu/qcom_iommu.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c
index ee70e9921cf1..f437bf9c5ebd 100644
--- a/drivers/iommu/qcom_iommu.c
+++ b/drivers/iommu/qcom_iommu.c
@@ -354,7 +354,8 @@ static void qcom_iommu_domain_free(struct iommu_domain 
*domain)
 
 static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device 
*dev)
 {
-   struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec);
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+   struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
int ret;
 
@@ -365,7 +366,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain 
*domain, struct device *dev
 
/* Ensure that the domain is finalized */
pm_runtime_get_sync(qcom_iommu->dev);
-   ret = qcom_iommu_init_domain(domain, qcom_iommu, dev->iommu_fwspec);
+   ret = qcom_iommu_init_domain(domain, qcom_iommu, fwspec);
pm_runtime_put_sync(qcom_iommu->dev);
if (ret < 0)
return ret;
@@ -387,7 +388,7 @@ static int qcom_iommu_attach_dev(struct iommu_domain 
*domain, struct device *dev
 
 static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device 
*dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu = to_iommu(fwspec);
struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain);
unsigned i;
@@ -500,7 +501,7 @@ static bool qcom_iommu_capable(enum iommu_cap cap)
 
 static int qcom_iommu_add_device(struct device *dev)
 {
-   struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec);
+   struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
struct iommu_group *group;
struct device_link *link;
 
@@ -531,7 +532,7 @@ static int qcom_iommu_add_device(struct device *dev)
 
 static void qcom_iommu_remove_device(struct device *dev)
 {
-   struct qcom_iommu_dev *qcom_iommu = to_iommu(dev->iommu_fwspec);
+   struct qcom_iommu_dev *qcom_iommu = to_iommu(dev_iommu_fwspec_get(dev));
 
if (!qcom_iommu)
return;
@@ -543,6 +544,7 @@ static void qcom_iommu_remove_device(struct device *dev)
 
 static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args 
*args)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct qcom_iommu_dev *qcom_iommu;
struct platform_device *iommu_pdev;
unsigned asid = args->args[0];
@@ -568,14 +570,14 @@ static int qcom_iommu_of_xlate(struct device *dev, struct 
of_phandle_args *args)
WARN_ON(asid > qcom_iommu->num_ctxs))
return -EINVAL;
 
-   if (!dev->iommu_fwspec->iommu_priv) {
-   dev->iommu_fwspec->iommu_priv = qcom_iommu;
+   if (!fwspec->iommu_priv) {
+   fwspec->iommu_priv = qcom_iommu;
} else {
/* make sure devices iommus dt node isn't referring to
 * multiple different iommu devices.  Multiple context
 * banks are ok, but multiple devices are not:
 */
-   if (WARN_ON(qcom_iommu != dev->iommu_fwspec->iommu_priv))
+   if (WARN_ON(qcom_iommu != fwspec->iommu_priv))
return -EINVAL;
}
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 0/9] Access dev->iommu_fwspec through functions

2018-12-04 Thread Joerg Roedel
Hi,

here is a patch-set to wrap accesses to dev->iommu_fwspec
into functions. This will make it easier to move the pointer
into a separate struct and consolitdate the iommu-related
pointers in 'struct device'.

Regards,

Joerg

Joerg Roedel (9):
  iommu: Introduce wrappers around dev->iommu_fwspec
  ACPI/IORT: Use helper functions to access dev->iommu_fwspec
  iommu/arm-smmu: Use helper functions to access dev->iommu_fwspec
  iommu/dma: Use helper functions to access dev->iommu_fwspec
  iommu/ipmmu-vmsa: Use helper functions to access dev->iommu_fwspec
  iommu/mediatek: Use helper functions to access dev->iommu_fwspec
  iommu/of: Use helper functions to access dev->iommu_fwspec
  iommu/qcom: Use helper functions to access dev->iommu_fwspec
  iommu/tegra: Use helper functions to access dev->iommu_fwspec

 drivers/acpi/arm64/iort.c| 12 +++-
 drivers/iommu/arm-smmu-v3.c  | 16 +---
 drivers/iommu/arm-smmu.c | 12 ++--
 drivers/iommu/dma-iommu.c|  2 +-
 drivers/iommu/iommu.c| 14 +++---
 drivers/iommu/ipmmu-vmsa.c   | 12 
 drivers/iommu/mtk_iommu.c| 21 -
 drivers/iommu/mtk_iommu_v1.c | 28 
 drivers/iommu/of_iommu.c |  7 +--
 drivers/iommu/qcom_iommu.c   | 18 ++
 drivers/iommu/tegra-smmu.c   |  2 +-
 include/linux/iommu.h| 11 +++
 12 files changed, 93 insertions(+), 62 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/9] iommu/ipmmu-vmsa: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/ipmmu-vmsa.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ddf3a492e1d5..679f18bf0634 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -81,7 +81,9 @@ static struct ipmmu_vmsa_domain *to_vmsa_domain(struct 
iommu_domain *dom)
 
 static struct ipmmu_vmsa_device *to_ipmmu(struct device *dev)
 {
-   return dev->iommu_fwspec ? dev->iommu_fwspec->iommu_priv : NULL;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+   return fwspec ? fwspec->iommu_priv : NULL;
 }
 
 #define TLB_LOOP_TIMEOUT   100 /* 100us */
@@ -643,7 +645,7 @@ static void ipmmu_domain_free(struct iommu_domain 
*io_domain)
 static int ipmmu_attach_device(struct iommu_domain *io_domain,
   struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_device *mmu = to_ipmmu(dev);
struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
unsigned int i;
@@ -692,7 +694,7 @@ static int ipmmu_attach_device(struct iommu_domain 
*io_domain,
 static void ipmmu_detach_device(struct iommu_domain *io_domain,
struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct ipmmu_vmsa_domain *domain = to_vmsa_domain(io_domain);
unsigned int i;
 
@@ -744,13 +746,15 @@ static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain 
*io_domain,
 static int ipmmu_init_platform_device(struct device *dev,
  struct of_phandle_args *args)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct platform_device *ipmmu_pdev;
 
ipmmu_pdev = of_find_device_by_node(args->np);
if (!ipmmu_pdev)
return -ENODEV;
 
-   dev->iommu_fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev);
+   fwspec->iommu_priv = platform_get_drvdata(ipmmu_pdev);
+
return 0;
 }
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 6/9] iommu/mediatek: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Cc: Matthias Brugger 
Signed-off-by: Joerg Roedel 
---
 drivers/iommu/mtk_iommu.c| 21 -
 drivers/iommu/mtk_iommu_v1.c | 28 
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 44bd5b9166bb..0783fba05d19 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -244,7 +244,7 @@ static void mtk_iommu_config(struct mtk_iommu_data *data,
 {
struct mtk_smi_larb_iommu*larb_mmu;
unsigned int larbid, portid;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
int i;
 
for (i = 0; i < fwspec->num_ids; ++i) {
@@ -336,7 +336,7 @@ static int mtk_iommu_attach_device(struct iommu_domain 
*domain,
   struct device *dev)
 {
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
-   struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+   struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
 
if (!data)
return -ENODEV;
@@ -355,7 +355,7 @@ static int mtk_iommu_attach_device(struct iommu_domain 
*domain,
 static void mtk_iommu_detach_device(struct iommu_domain *domain,
struct device *dev)
 {
-   struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+   struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
 
if (!data)
return;
@@ -417,13 +417,14 @@ static phys_addr_t mtk_iommu_iova_to_phys(struct 
iommu_domain *domain,
 
 static int mtk_iommu_add_device(struct device *dev)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_data *data;
struct iommu_group *group;
 
-   if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != _iommu_ops)
+   if (!fwspec || fwspec->ops != _iommu_ops)
return -ENODEV; /* Not a iommu client device */
 
-   data = dev->iommu_fwspec->iommu_priv;
+   data = fwspec->iommu_priv;
iommu_device_link(>iommu, dev);
 
group = iommu_group_get_for_dev(dev);
@@ -436,12 +437,13 @@ static int mtk_iommu_add_device(struct device *dev)
 
 static void mtk_iommu_remove_device(struct device *dev)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct mtk_iommu_data *data;
 
-   if (!dev->iommu_fwspec || dev->iommu_fwspec->ops != _iommu_ops)
+   if (!fwspec || fwspec->ops != _iommu_ops)
return;
 
-   data = dev->iommu_fwspec->iommu_priv;
+   data = fwspec->iommu_priv;
iommu_device_unlink(>iommu, dev);
 
iommu_group_remove_device(dev);
@@ -468,6 +470,7 @@ static struct iommu_group *mtk_iommu_device_group(struct 
device *dev)
 
 static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
 {
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct platform_device *m4updev;
 
if (args->args_count != 1) {
@@ -476,13 +479,13 @@ static int mtk_iommu_of_xlate(struct device *dev, struct 
of_phandle_args *args)
return -EINVAL;
}
 
-   if (!dev->iommu_fwspec->iommu_priv) {
+   if (!fwspec->iommu_priv) {
/* Get the m4u device */
m4updev = of_find_device_by_node(args->np);
if (WARN_ON(!m4updev))
return -EINVAL;
 
-   dev->iommu_fwspec->iommu_priv = platform_get_drvdata(m4updev);
+   fwspec->iommu_priv = platform_get_drvdata(m4updev);
}
 
return iommu_fwspec_add_ids(dev, args->args, 1);
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 0e780848f59b..d22a2a145bd2 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -206,7 +206,7 @@ static void mtk_iommu_config(struct mtk_iommu_data *data,
 {
struct mtk_smi_larb_iommu*larb_mmu;
unsigned int larbid, portid;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
int i;
 
for (i = 0; i < fwspec->num_ids; ++i) {
@@ -271,7 +271,7 @@ static int mtk_iommu_attach_device(struct iommu_domain 
*domain,
   struct device *dev)
 {
struct mtk_iommu_domain *dom = to_mtk_domain(domain);
-   struct mtk_iommu_data *data = dev->iommu_fwspec->iommu_priv;
+   struct mtk_iommu_data *data = dev_iommu_fwspec_get(dev)->iommu_priv;
int ret;
 
if (!data)
@@ -293,7 +293,7 @@ static int mtk_iommu_attach_device(struct iommu_domain 
*domain,
 static void mtk_iommu_detach_device(struct iommu_domain *domain,
   

[PATCH 3/9] iommu/arm-smmu: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Cc: Will Deacon 
Cc: Robin Murphy 
Signed-off-by: Joerg Roedel 
---
 drivers/iommu/arm-smmu-v3.c | 16 +---
 drivers/iommu/arm-smmu.c| 12 ++--
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 6947ccf26512..8f2d3a30d090 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1691,24 +1691,26 @@ static void arm_smmu_install_ste_for_dev(struct 
iommu_fwspec *fwspec)
 
 static void arm_smmu_detach_dev(struct device *dev)
 {
-   struct arm_smmu_master_data *master = dev->iommu_fwspec->iommu_priv;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+   struct arm_smmu_master_data *master = fwspec->iommu_priv;
 
master->ste.assigned = false;
-   arm_smmu_install_ste_for_dev(dev->iommu_fwspec);
+   arm_smmu_install_ste_for_dev(fwspec);
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret = 0;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct arm_smmu_device *smmu;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
struct arm_smmu_master_data *master;
struct arm_smmu_strtab_ent *ste;
 
-   if (!dev->iommu_fwspec)
+   if (!fwspec)
return -ENOENT;
 
-   master = dev->iommu_fwspec->iommu_priv;
+   master = fwspec->iommu_priv;
smmu = master->smmu;
ste = >ste;
 
@@ -1748,7 +1750,7 @@ static int arm_smmu_attach_dev(struct iommu_domain 
*domain, struct device *dev)
ste->s2_cfg = _domain->s2_cfg;
}
 
-   arm_smmu_install_ste_for_dev(dev->iommu_fwspec);
+   arm_smmu_install_ste_for_dev(fwspec);
 out_unlock:
mutex_unlock(_domain->init_mutex);
return ret;
@@ -1839,7 +1841,7 @@ static int arm_smmu_add_device(struct device *dev)
int i, ret;
struct arm_smmu_device *smmu;
struct arm_smmu_master_data *master;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct iommu_group *group;
 
if (!fwspec || fwspec->ops != _smmu_ops)
@@ -1890,7 +1892,7 @@ static int arm_smmu_add_device(struct device *dev)
 
 static void arm_smmu_remove_device(struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct arm_smmu_master_data *master;
struct arm_smmu_device *smmu;
 
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 5a28ae892504..988d0362cd03 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1103,7 +1103,7 @@ static bool arm_smmu_free_sme(struct arm_smmu_device 
*smmu, int idx)
 
 static int arm_smmu_master_alloc_smes(struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct arm_smmu_master_cfg *cfg = fwspec->iommu_priv;
struct arm_smmu_device *smmu = cfg->smmu;
struct arm_smmu_smr *smrs = smmu->smrs;
@@ -1206,7 +1206,7 @@ static int arm_smmu_domain_add_master(struct 
arm_smmu_domain *smmu_domain,
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
int ret;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct arm_smmu_device *smmu;
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 
@@ -1380,7 +1380,7 @@ static int arm_smmu_add_device(struct device *dev)
 {
struct arm_smmu_device *smmu;
struct arm_smmu_master_cfg *cfg;
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
int i, ret;
 
if (using_legacy_binding) {
@@ -1391,7 +1391,7 @@ static int arm_smmu_add_device(struct device *dev)
 * will allocate/initialise a new one. Thus we need to update 
fwspec for
 * later use.
 */
-   fwspec = dev->iommu_fwspec;
+   fwspec = dev_iommu_fwspec_get(dev);
if (ret)
goto out_free;
} else if (fwspec && fwspec->ops == _smmu_ops) {
@@ -1445,7 +1445,7 @@ static int arm_smmu_add_device(struct device *dev)
 
 static void arm_smmu_remove_device(struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
struct arm_smmu_master_cfg *cfg;
struct arm_smmu_device *smmu;
 
@@ -1465,7 +1465,7 @@ static void arm_smmu_remove_device(struct device *dev)
 
 static struct iommu_group 

[PATCH 4/9] iommu/dma: Use helper functions to access dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

Use the new helpers dev_iommu_fwspec_get()/set() to access
the dev->iommu_fwspec pointer. This makes it easier to move
that pointer later into another struct.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/dma-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d1b04753b204..2b1c6912fbcc 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -177,7 +177,7 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
 
-   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+   if (!is_of_node(dev_iommu_fwspec_get(dev)->iommu_fwnode))
iort_iommu_msi_get_resv_regions(dev, list);
 
 }
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/9] iommu: Introduce wrappers around dev->iommu_fwspec

2018-12-04 Thread Joerg Roedel
From: Joerg Roedel 

These wrappers will be used to easily change the location of
the field later when all users are converted.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/iommu.c | 14 +++---
 include/linux/iommu.h | 11 +++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index edbdf5d6962c..4cfd407b972c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1976,7 +1976,7 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct 
fwnode_handle *fwnode)
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
  const struct iommu_ops *ops)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 
if (fwspec)
return ops == fwspec->ops ? 0 : -EINVAL;
@@ -1988,26 +1988,26 @@ int iommu_fwspec_init(struct device *dev, struct 
fwnode_handle *iommu_fwnode,
of_node_get(to_of_node(iommu_fwnode));
fwspec->iommu_fwnode = iommu_fwnode;
fwspec->ops = ops;
-   dev->iommu_fwspec = fwspec;
+   dev_iommu_fwspec_set(dev, fwspec);
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_init);
 
 void iommu_fwspec_free(struct device *dev)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 
if (fwspec) {
fwnode_handle_put(fwspec->iommu_fwnode);
kfree(fwspec);
-   dev->iommu_fwspec = NULL;
+   dev_iommu_fwspec_set(dev, NULL);
}
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_free);
 
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
 {
-   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
size_t size;
int i;
 
@@ -2016,11 +2016,11 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, 
int num_ids)
 
size = offsetof(struct iommu_fwspec, ids[fwspec->num_ids + num_ids]);
if (size > sizeof(*fwspec)) {
-   fwspec = krealloc(dev->iommu_fwspec, size, GFP_KERNEL);
+   fwspec = krealloc(fwspec, size, GFP_KERNEL);
if (!fwspec)
return -ENOMEM;
 
-   dev->iommu_fwspec = fwspec;
+   dev_iommu_fwspec_set(dev, fwspec);
}
 
for (i = 0; i < num_ids; i++)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a1d28f42cb77..f93fdcf7d130 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -398,6 +398,17 @@ void iommu_fwspec_free(struct device *dev);
 int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
+static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
+{
+   return dev->iommu_fwspec;
+}
+
+static inline void dev_iommu_fwspec_set(struct device *dev,
+   struct iommu_fwspec *fwspec)
+{
+   dev->iommu_fwspec = fwspec;
+}
+
 #else /* CONFIG_IOMMU_API */
 
 struct iommu_ops {};
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-12-04 Thread Will Deacon
On Tue, Dec 04, 2018 at 05:37:13PM +0800, Nicolas Boichat wrote:
> On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat  wrote:
> >
> > This is a follow-up to the discussion in [1], to make sure that the page
> > tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit
> > physical address space.
> >
> > [1] 
> > https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html
> 
> Hi everyone,
> 
> Let's try to summarize here.
> 
> First, we confirmed that this is a regression, and IOMMU errors happen
> on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13).
> The issue most likely starts from ad67f5a6545f ("arm64: replace
> ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number
> of Mediatek platforms (and maybe others?).
> 
> We have a few options here:
> 1. This series [2], that adds support for GFP_DMA32 slab caches,
> _without_ adding kmalloc caches (since there are no users of
> kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on
> the 3 patches, and AFAICT this solution works fine.
> 2. genalloc. That works, but unless we preallocate 4MB for L2 tables
> (which is wasteful as we usually only need a handful of L2 tables),
> we'll need changes in the core (use GFP_ATOMIC) to allow allocating on
> demand, and as it stands we'd have no way to shrink the allocation.
> 3. page_frag [3]. That works fine, and the code is quite simple. One
> drawback is that fragments in partially freed pages cannot be reused
> (from limited experiments, I see that IOMMU L2 tables are rarely
> freed, so it's unlikely a whole page would get freed). But given the
> low number of L2 tables, maybe we can live with that.
> 
> I think 2 is out. Any preference between 1 and 3? I think 1 makes
> better use of the memory, so that'd be my preference. But I'm probably
> missing something.

FWIW, I'm open to any solution at this point, since I'd like to see this
regression fixed. (1) does sound better longer-term, but (3) looks pretty
much ready to do afaict.

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 9/9] dmapool: debug: prevent endless loop in case of corruption

2018-12-04 Thread Tony Battersby
On 11/13/18 1:36 AM, Matthew Wilcox wrote:
> On Mon, Nov 12, 2018 at 10:46:35AM -0500, Tony Battersby wrote:
>> Prevent a possible endless loop with DMAPOOL_DEBUG enabled if a buggy
>> driver corrupts DMA pool memory.
>>
>> Signed-off-by: Tony Battersby 
> I like it!  Also, here you're using blks_per_alloc in a way which isn't
> normally in the performance path, but might be with the right config
> options.  With that, I withdraw my objection to the previous patch and
>
> Acked-by: Matthew Wilcox 
>
> Andrew, can you funnel these in through your tree?  If you'd rather not,
> I don't mind stuffing them into a git tree and asking Linus to pull
> for 4.21.
>
No reply for 3 weeks, so adding Andrew Morton to recipient list.

Andrew, I have 9 dmapool patches ready for merging in 4.21.  See Matthew
Wilcox's request above.

Tony Battersby

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

merge swiotlb support into dma_direct_ops

2018-12-04 Thread Christoph Hellwig
Hi Konrad and others,

can you review this series?  It merges the swiotlb support into the
DMA direct ops so that we don't have to duplicate the dma mapping logic
in multiple places.

Note that this is based on the dma_mapping_error series for which I'd
still like to collect a few more reviews, so pulling the git tree might
be easiest for testing.

The git tree is available here:

git://git.infradead.org/users/hch/misc.git swiotlb-merge

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/swiotlb-merge
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/5] swiotlb: remove SWIOTLB_MAP_ERROR

2018-12-04 Thread Christoph Hellwig
We can use DMA_MAPPING_ERROR instead, which already maps to the same
value.

Signed-off-by: Christoph Hellwig 
---
 drivers/xen/swiotlb-xen.c | 4 ++--
 include/linux/swiotlb.h   | 3 ---
 kernel/dma/swiotlb.c  | 4 ++--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 6dc969d5ea2f..833e80b46eb2 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -403,7 +403,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, 
struct page *page,
 
map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir,
 attrs);
-   if (map == SWIOTLB_MAP_ERROR)
+   if (map == DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
 
dev_addr = xen_phys_to_bus(map);
@@ -572,7 +572,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct 
scatterlist *sgl,
 sg_phys(sg),
 sg->length,
 dir, attrs);
-   if (map == SWIOTLB_MAP_ERROR) {
+   if (map == DMA_MAPPING_ERROR) {
dev_warn(hwdev, "swiotlb buffer is full\n");
/* Don't panic here, we expect map_sg users
   to do proper error handling. */
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index a387b59640a4..14aec0b70dd9 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -46,9 +46,6 @@ enum dma_sync_target {
SYNC_FOR_DEVICE = 1,
 };
 
-/* define the last possible byte of physical address space as a mapping error 
*/
-#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0)
-
 extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
  dma_addr_t tbl_dma_addr,
  phys_addr_t phys, size_t size,
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ff1ce81bb623..19ba8e473d71 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -526,7 +526,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
spin_unlock_irqrestore(_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", 
size);
-   return SWIOTLB_MAP_ERROR;
+   return DMA_MAPPING_ERROR;
 found:
spin_unlock_irqrestore(_tlb_lock, flags);
 
@@ -637,7 +637,7 @@ static dma_addr_t swiotlb_bounce_page(struct device *dev, 
phys_addr_t *phys,
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
*phys, size, dir, attrs);
-   if (*phys == SWIOTLB_MAP_ERROR)
+   if (*phys == DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
 
/* Ensure that the address returned is DMA'ble */
-- 
2.19.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/5] dma-direct: merge swiotlb_dma_ops into the dma_direct code

2018-12-04 Thread Christoph Hellwig
While the dma-direct code is (relatively) clean and simple we actually
have to use the swiotlb ops for the mapping on many architectures due
to devices with addressing limits.  Instead of keeping two
implementations around this commit allows the dma-direct
implementation to call the swiotlb bounce buffering functions and
thus share the guts of the mapping implementation.  This also
simplified the dma-mapping setup on a few architectures where we
don't have to differenciate which implementation to use.

Signed-off-by: Christoph Hellwig 
---
 arch/arm64/mm/dma-mapping.c  |   2 +-
 arch/ia64/hp/common/hwsw_iommu.c |   2 +-
 arch/ia64/hp/common/sba_iommu.c  |   6 +-
 arch/ia64/kernel/dma-mapping.c   |   2 +-
 arch/mips/include/asm/dma-mapping.h  |   2 -
 arch/powerpc/kernel/dma-swiotlb.c|  16 +-
 arch/riscv/include/asm/dma-mapping.h |  15 --
 arch/x86/kernel/pci-swiotlb.c|   4 +-
 arch/x86/mm/mem_encrypt.c|   7 -
 arch/x86/pci/sta2x11-fixup.c |   1 -
 include/linux/dma-direct.h   |  12 ++
 include/linux/swiotlb.h  |  68 +++-
 kernel/dma/direct.c  | 113 +
 kernel/dma/swiotlb.c | 227 ++-
 14 files changed, 144 insertions(+), 333 deletions(-)
 delete mode 100644 arch/riscv/include/asm/dma-mapping.h

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 4c0f498069e8..e4effbb243b1 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -549,7 +549,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, 
u64 size,
const struct iommu_ops *iommu, bool coherent)
 {
if (!dev->dma_ops)
-   dev->dma_ops = _dma_ops;
+   dev->dma_ops = _direct_ops;
 
dev->dma_coherent = coherent;
__iommu_setup_dma_ops(dev, dma_base, size, iommu);
diff --git a/arch/ia64/hp/common/hwsw_iommu.c b/arch/ia64/hp/common/hwsw_iommu.c
index 58969039bed2..f40ca499b246 100644
--- a/arch/ia64/hp/common/hwsw_iommu.c
+++ b/arch/ia64/hp/common/hwsw_iommu.c
@@ -38,7 +38,7 @@ static inline int use_swiotlb(struct device *dev)
 const struct dma_map_ops *hwsw_dma_get_ops(struct device *dev)
 {
if (use_swiotlb(dev))
-   return _dma_ops;
+   return _direct_ops;
return _dma_ops;
 }
 EXPORT_SYMBOL(hwsw_dma_get_ops);
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 0d21c0b5b23d..5ee74820a0f6 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2065,8 +2065,6 @@ static int __init acpi_sba_ioc_init_acpi(void)
 /* This has to run before acpi_scan_init(). */
 arch_initcall(acpi_sba_ioc_init_acpi);
 
-extern const struct dma_map_ops swiotlb_dma_ops;
-
 static int __init
 sba_init(void)
 {
@@ -2080,7 +2078,7 @@ sba_init(void)
 * a successful kdump kernel boot is to use the swiotlb.
 */
if (is_kdump_kernel()) {
-   dma_ops = _dma_ops;
+   dma_ops = _direct_ops;
if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
panic("Unable to initialize software I/O TLB:"
  " Try machvec=dig boot option");
@@ -2102,7 +2100,7 @@ sba_init(void)
 * If we didn't find something sba_iommu can claim, we
 * need to setup the swiotlb and switch to the dig machvec.
 */
-   dma_ops = _dma_ops;
+   dma_ops = _direct_ops;
if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
panic("Unable to find SBA IOMMU or initialize "
  "software I/O TLB: Try machvec=dig boot option");
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 36dd6aa6d759..80cd3e1ea95a 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -36,7 +36,7 @@ long arch_dma_coherent_to_pfn(struct device *dev, void 
*cpu_addr,
 
 void __init swiotlb_dma_init(void)
 {
-   dma_ops = _dma_ops;
+   dma_ops = _direct_ops;
swiotlb_init(1);
 }
 #endif
diff --git a/arch/mips/include/asm/dma-mapping.h 
b/arch/mips/include/asm/dma-mapping.h
index b4c477eb46ce..69f914667f3e 100644
--- a/arch/mips/include/asm/dma-mapping.h
+++ b/arch/mips/include/asm/dma-mapping.h
@@ -10,8 +10,6 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
 {
 #if defined(CONFIG_MACH_JAZZ)
return _dma_ops;
-#elif defined(CONFIG_SWIOTLB)
-   return _dma_ops;
 #else
return _direct_ops;
 #endif
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index 3d8df2cf8be9..21a869fb9734 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -50,15 +50,15 @@ const struct dma_map_ops powerpc_swiotlb_dma_ops = {
.alloc = __dma_nommu_alloc_coherent,
   

[PATCH 4/5] dma-direct: use dma_direct_map_page to implement dma_direct_map_sg

2018-12-04 Thread Christoph Hellwig
No need to duplicate the mapping logic.

Signed-off-by: Christoph Hellwig 
---
 kernel/dma/direct.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index edb24f94ea1e..d45306473c90 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -217,6 +217,7 @@ static void dma_direct_sync_single_for_device(struct device 
*dev,
arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir);
 }
 
+#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
 static void dma_direct_sync_sg_for_device(struct device *dev,
struct scatterlist *sgl, int nents, enum dma_data_direction dir)
 {
@@ -229,6 +230,7 @@ static void dma_direct_sync_sg_for_device(struct device 
*dev,
for_each_sg(sgl, sg, nents, i)
arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
 }
+#endif
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
 defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
@@ -294,19 +296,13 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
struct scatterlist *sg;
 
for_each_sg(sgl, sg, nents, i) {
-   BUG_ON(!sg_page(sg));
-
-   sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
-   if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg),
-   sg->length))) {
-   report_addr(dev, sg_dma_address(sg), sg->length);
+   sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
+   sg->offset, sg->length, dir, attrs);
+   if (sg->dma_address == DMA_MAPPING_ERROR)
return 0;
-   }
sg_dma_len(sg) = sg->length;
}
 
-   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-   dma_direct_sync_sg_for_device(dev, sgl, nents, dir);
return nents;
 }
 
-- 
2.19.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 3/5] dma-direct: improve addressability error reporting

2018-12-04 Thread Christoph Hellwig
Only report report a DMA addressability report once to avoid spewing the
kernel log with repeated message.  Also provide a stack trace to make it
easy to find the actual caller that caused the problem.

Last but not least move the actual check into the fast path and only
leave the error reporting in a helper.

Signed-off-by: Christoph Hellwig 
---
 kernel/dma/direct.c | 36 +++-
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 308f88a750c8..edb24f94ea1e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -30,27 +30,16 @@ static inline bool force_dma_unencrypted(void)
return sev_active();
 }
 
-static bool
-check_addr(struct device *dev, dma_addr_t dma_addr, size_t size,
-   const char *caller)
+static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
-   if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
-   if (!dev->dma_mask) {
-   dev_err(dev,
-   "%s: call on device without dma_mask\n",
-   caller);
-   return false;
-   }
-
-   if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
-   dev_err(dev,
-   "%s: overflow %pad+%zu of device mask %llx bus 
mask %llx\n",
-   caller, _addr, size,
-   *dev->dma_mask, dev->bus_dma_mask);
-   }
-   return false;
+   if (!dev->dma_mask) {
+   dev_err_once(dev, "DMA map on device without dma_mask\n");
+   } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+   dev_err_once(dev,
+   "overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
+   _addr, size, *dev->dma_mask, dev->bus_dma_mask);
}
-   return true;
+   WARN_ON_ONCE(1);
 }
 
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
@@ -288,8 +277,10 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct 
page *page,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-   if (!check_addr(dev, dma_addr, size, __func__))
+   if (unlikely(dev && !dma_capable(dev, dma_addr, size))) {
+   report_addr(dev, dma_addr, size);
return DMA_MAPPING_ERROR;
+   }
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
dma_direct_sync_single_for_device(dev, dma_addr, size, dir);
@@ -306,8 +297,11 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
BUG_ON(!sg_page(sg));
 
sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg));
-   if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__))
+   if (unlikely(dev && !dma_capable(dev, sg_dma_address(sg),
+   sg->length))) {
+   report_addr(dev, sg_dma_address(sg), sg->length);
return 0;
+   }
sg_dma_len(sg) = sg->length;
}
 
-- 
2.19.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/5] swiotlb: remove dma_mark_clean

2018-12-04 Thread Christoph Hellwig
Instead of providing a special dma_mark_clean hook just for ia64, switch
ia64 to use the normal arch_sync_dma_for_cpu hooks instead.

This means that we now also set the PG_arch_1 bit for pages in the
swiotlb buffer, which isn't stricly needed as we will never execute code
out of the swiotlb buffer, but otherwise harmless.

Signed-off-by: Christoph Hellwig 
---
 arch/ia64/Kconfig  |  3 ++-
 arch/ia64/kernel/dma-mapping.c | 20 +++-
 arch/ia64/mm/init.c| 18 +++---
 drivers/xen/swiotlb-xen.c  | 20 +---
 include/linux/dma-direct.h |  8 
 kernel/dma/swiotlb.c   | 18 +-
 6 files changed, 30 insertions(+), 57 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 36773def6920..39724df0de29 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -28,8 +28,9 @@ config IA64
select HAVE_ARCH_TRACEHOOK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
-   select ARCH_HAS_DMA_MARK_CLEAN
+   select ARCH_HAS_DMA_COHERENT_TO_PFN
select ARCH_HAS_SG_CHAIN
+   select ARCH_HAS_SYNC_DMA_FOR_CPU
select VIRT_TO_BUS
select ARCH_DISCARD_MEMBLOCK
select GENERIC_IRQ_PROBE
diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c
index 7a471d8d67d4..36dd6aa6d759 100644
--- a/arch/ia64/kernel/dma-mapping.c
+++ b/arch/ia64/kernel/dma-mapping.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-#include 
+#include 
 #include 
 #include 
 
@@ -16,6 +16,24 @@ const struct dma_map_ops *dma_get_ops(struct device *dev)
 EXPORT_SYMBOL(dma_get_ops);
 
 #ifdef CONFIG_SWIOTLB
+void *arch_dma_alloc(struct device *dev, size_t size,
+   dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+{
+   return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+}
+
+void arch_dma_free(struct device *dev, size_t size, void *cpu_addr,
+   dma_addr_t dma_addr, unsigned long attrs)
+{
+   dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
+}
+
+long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
+   dma_addr_t dma_addr)
+{
+   return page_to_pfn(virt_to_page(cpu_addr));
+}
+
 void __init swiotlb_dma_init(void)
 {
dma_ops = _dma_ops;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d5e12ff1d73c..2c51733f1dfd 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -71,18 +71,14 @@ __ia64_sync_icache_dcache (pte_t pte)
  * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
  * flush them when they get mapped into an executable vm-area.
  */
-void
-dma_mark_clean(void *addr, size_t size)
+void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr,
+   size_t size, enum dma_data_direction dir)
 {
-   unsigned long pg_addr, end;
-
-   pg_addr = PAGE_ALIGN((unsigned long) addr);
-   end = (unsigned long) addr + size;
-   while (pg_addr + PAGE_SIZE <= end) {
-   struct page *page = virt_to_page(pg_addr);
-   set_bit(PG_arch_1, >flags);
-   pg_addr += PAGE_SIZE;
-   }
+   unsigned long pfn = __phys_to_pfn(paddr);
+
+   do {
+   set_bit(PG_arch_1, _to_page(pfn)->flags);
+   } while (++pfn <= __phys_to_pfn(paddr + size - 1));
 }
 
 inline void
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 833e80b46eb2..989cf872b98c 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -441,21 +441,8 @@ static void xen_unmap_single(struct device *hwdev, 
dma_addr_t dev_addr,
xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs);
 
/* NOTE: We use dev_addr here, not paddr! */
-   if (is_xen_swiotlb_buffer(dev_addr)) {
+   if (is_xen_swiotlb_buffer(dev_addr))
swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
-   return;
-   }
-
-   if (dir != DMA_FROM_DEVICE)
-   return;
-
-   /*
-* phys_to_virt doesn't work with hihgmem page but we could
-* call dma_mark_clean() with hihgmem page here. However, we
-* are fine since dma_mark_clean() is null on POWERPC. We can
-* make dma_mark_clean() take a physical address if necessary.
-*/
-   dma_mark_clean(phys_to_virt(paddr), size);
 }
 
 static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
@@ -493,11 +480,6 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t 
dev_addr,
 
if (target == SYNC_FOR_DEVICE)
xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir);
-
-   if (dir != DMA_FROM_DEVICE)
-   return;
-
-   dma_mark_clean(phys_to_virt(paddr), size);
 }
 
 void
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 6e5a47ae7d64..1aa73f4907ae 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -48,14 +48,6 

Re: [PATCH 2/4] dma-debug: Refactor dma_debug_entry allocation

2018-12-04 Thread Robin Murphy

On 04/12/2018 14:27, Christoph Hellwig wrote:

On Mon, Dec 03, 2018 at 05:28:07PM +, Robin Murphy wrote:

Make prealloc_memory() a little more general and robust so that it
serves for runtime reallocations too. The first thing we can do with
that is clean up dma_debug_resize_entries() quite a bit.


Maybe also renamed it to dma_debug_alloc_entries or something like
that?


Yes, that's definitely nicer.


Otherwise this looks fine to me.


Thanks (and for the review on #1)

Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread Robin Murphy

On 04/12/2018 14:17, Christoph Hellwig wrote:

On Tue, Dec 04, 2018 at 01:11:37PM +, Robin Murphy wrote:

In fact, having got this far in, what I'd quite like to do is to get rid of
dma_debug_resize_entries() such that we never need to free things at all,
since then we could allocate whole pages as blocks of entries to save on
masses of individual slab allocations.


Yes, we should defintively kill dma_debug_resize_entries.  Allocating
page batches might sound nice, but is that going to introduce additional
complexity?


OK, looking at what the weird AMD GART code does I reckon it should be 
happy enough with on-demand expansion, and that no tears will be shed if 
it can no longer actually trim the pool to the size it thinks is 
necessary. I'll add a patch to clean that up.


Page-based allocation, at least the way I'm thinking of it, shouldn't do 
much more than add an extra loop in one place, which should be more than 
made up for by removing all the freeing code :)


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] dma-mapping: remove a pointless memset in dma_atomic_pool_init

2018-12-04 Thread Christoph Hellwig
We already zero the memory after allocating it from the pool that
this function fills, and having the memset here in this form means
we can't support CMA highmem allocations.

Signed-off-by: Christoph Hellwig 
Reported-by: Russell King - ARM Linux 
---
 kernel/dma/remap.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 68a64e3ff6a1..6e784824326f 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -121,7 +121,6 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
if (!page)
goto out;
 
-   memset(page_address(page), 0, atomic_pool_size);
arch_dma_prep_coherent(page, atomic_pool_size);
 
atomic_pool = gen_pool_create(PAGE_SHIFT, -1);
-- 
2.19.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] dma-mapping: simplify the dma_sync_single_range_for_{cpu, device} implementation

2018-12-04 Thread Christoph Hellwig
We can just call the regular calls after adding offset the the address instead
of reimplementing them.

Signed-off-by: Christoph Hellwig 
---
 include/linux/dma-debug.h   | 27 
 include/linux/dma-mapping.h | 34 +-
 kernel/dma/debug.c  | 42 -
 3 files changed, 10 insertions(+), 93 deletions(-)

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 30213adbb6b9..c85e097a984c 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -72,17 +72,6 @@ extern void debug_dma_sync_single_for_device(struct device 
*dev,
 dma_addr_t dma_handle,
 size_t size, int direction);
 
-extern void debug_dma_sync_single_range_for_cpu(struct device *dev,
-   dma_addr_t dma_handle,
-   unsigned long offset,
-   size_t size,
-   int direction);
-
-extern void debug_dma_sync_single_range_for_device(struct device *dev,
-  dma_addr_t dma_handle,
-  unsigned long offset,
-  size_t size, int direction);
-
 extern void debug_dma_sync_sg_for_cpu(struct device *dev,
  struct scatterlist *sg,
  int nelems, int direction);
@@ -174,22 +163,6 @@ static inline void debug_dma_sync_single_for_device(struct 
device *dev,
 {
 }
 
-static inline void debug_dma_sync_single_range_for_cpu(struct device *dev,
-  dma_addr_t dma_handle,
-  unsigned long offset,
-  size_t size,
-  int direction)
-{
-}
-
-static inline void debug_dma_sync_single_range_for_device(struct device *dev,
- dma_addr_t dma_handle,
- unsigned long offset,
- size_t size,
- int direction)
-{
-}
-
 static inline void debug_dma_sync_sg_for_cpu(struct device *dev,
 struct scatterlist *sg,
 int nelems, int direction)
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 634c0ab39074..1176e6adb035 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -360,6 +360,13 @@ static inline void dma_sync_single_for_cpu(struct device 
*dev, dma_addr_t addr,
debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
 
+static inline void dma_sync_single_range_for_cpu(struct device *dev,
+   dma_addr_t addr, unsigned long offset, size_t size,
+   enum dma_data_direction dir)
+{
+   return dma_sync_single_for_cpu(dev, addr + offset, size, dir);
+}
+
 static inline void dma_sync_single_for_device(struct device *dev,
  dma_addr_t addr, size_t size,
  enum dma_data_direction dir)
@@ -372,32 +379,11 @@ static inline void dma_sync_single_for_device(struct 
device *dev,
debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
 
-static inline void dma_sync_single_range_for_cpu(struct device *dev,
-dma_addr_t addr,
-unsigned long offset,
-size_t size,
-enum dma_data_direction dir)
-{
-   const struct dma_map_ops *ops = get_dma_ops(dev);
-
-   BUG_ON(!valid_dma_direction(dir));
-   if (ops->sync_single_for_cpu)
-   ops->sync_single_for_cpu(dev, addr + offset, size, dir);
-   debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir);
-}
-
 static inline void dma_sync_single_range_for_device(struct device *dev,
-   dma_addr_t addr,
-   unsigned long offset,
-   size_t size,
-   enum dma_data_direction dir)
+   dma_addr_t addr, unsigned long offset, size_t size,
+   enum dma_data_direction dir)
 {
-   const struct dma_map_ops *ops = get_dma_ops(dev);
-
-   BUG_ON(!valid_dma_direction(dir));
-   if (ops->sync_single_for_device)
-   ops->sync_single_for_device(dev, addr 

Re: [PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache

2018-12-04 Thread Robin Murphy

On 04/12/2018 11:01, Vivek Gautam wrote:

Qualcomm SoCs have an additional level of cache called as
System cache, aka. Last level cache (LLC). This cache sits right
before the DDR, and is tightly coupled with the memory controller.
The cache is available to all the clients present in the SoC system.
The clients request their slices from this system cache, make it
active, and can then start using it.
For these clients with smmu, to start using the system cache for
buffers and, related page tables [1], memory attributes need to be
set accordingly.
This change updates the MAIR and TCR configurations with correct
attributes to use this system cache.

To explain a little about memory attribute requirements here:

Non-coherent I/O devices can't look-up into inner caches. However,
coherent I/O devices can. But both can allocate in the system cache
based on system policy and configured memory attributes in page
tables.
CPUs can access both inner and outer caches (including system cache,
aka. Last level cache), and can allocate into system cache too
based on memory attributes, and system policy.

Further looking at memory types, we have following -
a) Normal uncached :- MAIR 0x44, inner non-cacheable,
   outer non-cacheable;
b) Normal cached :-   MAIR 0xff, inner read write-back non-transient,
   outer read write-back non-transient;
   attribute setting for coherenet I/O devices.

and, for non-coherent i/o devices that can allocate in system cache
another type gets added -
c) Normal sys-cached/non-inner-cached :-
   MAIR 0xf4, inner non-cacheable,
   outer read write-back non-transient

So, CPU will automatically use the system cache for memory marked as
normal cached. The normal sys-cached is downgraded to normal non-cached
memory for CPUs.
Coherent I/O devices can use system cache by marking the memory as
normal cached.
Non-coherent I/O devices, to use system cache, should mark the memory as
normal sys-cached in page tables.

This change is a realisation of following changes
from downstream msm-4.9:
iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2]
iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3]

[1] https://patchwork.kernel.org/patch/10302791/
[2] 
https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9=bf762276796e79ca90014992f4d9da5593fa7d51
[3] 
https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9=d4c72c413ea27c43f60825193d4de9cb8ffd9602

Signed-off-by: Vivek Gautam 
---

Changes since v1:
  - Addressed Tomasz's comments for basing the change on
"NO_INNER_CACHE" concept for non-coherent I/O devices
rather than capturing "SYS_CACHE". This is to indicate
clearly the intent of non-coherent I/O devices that
can't access inner caches.


That seems backwards to me - there is already a fundamental assumption 
that non-coherent devices can't access caches. What we're adding here is 
a weird exception where they *can* use some level of cache despite still 
being non-coherent overall.


In other words, it's not a case of downgrading coherent devices' 
accesses to bypass inner caches, it's upgrading non-coherent devices' 
accesses to hit the outer cache. That's certainly the understanding I 
got from talking with Pratik at Plumbers, and it does appear to fit with 
your explanation above despite the final conclusion you draw being 
different.


I do see what Tomasz meant in terms of the TCR attributes, but what we 
currently do there is a little unintuitive and not at all representative 
of actual mapping attributes - I'll come back to that inline.



  drivers/iommu/arm-smmu.c   | 15 +++
  drivers/iommu/dma-iommu.c  |  3 +++
  drivers/iommu/io-pgtable-arm.c | 22 +-
  drivers/iommu/io-pgtable.h |  5 +
  include/linux/iommu.h  |  3 +++
  5 files changed, 43 insertions(+), 5 deletions(-)


As a minor nit, I'd prefer this as at least two patches to separate the 
io-pgtable changes and arm-smmu changes - basically I'd expect it to 
look much the same as the non-strict mode support did.



diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index ba18d89d4732..047f7ff95b0d 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -255,6 +255,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolno_inner_cache;


Can we keep all the domain flags together please? In fact, I'd be 
inclined to implement an options bitmap as we do elsewhere rather than 
proliferate multiple bools.



  };
  
  struct arm_smmu_option_prop {

@@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if 

Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-12-04 Thread Vlastimil Babka
On 12/4/18 10:37 AM, Nicolas Boichat wrote:
> On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat  wrote:
>>
>> This is a follow-up to the discussion in [1], to make sure that the page
>> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit
>> physical address space.
>>
>> [1] 
>> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html
> 
> Hi everyone,
> 
> Let's try to summarize here.
> 
> First, we confirmed that this is a regression, and IOMMU errors happen
> on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13).
> The issue most likely starts from ad67f5a6545f ("arm64: replace
> ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number
> of Mediatek platforms (and maybe others?).
> 
> We have a few options here:
> 1. This series [2], that adds support for GFP_DMA32 slab caches,
> _without_ adding kmalloc caches (since there are no users of
> kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on
> the 3 patches, and AFAICT this solution works fine.
> 2. genalloc. That works, but unless we preallocate 4MB for L2 tables
> (which is wasteful as we usually only need a handful of L2 tables),
> we'll need changes in the core (use GFP_ATOMIC) to allow allocating on
> demand, and as it stands we'd have no way to shrink the allocation.
> 3. page_frag [3]. That works fine, and the code is quite simple. One
> drawback is that fragments in partially freed pages cannot be reused
> (from limited experiments, I see that IOMMU L2 tables are rarely
> freed, so it's unlikely a whole page would get freed). But given the
> low number of L2 tables, maybe we can live with that.
> 
> I think 2 is out. Any preference between 1 and 3? I think 1 makes
> better use of the memory, so that'd be my preference. But I'm probably
> missing something.

I would prefer 1 as well. IIRC you already confirmed that alignment
requirements are not broken for custom kmem caches even in presence of
SLUB debug options (and I would say it's a bug to be fixed if they
weren't). I just asked (and didn't get a reply I think) about your
ability to handle the GFP_ATOMIC allocation failures. They should be
rare when only single page allocations are needed for the kmem cache.
But in case they are not an option, then preallocating would be needed,
thus probably option 2.

> [2] https://patchwork.kernel.org/cover/10677529/, 3 patches
> [3] https://patchwork.codeaurora.org/patch/671639/
> 
> Thanks,
> 
> Nicolas
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC 4/4] dma-debug: Make leak-like behaviour apparent

2018-12-04 Thread Christoph Hellwig
On Mon, Dec 03, 2018 at 05:28:09PM +, Robin Murphy wrote:
> Now that we can dynamically allocate DMA debug entries to cope with
> drivers maintaining excessively large numbers of live mappings, a driver
> which *does* actually have a bug leaking mappings (and is not unloaded)
> will no longer trigger the "DMA-API: debugging out of memory - disabling"
> message until it gets to actual kernel OOM conditions, which means it
> could go unnoticed for a while. To that end, let's inform the user each
> time the pool has grown to a multiple of its initial size, which should
> make it apparent that they either have a leak or might want to increase
> the preallocation size.
> 
> Signed-off-by: Robin Murphy 
> ---
> 
> Tagging this one as RFC since people might think it's silly.

I think finding out the numbers is useful, but I'm a little worried
about claiming a possible leak.  Maybe we just need to print a log message
for each new power of 2 of entries reached?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread Christoph Hellwig
> + for (retry_count = 0; ; retry_count++) {
> + spin_lock_irqsave(_entries_lock, flags);
> +
> + if (num_free_entries > 0)
> + break;
>  
>   spin_unlock_irqrestore(_entries_lock, flags);

Taking a spinlock just to read a single integer value doesn't really
help anything.

> +
> + if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES &&
> + !prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES))

Don't we need GFP_ATOMIC here?  Also why do we need the retries?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/4] dma-debug: Refactor dma_debug_entry allocation

2018-12-04 Thread Christoph Hellwig
On Mon, Dec 03, 2018 at 05:28:07PM +, Robin Murphy wrote:
> Make prealloc_memory() a little more general and robust so that it
> serves for runtime reallocations too. The first thing we can do with
> that is clean up dma_debug_resize_entries() quite a bit.

Maybe also renamed it to dma_debug_alloc_entries or something like
that?

Otherwise this looks fine to me.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/4] dma-debug: Use pr_fmt()

2018-12-04 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3, RFC] iommu/io-pgtable-arm-v7s: Use page_frag to request DMA32 memory

2018-12-04 Thread Christoph Hellwig
On Tue, Dec 04, 2018 at 04:23:00PM +0800, Nicolas Boichat wrote:
> IOMMUs using ARMv7 short-descriptor format require page tables
> (level 1 and 2) to be allocated within the first 4GB of RAM, even
> on 64-bit systems.
> 
> For level 1/2 tables, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32
> is defined (e.g. on arm64 platforms).
> 
> For level 2 tables (1 KB), we use page_frag to allocate these pages,
> as we cannot directly use kmalloc (no slab cache for GFP_DMA32) or
> kmem_cache (mm/ code treats GFP_DMA32 as an invalid flag).
> 
> One downside is that we only free the allocated page if all the
> 4 fragments (4 IOMMU L2 tables) are freed, but given that we
> usually only allocate limited number of IOMMU L2 tables, this
> should not have too much impact on memory usage: In the absolute
> worst case (4096 L2 page tables, each on their own 4K page),
> we would use 16 MB of memory for 4 MB of L2 tables.

I think this needs to be documemented in the code.  That is move
the explanation about into a comment in the code.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use generic DMA mapping code in powerpc V4

2018-12-04 Thread Christoph Hellwig
On Tue, Dec 04, 2018 at 10:53:39AM +0100, Christian Zigotzky wrote:
> I don't know why this kernel doesn't recognize the hard disks connected to 
> my physical P5020 board and why the onboard ethernet on my PASEMI board 
> doesn't work. (dma_direct_map_page: overflow)

Do you know if this actually works for the baseline before my patches?
E.g. with commit 721c01ba8b46ddb5355bd6e6b3bbfdabfdf01e97 ?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/9] dma-mapping: support highmem in the generic remap allocator

2018-12-04 Thread Christoph Hellwig
On Tue, Dec 04, 2018 at 09:38:02AM +0100, Marek Szyprowski wrote:
> Hi All,
> 
> On 2018-11-30 20:05, Robin Murphy wrote:
> > On 05/11/2018 12:19, Christoph Hellwig wrote:
> >> By using __dma_direct_alloc_pages we can deal entirely with struct page
> >> instead of having to derive a kernel virtual address.
> >
> > Simple enough :)
> >
> > Reviewed-by: Robin Murphy 
> 
> This patch has landed linux-next yesterday and I've noticed that it
> breaks operation of many drivers. The change looked simple, but a stupid
> bug managed to slip into the code. After a short investigation I've
> noticed that __dma_direct_alloc_pages() doesn't set dma_handle and zero
> allocated memory, while dma_direct_alloc_pages() did. The other
> difference is the lack of set_memory_decrypted() handling.
> 
> Following patch fixes the issue, but maybe it would be better to fix it
> in kernel/dma/direct.c:

Thanks for spotting this Marek.  Can you send the patch below with a
signoff and a changelog so that I can queue it up?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/9] dma-mapping: move the arm64 ncoherent alloc/free support to common code

2018-12-04 Thread Christoph Hellwig
> > +int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
> > +{
> > +   unsigned int pool_size_order = get_order(atomic_pool_size);
> > +   unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
> > +   struct page *page;
> > +   void *addr;
> > +   int ret;
> > +
> > +   if (dev_get_cma_area(NULL))
> > +   page = dma_alloc_from_contiguous(NULL, nr_pages,
> > +pool_size_order, false);
> > +   else
> > +   page = alloc_pages(gfp, pool_size_order);
> > +   if (!page)
> > +   goto out;
> > +
> > +   memset(page_address(page), 0, atomic_pool_size);
> 
> Note that this won't work if 'page' is a highmem page - should there
> be a check for that, or a check for the gfp flags?
>
> Also, is this memset() actually useful, or a waste of cycles - when we
> allocate from this pool (see dma_alloc_from_pool()), we always memset()
> the buffer.

Currently there is no user that supports highmem, but yes, the memset
should probably simply be removed.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread Christoph Hellwig
On Tue, Dec 04, 2018 at 01:11:37PM +, Robin Murphy wrote:
> In fact, having got this far in, what I'd quite like to do is to get rid of 
> dma_debug_resize_entries() such that we never need to free things at all, 
> since then we could allocate whole pages as blocks of entries to save on 
> masses of individual slab allocations.

Yes, we should defintively kill dma_debug_resize_entries.  Allocating
page batches might sound nice, but is that going to introduce additional
complexity?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v2] dma-debug: Kconfig for PREALLOC_DMA_DEBUG_ENTRIES

2018-12-04 Thread David Laight
From: Qian Cai
> Sent: 30 November 2018 21:48
> To: h...@lst.de; m.szyprow...@samsung.com; robin.mur...@arm.com
> Cc: yisen.zhu...@huawei.com; salil.me...@huawei.com; john.ga...@huawei.com; 
> linux...@huawei.com;
> iommu@lists.linux-foundation.org; net...@vger.kernel.org; 
> linux-ker...@vger.kernel.org; Qian Cai
> Subject: [PATCH v2] dma-debug: Kconfig for PREALLOC_DMA_DEBUG_ENTRIES
> 
> The amount of DMA mappings from Hisilicon HNS ethernet devices is huge,
> so it could trigger "DMA-API: debugging out of memory - disabling".
> 
> hnae_get_handle [1]
>   hnae_init_queue
> hnae_init_ring
>   hnae_alloc_buffers [2]
> debug_dma_map_page
>   dma_entry_alloc
> 
> [1] for (i = 0; i < handle->q_num; i++)
> [2] for (i = 0; i < ring->desc_num; i++)
> 
> Also, "#define HNS_DSAF_MAX_DESC_CNT 1024"
> 
> On this Huawei TaiShan 2280 aarch64 server, it has reached the limit
> already,
> 
> 4 (NICs) x 16 (queues) x 1024 (port descption numbers) = 65536
> 
> Added a Kconfig entry for PREALLOC_DMA_DEBUG_ENTRIES, so make it easier
> for users to deal with special cases like this.

Ugg. That is worse than a module parameter.

The driver needs to automatically reduce the number of mapping requests.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, 
UK
Registration No: 1397386 (Wales)

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/4] dma-debug: Dynamically expand the dma_debug_entry pool

2018-12-04 Thread Robin Murphy

Hi John,

On 03/12/2018 18:23, John Garry wrote:

On 03/12/2018 17:28, Robin Murphy wrote:

Certain drivers such as large multi-queue network adapters can use pools
of mapped DMA buffers larger than the default dma_debug_entry pool of
65536 entries, with the result that merely probing such a device can
cause DMA debug to disable itself during boot unless explicitly given an
appropriate "dma_debug_entries=..." option.

Developers trying to debug some other driver on such a system may not be
immediately aware of this, and at worst it can hide bugs if they fail to
realise that dma-debug has already disabled itself unexpectedly by the
time the code of interest gets to run. Even once they do realise, it can
be a bit of a pain to emprirically determine a suitable number of
preallocated entries to configure without massively over-allocating.

There's really no need for such a static limit, though, since we can
quite easily expand the pool at runtime in those rare cases that the
preallocated entries are insufficient, which is arguably the least
surprising and most useful behaviour.


Hi Robin,

Do you have an idea on shrinking the pool again when the culprit driver 
is removed, i.e. we have so many unused debug entries now available?


I honestly don't believe it's worth the complication. This is a 
development feature with significant overheads already, so there's not 
an awful lot to gain by trying to optimise memory usage. If a system can 
ever load a driver that makes hundreds of thousands of simultaneous 
mappings, it can almost certainly spare 20-odd megabytes of RAM for the 
corresponding debug entries in perpetuity. Sure, it does mean you'd need 
to reboot to recover memory from a major leak, but that's mostly true of 
the current behaviour too, and rebooting during driver development is 
hardly an unacceptable inconvenience.


In fact, having got this far in, what I'd quite like to do is to get rid 
of dma_debug_resize_entries() such that we never need to free things at 
all, since then we could allocate whole pages as blocks of entries to 
save on masses of individual slab allocations.


Robin.



Thanks,
John



Signed-off-by: Robin Murphy 
---
 kernel/dma/debug.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index de5db800dbfc..46cc075aec99 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -47,6 +47,9 @@
 #ifndef PREALLOC_DMA_DEBUG_ENTRIES
 #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
 #endif
+/* If the pool runs out, try this many times to allocate this many 
new entries */

+#define DMA_DEBUG_DYNAMIC_ENTRIES 256
+#define DMA_DEBUG_DYNAMIC_RETRIES 2

 enum {
 dma_debug_single,
@@ -702,12 +705,21 @@ static struct dma_debug_entry 
*dma_entry_alloc(void)

 {
 struct dma_debug_entry *entry;
 unsigned long flags;
+    int retry_count;

-    spin_lock_irqsave(_entries_lock, flags);
+    for (retry_count = 0; ; retry_count++) {
+    spin_lock_irqsave(_entries_lock, flags);
+
+    if (num_free_entries > 0)
+    break;

-    if (list_empty(_entries)) {
-    global_disable = true;
 spin_unlock_irqrestore(_entries_lock, flags);
+
+    if (retry_count < DMA_DEBUG_DYNAMIC_RETRIES &&
+    !prealloc_memory(DMA_DEBUG_DYNAMIC_ENTRIES))
+    continue;
+
+    global_disable = true;
 pr_err("debugging out of memory - disabling\n");
 return NULL;
 }





___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/1] iommu/arm-smmu: Add support to use Last level cache

2018-12-04 Thread Vivek Gautam
Qualcomm SoCs have an additional level of cache called as
System cache, aka. Last level cache (LLC). This cache sits right
before the DDR, and is tightly coupled with the memory controller.
The cache is available to all the clients present in the SoC system.
The clients request their slices from this system cache, make it
active, and can then start using it.
For these clients with smmu, to start using the system cache for
buffers and, related page tables [1], memory attributes need to be
set accordingly.
This change updates the MAIR and TCR configurations with correct
attributes to use this system cache.

To explain a little about memory attribute requirements here:

Non-coherent I/O devices can't look-up into inner caches. However,
coherent I/O devices can. But both can allocate in the system cache
based on system policy and configured memory attributes in page
tables.
CPUs can access both inner and outer caches (including system cache,
aka. Last level cache), and can allocate into system cache too
based on memory attributes, and system policy.

Further looking at memory types, we have following -
a) Normal uncached :- MAIR 0x44, inner non-cacheable,
  outer non-cacheable;
b) Normal cached :-   MAIR 0xff, inner read write-back non-transient,
  outer read write-back non-transient;
  attribute setting for coherenet I/O devices.

and, for non-coherent i/o devices that can allocate in system cache
another type gets added -
c) Normal sys-cached/non-inner-cached :-
  MAIR 0xf4, inner non-cacheable,
  outer read write-back non-transient

So, CPU will automatically use the system cache for memory marked as
normal cached. The normal sys-cached is downgraded to normal non-cached
memory for CPUs.
Coherent I/O devices can use system cache by marking the memory as
normal cached.
Non-coherent I/O devices, to use system cache, should mark the memory as
normal sys-cached in page tables.

This change is a realisation of following changes
from downstream msm-4.9:
iommu: io-pgtable-arm: Support DOMAIN_ATTRIBUTE_USE_UPSTREAM_HINT[2]
iommu: io-pgtable-arm: Implement IOMMU_USE_UPSTREAM_HINT[3]

[1] https://patchwork.kernel.org/patch/10302791/
[2] 
https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9=bf762276796e79ca90014992f4d9da5593fa7d51
[3] 
https://source.codeaurora.org/quic/la/kernel/msm-4.9/commit/?h=msm-4.9=d4c72c413ea27c43f60825193d4de9cb8ffd9602

Signed-off-by: Vivek Gautam 
---

Changes since v1:
 - Addressed Tomasz's comments for basing the change on
   "NO_INNER_CACHE" concept for non-coherent I/O devices
   rather than capturing "SYS_CACHE". This is to indicate
   clearly the intent of non-coherent I/O devices that
   can't access inner caches.

 drivers/iommu/arm-smmu.c   | 15 +++
 drivers/iommu/dma-iommu.c  |  3 +++
 drivers/iommu/io-pgtable-arm.c | 22 +-
 drivers/iommu/io-pgtable.h |  5 +
 include/linux/iommu.h  |  3 +++
 5 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index ba18d89d4732..047f7ff95b0d 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -255,6 +255,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolno_inner_cache;
 };
 
 struct arm_smmu_option_prop {
@@ -897,6 +898,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->no_inner_cache)
+   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NO_IC;
+
smmu_domain->smmu = smmu;
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
@@ -1579,6 +1583,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == 
ARM_SMMU_DOMAIN_NESTED);
return 0;
+   case DOMAIN_ATTR_NO_IC:
+   *((int *)data) = smmu_domain->no_inner_cache;
+   return 0;
default:
return -ENODEV;
}
@@ -1619,6 +1626,14 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_NO_IC:
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+   if (*((int *)data))
+  

Re: [PATCH 4/9] dma-mapping: move the arm64 ncoherent alloc/free support to common code

2018-12-04 Thread Russell King - ARM Linux
On Mon, Nov 05, 2018 at 01:19:26PM +0100, Christoph Hellwig wrote:
> The arm64 codebase to implement coherent dma allocation for architectures
> with non-coherent DMA is a good start for a generic implementation, given
> that is uses the generic remap helpers, provides the atomic pool for
> allocations that can't sleep and still is realtively simple and well
> tested.  Move it to kernel/dma and allow architectures to opt into it
> using a config symbol.  Architectures just need to provide a new
> arch_dma_prep_coherent helper to writeback an invalidate the caches
> for any memory that gets remapped for uncached access.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/arm64/Kconfig  |   2 +-
>  arch/arm64/mm/dma-mapping.c | 184 ++--
>  include/linux/dma-mapping.h |   5 +
>  include/linux/dma-noncoherent.h |   2 +
>  kernel/dma/Kconfig  |   6 ++
>  kernel/dma/remap.c  | 158 ++-
>  6 files changed, 181 insertions(+), 176 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 5d065acb6d10..2e645ea693ea 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -82,7 +82,7 @@ config ARM64
>   select CRC32
>   select DCACHE_WORD_ACCESS
>   select DMA_DIRECT_OPS
> - select DMA_REMAP
> + select DMA_DIRECT_REMAP
>   select EDAC_SUPPORT
>   select FRAME_POINTER
>   select GENERIC_ALLOCATOR
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index a3ac26284845..e2e7e5d0f94e 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -33,113 +33,6 @@
>  
>  #include 
>  
> -static struct gen_pool *atomic_pool __ro_after_init;
> -
> -#define DEFAULT_DMA_COHERENT_POOL_SIZE  SZ_256K
> -static size_t atomic_pool_size __initdata = DEFAULT_DMA_COHERENT_POOL_SIZE;
> -
> -static int __init early_coherent_pool(char *p)
> -{
> - atomic_pool_size = memparse(p, );
> - return 0;
> -}
> -early_param("coherent_pool", early_coherent_pool);
> -
> -static void *__alloc_from_pool(size_t size, struct page **ret_page, gfp_t 
> flags)
> -{
> - unsigned long val;
> - void *ptr = NULL;
> -
> - if (!atomic_pool) {
> - WARN(1, "coherent pool not initialised!\n");
> - return NULL;
> - }
> -
> - val = gen_pool_alloc(atomic_pool, size);
> - if (val) {
> - phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
> -
> - *ret_page = phys_to_page(phys);
> - ptr = (void *)val;
> - memset(ptr, 0, size);
> - }
> -
> - return ptr;
> -}
> -
> -static bool __in_atomic_pool(void *start, size_t size)
> -{
> - return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
> -}
> -
> -static int __free_from_pool(void *start, size_t size)
> -{
> - if (!__in_atomic_pool(start, size))
> - return 0;
> -
> - gen_pool_free(atomic_pool, (unsigned long)start, size);
> -
> - return 1;
> -}
> -
> -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
> - gfp_t flags, unsigned long attrs)
> -{
> - struct page *page;
> - void *ptr, *coherent_ptr;
> - pgprot_t prot = pgprot_writecombine(PAGE_KERNEL);
> -
> - size = PAGE_ALIGN(size);
> -
> - if (!gfpflags_allow_blocking(flags)) {
> - struct page *page = NULL;
> - void *addr = __alloc_from_pool(size, , flags);
> -
> - if (addr)
> - *dma_handle = phys_to_dma(dev, page_to_phys(page));
> -
> - return addr;
> - }
> -
> - ptr = dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
> - if (!ptr)
> - goto no_mem;
> -
> - /* remove any dirty cache lines on the kernel alias */
> - __dma_flush_area(ptr, size);
> -
> - /* create a coherent mapping */
> - page = virt_to_page(ptr);
> - coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
> -prot, 
> __builtin_return_address(0));
> - if (!coherent_ptr)
> - goto no_map;
> -
> - return coherent_ptr;
> -
> -no_map:
> - dma_direct_free_pages(dev, size, ptr, *dma_handle, attrs);
> -no_mem:
> - return NULL;
> -}
> -
> -void arch_dma_free(struct device *dev, size_t size, void *vaddr,
> - dma_addr_t dma_handle, unsigned long attrs)
> -{
> - if (!__free_from_pool(vaddr, PAGE_ALIGN(size))) {
> - void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle));
> -
> - vunmap(vaddr);
> - dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs);
> - }
> -}
> -
> -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
> - dma_addr_t dma_addr)
> -{
> - return __phys_to_pfn(dma_to_phys(dev, dma_addr));
> -}
> -
>  pgprot_t arch_dma_mmap_pgprot(struct device *dev, pgprot_t prot,
>   unsigned long attrs)
>  

Re: use generic DMA mapping code in powerpc V4

2018-12-04 Thread Christian Zigotzky

On 04 December 2018 at 08:31AM, Christian Zigotzky wrote:

Hi All,

Could you please test Christoph's kernel on your PASEMI and NXP 
boards? Download:


'git clone git://git.infradead.org/users/hch/misc.git -b powerpc-dma.5 a'

Thanks,
Christian


I successfully tested this kernel on a virtual e5500 QEMU machine today.

Command: ./qemu-system-ppc64 -M ppce500 -cpu e5500 -m 2048 -kernel 
uImage-dma -drive 
format=raw,file=MATE_PowerPC_Remix_2017_0.9.img,index=0,if=virtio -nic 
user,model=e1000 -append "rw root=/dev/vda" -device virtio-vga -device 
virtio-mouse-pci -device virtio-keyboard-pci -usb -soundhw es1370 -smp 4


QEMU version 3.1.0.

I don't know why this kernel doesn't recognize the hard disks connected 
to my physical P5020 board and why the onboard ethernet on my PASEMI 
board doesn't work. (dma_direct_map_page: overflow)


-- Christian

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-12-04 Thread Nicolas Boichat
On Sun, Nov 11, 2018 at 5:04 PM Nicolas Boichat  wrote:
>
> This is a follow-up to the discussion in [1], to make sure that the page
> tables allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit
> physical address space.
>
> [1] 
> https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html

Hi everyone,

Let's try to summarize here.

First, we confirmed that this is a regression, and IOMMU errors happen
on 4.19 and linux-next/master on MT8173 (elm, Acer Chromebook R13).
The issue most likely starts from ad67f5a6545f ("arm64: replace
ZONE_DMA with ZONE_DMA32"), i.e. 4.15, and presumably breaks a number
of Mediatek platforms (and maybe others?).

We have a few options here:
1. This series [2], that adds support for GFP_DMA32 slab caches,
_without_ adding kmalloc caches (since there are no users of
kmalloc(..., GFP_DMA32)). I think I've addressed all the comments on
the 3 patches, and AFAICT this solution works fine.
2. genalloc. That works, but unless we preallocate 4MB for L2 tables
(which is wasteful as we usually only need a handful of L2 tables),
we'll need changes in the core (use GFP_ATOMIC) to allow allocating on
demand, and as it stands we'd have no way to shrink the allocation.
3. page_frag [3]. That works fine, and the code is quite simple. One
drawback is that fragments in partially freed pages cannot be reused
(from limited experiments, I see that IOMMU L2 tables are rarely
freed, so it's unlikely a whole page would get freed). But given the
low number of L2 tables, maybe we can live with that.

I think 2 is out. Any preference between 1 and 3? I think 1 makes
better use of the memory, so that'd be my preference. But I'm probably
missing something.

[2] https://patchwork.kernel.org/cover/10677529/, 3 patches
[3] https://patchwork.codeaurora.org/patch/671639/

Thanks,

Nicolas
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 1/4] PCI / ACPI: Identify untrusted PCI devices

2018-12-04 Thread Mika Westerberg
On Mon, Dec 03, 2018 at 06:28:00PM -0600, Bjorn Helgaas wrote:
> On Thu, Nov 29, 2018 at 06:51:50PM +0300, Mika Westerberg wrote:
> > A malicious PCI device may use DMA to attack the system. An external
> > Thunderbolt port is a convenient point to attach such a device. The OS
> > may use IOMMU to defend against DMA attacks.
> > 
> > Recent BIOSes with Thunderbolt ports mark these externally facing root
> > ports with this ACPI _DSD [1]:
> 
> I'm not 100% comfortable with the "Recent BIOSes" wording because that
> suggests that we can rely on the fact that *all* BIOSes newer than
> some date X mark these ports.
> 
> Since this _DSD usage is Microsoft-specific and not required by either
> PCIe or ACPI specs, we can't rely on it.  A BIOS that doesn't
> implement it may not be Windows-certified, but it's perfectly
> spec-compliant otherwise and we have to keep in mind the possibility
> that ports without this _DSD may still be externally visible and may
> still be attack vectors.

OK.

I will change it to "Some BIOSes .." following what you suggested
earlier. That should make it clear not all BIOSes are required to
implement this.

> >   Name (_DSD, Package () {
> >   ToUUID ("efcc06cc-73ac-4bc3-bff0-76143807c389"),
> >   Package () {
> >   Package () {"ExternalFacingPort", 1},
> >   Package () {"UID", 0 }
> >   }
> >   })
> > 
> > If we find such a root port, mark it and all its children as untrusted.
> > The rest of the OS may use this information to enable DMA protection
> > against malicious devices. For instance the device may be put behind an
> > IOMMU to keep it from accessing memory outside of what the driver has
> > allocated for it.
> > 
> > While at it, add a comment on top of prp_guids array explaining the
> > possible caveat resulting when these GUIDs are treated equivalent.
> > 
> > [1] 
> > https://docs.microsoft.com/en-us/windows-hardware/drivers/pci/dsd-for-pcie-root-ports#identifying-externally-exposed-pcie-root-ports
> > 
> > Signed-off-by: Mika Westerberg 
> 
> Acked-by: Bjorn Helgaas 

Thanks!
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 5/9] dma-mapping: support highmem in the generic remap allocator

2018-12-04 Thread Marek Szyprowski
Hi All,

On 2018-11-30 20:05, Robin Murphy wrote:
> On 05/11/2018 12:19, Christoph Hellwig wrote:
>> By using __dma_direct_alloc_pages we can deal entirely with struct page
>> instead of having to derive a kernel virtual address.
>
> Simple enough :)
>
> Reviewed-by: Robin Murphy 

This patch has landed linux-next yesterday and I've noticed that it
breaks operation of many drivers. The change looked simple, but a stupid
bug managed to slip into the code. After a short investigation I've
noticed that __dma_direct_alloc_pages() doesn't set dma_handle and zero
allocated memory, while dma_direct_alloc_pages() did. The other
difference is the lack of set_memory_decrypted() handling.

Following patch fixes the issue, but maybe it would be better to fix it
in kernel/dma/direct.c:

diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index dcc82dd668f8..7765ddc56e4e 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -219,8 +219,14 @@ void *arch_dma_alloc(struct device *dev, size_t
size, dma_addr_t *dma_handle,
    ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
    arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
    __builtin_return_address(0));
-   if (!ret)
+   if (!ret) {
    __dma_direct_free_pages(dev, size, page);
+   return ret;
+   }
+
+   *dma_handle = phys_to_dma(dev, page_to_phys(page));
+   memset(ret, 0, size);
+
    return ret;
 }

>
>> Signed-off-by: Christoph Hellwig 
>> ---
>>   kernel/dma/remap.c | 14 +++---
>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
>> index bc42766f52df..8f1fca34b894 100644
>> --- a/kernel/dma/remap.c
>> +++ b/kernel/dma/remap.c
>> @@ -196,7 +196,7 @@ void *arch_dma_alloc(struct device *dev, size_t
>> size, dma_addr_t *dma_handle,
>>   gfp_t flags, unsigned long attrs)
>>   {
>>   struct page *page = NULL;
>> -    void *ret, *kaddr;
>> +    void *ret;
>>     size = PAGE_ALIGN(size);
>>   @@ -208,10 +208,9 @@ void *arch_dma_alloc(struct device *dev,
>> size_t size, dma_addr_t *dma_handle,
>>   return ret;
>>   }
>>   -    kaddr = dma_direct_alloc_pages(dev, size, dma_handle, flags,
>> attrs);
>> -    if (!kaddr)
>> +    page = __dma_direct_alloc_pages(dev, size, dma_handle, flags,
>> attrs);
>> +    if (!page)
>>   return NULL;
>> -    page = virt_to_page(kaddr);
>>     /* remove any dirty cache lines on the kernel alias */
>>   arch_dma_prep_coherent(page, size);
>> @@ -221,7 +220,7 @@ void *arch_dma_alloc(struct device *dev, size_t
>> size, dma_addr_t *dma_handle,
>>   arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
>>   __builtin_return_address(0));
>>   if (!ret)
>> -    dma_direct_free_pages(dev, size, kaddr, *dma_handle, attrs);
>> +    __dma_direct_free_pages(dev, size, page);
>>   return ret;
>>   }
>>   @@ -229,10 +228,11 @@ void arch_dma_free(struct device *dev, size_t
>> size, void *vaddr,
>>   dma_addr_t dma_handle, unsigned long attrs)
>>   {
>>   if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
>> -    void *kaddr = phys_to_virt(dma_to_phys(dev, dma_handle));
>> +    phys_addr_t phys = dma_to_phys(dev, dma_handle);
>> +    struct page *page = pfn_to_page(__phys_to_pfn(phys));
>>     vunmap(vaddr);
>> -    dma_direct_free_pages(dev, size, kaddr, dma_handle, attrs);
>> +    __dma_direct_free_pages(dev, size, page);
>>   }
>>   }
>>  
>
Best regards
-- 
Marek Szyprowski, PhD
Samsung R Institute Poland

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH v3, RFC] iommu/io-pgtable-arm-v7s: Use page_frag to request DMA32 memory

2018-12-04 Thread Nicolas Boichat
IOMMUs using ARMv7 short-descriptor format require page tables
(level 1 and 2) to be allocated within the first 4GB of RAM, even
on 64-bit systems.

For level 1/2 tables, ensure GFP_DMA32 is used if CONFIG_ZONE_DMA32
is defined (e.g. on arm64 platforms).

For level 2 tables (1 KB), we use page_frag to allocate these pages,
as we cannot directly use kmalloc (no slab cache for GFP_DMA32) or
kmem_cache (mm/ code treats GFP_DMA32 as an invalid flag).

One downside is that we only free the allocated page if all the
4 fragments (4 IOMMU L2 tables) are freed, but given that we
usually only allocate limited number of IOMMU L2 tables, this
should not have too much impact on memory usage: In the absolute
worst case (4096 L2 page tables, each on their own 4K page),
we would use 16 MB of memory for 4 MB of L2 tables.

Also, print an error when the physical address does not fit in
32-bit, to make debugging easier in the future.

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---

As an alternative to the series [1], which adds support for GFP_DMA32
to kmem_cache in mm/. IMHO the solution in [1] is cleaner and more
efficient, as it allows freed fragments (L2 tables) to be reused, but
this approach does not require any core change.

[1] https://patchwork.kernel.org/cover/10677529/, 3 patches

 drivers/iommu/io-pgtable-arm-v7s.c | 32 --
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c 
b/drivers/iommu/io-pgtable-arm-v7s.c
index 445c3bde04800c..0de6a51eb6755f 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -161,6 +161,12 @@
 
 #define ARM_V7S_TCR_PD1BIT(5)
 
+#ifdef CONFIG_ZONE_DMA32
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA32
+#else
+#define ARM_V7S_TABLE_GFP_DMA GFP_DMA
+#endif
+
 typedef u32 arm_v7s_iopte;
 
 static bool selftest_running;
@@ -169,7 +175,7 @@ struct arm_v7s_io_pgtable {
struct io_pgtable   iop;
 
arm_v7s_iopte   *pgd;
-   struct kmem_cache   *l2_tables;
+   struct page_frag_cache  l2_tables;
spinlock_t  split_lock;
 };
 
@@ -198,13 +204,17 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
void *table = NULL;
 
if (lvl == 1)
-   table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size));
+   table = (void *)__get_free_pages(
+   __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA, get_order(size));
else if (lvl == 2)
-   table = kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA);
+   table = page_frag_alloc(>l2_tables, size,
+   gfp | __GFP_ZERO | ARM_V7S_TABLE_GFP_DMA);
phys = virt_to_phys(table);
-   if (phys != (arm_v7s_iopte)phys)
+   if (phys != (arm_v7s_iopte)phys) {
/* Doesn't fit in PTE */
+   dev_err(dev, "Page table does not fit in PTE: %pa", );
goto out_free;
+   }
if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
if (dma_mapping_error(dev, dma))
@@ -227,7 +237,7 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
if (lvl == 1)
free_pages((unsigned long)table, get_order(size));
else
-   kmem_cache_free(data->l2_tables, table);
+   page_frag_free(table);
return NULL;
 }
 
@@ -244,7 +254,7 @@ static void __arm_v7s_free_table(void *table, int lvl,
if (lvl == 1)
free_pages((unsigned long)table, get_order(size));
else
-   kmem_cache_free(data->l2_tables, table);
+   page_frag_free(table);
 }
 
 static void __arm_v7s_pte_sync(arm_v7s_iopte *ptep, int num_entries,
@@ -515,7 +525,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop)
__arm_v7s_free_table(iopte_deref(pte, 1), 2, data);
}
__arm_v7s_free_table(data->pgd, 1, data);
-   kmem_cache_destroy(data->l2_tables);
kfree(data);
 }
 
@@ -729,17 +738,11 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct 
io_pgtable_cfg *cfg,
!(cfg->quirks & IO_PGTABLE_QUIRK_NO_PERMS))
return NULL;
 
-   data = kmalloc(sizeof(*data), GFP_KERNEL);
+   data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return NULL;
 
spin_lock_init(>split_lock);
-   data->l2_tables = kmem_cache_create("io-pgtable_armv7s_l2",
-   ARM_V7S_TABLE_SIZE(2),
-   ARM_V7S_TABLE_SIZE(2),
-   SLAB_CACHE_DMA, NULL);
-   if (!data->l2_tables)
-   goto out_free_data;
 
data->iop.ops = (struct io_pgtable_ops) {
.map= arm_v7s_map,
@@ -789,7