[PATCH v3 1/2] dma-contiguous: Abstract dma_{alloc, free}_contiguous()

2019-05-23 Thread Nicolin Chen
Both dma_alloc_from_contiguous() and dma_release_from_contiguous()
are very simply implemented, but requiring callers to pass certain
parameters like count and align, and taking a boolean parameter to
check __GFP_NOWARN in the allocation flags. So every function call
duplicates similar work:
  /* A piece of example */
  unsigned long order = get_order(size);
  size_t count = size >> PAGE_SHIFT;
  page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN);
  [...]
  dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);

Additionally, as CMA can be used only in the context which permits
sleeping, most of callers do a gfpflags_allow_blocking() check and
a corresponding fallback allocation of normal pages upon any false
result:
  /* A piece of example */
  if (gfpflags_allow_blocking(flag))
  page = dma_alloc_from_contiguous();
  if (!page)
  page = alloc_pages();
  [...]
  if (!dma_release_from_contiguous(dev, page, count))
  __free_pages(page, get_order(size));

So this patch simplifies those function calls by abstracting these
operations into the two new functions: dma_{alloc,free}_contiguous.

As some callers of dma_{alloc,release}_from_contiguous() might be
complicated, this patch just implements these two new functions to
kernel/dma/direct.c only as an initial step.

Suggested-by: Christoph Hellwig 
Signed-off-by: Nicolin Chen 
---
Changelog
v2->v3:
 * Added missing "static inline" in header file to fix build error.
v1->v2:
 * Added new functions beside the old ones so we can replace callers
   one by one later.
 * Applied new functions to dma/direct.c only, because it's the best
   example caller to apply and should be safe with the new functions.

 include/linux/dma-contiguous.h | 11 
 kernel/dma/contiguous.c| 48 ++
 kernel/dma/direct.c| 24 +++--
 3 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
index f247e8aa5e3d..00a370c1c140 100644
--- a/include/linux/dma-contiguous.h
+++ b/include/linux/dma-contiguous.h
@@ -115,6 +115,8 @@ struct page *dma_alloc_from_contiguous(struct device *dev, 
size_t count,
   unsigned int order, bool no_warn);
 bool dma_release_from_contiguous(struct device *dev, struct page *pages,
 int count);
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp);
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size);
 
 #else
 
@@ -157,6 +159,15 @@ bool dma_release_from_contiguous(struct device *dev, 
struct page *pages,
return false;
 }
 
+static inline
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+   return NULL;
+}
+
+static inline
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size) { 
}
+
 #endif
 
 #endif
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index b2a87905846d..21f39a6cb04f 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -214,6 +214,54 @@ bool dma_release_from_contiguous(struct device *dev, 
struct page *pages,
return cma_release(dev_get_cma_area(dev), pages, count);
 }
 
+/**
+ * dma_alloc_contiguous() - allocate contiguous pages
+ * @dev:   Pointer to device for which the allocation is performed.
+ * @size:  Requested allocation size.
+ * @gfp:   Allocation flags.
+ *
+ * This function allocates contiguous memory buffer for specified device. It
+ * first tries to use device specific contiguous memory area if available or
+ * the default global one, then tries a fallback allocation of normal pages.
+ */
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+   size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+   size_t align = get_order(PAGE_ALIGN(size));
+   struct cma *cma = dev_get_cma_area(dev);
+   struct page *page = NULL;
+
+   /* CMA can be used only in the context which permits sleeping */
+   if (cma && gfpflags_allow_blocking(gfp)) {
+   align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
+   page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
+   }
+
+   /* Fallback allocation of normal pages */
+   if (!page)
+   page = alloc_pages_node(node, gfp, align);
+
+   return page;
+}
+
+/**
+ * dma_free_contiguous() - release allocated pages
+ * @dev:   Pointer to device for which the pages were allocated.
+ * @page:  Pointer to the allocated pages.
+ * @size:  Size of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_contiguous(). As the
+ * cma_release returns false when provided pages do not belong to contiguous
+ * area and true otherwise, this function then does a fallback __free_pages()
+ * upon a false-return.
+ */
+void dma_free_contiguous(struct device 

[PATCH v3 0/2] Optimize dma_*_from_contiguous calls

2019-05-23 Thread Nicolin Chen
[ Per discussion at v1, we decide to add two new functions and start
  replacing callers one by one. For this series, it only touches the
  dma-direct part. And instead of merging two PATCHes, I still keep
  them separate so that we may easily revert PATCH-2 if anything bad
  happens as last time -- PATCH-1 is supposed to be a safe cleanup. ]

This series of patches try to optimize dma_*_from_contiguous calls:
PATCH-1 abstracts two new functions and applies to dma-direct.c file.
PATCH-2 saves single pages and reduce fragmentations from CMA area.

Please check their commit messages for detail changelog.

Nicolin Chen (2):
  dma-contiguous: Abstract dma_{alloc,free}_contiguous()
  dma-contiguous: Use fallback alloc_pages for single pages

 include/linux/dma-contiguous.h | 11 +++
 kernel/dma/contiguous.c| 57 ++
 kernel/dma/direct.c| 24 +++---
 3 files changed, 72 insertions(+), 20 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 2/2] dma-contiguous: Use fallback alloc_pages for single pages

2019-05-23 Thread Nicolin Chen
The addresses within a single page are always contiguous, so it's
not so necessary to always allocate one single page from CMA area.
Since the CMA area has a limited predefined size of space, it may
run out of space in heavy use cases, where there might be quite a
lot CMA pages being allocated for single pages.

However, there is also a concern that a device might care where a
page comes from -- it might expect the page from CMA area and act
differently if the page doesn't.

This patch tries to use the fallback alloc_pages path, instead of
one-page size allocations from the global CMA area in case that a
device does not have its own CMA area. This'd save resources from
the CMA global area for more CMA allocations, and also reduce CMA
fragmentations resulted from trivial allocations.

Signed-off-by: Nicolin Chen 
---
 kernel/dma/contiguous.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 21f39a6cb04f..6914b92d5c88 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -223,14 +223,23 @@ bool dma_release_from_contiguous(struct device *dev, 
struct page *pages,
  * This function allocates contiguous memory buffer for specified device. It
  * first tries to use device specific contiguous memory area if available or
  * the default global one, then tries a fallback allocation of normal pages.
+ *
+ * Note that it byapss one-page size of allocations from the global area as
+ * the addresses within one page are always contiguous, so there is no need
+ * to waste CMA pages for that kind; it also helps reduce fragmentations.
  */
 struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
 {
int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
size_t align = get_order(PAGE_ALIGN(size));
-   struct cma *cma = dev_get_cma_area(dev);
struct page *page = NULL;
+   struct cma *cma = NULL;
+
+   if (dev && dev->cma_area)
+   cma = dev->cma_area;
+   else if (count > 1)
+   cma = dma_contiguous_default_area;
 
/* CMA can be used only in the context which permits sleeping */
if (cma && gfpflags_allow_blocking(gfp)) {
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 1/2] dma-contiguous: Abstract dma_{alloc, free}_contiguous()

2019-05-23 Thread Nicolin Chen
On Thu, May 23, 2019 at 08:59:30PM -0600, dann frazier wrote:
> > > diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
> > > index b2a87905846d..21f39a6cb04f 100644
> > > --- a/kernel/dma/contiguous.c
> > > +++ b/kernel/dma/contiguous.c
> > > @@ -214,6 +214,54 @@ bool dma_release_from_contiguous(struct device *dev, 
> > > struct page *pages,
> > > return cma_release(dev_get_cma_area(dev), pages, count);
> > >  }
> >
> > This breaks the build for me if CONFIG_DMA_CMA=n:
> >
> >   LD [M]  fs/9p/9p.o
> > ld: fs/9p/vfs_inode.o: in function `dma_alloc_contiguous':
> > vfs_inode.c:(.text+0xa60): multiple definition of
> > `dma_alloc_contiguous'; fs/9p/vfs_super.o:vfs_super.c:(.text+0x500):
> > first defined here
> >
> > Do the following insertions need to be under an #ifdef CONFIG_DMA_CMA ?
> 
> Ah, no - the problem is actually a missing "static inline" in the
> !CONFIG_DMA_CMA version of dma_alloc_contiguous().

Yea, I saw it. Thanks for the testing and pointing it out.

Sending v3.


Re: [PATCH v2 1/2] dma-contiguous: Abstract dma_{alloc, free}_contiguous()

2019-05-23 Thread dann frazier
On Thu, May 23, 2019 at 7:52 PM dann frazier  wrote:
>
> On Mon, May 6, 2019 at 4:35 PM Nicolin Chen  wrote:
> >
> > Both dma_alloc_from_contiguous() and dma_release_from_contiguous()
> > are very simply implemented, but requiring callers to pass certain
> > parameters like count and align, and taking a boolean parameter to
> > check __GFP_NOWARN in the allocation flags. So every function call
> > duplicates similar work:
> >   /* A piece of example */
> >   unsigned long order = get_order(size);
> >   size_t count = size >> PAGE_SHIFT;
> >   page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN);
> >   [...]
> >   dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
> >
> > Additionally, as CMA can be used only in the context which permits
> > sleeping, most of callers do a gfpflags_allow_blocking() check and
> > a corresponding fallback allocation of normal pages upon any false
> > result:
> >   /* A piece of example */
> >   if (gfpflags_allow_blocking(flag))
> >   page = dma_alloc_from_contiguous();
> >   if (!page)
> >   page = alloc_pages();
> >   [...]
> >   if (!dma_release_from_contiguous(dev, page, count))
> >   __free_pages(page, get_order(size));
> >
> > So this patch simplifies those function calls by abstracting these
> > operations into the two new functions: dma_{alloc,free}_contiguous.
> >
> > As some callers of dma_{alloc,release}_from_contiguous() might be
> > complicated, this patch just implements these two new functions to
> > kernel/dma/direct.c only as an initial step.
> >
> > Suggested-by: Christoph Hellwig 
> > Signed-off-by: Nicolin Chen 
> > ---
> > Changelog
> > v1->v2:
> >  * Added new functions beside the old ones so we can replace callers
> >one by one later.
> >  * Applied new functions to dma/direct.c only, because it's the best
> >example caller to apply and should be safe with the new functions.
> >
> >  include/linux/dma-contiguous.h | 10 +++
> >  kernel/dma/contiguous.c| 48 ++
> >  kernel/dma/direct.c| 24 +++--
> >  3 files changed, 62 insertions(+), 20 deletions(-)
> >
> > diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
> > index f247e8aa5e3d..dacbdcb91a89 100644
> > --- a/include/linux/dma-contiguous.h
> > +++ b/include/linux/dma-contiguous.h
> > @@ -115,6 +115,8 @@ struct page *dma_alloc_from_contiguous(struct device 
> > *dev, size_t count,
> >unsigned int order, bool no_warn);
> >  bool dma_release_from_contiguous(struct device *dev, struct page *pages,
> >  int count);
> > +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t 
> > gfp);
> > +void dma_free_contiguous(struct device *dev, struct page *page, size_t 
> > size);
> >
> >  #else
> >
> > @@ -157,6 +159,14 @@ bool dma_release_from_contiguous(struct device *dev, 
> > struct page *pages,
> > return false;
> >  }
> >
> > +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t 
> > gfp)
> > +{
> > +   return NULL;
> > +}
> > +
> > +static inline
> > +void dma_free_contiguous(struct device *dev, struct page *page, size_t 
> > size) { }
> > +
> >  #endif
> >
> >  #endif
> > diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
> > index b2a87905846d..21f39a6cb04f 100644
> > --- a/kernel/dma/contiguous.c
> > +++ b/kernel/dma/contiguous.c
> > @@ -214,6 +214,54 @@ bool dma_release_from_contiguous(struct device *dev, 
> > struct page *pages,
> > return cma_release(dev_get_cma_area(dev), pages, count);
> >  }
>
> This breaks the build for me if CONFIG_DMA_CMA=n:
>
>   LD [M]  fs/9p/9p.o
> ld: fs/9p/vfs_inode.o: in function `dma_alloc_contiguous':
> vfs_inode.c:(.text+0xa60): multiple definition of
> `dma_alloc_contiguous'; fs/9p/vfs_super.o:vfs_super.c:(.text+0x500):
> first defined here
>
> Do the following insertions need to be under an #ifdef CONFIG_DMA_CMA ?

Ah, no - the problem is actually a missing "static inline" in the
!CONFIG_DMA_CMA version of dma_alloc_contiguous().

  -dann

> > +/**
> > + * dma_alloc_contiguous() - allocate contiguous pages
> > + * @dev:   Pointer to device for which the allocation is performed.
> > + * @size:  Requested allocation size.
> > + * @gfp:   Allocation flags.
> > + *
> > + * This function allocates contiguous memory buffer for specified device. 
> > It
> > + * first tries to use device specific contiguous memory area if available 
> > or
> > + * the default global one, then tries a fallback allocation of normal 
> > pages.
> > + */
> > +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t 
> > gfp)
> > +{
> > +   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
> > +   size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
> > +   size_t align = get_order(PAGE_ALIGN(size));
> > +   struct cma *cma = dev_get_cma_area(dev);
> > +   struct page *page = NULL;
> > +
> > + 

Re: [PATCH v2 1/2] dma-contiguous: Abstract dma_{alloc, free}_contiguous()

2019-05-23 Thread dann frazier
On Mon, May 6, 2019 at 4:35 PM Nicolin Chen  wrote:
>
> Both dma_alloc_from_contiguous() and dma_release_from_contiguous()
> are very simply implemented, but requiring callers to pass certain
> parameters like count and align, and taking a boolean parameter to
> check __GFP_NOWARN in the allocation flags. So every function call
> duplicates similar work:
>   /* A piece of example */
>   unsigned long order = get_order(size);
>   size_t count = size >> PAGE_SHIFT;
>   page = dma_alloc_from_contiguous(dev, count, order, gfp & __GFP_NOWARN);
>   [...]
>   dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
>
> Additionally, as CMA can be used only in the context which permits
> sleeping, most of callers do a gfpflags_allow_blocking() check and
> a corresponding fallback allocation of normal pages upon any false
> result:
>   /* A piece of example */
>   if (gfpflags_allow_blocking(flag))
>   page = dma_alloc_from_contiguous();
>   if (!page)
>   page = alloc_pages();
>   [...]
>   if (!dma_release_from_contiguous(dev, page, count))
>   __free_pages(page, get_order(size));
>
> So this patch simplifies those function calls by abstracting these
> operations into the two new functions: dma_{alloc,free}_contiguous.
>
> As some callers of dma_{alloc,release}_from_contiguous() might be
> complicated, this patch just implements these two new functions to
> kernel/dma/direct.c only as an initial step.
>
> Suggested-by: Christoph Hellwig 
> Signed-off-by: Nicolin Chen 
> ---
> Changelog
> v1->v2:
>  * Added new functions beside the old ones so we can replace callers
>one by one later.
>  * Applied new functions to dma/direct.c only, because it's the best
>example caller to apply and should be safe with the new functions.
>
>  include/linux/dma-contiguous.h | 10 +++
>  kernel/dma/contiguous.c| 48 ++
>  kernel/dma/direct.c| 24 +++--
>  3 files changed, 62 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
> index f247e8aa5e3d..dacbdcb91a89 100644
> --- a/include/linux/dma-contiguous.h
> +++ b/include/linux/dma-contiguous.h
> @@ -115,6 +115,8 @@ struct page *dma_alloc_from_contiguous(struct device 
> *dev, size_t count,
>unsigned int order, bool no_warn);
>  bool dma_release_from_contiguous(struct device *dev, struct page *pages,
>  int count);
> +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t 
> gfp);
> +void dma_free_contiguous(struct device *dev, struct page *page, size_t size);
>
>  #else
>
> @@ -157,6 +159,14 @@ bool dma_release_from_contiguous(struct device *dev, 
> struct page *pages,
> return false;
>  }
>
> +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
> +{
> +   return NULL;
> +}
> +
> +static inline
> +void dma_free_contiguous(struct device *dev, struct page *page, size_t size) 
> { }
> +
>  #endif
>
>  #endif
> diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
> index b2a87905846d..21f39a6cb04f 100644
> --- a/kernel/dma/contiguous.c
> +++ b/kernel/dma/contiguous.c
> @@ -214,6 +214,54 @@ bool dma_release_from_contiguous(struct device *dev, 
> struct page *pages,
> return cma_release(dev_get_cma_area(dev), pages, count);
>  }

This breaks the build for me if CONFIG_DMA_CMA=n:

  LD [M]  fs/9p/9p.o
ld: fs/9p/vfs_inode.o: in function `dma_alloc_contiguous':
vfs_inode.c:(.text+0xa60): multiple definition of
`dma_alloc_contiguous'; fs/9p/vfs_super.o:vfs_super.c:(.text+0x500):
first defined here

Do the following insertions need to be under an #ifdef CONFIG_DMA_CMA ?

  -dann

> +/**
> + * dma_alloc_contiguous() - allocate contiguous pages
> + * @dev:   Pointer to device for which the allocation is performed.
> + * @size:  Requested allocation size.
> + * @gfp:   Allocation flags.
> + *
> + * This function allocates contiguous memory buffer for specified device. It
> + * first tries to use device specific contiguous memory area if available or
> + * the default global one, then tries a fallback allocation of normal pages.
> + */
> +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
> +{
> +   int node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
> +   size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
> +   size_t align = get_order(PAGE_ALIGN(size));
> +   struct cma *cma = dev_get_cma_area(dev);
> +   struct page *page = NULL;
> +
> +   /* CMA can be used only in the context which permits sleeping */
> +   if (cma && gfpflags_allow_blocking(gfp)) {
> +   align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
> +   page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
> +   }
> +
> +   /* Fallback allocation of normal pages */
> +   if (!page)
> +   page = alloc_pages_node(node, gfp, align);
> +
> +   

[RFC PATCH v4 15/21] watchdog/hardlockup/hpet: Only enable the HPET watchdog via a boot parameter

2019-05-23 Thread Ricardo Neri
Keep the HPET-based hardlockup detector disabled unless explicitly enabled
via a command-line argument. If such parameter is not given, the
initialization of the hpet-based hardlockup detector fails and the NMI
watchdog will fallback to use the perf-based implementation.

Given that __setup("nmi_watchdog=") is already used to control the behavior
of the NMI watchdog (via hardlockup_panic_setup()), it cannot be used to
control of the hpet-based implementation. Instead, use a new
early_param("nmi_watchdog").

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Peter Zijlstra 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Mimi Zohar 
Cc: Jan Kiszka 
Cc: Nick Desaulniers 
Cc: Masahiro Yamada 
Cc: Nayna Jain 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 

--
checkpatch gives the following warning:

CHECK: __setup appears un-documented -- check 
Documentation/admin-guide/kernel-parameters.rst
+__setup("nmi_watchdog=", hardlockup_detector_hpet_setup);

This is a false-positive as the option nmi_watchdog is already
documented. The option is re-evaluated in this file as well.
---
 .../admin-guide/kernel-parameters.txt |  8 ++-
 arch/x86/kernel/watchdog_hld_hpet.c   | 22 +++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..17ed3dcda13e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2831,7 +2831,7 @@
Format: [state][,regs][,debounce][,die]
 
nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
-   Format: [panic,][nopanic,][num]
+   Format: [panic,][nopanic,][num,][hpet]
Valid num: 0 or 1
0 - turn hardlockup detector in nmi_watchdog off
1 - turn hardlockup detector in nmi_watchdog on
@@ -2841,6 +2841,12 @@
please see 'nowatchdog'.
This is useful when you use a panic=... timeout and
need the box quickly up again.
+   When hpet is specified, the NMI watchdog will be driven
+   by an HPET timer, if available in the system. Otherwise,
+   it falls back to the default implementation (perf or
+   architecture-specific). Specifying hpet has no effect
+   if the NMI watchdog is not enabled (either at build time
+   or via the command line).
 
These settings can be accessed at runtime via
the nmi_watchdog and hardlockup_panic sysctls.
diff --git a/arch/x86/kernel/watchdog_hld_hpet.c 
b/arch/x86/kernel/watchdog_hld_hpet.c
index dcc50cd29374..76eed714a1cb 100644
--- a/arch/x86/kernel/watchdog_hld_hpet.c
+++ b/arch/x86/kernel/watchdog_hld_hpet.c
@@ -351,6 +351,28 @@ void hardlockup_detector_hpet_stop(void)
disable_timer(hld_data);
 }
 
+/**
+ * hardlockup_detector_hpet_setup() - Parse command-line parameters
+ * @str:   A string containing the kernel command line
+ *
+ * Parse the nmi_watchdog parameter from the kernel command line. If
+ * selected by the user, use this implementation to detect hardlockups.
+ */
+static int __init hardlockup_detector_hpet_setup(char *str)
+{
+   if (!str)
+   return -EINVAL;
+
+   if (parse_option_str(str, "hpet"))
+   hardlockup_use_hpet = true;
+
+   if (!nmi_watchdog_user_enabled && hardlockup_use_hpet)
+   pr_warn("Selecting HPET NMI watchdog has no effect with NMI 
watchdog disabled\n");
+
+   return 0;
+}
+early_param("nmi_watchdog", hardlockup_detector_hpet_setup);
+
 /**
  * hardlockup_detector_hpet_init() - Initialize the hardlockup detector
  *
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 17/21] x86/tsc: Switch to perf-based hardlockup detector if TSC become unstable

2019-05-23 Thread Ricardo Neri
The HPET-based hardlockup detector relies on the TSC to determine if an
observed NMI interrupt was originated by HPET timer. Hence, this detector
can no longer be used with an unstable TSC.

In such case, permanently stop the HPET-based hardlockup detector and
start the perf-based detector.

Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h| 2 ++
 arch/x86/kernel/tsc.c  | 2 ++
 arch/x86/kernel/watchdog_hld.c | 7 +++
 3 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index fd99f2390714..a82cbe17479d 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -128,6 +128,7 @@ extern int hardlockup_detector_hpet_init(void);
 extern void hardlockup_detector_hpet_stop(void);
 extern void hardlockup_detector_hpet_enable(unsigned int cpu);
 extern void hardlockup_detector_hpet_disable(unsigned int cpu);
+extern void hardlockup_detector_switch_to_perf(void);
 #else
 static inline struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
 { return NULL; }
@@ -136,6 +137,7 @@ static inline int hardlockup_detector_hpet_init(void)
 static inline void hardlockup_detector_hpet_stop(void) {}
 static inline void hardlockup_detector_hpet_enable(unsigned int cpu) {}
 static inline void hardlockup_detector_hpet_disable(unsigned int cpu) {}
+static void harrdlockup_detector_switch_to_perf(void) {}
 #endif /* CONFIG_X86_HARDLOCKUP_DETECTOR_HPET */
 
 #else /* CONFIG_HPET_TIMER */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 59b57605e66c..b2210728ce3d 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1158,6 +1158,8 @@ void mark_tsc_unstable(char *reason)
 
clocksource_mark_unstable(_tsc_early);
clocksource_mark_unstable(_tsc);
+
+   hardlockup_detector_switch_to_perf();
 }
 
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
diff --git a/arch/x86/kernel/watchdog_hld.c b/arch/x86/kernel/watchdog_hld.c
index c2512d4c79c5..c8547c227a41 100644
--- a/arch/x86/kernel/watchdog_hld.c
+++ b/arch/x86/kernel/watchdog_hld.c
@@ -76,3 +76,10 @@ void watchdog_nmi_stop(void)
if (detector_type == X86_HARDLOCKUP_DETECTOR_HPET)
hardlockup_detector_hpet_stop();
 }
+
+void hardlockup_detector_switch_to_perf(void)
+{
+   detector_type = X86_HARDLOCKUP_DETECTOR_PERF;
+   hardlockup_detector_hpet_stop();
+   hardlockup_start_all();
+}
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 19/21] iommu/vt-d: Rework prepare_irte() to support per-irq delivery mode

2019-05-23 Thread Ricardo Neri
A recent change introduced a new member to struct irq_cfg to specify the
delivery mode of an interrupt. Supporting the configuration of the
delivery mode would require adding a third argument to prepare_irte().
Instead, simply take a pointer to a irq_cfg data structure as a the only
argument.

Internally, configure the delivery mode of the Interrupt Remapping Table
Entry as specified in the irq_cfg data structure and not as the APIC
setting.

This change does not change the existing behavior, as the delivery mode
of the APIC is used to configure irq_cfg data structure.

Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Borislav Petkov 
Cc: Jacob Pan 
Cc: Joerg Roedel 
Cc: Juergen Gross 
Cc: Bjorn Helgaas 
Cc: Wincy Van 
Cc: Kate Stewart 
Cc: Philippe Ombredanne 
Cc: "Eric W. Biederman" 
Cc: Baoquan He 
Cc: Jan Kiszka 
Cc: Lu Baolu 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri 
---
 drivers/iommu/intel_irq_remapping.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index 4160aa9f3f80..2e61eaca7d7e 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1072,7 +1072,7 @@ static int reenable_irq_remapping(int eim)
return -1;
 }
 
-static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
+static void prepare_irte(struct irte *irte, struct irq_cfg *irq_cfg)
 {
memset(irte, 0, sizeof(*irte));
 
@@ -1086,9 +1086,9 @@ static void prepare_irte(struct irte *irte, int vector, 
unsigned int dest)
 * irq migration in the presence of interrupt-remapping.
*/
irte->trigger_mode = 0;
-   irte->dlvry_mode = apic->irq_delivery_mode;
-   irte->vector = vector;
-   irte->dest_id = IRTE_DEST(dest);
+   irte->dlvry_mode = irq_cfg->delivery_mode;
+   irte->vector = irq_cfg->vector;
+   irte->dest_id = IRTE_DEST(irq_cfg->dest_apicid);
irte->redir_hint = 1;
 }
 
@@ -1265,7 +1265,7 @@ static void intel_irq_remapping_prepare_irte(struct 
intel_ir_data *data,
struct irte *irte = >irte_entry;
struct msi_msg *msg = >msi_entry;
 
-   prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid);
+   prepare_irte(irte, irq_cfg);
switch (info->type) {
case X86_IRQ_ALLOC_TYPE_IOAPIC:
/* Set source-id of interrupt request */
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 00/21] Implement an HPET-based hardlockup detector

2019-05-23 Thread Ricardo Neri
Hi,

This is the third attempt to demonstrate the implementation of a
hardlockup detector driven by the High-Precision Event Timer. This
version provides a few but important updates with respect the previous
version (please refer to the Changes since v3 section). The three initial
implementations can be found here [1], [2], and [3].

== Introduction ==

In CPU architectures that do not have an NMI watchdog, one can be
constructed using a counter of the Performance Monitoring Unit (PMU).
Counters in the PMU have high granularity and high visibility of the CPU.
These capabilities and their limited number make these counters precious
resources. Unfortunately, the perf-based hardlockup detector permanently
consumes one of these counters per CPU.

These counters could be freed for profiling purposes if the hardlockup
detector were driven by another timer.

The hardlockup detector runs relatively infrequently and does not require
visibility of the CPU activity (in addition to detect locked-up CPUs). A
timer that is external to the CPU (e.g., in the chipset) can be used to
drive the detector.

A key requirement is that the timer needs to be capable of issuing a
non-maskable interrupt to the CPU. In most cases, this can be achieved
by tweaking the delivery mode of the interrupt. It is especially
straightforward for MSI interrupts.

== Details of this implementation

This implementation uses an HPET timer to deliver an NMI interrupt via
an MSI message.

Unlike the the perf-based hardlockup detector, this implementation is
driven by a single timer. The timer targets one CPU at a time in a round-
robin manner. This means that if a CPU must be monitored every watch_thresh
seconds, in a system with N monitored CPUs the timer must expire every
watch_thresh/N. A timer expiration per CPU attribute is maintained.

The timer expiration time per CPU is updated every time CPUs are put
online or offline (a CPU hotplug thread enables and disables the watchdog
in these events) or the user changes the file /proc/sys/kernel/
watchdog_cpumask.

Also, given that a single timer drives the detector, a cpumask is needed
to keep track of which online CPUs are allowed to be monitored. This mask
is also updated every time a CPU is put online or offline or when the user
modifies the mask in /proc/sys/kernel/watchdog_cpumask. This mask
is needed to keep the current behavior of the lockup detector.

In order to avoid reading HPET registers in every NMI, the time-stamp
counter is used to determine whether the HPET caused the interrupt. At
every timer expiration, we compute the value the time-stamp counter is
expected to have the next time the timer expires. I have found
experimentally that expected TSC value consistently has an error of less
than 1.5%

Furthermore, only one write to HPET registers is done every
watchdog_thresh seconds. This write can be eliminated if the HPET timer
is periodic.

== Parts of this series ==

For clarity, patches are grouped as follows:

 1) New irq definition. Patch 1 adds a definition for NMI delivery mode
in MSI interrupts. No other changes are done to generic irq code.

 2) HPET updates. Patches 2-7 prepare the HPET code to accommodate the
new detector: rework periodic programming, reserve and configure a
timer for the detector and expose a few existing functions.

 3) NMI watchdog. Patches 8-11 updates the existing hardlockup detector
to uncouple it from perf, switch back to the perf implementation if
TSC becomes unstable, and introduce a new NMI handler category
intended to run after the NMI_LOCAL handlers.

 4) New HPET-based hardlockup detector. Patches 12-17 includes changes to
probe the hardware resources, configure the interrupt and rotate the
destination of the interrupts among all monitored CPUs. Also, it
includes an x86-specific shim hardlockup detector that selects
between HPET and perf implementations.

 5) Interrupt remapping. Patches 18-22 add support to operate this new
detector with interrupt remapping enabled.

Thanks and BR,
Ricardo

Change since v3:
 * Fixed yet another bug in periodic programming of the HPET timer that
   prevented the system from booting.
 * Fixed computation of HPET frequency to use hpet_readl() only.
 * Added a missing #include in the watchdog_hld_hpet.c
 * Fixed various typos and grammar errors (Randy Dunlap)

Changes since v2:
 * Added functionality to switch to the perf-based hardlockup
   detector if the TSC becomes unstable (Thomas Gleixner).
 * Brought back the round-robin mechanism proposed in v1 (this time not
   using the interrupt subsystem). This also requires to compute
   expiration times as in v1 (Andi Kleen, Stephane Eranian).
 * Fixed a bug in which using a periodic timer was not working(thanks
   to Suravee Suthikulpanit!).
 * In this version, I incorporate support for interrupt remapping in the
   last 4 patches so that they can be reviewed separately if needed.
 * Removed redundant documentation of 

[RFC PATCH v4 16/21] x86/watchdog: Add a shim hardlockup detector

2019-05-23 Thread Ricardo Neri
The generic hardlockup detector is based on perf. It also provides a set
of weak stubs that CPU architectures can override. Add a shim hardlockup
detector for x86 that selects between perf and hpet implementations.

Specifically, this shim implementation is needed for the HPET-based
hardlockup detector; it can also be used for future implementations.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Peter Zijlstra 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Mimi Zohar 
Cc: Jan Kiszka 
Cc: Nick Desaulniers 
Cc: Masahiro Yamada 
Cc: Nayna Jain 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Suggested-by: Nicholas Piggin 
Signed-off-by: Ricardo Neri 
---
 arch/x86/Kconfig.debug |  4 ++
 arch/x86/kernel/Makefile   |  1 +
 arch/x86/kernel/watchdog_hld.c | 78 ++
 3 files changed, 83 insertions(+)
 create mode 100644 arch/x86/kernel/watchdog_hld.c

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 445bbb188f10..52c77e2145c9 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -169,11 +169,15 @@ config IOMMU_LEAK
 config HAVE_MMIOTRACE_SUPPORT
def_bool y
 
+config X86_HARDLOCKUP_DETECTOR
+   bool
+
 config X86_HARDLOCKUP_DETECTOR_HPET
bool "Use HPET Timer for Hard Lockup Detection"
select SOFTLOCKUP_DETECTOR
select HARDLOCKUP_DETECTOR
select HARDLOCKUP_DETECTOR_CORE
+   select X86_HARDLOCKUP_DETECTOR
depends on HPET_TIMER && HPET && X86_64
help
  Say y to enable a hardlockup detector that is driven by a High-
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3ad55de67e8b..e60244b8a8ec 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_VM86)  += vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 
 obj-$(CONFIG_HPET_TIMER)   += hpet.o
+obj-$(CONFIG_X86_HARDLOCKUP_DETECTOR) += watchdog_hld.o
 obj-$(CONFIG_X86_HARDLOCKUP_DETECTOR_HPET) += watchdog_hld_hpet.o
 obj-$(CONFIG_APB_TIMER)+= apb_timer.o
 
diff --git a/arch/x86/kernel/watchdog_hld.c b/arch/x86/kernel/watchdog_hld.c
new file mode 100644
index ..c2512d4c79c5
--- /dev/null
+++ b/arch/x86/kernel/watchdog_hld.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A shim hardlockup detector. It overrides the weak stubs of the generic
+ * implementation to select between the perf- or the hpet-based implementation.
+ *
+ * Copyright (C) Intel Corporation 2019
+ */
+
+#include 
+#include 
+
+enum x86_hardlockup_detector {
+   X86_HARDLOCKUP_DETECTOR_PERF,
+   X86_HARDLOCKUP_DETECTOR_HPET,
+};
+
+static enum __read_mostly x86_hardlockup_detector detector_type;
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+   if (detector_type == X86_HARDLOCKUP_DETECTOR_PERF) {
+   hardlockup_detector_perf_enable();
+   return 0;
+   }
+
+   if (detector_type == X86_HARDLOCKUP_DETECTOR_HPET) {
+   hardlockup_detector_hpet_enable(cpu);
+   return 0;
+   }
+
+   return -ENODEV;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+   if (detector_type == X86_HARDLOCKUP_DETECTOR_PERF) {
+   hardlockup_detector_perf_disable();
+   return;
+   }
+
+   if (detector_type == X86_HARDLOCKUP_DETECTOR_HPET) {
+   hardlockup_detector_hpet_disable(cpu);
+   return;
+   }
+}
+
+int __init watchdog_nmi_probe(void)
+{
+   int ret;
+
+   /*
+* Try first with the HPET hardlockup detector. It will only
+* succeed if selected at build time and the nmi_watchdog
+* command-line parameter is configured. This ensure that the
+* perf-based detector is used by default, if selected at
+* build time.
+*/
+   ret = hardlockup_detector_hpet_init();
+   if (!ret) {
+   detector_type = X86_HARDLOCKUP_DETECTOR_HPET;
+   return ret;
+   }
+
+   ret = hardlockup_detector_perf_init();
+   if (!ret) {
+   detector_type = X86_HARDLOCKUP_DETECTOR_PERF;
+   return ret;
+   }
+
+   return ret;
+}
+
+void watchdog_nmi_stop(void)
+{
+   /* Only the HPET lockup detector defines a stop function. */
+   if (detector_type == X86_HARDLOCKUP_DETECTOR_HPET)
+   hardlockup_detector_hpet_stop();
+}
-- 
2.17.1



[RFC PATCH v4 11/21] x86/watchdog/hardlockup: Add an HPET-based hardlockup detector

2019-05-23 Thread Ricardo Neri
This is the initial implementation of a hardlockup detector driven by an
HPET timer. This initial implementation includes functions to control the
timer via its registers. It also requests such timer, installs an NMI
interrupt handler and performs the initial configuration of the timer.

The detector is not functional at this stage. A subsequent changeset will
invoke the interfaces provides by this detector as well as functionality
to determine if the HPET timer caused the NMI.

In order to detect hardlockups in all the monitored CPUs, move the
interrupt to the next monitored CPU while handling the NMI interrupt; wrap
around when reaching the highest CPU in the mask. This rotation is
achieved by setting the affinity mask to only contain the next CPU to
monitor. A cpumask keeps track of all the CPUs that need to be monitored.
Such cpumask is updated when the watchdog is enabled or disabled in a
particular CPU.

This detector relies on an HPET timer that is capable of using Front Side
Bus interrupts. In order to avoid using the generic interrupt code,
program directly the MSI message register of the HPET timer.

HPET registers are only accessed to kick the timer after looking for
hardlockups. This happens every watchdog_thresh seconds. A subsequent
changeset will determine whether the HPET timer caused the interrupt based
on the value of the time-stamp counter. For now, just add a stub function.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Peter Zijlstra 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: "Ravi V. Shankar" 
Cc: Mimi Zohar 
Cc: Jan Kiszka 
Cc: Nick Desaulniers 
Cc: Masahiro Yamada 
Cc: Nayna Jain 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/Kconfig.debug  |  11 +
 arch/x86/include/asm/hpet.h |  13 ++
 arch/x86/kernel/Makefile|   1 +
 arch/x86/kernel/hpet.c  |   3 +-
 arch/x86/kernel/watchdog_hld_hpet.c | 335 
 5 files changed, 362 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/watchdog_hld_hpet.c

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index f730680dc818..445bbb188f10 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -169,6 +169,17 @@ config IOMMU_LEAK
 config HAVE_MMIOTRACE_SUPPORT
def_bool y
 
+config X86_HARDLOCKUP_DETECTOR_HPET
+   bool "Use HPET Timer for Hard Lockup Detection"
+   select SOFTLOCKUP_DETECTOR
+   select HARDLOCKUP_DETECTOR
+   select HARDLOCKUP_DETECTOR_CORE
+   depends on HPET_TIMER && HPET && X86_64
+   help
+ Say y to enable a hardlockup detector that is driven by a High-
+ Precision Event Timer. This option is helpful to not use counters
+ from the Performance Monitoring Unit to drive the detector.
+
 config X86_DECODER_SELFTEST
bool "x86 instruction decoder selftest"
depends on DEBUG_KERNEL && KPROBES
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 20abdaa5372d..31fc27508cf3 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -114,12 +114,25 @@ struct hpet_hld_data {
boolhas_periodic;
u32 num;
u64 ticks_per_second;
+   u32 handling_cpu;
+   u32 enabled_cpus;
+   struct msi_msg  msi_msg;
+   unsigned long   cpu_monitored_mask[0];
 };
 
 extern struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void);
+extern int hardlockup_detector_hpet_init(void);
+extern void hardlockup_detector_hpet_stop(void);
+extern void hardlockup_detector_hpet_enable(unsigned int cpu);
+extern void hardlockup_detector_hpet_disable(unsigned int cpu);
 #else
 static inline struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
 { return NULL; }
+static inline int hardlockup_detector_hpet_init(void)
+{ return -ENODEV; }
+static inline void hardlockup_detector_hpet_stop(void) {}
+static inline void hardlockup_detector_hpet_enable(unsigned int cpu) {}
+static inline void hardlockup_detector_hpet_disable(unsigned int cpu) {}
 #endif /* CONFIG_X86_HARDLOCKUP_DETECTOR_HPET */
 
 #else /* CONFIG_HPET_TIMER */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3578ad248bc9..3ad55de67e8b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_VM86)  += vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 
 obj-$(CONFIG_HPET_TIMER)   += hpet.o
+obj-$(CONFIG_X86_HARDLOCKUP_DETECTOR_HPET) += watchdog_hld_hpet.o
 obj-$(CONFIG_APB_TIMER)+= apb_timer.o
 
 obj-$(CONFIG_AMD_NB)   += amd_nb.o
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 5f9209949fc7..dd3bb664a188 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -183,7 +183,8 @@ struct 

[RFC PATCH v4 13/21] x86/watchdog/hardlockup/hpet: Determine if HPET timer caused NMI

2019-05-23 Thread Ricardo Neri
The only direct method to determine whether an HPET timer caused an
interrupt is to read the Interrupt Status register. Unfortunately,
reading HPET registers is slow and, therefore, it is not recommended to
read them while in NMI context. Furthermore, status is not available if
the interrupt is generated vi the Front Side Bus.

An indirect manner to infer if the non-maskable interrupt we see was
caused by the HPET timer is to use the time-stamp counter. Compute the
value that the time-stamp counter should have at the next interrupt of the
HPET timer. Since the hardlockup detector operates in seconds, high
precision is not needed. This implementation considers that the HPET
caused the HMI if the time-stamp counter reads the expected value -/+ 1.5%.
This value is selected as it is equivalent to 1/64 and the division can be
performed using a bit shift operation. Experimentally, the error in the
estimation is consistently less than 1%.

The computation of the expected value of the time-stamp counter must be
performed in relation to watchdog_thresh divided by the number of
monitored CPUs. This quantity is stored in tsc_ticks_per_cpu and must be
updated whenever the number of monitored CPUs changes.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Peter Zijlstra 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Mimi Zohar 
Cc: Jan Kiszka 
Cc: Nick Desaulniers 
Cc: Masahiro Yamada 
Cc: Nayna Jain 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Suggested-by: Andi Kleen 
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h |  2 ++
 arch/x86/kernel/watchdog_hld_hpet.c | 27 ++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 64acacce095d..fd99f2390714 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -115,6 +115,8 @@ struct hpet_hld_data {
u32 num;
u64 ticks_per_second;
u64 ticks_per_cpu;
+   u64 tsc_next;
+   u64 tsc_ticks_per_cpu;
u32 handling_cpu;
u32 enabled_cpus;
struct msi_msg  msi_msg;
diff --git a/arch/x86/kernel/watchdog_hld_hpet.c 
b/arch/x86/kernel/watchdog_hld_hpet.c
index 74aeb0535d08..dcc50cd29374 100644
--- a/arch/x86/kernel/watchdog_hld_hpet.c
+++ b/arch/x86/kernel/watchdog_hld_hpet.c
@@ -24,6 +24,7 @@
 
 static struct hpet_hld_data *hld_data;
 static bool hardlockup_use_hpet;
+static u64 tsc_next_error;
 
 /**
  * kick_timer() - Reprogram timer to expire in the future
@@ -33,11 +34,22 @@ static bool hardlockup_use_hpet;
  * Reprogram the timer to expire within watchdog_thresh seconds in the future.
  * If the timer supports periodic mode, it is not kicked unless @force is
  * true.
+ *
+ * Also, compute the expected value of the time-stamp counter at the time of
+ * expiration as well as a deviation from the expected value. The maximum
+ * deviation is of ~1.5%. This deviation can be easily computed by shifting
+ * by 6 positions the delta between the current and expected time-stamp values.
  */
 static void kick_timer(struct hpet_hld_data *hdata, bool force)
 {
+   u64 tsc_curr, tsc_delta, new_compare, count, period = 0;
bool kick_needed = force || !(hdata->has_periodic);
-   u64 new_compare, count, period = 0;
+
+   tsc_curr = rdtsc();
+
+   tsc_delta = (unsigned long)watchdog_thresh * hdata->tsc_ticks_per_cpu;
+   hdata->tsc_next = tsc_curr + tsc_delta;
+   tsc_next_error = tsc_delta >> 6;
 
/*
 * Update the comparator in increments of watch_thresh seconds relative
@@ -93,6 +105,15 @@ static void enable_timer(struct hpet_hld_data *hdata)
  */
 static bool is_hpet_wdt_interrupt(struct hpet_hld_data *hdata)
 {
+   if (smp_processor_id() == hdata->handling_cpu) {
+   u64 tsc_curr;
+
+   tsc_curr = rdtsc();
+
+   return (tsc_curr - hdata->tsc_next) + tsc_next_error <
+  2 * tsc_next_error;
+   }
+
return false;
 }
 
@@ -260,6 +281,10 @@ static void update_ticks_per_cpu(struct hpet_hld_data 
*hdata)
 
do_div(temp, hdata->enabled_cpus);
hdata->ticks_per_cpu = temp;
+
+   temp = (unsigned long)tsc_khz * 1000L;
+   do_div(temp, hdata->enabled_cpus);
+   hdata->tsc_ticks_per_cpu = temp;
 }
 
 /**
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 14/21] watchdog/hardlockup: Use parse_option_str() to handle "nmi_watchdog"

2019-05-23 Thread Ricardo Neri
Prepare hardlockup_panic_setup() to handle a comma-separated list of
options. This is needed to pass options to specific implementations of the
hardlockup detector.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Peter Zijlstra 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Mimi Zohar 
Cc: Jan Kiszka 
Cc: Nick Desaulniers 
Cc: Masahiro Yamada 
Cc: Nayna Jain 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 kernel/watchdog.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index be589001200a..fd50049449ec 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -70,13 +70,13 @@ void __init hardlockup_detector_disable(void)
 
 static int __init hardlockup_panic_setup(char *str)
 {
-   if (!strncmp(str, "panic", 5))
+   if (parse_option_str(str, "panic"))
hardlockup_panic = 1;
-   else if (!strncmp(str, "nopanic", 7))
+   else if (parse_option_str(str, "nopanic"))
hardlockup_panic = 0;
-   else if (!strncmp(str, "0", 1))
+   else if (parse_option_str(str, "0"))
nmi_watchdog_user_enabled = 0;
-   else if (!strncmp(str, "1", 1))
+   else if (parse_option_str(str, "1"))
nmi_watchdog_user_enabled = 1;
return 1;
 }
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 21/21] x86/watchdog/hardlockup/hpet: Support interrupt remapping

2019-05-23 Thread Ricardo Neri
When interrupt remapping is enabled in the system, the MSI interrupt
message must follow a special format the IOMMU can understand. Hence,
utilize the functionality provided by the IOMMU driver for such purpose.

The first step is to determine whether interrupt remapping is enabled
by looking for the existence of an interrupt remapping domain. If it
exists, let the IOMMU driver compose the MSI message for us. The hard-
lockup detector is still responsible of writing the message in the
HPET FSB route register.

Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Borislav Petkov 
Cc: Jacob Pan 
Cc: Joerg Roedel 
Cc: Juergen Gross 
Cc: Bjorn Helgaas 
Cc: Wincy Van 
Cc: Kate Stewart 
Cc: Philippe Ombredanne 
Cc: "Eric W. Biederman" 
Cc: Baoquan He 
Cc: Jan Kiszka 
Cc: Lu Baolu 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/kernel/watchdog_hld_hpet.c | 33 -
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/watchdog_hld_hpet.c 
b/arch/x86/kernel/watchdog_hld_hpet.c
index 76eed714a1cb..a266439fdb9e 100644
--- a/arch/x86/kernel/watchdog_hld_hpet.c
+++ b/arch/x86/kernel/watchdog_hld_hpet.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static struct hpet_hld_data *hld_data;
@@ -117,6 +118,25 @@ static bool is_hpet_wdt_interrupt(struct hpet_hld_data 
*hdata)
return false;
 }
 
+/** irq_remapping_enabled() - Detect if interrupt remapping is enabled
+ * @hdata: A data structure with the HPET block id
+ *
+ * Determine if the HPET block that the hardlockup detector is under
+ * the remapped interrupt domain.
+ *
+ * Returns: True interrupt remapping is enabled. False otherwise.
+ */
+static bool irq_remapping_enabled(struct hpet_hld_data *hdata)
+{
+   struct irq_alloc_info info;
+
+   init_irq_alloc_info(, NULL);
+   info.type = X86_IRQ_ALLOC_TYPE_HPET;
+   info.hpet_id = hdata->blockid;
+
+   return !!irq_remapping_get_ir_irq_domain();
+}
+
 /**
  * compose_msi_msg() - Populate address and data fields of an MSI message
  * @hdata: A data strucure with the message to populate
@@ -161,6 +181,9 @@ static int update_msi_destid(struct hpet_hld_data *hdata)
 {
u32 destid;
 
+   if (irq_remapping_enabled(hdata))
+   return hld_hpet_intremap_activate_irq(hdata);
+
hdata->msi_msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
destid = apic->calc_dest_apicid(hdata->handling_cpu);
hdata->msi_msg.address_lo |= MSI_ADDR_DEST_ID(destid);
@@ -217,9 +240,17 @@ static int hardlockup_detector_nmi_handler(unsigned int 
type,
  */
 static int setup_irq_msi_mode(struct hpet_hld_data *hdata)
 {
+   s32 ret;
u32 v;
 
-   compose_msi_msg(hdata);
+   if (irq_remapping_enabled(hdata)) {
+   ret = hld_hpet_intremap_alloc_irq(hdata);
+   if (ret)
+   return ret;
+   } else {
+   compose_msi_msg(hdata);
+   }
+
hpet_writel(hdata->msi_msg.data, HPET_Tn_ROUTE(hdata->num));
hpet_writel(hdata->msi_msg.address_lo, HPET_Tn_ROUTE(hdata->num) + 4);
 
-- 
2.17.1



[RFC PATCH v4 20/21] iommu/vt-d: hpet: Reserve an interrupt remampping table entry for watchdog

2019-05-23 Thread Ricardo Neri
When interrupt remapping is enabled, MSI interrupt messages must follow a
special format that the IOMMU can understand. Hence, when the HPET hard
lockup detector is used with interrupt remapping, it must also follow this
special format.

The IOMMU, given the information about a particular interrupt, already
knows how to populate the MSI message with this special format and the
corresponding entry in the interrupt remapping table. Given that this is a
special interrupt case, we want to avoid the interrupt subsystem. Add two
functions to create an entry for the HPET hard lockup detector. Perform
this process in two steps as described below.

When initializing the lockup detector, the function
hld_hpet_intremap_alloc_irq() permanently allocates a new entry in the
interrupt remapping table and populates it with the information the
IOMMU driver needs. In order to populate the table, the IOMMU needs to
know the HPET block ID as described in the ACPI table. Hence, add such
ID to the data of the hardlockup detector.

When the hardlockup detector is enabled, the function
hld_hpet_intremapactivate_irq() activates the recently created entry
in the interrupt remapping table via the modify_irte() functions. While
doing this, it specifies which CPU the interrupt must target via its APIC
ID. This function can be called every time the destination iD of the
interrupt needs to be updated; there is no need to allocate or remove
entries in the interrupt remapping table.

Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Borislav Petkov 
Cc: Jacob Pan 
Cc: Joerg Roedel 
Cc: Juergen Gross 
Cc: Bjorn Helgaas 
Cc: Wincy Van 
Cc: Kate Stewart 
Cc: Philippe Ombredanne 
Cc: "Eric W. Biederman" 
Cc: Baoquan He 
Cc: Jan Kiszka 
Cc: Lu Baolu 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h | 11 +++
 arch/x86/kernel/hpet.c  |  1 +
 drivers/iommu/intel_irq_remapping.c | 49 +
 3 files changed, 61 insertions(+)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index a82cbe17479d..811051fa7ade 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -119,6 +119,8 @@ struct hpet_hld_data {
u64 tsc_ticks_per_cpu;
u32 handling_cpu;
u32 enabled_cpus;
+   u8  blockid;
+   void*intremap_data;
struct msi_msg  msi_msg;
unsigned long   cpu_monitored_mask[0];
 };
@@ -129,6 +131,15 @@ extern void hardlockup_detector_hpet_stop(void);
 extern void hardlockup_detector_hpet_enable(unsigned int cpu);
 extern void hardlockup_detector_hpet_disable(unsigned int cpu);
 extern void hardlockup_detector_switch_to_perf(void);
+#ifdef CONFIG_IRQ_REMAP
+extern int hld_hpet_intremap_activate_irq(struct hpet_hld_data *hdata);
+extern int hld_hpet_intremap_alloc_irq(struct hpet_hld_data *hdata);
+#else
+static inline int hld_hpet_intremap_activate_irq(struct hpet_hld_data *hdata)
+{ return -ENODEV; }
+static inline int hld_hpet_intremap_alloc_irq(struct hpet_hld_data *hdata)
+{ return -ENODEV; }
+#endif /* CONFIG_IRQ_REMAP */
 #else
 static inline struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
 { return NULL; }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dd3bb664a188..ddc9be81a075 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -202,6 +202,7 @@ struct hpet_hld_data 
*hpet_hardlockup_detector_assign_timer(void)
 */
temp = (u64)cfg << HPET_COUNTER_CLK_PERIOD_SHIFT;
hdata->ticks_per_second = hpet_get_ticks_per_sec(temp);
+   hdata->blockid = hpet_blockid;
 
return hdata;
 }
diff --git a/drivers/iommu/intel_irq_remapping.c 
b/drivers/iommu/intel_irq_remapping.c
index 2e61eaca7d7e..256466dd30cb 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "irq_remapping.h"
 
@@ -1516,3 +1517,51 @@ int dmar_ir_hotplug(struct dmar_drhd_unit *dmaru, bool 
insert)
 
return ret;
 }
+
+#ifdef CONFIG_X86_HARDLOCKUP_DETECTOR_HPET
+int hld_hpet_intremap_activate_irq(struct hpet_hld_data *hdata)
+{
+   u32 destid = apic->calc_dest_apicid(hdata->handling_cpu);
+   struct intel_ir_data *data;
+
+   data = (struct intel_ir_data *)hdata->intremap_data;
+   data->irte_entry.dest_id = IRTE_DEST(destid);
+   return modify_irte(>irq_2_iommu, >irte_entry);
+}
+
+int hld_hpet_intremap_alloc_irq(struct hpet_hld_data *hdata)
+{
+   struct intel_ir_data *data;
+   struct irq_alloc_info info;
+   struct intel_iommu *iommu;
+   struct irq_cfg irq_cfg;
+   int index;
+
+   iommu = map_hpet_to_ir(hdata->blockid);
+   if (!iommu)
+   return -ENODEV;
+
+   data = kzalloc(sizeof(*data), 

[RFC PATCH v4 10/21] watchdog/hardlockup: Add function to enable NMI watchdog on all allowed CPUs at once

2019-05-23 Thread Ricardo Neri
When there are more than one implementation of the NMI watchdog, there may
be situations in which switching from one to another is needed (e.g., if
the time-stamp counter becomes unstable, the HPET-based NMI watchdog can
no longer be used.

The perf-based implementation of the hardlockup detector makes use of
various per-CPU variables which are accessed via this_cpu operations.
Hence, each CPU needs to enable its own NMI watchdog if using the perf
implementation.

Add functionality to switch from one NMI watchdog to another and do it
from each allowed CPU.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: "Rafael J. Wysocki" 
Cc: Don Zickus 
Cc: Nicholas Piggin 
Cc: Michael Ellerman 
Cc: Frederic Weisbecker 
Cc: Alexei Starovoitov 
Cc: Babu Moger 
Cc: "David S. Miller" 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Mathieu Desnoyers 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Philippe Ombredanne 
Cc: Colin Ian King 
Cc: Byungchul Park 
Cc: "Paul E. McKenney" 
Cc: "Luis R. Rodriguez" 
Cc: Waiman Long 
Cc: Josh Poimboeuf 
Cc: Randy Dunlap 
Cc: Davidlohr Bueso 
Cc: Marc Zyngier 
Cc: Kai-Heng Feng 
Cc: Konrad Rzeszutek Wilk 
Cc: David Rientjes 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linuxppc-...@lists.ozlabs.org
Signed-off-by: Ricardo Neri 
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 15 +++
 2 files changed, 17 insertions(+)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e5f1a86e20b7..6d828334348b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -83,9 +83,11 @@ static inline void reset_hung_task_detector(void) { }
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+extern void hardlockup_start_all(void);
 extern unsigned int hardlockup_panic;
 #else
 static inline void hardlockup_detector_disable(void) {}
+static inline void hardlockup_start_all(void) {}
 #endif
 
 #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f9e7b9306fe..be589001200a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -566,6 +566,21 @@ int lockup_detector_offline_cpu(unsigned int cpu)
return 0;
 }
 
+static int hardlockup_start_fn(void *data)
+{
+   watchdog_nmi_enable(smp_processor_id());
+   return 0;
+}
+
+void hardlockup_start_all(void)
+{
+   int cpu;
+
+   cpumask_copy(_allowed_mask, _cpumask);
+   for_each_cpu(cpu, _allowed_mask)
+   smp_call_on_cpu(cpu, hardlockup_start_fn, NULL, false);
+}
+
 static void lockup_detector_reconfigure(void)
 {
cpus_read_lock();
-- 
2.17.1



[RFC PATCH v4 12/21] watchdog/hardlockup/hpet: Adjust timer expiration on the number of monitored CPUs

2019-05-23 Thread Ricardo Neri
Each CPU should be monitored for hardlockups every watchdog_thresh seconds.
Since all the CPUs in the system are monitored by the same timer and the
timer interrupt is rotated among the monitored CPUs, the timer must expire
every watchdog_thresh/N seconds; where N is the number of monitored CPUs.
Use the new member of struct hld_data, ticks_per_cpu, to store the
aforementioned quantity.

The ticks-per-CPU quantity is updated every time the number of monitored
CPUs changes: when the watchdog is enabled or disabled for a specific CPU.
If the timer is used in periodic mode, it needs to be adjusted to reflect
the new expected expiration.

Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Borislav Petkov 
Cc: Jacob Pan 
Cc: "Rafael J. Wysocki" 
Cc: Don Zickus 
Cc: Nicholas Piggin 
Cc: Michael Ellerman 
Cc: Frederic Weisbecker 
Cc: Alexei Starovoitov 
Cc: Babu Moger 
Cc: Mathieu Desnoyers 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Philippe Ombredanne 
Cc: Colin Ian King 
Cc: Byungchul Park 
Cc: "Paul E. McKenney" 
Cc: "Luis R. Rodriguez" 
Cc: Waiman Long 
Cc: Josh Poimboeuf 
Cc: Randy Dunlap 
Cc: Davidlohr Bueso 
Cc: Marc Zyngier 
Cc: Kai-Heng Feng 
Cc: Konrad Rzeszutek Wilk 
Cc: David Rientjes 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h |  1 +
 arch/x86/kernel/watchdog_hld_hpet.c | 46 +++--
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 31fc27508cf3..64acacce095d 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -114,6 +114,7 @@ struct hpet_hld_data {
boolhas_periodic;
u32 num;
u64 ticks_per_second;
+   u64 ticks_per_cpu;
u32 handling_cpu;
u32 enabled_cpus;
struct msi_msg  msi_msg;
diff --git a/arch/x86/kernel/watchdog_hld_hpet.c 
b/arch/x86/kernel/watchdog_hld_hpet.c
index dff4dadabd4c..74aeb0535d08 100644
--- a/arch/x86/kernel/watchdog_hld_hpet.c
+++ b/arch/x86/kernel/watchdog_hld_hpet.c
@@ -45,6 +45,13 @@ static void kick_timer(struct hpet_hld_data *hdata, bool 
force)
 * are able to update the comparator before the counter reaches such new
 * value.
 *
+* Each CPU must be monitored every watch_thresh seconds. Since the
+* timer targets one CPU at a time, it must expire every
+*
+*ticks_per_cpu = watch_thresh * ticks_per_second /enabled_cpus
+*
+* as computed in update_ticks_per_cpu().
+*
 * Let it wrap around if needed.
 */
 
@@ -52,10 +59,10 @@ static void kick_timer(struct hpet_hld_data *hdata, bool 
force)
return;
 
if (hdata->has_periodic)
-   period = watchdog_thresh * hdata->ticks_per_second;
+   period = watchdog_thresh * hdata->ticks_per_cpu;
 
count = hpet_readl(HPET_COUNTER);
-   new_compare = count + watchdog_thresh * hdata->ticks_per_second;
+   new_compare = count + watchdog_thresh * hdata->ticks_per_cpu;
hpet_set_comparator(hdata->num, (u32)new_compare, (u32)period);
 }
 
@@ -234,6 +241,27 @@ static int setup_hpet_irq(struct hpet_hld_data *hdata)
return ret;
 }
 
+/**
+ * update_ticks_per_cpu() - Update the number of HPET ticks per CPU
+ * @hdata: struct with the timer's the ticks-per-second and CPU mask
+ *
+ * From the overall ticks-per-second of the timer, compute the number of ticks
+ * after which the timer should expire to monitor each CPU every watch_thresh
+ * seconds. The ticks-per-cpu quantity is computed using the number of CPUs 
that
+ * the watchdog currently monitors.
+ */
+static void update_ticks_per_cpu(struct hpet_hld_data *hdata)
+{
+   u64 temp = hdata->ticks_per_second;
+
+   /* Only update if there are monitored CPUs. */
+   if (!hdata->enabled_cpus)
+   return;
+
+   do_div(temp, hdata->enabled_cpus);
+   hdata->ticks_per_cpu = temp;
+}
+
 /**
  * hardlockup_detector_hpet_enable() - Enable the hardlockup detector
  * @cpu:   CPU Index in which the watchdog will be enabled.
@@ -246,13 +274,23 @@ void hardlockup_detector_hpet_enable(unsigned int cpu)
 {
cpumask_set_cpu(cpu, to_cpumask(hld_data->cpu_monitored_mask));
 
-   if (!hld_data->enabled_cpus++) {
+   hld_data->enabled_cpus++;
+   update_ticks_per_cpu(hld_data);
+
+   if (hld_data->enabled_cpus == 1) {
hld_data->handling_cpu = cpu;
update_msi_destid(hld_data);
/* Force timer kick when detector is just enabled */
kick_timer(hld_data, true);
enable_timer(hld_data);
}
+
+   /*
+* When in periodic mode, we only kick the timer here. Hence,
+* 

[RFC PATCH v4 08/21] watchdog/hardlockup: Decouple the hardlockup detector from perf

2019-05-23 Thread Ricardo Neri
The current default implementation of the hardlockup detector assumes that
it is implemented using perf events. However, the hardlockup detector can
be driven by other sources of non-maskable interrupts (e.g., a properly
configured timer).

Group and wrap in #ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF all the code
specific to perf: create and manage perf events, stop and start the perf-
based detector.

The generic portion of the detector (monitor the timers' thresholds, check
timestamps and detect hardlockups as well as the implementation of
arch_touch_nmi_watchdog()) is now selected with the new intermediate config
symbol CONFIG_HARDLOCKUP_DETECTOR_CORE.

The perf-based implementation of the detector selects the new intermediate
symbol. Other implementations should do the same.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: "Rafael J. Wysocki" 
Cc: Don Zickus 
Cc: Nicholas Piggin 
Cc: Michael Ellerman 
Cc: Frederic Weisbecker 
Cc: Alexei Starovoitov 
Cc: Babu Moger 
Cc: "David S. Miller" 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Mathieu Desnoyers 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Philippe Ombredanne 
Cc: Colin Ian King 
Cc: Byungchul Park 
Cc: "Paul E. McKenney" 
Cc: "Luis R. Rodriguez" 
Cc: Waiman Long 
Cc: Josh Poimboeuf 
Cc: Randy Dunlap 
Cc: Davidlohr Bueso 
Cc: Marc Zyngier 
Cc: Kai-Heng Feng 
Cc: Konrad Rzeszutek Wilk 
Cc: David Rientjes 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linuxppc-...@lists.ozlabs.org
Signed-off-by: Ricardo Neri 
---
 include/linux/nmi.h   |  5 -
 kernel/Makefile   |  2 +-
 kernel/watchdog_hld.c | 32 
 lib/Kconfig.debug |  4 
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5a8b19749769..e5f1a86e20b7 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -94,8 +94,11 @@ static inline void hardlockup_detector_disable(void) {}
 # define NMI_WATCHDOG_SYSCTL_PERM  0444
 #endif
 
-#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_CORE)
 extern void arch_touch_nmi_watchdog(void);
+#endif
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_disable(void);
diff --git a/kernel/Makefile b/kernel/Makefile
index 33824f0385b3..d07d52a03cc9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -83,7 +83,7 @@ obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_CORE) += watchdog_hld.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index b352e507b17f..bb6435978c46 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,12 +22,8 @@
 
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
-static atomic_t watchdog_cpus = ATOMIC_INIT(0);
 
 notrace void arch_touch_nmi_watchdog(void)
 {
@@ -98,14 +94,6 @@ static inline bool watchdog_check_timestamp(void)
 }
 #endif
 
-static struct perf_event_attr wd_hw_attr = {
-   .type   = PERF_TYPE_HARDWARE,
-   .config = PERF_COUNT_HW_CPU_CYCLES,
-   .size   = sizeof(struct perf_event_attr),
-   .pinned = 1,
-   .disabled   = 1,
-};
-
 void inspect_for_hardlockups(struct pt_regs *regs)
 {
if (__this_cpu_read(watchdog_nmi_touch) == true) {
@@ -157,6 +145,24 @@ void inspect_for_hardlockups(struct pt_regs *regs)
return;
 }
 
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+#undef pr_fmt
+#define pr_fmt(fmt) "NMI perf watchdog: " fmt
+
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static struct cpumask dead_events_mask;
+
+static atomic_t watchdog_cpus = ATOMIC_INIT(0);
+
+static struct perf_event_attr wd_hw_attr = {
+   .type   = PERF_TYPE_HARDWARE,
+   .config = PERF_COUNT_HW_CPU_CYCLES,
+   .size   = sizeof(struct perf_event_attr),
+   .pinned = 1,
+   .disabled   = 1,
+};
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
   struct perf_sample_data *data,
@@ -298,3 +304,5 @@ int __init hardlockup_detector_perf_init(void)
}
return 

[RFC PATCH v4 01/21] x86/msi: Add definition for NMI delivery mode

2019-05-23 Thread Ricardo Neri
Until now, the delivery mode of MSI interrupts is set to the default
mode set in the APIC driver. However, there are no restrictions in hardware
to configure each interrupt with a different delivery mode. Specifying the
delivery mode per interrupt is useful when one is interested in changing
the delivery mode of a particular interrupt. For instance, this can be used
to deliver an interrupt as non-maskable.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Joerg Roedel 
Cc: Juergen Gross 
Cc: Bjorn Helgaas 
Cc: Wincy Van 
Cc: Kate Stewart 
Cc: Philippe Ombredanne 
Cc: "Eric W. Biederman" 
Cc: Baoquan He 
Cc: Jan Kiszka 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/msidef.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index ee2f8ccc32d0..38ccfdc2d96e 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -18,6 +18,7 @@
 #define MSI_DATA_DELIVERY_MODE_SHIFT   8
 #define  MSI_DATA_DELIVERY_FIXED   (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
 #define  MSI_DATA_DELIVERY_LOWPRI  (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define  MSI_DATA_DELIVERY_NMI (4 << MSI_DATA_DELIVERY_MODE_SHIFT)
 
 #define MSI_DATA_LEVEL_SHIFT   14
 #define MSI_DATA_LEVEL_DEASSERT(0 << MSI_DATA_LEVEL_SHIFT)
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 05/21] x86/hpet: Reserve timer for the HPET hardlockup detector

2019-05-23 Thread Ricardo Neri
HPET timer 2 will be used to drive the HPET-based hardlockup detector.
Reserve such timer to ensure it cannot be used by user space programs or
for clock events.

When looking for MSI-capable timers for clock events, skip timer 2 if
the HPET hardlockup detector is selected.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h |  3 +++
 arch/x86/kernel/hpet.c  | 19 ---
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index e7098740f5ee..6f099e2781ce 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -61,6 +61,9 @@
  */
 #define HPET_MIN_PERIOD10UL
 
+/* Timer used for the hardlockup detector */
+#define HPET_WD_TIMER_NR 2
+
 /* hpet memory map physical address */
 extern unsigned long hpet_address;
 extern unsigned long force_hpet_address;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 1723d55219e8..ff0250831786 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -173,7 +173,8 @@ do {
\
 
 /*
  * When the hpet driver (/dev/hpet) is enabled, we need to reserve
- * timer 0 and timer 1 in case of RTC emulation.
+ * timer 0 and timer 1 in case of RTC emulation. Timer 2 is reserved in case
+ * the HPET-based hardlockup detector is used.
  */
 #ifdef CONFIG_HPET
 
@@ -183,7 +184,7 @@ static void hpet_reserve_platform_timers(unsigned int id)
 {
struct hpet __iomem *hpet = hpet_virt_address;
struct hpet_timer __iomem *timer = >hpet_timers[2];
-   unsigned int nrtimers, i;
+   unsigned int nrtimers, i, start_timer;
struct hpet_data hd;
 
nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
@@ -198,6 +199,13 @@ static void hpet_reserve_platform_timers(unsigned int id)
hpet_reserve_timer(, 1);
 #endif
 
+   if (IS_ENABLED(CONFIG_X86_HARDLOCKUP_DETECTOR_HPET)) {
+   hpet_reserve_timer(, HPET_WD_TIMER_NR);
+   start_timer = HPET_WD_TIMER_NR + 1;
+   } else {
+   start_timer = HPET_WD_TIMER_NR;
+   }
+
/*
 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
 * is wrong for i8259!) not the output IRQ.  Many BIOS writers
@@ -206,7 +214,7 @@ static void hpet_reserve_platform_timers(unsigned int id)
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
 
-   for (i = 2; i < nrtimers; timer++, i++) {
+   for (i = start_timer; i < nrtimers; timer++, i++) {
hd.hd_irq[i] = (readl(>hpet_config) &
Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
}
@@ -651,6 +659,11 @@ static void hpet_msi_capability_lookup(unsigned int 
start_timer)
struct hpet_dev *hdev = _devs[num_timers_used];
unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
 
+   /* Do not use timer reserved for the HPET watchdog. */
+   if (IS_ENABLED(CONFIG_X86_HARDLOCKUP_DETECTOR_HPET) &&
+   i == HPET_WD_TIMER_NR)
+   continue;
+
/* Only consider HPET timer with MSI support */
if (!(cfg & HPET_TN_FSB_CAP))
continue;
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 07/21] watchdog/hardlockup: Define a generic function to detect hardlockups

2019-05-23 Thread Ricardo Neri
The procedure to detect hardlockups is independent of the underlying
mechanism that generates the non-maskable interrupt used to drive the
detector. Thus, it can be put in a separate, generic function. In this
manner, it can be invoked by various implementations of the NMI watchdog.

For this purpose, move the bulk of watchdog_overflow_callback() to the
new function inspect_for_hardlockups(). This function can then be called
from the applicable NMI handlers.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Don Zickus 
Cc: Nicholas Piggin 
Cc: Michael Ellerman 
Cc: Frederic Weisbecker 
Cc: Babu Moger 
Cc: "David S. Miller" 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Mathieu Desnoyers 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Cc: Andrew Morton 
Cc: Philippe Ombredanne 
Cc: Colin Ian King 
Cc: "Luis R. Rodriguez" 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: sparcli...@vger.kernel.org
Cc: linuxppc-...@lists.ozlabs.org
Signed-off-by: Ricardo Neri 
---
 include/linux/nmi.h   |  1 +
 kernel/watchdog_hld.c | 18 +++---
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 9003e29cde46..5a8b19749769 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -212,6 +212,7 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 void __user *, size_t *, loff_t *);
+void inspect_for_hardlockups(struct pt_regs *regs);
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include 
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..b352e507b17f 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,14 +106,8 @@ static struct perf_event_attr wd_hw_attr = {
.disabled   = 1,
 };
 
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-  struct perf_sample_data *data,
-  struct pt_regs *regs)
+void inspect_for_hardlockups(struct pt_regs *regs)
 {
-   /* Ensure the watchdog never gets throttled */
-   event->hw.interrupts = 0;
-
if (__this_cpu_read(watchdog_nmi_touch) == true) {
__this_cpu_write(watchdog_nmi_touch, false);
return;
@@ -163,6 +157,16 @@ static void watchdog_overflow_callback(struct perf_event 
*event,
return;
 }
 
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+  struct perf_sample_data *data,
+  struct pt_regs *regs)
+{
+   /* Ensure the watchdog never gets throttled */
+   event->hw.interrupts = 0;
+   inspect_for_hardlockups(regs);
+}
+
 static int hardlockup_detector_event_create(void)
 {
unsigned int cpu = smp_processor_id();
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 04/21] x86/hpet: Add hpet_set_comparator() for periodic and one-shot modes

2019-05-23 Thread Ricardo Neri
Instead of setting the timer period directly in hpet_set_periodic(), add a
new helper function hpet_set_comparator() that only sets the accumulator
and comparator. hpet_set_periodic() will only prepare the timer for
periodic mode and leave the expiration programming to
hpet_set_comparator().

This new function can also be used by other components (e.g., the HPET-
based hardlockup detector) which also need to configure HPET timers. Thus,
add its declaration into the hpet header file.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Originally-by: Suravee Suthikulpanit 
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h |  1 +
 arch/x86/kernel/hpet.c  | 57 +
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index f132fbf984d4..e7098740f5ee 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -102,6 +102,7 @@ extern int hpet_rtc_timer_init(void);
 extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
 extern int hpet_register_irq_handler(rtc_irq_handler handler);
 extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
+extern void hpet_set_comparator(int num, unsigned int cmp, unsigned int 
period);
 
 #endif /* CONFIG_HPET_EMULATE_RTC */
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 5e86e024c489..1723d55219e8 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -290,6 +290,47 @@ static void hpet_legacy_clockevent_register(void)
printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
+/**
+ * hpet_set_comparator() - Helper function for setting comparator register
+ * @num:   The timer ID
+ * @cmp:   The value to be written to the comparator/accumulator
+ * @period:The value to be written to the period (0 = oneshot mode)
+ *
+ * Helper function for updating comparator, accumulator and period values.
+ *
+ * In periodic mode, HPET needs HPET_TN_SETVAL to be set before writing
+ * to the Tn_CMP to update the accumulator. Then, HPET needs a second
+ * write (with HPET_TN_SETVAL cleared) to Tn_CMP to set the period.
+ * The HPET_TN_SETVAL bit is automatically cleared after the first write.
+ *
+ * For one-shot mode, HPET_TN_SETVAL does not need to be set.
+ *
+ * See the following documents:
+ *   - Intel IA-PC HPET (High Precision Event Timers) Specification
+ *   - AMD-8111 HyperTransport I/O Hub Data Sheet, Publication # 24674
+ */
+void hpet_set_comparator(int num, unsigned int cmp, unsigned int period)
+{
+   if (period) {
+   unsigned int v = hpet_readl(HPET_Tn_CFG(num));
+
+   hpet_writel(v | HPET_TN_SETVAL, HPET_Tn_CFG(num));
+   }
+
+   hpet_writel(cmp, HPET_Tn_CMP(num));
+
+   if (!period)
+   return;
+
+   /*
+* This delay is seldom used: never in one-shot mode and in periodic
+* only when reprogramming the timer.
+*/
+   udelay(1);
+   hpet_writel(period, HPET_Tn_CMP(num));
+}
+EXPORT_SYMBOL_GPL(hpet_set_comparator);
+
 static int hpet_set_periodic(struct clock_event_device *evt, int timer)
 {
unsigned int cfg, cmp, now;
@@ -301,19 +342,11 @@ static int hpet_set_periodic(struct clock_event_device 
*evt, int timer)
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned int)delta;
cfg = hpet_readl(HPET_Tn_CFG(timer));
-   cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
-  HPET_TN_32BIT;
+   cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_32BIT;
hpet_writel(cfg, HPET_Tn_CFG(timer));
-   hpet_writel(cmp, HPET_Tn_CMP(timer));
-   udelay(1);
-   /*
-* HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
-* cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
-* bit is automatically cleared after the first write.
-* (See AMD-8111 HyperTransport I/O Hub Data Sheet,
-* Publication # 24674)
-*/
-   hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+
+   hpet_set_comparator(timer, cmp, (unsigned int)delta);
+
hpet_start_counter();
hpet_print_config();
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 03/21] x86/hpet: Calculate ticks-per-second in a separate function

2019-05-23 Thread Ricardo Neri
It is easier to compute the expiration times of an HPET timer by using
its frequency (i.e., the number of times it ticks in a second) than its
period, as given in the capabilities register.

In addition to the HPET char driver, the HPET-based hardlockup detector
will also need to know the timer's frequency. Thus, create a common
function that both can use.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 drivers/char/hpet.c  | 31 ---
 include/linux/hpet.h |  1 +
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 3a1e6b3ccd10..747255f552a9 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -836,6 +836,29 @@ static unsigned long hpet_calibrate(struct hpets *hpetp)
return ret;
 }
 
+u64 hpet_get_ticks_per_sec(u64 hpet_caps)
+{
+   u64 ticks_per_sec, period;
+
+   period = (hpet_caps & HPET_COUNTER_CLK_PERIOD_MASK) >>
+HPET_COUNTER_CLK_PERIOD_SHIFT; /* fs, 10^-15 */
+
+   /*
+* The frequency is the reciprocal of the period. The period is given
+* in femtoseconds per second. Thus, prepare a dividend to obtain the
+* frequency in ticks per second.
+*/
+
+   /* 10^15 femtoseconds per second */
+   ticks_per_sec = 1000ULL;
+   ticks_per_sec += period >> 1; /* round */
+
+   /* The quotient is put in the dividend. We drop the remainder. */
+   do_div(ticks_per_sec, period);
+
+   return ticks_per_sec;
+}
+
 int hpet_alloc(struct hpet_data *hdp)
 {
u64 cap, mcfg;
@@ -844,7 +867,6 @@ int hpet_alloc(struct hpet_data *hdp)
struct hpets *hpetp;
struct hpet __iomem *hpet;
static struct hpets *last;
-   unsigned long period;
unsigned long long temp;
u32 remainder;
 
@@ -894,12 +916,7 @@ int hpet_alloc(struct hpet_data *hdp)
 
last = hpetp;
 
-   period = (cap & HPET_COUNTER_CLK_PERIOD_MASK) >>
-   HPET_COUNTER_CLK_PERIOD_SHIFT; /* fs, 10^-15 */
-   temp = 1000uLL; /* 10^15 femtoseconds per second */
-   temp += period >> 1; /* round */
-   do_div(temp, period);
-   hpetp->hp_tick_freq = temp; /* ticks per second */
+   hpetp->hp_tick_freq = hpet_get_ticks_per_sec(cap);
 
printk(KERN_INFO "hpet%d: at MMIO 0x%lx, IRQ%s",
hpetp->hp_which, hdp->hd_phys_address,
diff --git a/include/linux/hpet.h b/include/linux/hpet.h
index 8604564b985d..e7b36bcf4699 100644
--- a/include/linux/hpet.h
+++ b/include/linux/hpet.h
@@ -107,5 +107,6 @@ static inline void hpet_reserve_timer(struct hpet_data *hd, 
int timer)
 }
 
 int hpet_alloc(struct hpet_data *);
+u64 hpet_get_ticks_per_sec(u64 hpet_caps);
 
 #endif /* !__HPET__ */
-- 
2.17.1



[RFC PATCH v4 18/21] x86/apic: Add a parameter for the APIC delivery mode

2019-05-23 Thread Ricardo Neri
Until now, the delivery mode of APIC interrupts is set to the default
mode set in the APIC driver. However, there are no restrictions in hardware
to configure each interrupt with a different delivery mode. Specifying the
delivery mode per interrupt is useful when one is interested in changing
the delivery mode of a particular interrupt. For instance, this can be used
to deliver an interrupt as non-maskable.

Add a new member, delivery_mode, to struct irq_cfg. This new member, can
be used to update the configuration of the delivery mode in each interrupt
domain. Likewise, add equivalent macros to populate MSI messages.

Currently, all interrupt domains set the delivery mode of interrupts using
the APIC setting. Interrupt domains use an irq_cfg data structure to
configure their own data structures and hardware resources. Thus, in order
to keep the current behavior, set the delivery mode of the irq
configuration that as the APIC setting. In this manner, irq domains can
obtain the delivery mode from the irq configuration data instead of the
APIC setting, if needed.

Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Borislav Petkov 
Cc: Jacob Pan 
Cc: Joerg Roedel 
Cc: Juergen Gross 
Cc: Bjorn Helgaas 
Cc: Wincy Van 
Cc: Kate Stewart 
Cc: Philippe Ombredanne 
Cc: "Eric W. Biederman" 
Cc: Baoquan He 
Cc: Jan Kiszka 
Cc: Lu Baolu 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Cc: iommu@lists.linux-foundation.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hw_irq.h |  5 +++--
 arch/x86/include/asm/msidef.h |  3 +++
 arch/x86/kernel/apic/vector.c | 10 ++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 32e666e1231e..c024e5976b78 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -117,8 +117,9 @@ struct irq_alloc_info {
 };
 
 struct irq_cfg {
-   unsigned intdest_apicid;
-   unsigned intvector;
+   unsigned intdest_apicid;
+   unsigned intvector;
+   enum ioapic_irq_destination_types   delivery_mode;
 };
 
 extern struct irq_cfg *irq_cfg(unsigned int irq);
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index 38ccfdc2d96e..6d666c90f057 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -16,6 +16,9 @@
 MSI_DATA_VECTOR_MASK)
 
 #define MSI_DATA_DELIVERY_MODE_SHIFT   8
+#define MSI_DATA_DELIVERY_MODE_MASK0x0700
+#define MSI_DATA_DELIVERY_MODE(dm) (((dm) << MSI_DATA_DELIVERY_MODE_SHIFT) 
& \
+MSI_DATA_DELIVERY_MODE_MASK)
 #define  MSI_DATA_DELIVERY_FIXED   (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
 #define  MSI_DATA_DELIVERY_LOWPRI  (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
 #define  MSI_DATA_DELIVERY_NMI (4 << MSI_DATA_DELIVERY_MODE_SHIFT)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3173e07d3791..99436fe7e932 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -548,6 +548,16 @@ static int x86_vector_alloc_irqs(struct irq_domain 
*domain, unsigned int virq,
irqd->chip_data = apicd;
irqd->hwirq = virq + i;
irqd_set_single_target(irqd);
+
+   /*
+* Initialize the delivery mode of this irq to match the
+* default delivery mode of the APIC. This is useful for
+* children irq domains which want to take the delivery
+* mode from the individual irq configuration rather
+* than from the APIC.
+*/
+apicd->hw_irq_cfg.delivery_mode = apic->irq_delivery_mode;
+
/*
 * Legacy vectors are already assigned when the IOAPIC
 * takes them over. They stay on the same vector. This is
-- 
2.17.1



[RFC PATCH v4 02/21] x86/hpet: Expose hpet_writel() in header

2019-05-23 Thread Ricardo Neri
In order to allow hpet_writel() to be used by other components (e.g.,
the HPET-based hardlockup detector) expose it in the HPET header file.

No empty definition is needed if CONFIG_HPET is not selected as all
existing callers select such config symbol.

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h | 1 +
 arch/x86/kernel/hpet.c  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 67385d56d4f4..f132fbf984d4 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -72,6 +72,7 @@ extern int is_hpet_enabled(void);
 extern int hpet_enable(void);
 extern void hpet_disable(void);
 extern unsigned int hpet_readl(unsigned int a);
+extern void hpet_writel(unsigned int d, unsigned int a);
 extern void force_hpet_resume(void);
 
 struct irq_data;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a0573f2e7763..5e86e024c489 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -62,7 +62,7 @@ inline unsigned int hpet_readl(unsigned int a)
return readl(hpet_virt_address + a);
 }
 
-static inline void hpet_writel(unsigned int d, unsigned int a)
+inline void hpet_writel(unsigned int d, unsigned int a)
 {
writel(d, hpet_virt_address + a);
 }
-- 
2.17.1



[RFC PATCH v4 06/21] x86/hpet: Configure the timer used by the hardlockup detector

2019-05-23 Thread Ricardo Neri
Implement the initial configuration of the timer to be used by the
hardlockup detector. Return a data structure with a description of the
timer; this information is subsequently used by the hardlockup detector.

Only provide the timer if it supports Front Side Bus interrupt delivery.
This condition greatly simplifies the implementation of the detector.
Specifically, it helps to avoid the complexities of routing the interrupt
via the IO-APIC (e.g., potential race conditions that arise from re-
programming the IO-APIC in NMI context).

Cc: "H. Peter Anvin" 
Cc: Ashok Raj 
Cc: Andi Kleen 
Cc: Tony Luck 
Cc: Clemens Ladisch 
Cc: Arnd Bergmann 
Cc: Philippe Ombredanne 
Cc: Kate Stewart 
Cc: "Rafael J. Wysocki" 
Cc: Stephane Eranian 
Cc: Suravee Suthikulpanit 
Cc: "Ravi V. Shankar" 
Cc: x...@kernel.org
Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/hpet.h | 13 +
 arch/x86/kernel/hpet.c  | 35 +++
 2 files changed, 48 insertions(+)

diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 6f099e2781ce..20abdaa5372d 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -109,6 +109,19 @@ extern void hpet_set_comparator(int num, unsigned int cmp, 
unsigned int period);
 
 #endif /* CONFIG_HPET_EMULATE_RTC */
 
+#ifdef CONFIG_X86_HARDLOCKUP_DETECTOR_HPET
+struct hpet_hld_data {
+   boolhas_periodic;
+   u32 num;
+   u64 ticks_per_second;
+};
+
+extern struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void);
+#else
+static inline struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
+{ return NULL; }
+#endif /* CONFIG_X86_HARDLOCKUP_DETECTOR_HPET */
+
 #else /* CONFIG_HPET_TIMER */
 
 static inline int hpet_enable(void) { return 0; }
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ff0250831786..5f9209949fc7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -171,6 +171,41 @@ do {   
\
_hpet_print_config(__func__, __LINE__); \
 } while (0)
 
+#ifdef CONFIG_X86_HARDLOCKUP_DETECTOR_HPET
+struct hpet_hld_data *hpet_hardlockup_detector_assign_timer(void)
+{
+   struct hpet_hld_data *hdata;
+   u64 temp;
+   u32 cfg;
+
+   cfg = hpet_readl(HPET_Tn_CFG(HPET_WD_TIMER_NR));
+
+   if (!(cfg & HPET_TN_FSB_CAP))
+   return NULL;
+
+   hdata = kzalloc(sizeof(*hdata), GFP_KERNEL);
+   if (!hdata)
+   return NULL;
+
+   if (cfg & HPET_TN_PERIODIC_CAP)
+   hdata->has_periodic = true;
+
+   hdata->num = HPET_WD_TIMER_NR;
+
+   cfg = hpet_readl(HPET_PERIOD);
+
+   /*
+* hpet_get_ticks_per_sec() expects the contents of the general
+* capabilities register. The period is in the 32 most significant
+* bits.
+*/
+   temp = (u64)cfg << HPET_COUNTER_CLK_PERIOD_SHIFT;
+   hdata->ticks_per_second = hpet_get_ticks_per_sec(temp);
+
+   return hdata;
+}
+#endif /* CONFIG_X86_HARDLOCKUP_DETECTOR_HPET */
+
 /*
  * When the hpet driver (/dev/hpet) is enabled, we need to reserve
  * timer 0 and timer 1 in case of RTC emulation. Timer 2 is reserved in case
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC PATCH v4 09/21] x86/nmi: Add a NMI_WATCHDOG NMI handler category

2019-05-23 Thread Ricardo Neri
Add a NMI_WATCHDOG as a new category of NMI handler. This new category
is to be used with the HPET-based hardlockup detector. This detector
does not have a direct way of checking if the HPET timer is the source of
the NMI. Instead it indirectly estimate it using the time-stamp counter.

Therefore, we may have false-positives in case another NMI occurs within
the estimated time window. For this reason, we want the handler of the
detector to be called after all the NMI_LOCAL handlers. A simple way
of achieving this with a new NMI handler category.

Signed-off-by: Ricardo Neri 
---
 arch/x86/include/asm/nmi.h |  1 +
 arch/x86/kernel/nmi.c  | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 75ded1d13d98..75aa98313cde 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -29,6 +29,7 @@ enum {
NMI_UNKNOWN,
NMI_SERR,
NMI_IO_CHECK,
+   NMI_WATCHDOG,
NMI_MAX
 };
 
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4df7705022b9..43e96aedc6fe 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -64,6 +64,10 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
.lock = __RAW_SPIN_LOCK_UNLOCKED(_desc[3].lock),
.head = LIST_HEAD_INIT(nmi_desc[3].head),
},
+   {
+   .lock = __RAW_SPIN_LOCK_UNLOCKED(_desc[4].lock),
+   .head = LIST_HEAD_INIT(nmi_desc[4].head),
+   },
 
 };
 
@@ -174,6 +178,8 @@ int __register_nmi_handler(unsigned int type, struct 
nmiaction *action)
 */
WARN_ON_ONCE(type == NMI_SERR && !list_empty(>head));
WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(>head));
+   WARN_ON_ONCE(type == NMI_WATCHDOG && !list_empty(>head));
+
 
/*
 * some handlers need to be executed first otherwise a fake
@@ -384,6 +390,10 @@ static void default_do_nmi(struct pt_regs *regs)
}
raw_spin_unlock(_reason_lock);
 
+   handled = nmi_handle(NMI_WATCHDOG, regs);
+   if (handled == NMI_HANDLED)
+   return;
+
/*
 * Only one NMI can be latched at a time.  To handle
 * this we may process multiple nmi handlers at once to
-- 
2.17.1



[PATCH v5 00/10] Support using MSI interrupts in ntb_transport

2019-05-23 Thread Logan Gunthorpe
This is another resend as there has been no feedback since v4.
Seems Jon has been MIA this past cycle so hopefully he appears on the
list soon.

I've addressed the feedback so far and rebased on the latest kernel
and would like this to be considered for merging this cycle.

The only outstanding issue I know of is that it still will not work
with IDT hardware, but ntb_transport doesn't work with IDT hardware
and there is still no sensible common infrastructure to support
ntb_peer_mw_set_trans(). Thus, I decline to consider that complication
in this patchset. However, I'll be happy to review work that adds this
feature in the future.

Also, as the port number and resource index stuff is a bit complicated,
I made a quick out of tree test fixture to ensure it's correct[1]. As
an excerise I also wrote some test code[2] using the upcomming KUnit
feature.

Logan

[1] https://repl.it/repls/ExcitingPresentFile
[2] https://github.com/sbates130272/linux-p2pmem/commits/ntb_kunit

--

Changes in v5:

* Rebased onto v5.2-rc1 (plus the patches in ntb-next)

--

Changes in v4:

* Rebased onto v5.1-rc6 (No changes)

* Numerous grammar and spelling mistakes spotted by Bjorn

--

Changes in v3:

* Rebased onto v5.1-rc1 (Dropped the first two patches as they have
  been merged, and cleaned up some minor conflicts in the PCI tree)

* Added a new patch (#3) to calculate logical port numbers that
  are port numbers from 0 to (number of ports - 1). This is
  then used in ntb_peer_resource_idx() to fix the issues brought
  up by Serge.

* Fixed missing __iomem and iowrite calls (as noticed by Serge)

* Added patch 10 which describes ntb_msi_test in the documentation
  file (as requested by Serge)

* A couple other minor nits and documentation fixes

--

Changes in v2:

* Cleaned up the changes in intel_irq_remapping.c to make them
  less confusing and add a comment. (Per discussion with Jacob and
  Joerg)

* Fixed a nit from Bjorn and collected his Ack

* Added a Kconfig dependancy on CONFIG_PCI_MSI for CONFIG_NTB_MSI
  as the Kbuild robot hit a random config that didn't build
  without it.

* Worked in a callback for when the MSI descriptor changes so that
  the clients can resend the new address and data values to the peer.
  On my test system this was never necessary, but there may be
  other platforms where this can occur. I tested this by hacking
  in a path to rewrite the MSI descriptor when I change the cpu
  affinity of an IRQ. There's a bit of uncertainty over the latency
  of the change, but without hardware this can acctually occur on
  we can't test this. This was the result of a discussion with Dave.

--

This patch series adds optional support for using MSI interrupts instead
of NTB doorbells in ntb_transport. This is desirable seeing doorbells on
current hardware are quite slow and therefore switching to MSI interrupts
provides a significant performance gain. On switchtec hardware, a simple
apples-to-apples comparison shows ntb_netdev/iperf numbers going from
3.88Gb/s to 14.1Gb/s when switching to MSI interrupts.

To do this, a couple changes are required outside of the NTB tree:

1) The IOMMU must know to accept MSI requests from aliased bused numbers
seeing NTB hardware typically sends proxied request IDs through
additional requester IDs. The first patch in this series adds support
for the Intel IOMMU. A quirk to add these aliases for switchtec hardware
was already accepted. See commit ad281ecf1c7d ("PCI: Add DMA alias quirk
for Microsemi Switchtec NTB") for a description of NTB proxy IDs and why
this is necessary.

2) NTB transport (and other clients) may often need more MSI interrupts
than the NTB hardware actually advertises support for. However, seeing
these interrupts will not be triggered by the hardware but through an
NTB memory window, the hardware does not actually need support or need
to know about them. Therefore we add the concept of Virtual MSI
interrupts which are allocated just like any other MSI interrupt but
are not programmed into the hardware's MSI table. This is done in
Patch 2 and then made use of in Patch 3.

The remaining patches in this series add a library for dealing with MSI
interrupts, a test client and finally support in ntb_transport.

The series is based off of v5.1-rc6 plus the patches in ntb-next.
A git repo is available here:

https://github.com/sbates130272/linux-p2pmem/ ntb_transport_msi_v4

Thanks,

Logan

--

Logan Gunthorpe (10):
  PCI/MSI: Support allocating virtual MSI interrupts
  PCI/switchtec: Add module parameter to request more interrupts
  NTB: Introduce helper functions to calculate logical port number
  NTB: Introduce functions to calculate multi-port resource index
  NTB: Rename ntb.c to support multiple source files in the module
  NTB: Introduce MSI library
  NTB: Introduce NTB MSI Test Client
  NTB: Add ntb_msi_test support to ntb_test
  NTB: Add MSI interrupt support to ntb_transport
  NTB: Describe the ntb_msi_test client in the documentation.

 

[PATCH v5 08/10] NTB: Add ntb_msi_test support to ntb_test

2019-05-23 Thread Logan Gunthorpe
When the ntb_msi_test module is available, the test code will trigger
each of the interrupts and ensure the corresponding occurrences files
gets incremented.

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
---
 tools/testing/selftests/ntb/ntb_test.sh | 54 -
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ntb/ntb_test.sh 
b/tools/testing/selftests/ntb/ntb_test.sh
index 17ca36403d04..1a10b8f67727 100755
--- a/tools/testing/selftests/ntb/ntb_test.sh
+++ b/tools/testing/selftests/ntb/ntb_test.sh
@@ -87,10 +87,10 @@ set -e
 
 function _modprobe()
 {
-   modprobe "$@"
+   modprobe "$@" || return 1
 
if [[ "$REMOTE_HOST" != "" ]]; then
-   ssh "$REMOTE_HOST" modprobe "$@"
+   ssh "$REMOTE_HOST" modprobe "$@" || return 1
fi
 }
 
@@ -451,6 +451,30 @@ function pingpong_test()
echo "  Passed"
 }
 
+function msi_test()
+{
+   LOC=$1
+   REM=$2
+
+   write_file 1 $LOC/ready
+
+   echo "Running MSI interrupt tests on: $(subdirname $LOC) / $(subdirname 
$REM)"
+
+   CNT=$(read_file "$LOC/count")
+   for ((i = 0; i < $CNT; i++)); do
+   START=$(read_file $REM/../irq${i}_occurrences)
+   write_file $i $LOC/trigger
+   END=$(read_file $REM/../irq${i}_occurrences)
+
+   if [[ $(($END - $START)) != 1 ]]; then
+   echo "MSI did not trigger the interrupt on the remote 
side!" >&2
+   exit 1
+   fi
+   done
+
+   echo "  Passed"
+}
+
 function perf_test()
 {
USE_DMA=$1
@@ -529,6 +553,29 @@ function ntb_pingpong_tests()
_modprobe -r ntb_pingpong
 }
 
+function ntb_msi_tests()
+{
+   LOCAL_MSI="$DEBUGFS/ntb_msi_test/$LOCAL_DEV"
+   REMOTE_MSI="$REMOTE_HOST:$DEBUGFS/ntb_msi_test/$REMOTE_DEV"
+
+   echo "Starting ntb_msi_test tests..."
+
+   if ! _modprobe ntb_msi_test 2> /dev/null; then
+   echo "  Not doing MSI tests seeing the module is not available."
+   return
+   fi
+
+   port_test $LOCAL_MSI $REMOTE_MSI
+
+   LOCAL_PEER="$LOCAL_MSI/peer$LOCAL_PIDX"
+   REMOTE_PEER="$REMOTE_MSI/peer$REMOTE_PIDX"
+
+   msi_test $LOCAL_PEER $REMOTE_PEER
+   msi_test $REMOTE_PEER $LOCAL_PEER
+
+   _modprobe -r ntb_msi_test
+}
+
 function ntb_perf_tests()
 {
LOCAL_PERF="$DEBUGFS/ntb_perf/$LOCAL_DEV"
@@ -550,6 +597,7 @@ function cleanup()
_modprobe -r ntb_perf 2> /dev/null
_modprobe -r ntb_pingpong 2> /dev/null
_modprobe -r ntb_transport 2> /dev/null
+   _modprobe -r ntb_msi_test 2> /dev/null
set -e
 }
 
@@ -586,5 +634,7 @@ ntb_tool_tests
 echo
 ntb_pingpong_tests
 echo
+ntb_msi_tests
+echo
 ntb_perf_tests
 echo
-- 
2.20.1



[PATCH v5 07/10] NTB: Introduce NTB MSI Test Client

2019-05-23 Thread Logan Gunthorpe
Introduce a tool to test NTB MSI interrupts similar to the other
NTB test tools. This tool creates a debugfs directory for each
NTB device with the following files:

port
irqX_occurrences
peerX/port
peerX/count
peerX/trigger

The 'port' file tells the user the local port number and the
'occurrences' files tell the number of local interrupts that
have been received for each interrupt.

For each peer, the 'port' file and the 'count' file tell you the
peer's port number and number of interrupts respectively. Writing
the interrupt number to the 'trigger' file triggers the interrupt
handler for the peer which should increment their corresponding
'occurrences' file. The 'ready' file indicates if a peer is ready,
writing to this file blocks until it is ready.

The module parameter num_irqs can be used to set the number of
local interrupts. By default this is 4. This is only limited by
the number of unused MSI interrupts registered by the hardware
(this will require support of the hardware driver) and there must
be at least 2*num_irqs + 1 spads registers available.

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
---
 drivers/ntb/test/Kconfig|   9 +
 drivers/ntb/test/Makefile   |   1 +
 drivers/ntb/test/ntb_msi_test.c | 433 
 3 files changed, 443 insertions(+)
 create mode 100644 drivers/ntb/test/ntb_msi_test.c

diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig
index a5d0eda44438..a3f3e2638935 100644
--- a/drivers/ntb/test/Kconfig
+++ b/drivers/ntb/test/Kconfig
@@ -25,3 +25,12 @@ config NTB_PERF
 to and from the window without additional software interaction.
 
 If unsure, say N.
+
+config NTB_MSI_TEST
+   tristate "NTB MSI Test Client"
+   depends on NTB_MSI
+   help
+ This tool demonstrates the use of the NTB MSI library to
+ send MSI interrupts between peers.
+
+ If unsure, say N.
diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile
index 9e77e0b761c2..d2895ca995e4 100644
--- a/drivers/ntb/test/Makefile
+++ b/drivers/ntb/test/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o
 obj-$(CONFIG_NTB_TOOL) += ntb_tool.o
 obj-$(CONFIG_NTB_PERF) += ntb_perf.o
+obj-$(CONFIG_NTB_MSI_TEST) += ntb_msi_test.o
diff --git a/drivers/ntb/test/ntb_msi_test.c b/drivers/ntb/test/ntb_msi_test.c
new file mode 100644
index ..99d826ed9c34
--- /dev/null
+++ b/drivers/ntb/test/ntb_msi_test.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.1");
+MODULE_AUTHOR("Logan Gunthorpe ");
+MODULE_DESCRIPTION("Test for sending MSI interrupts over an NTB memory 
window");
+
+static int num_irqs = 4;
+module_param(num_irqs, int, 0644);
+MODULE_PARM_DESC(num_irqs, "number of irqs to use");
+
+struct ntb_msit_ctx {
+   struct ntb_dev *ntb;
+   struct dentry *dbgfs_dir;
+   struct work_struct setup_work;
+
+   struct ntb_msit_isr_ctx {
+   int irq_idx;
+   int irq_num;
+   int occurrences;
+   struct ntb_msit_ctx *nm;
+   struct ntb_msi_desc desc;
+   } *isr_ctx;
+
+   struct ntb_msit_peer {
+   struct ntb_msit_ctx *nm;
+   int pidx;
+   int num_irqs;
+   struct completion init_comp;
+   struct ntb_msi_desc *msi_desc;
+   } peers[];
+};
+
+static struct dentry *ntb_msit_dbgfs_topdir;
+
+static irqreturn_t ntb_msit_isr(int irq, void *dev)
+{
+   struct ntb_msit_isr_ctx *isr_ctx = dev;
+   struct ntb_msit_ctx *nm = isr_ctx->nm;
+
+   dev_dbg(>ntb->dev, "Interrupt Occurred: %d",
+   isr_ctx->irq_idx);
+
+   isr_ctx->occurrences++;
+
+   return IRQ_HANDLED;
+}
+
+static void ntb_msit_setup_work(struct work_struct *work)
+{
+   struct ntb_msit_ctx *nm = container_of(work, struct ntb_msit_ctx,
+  setup_work);
+   int irq_count = 0;
+   int irq;
+   int ret;
+   uintptr_t i;
+
+   ret = ntb_msi_setup_mws(nm->ntb);
+   if (ret) {
+   dev_err(>ntb->dev, "Unable to setup MSI windows: %d\n",
+   ret);
+   return;
+   }
+
+   for (i = 0; i < num_irqs; i++) {
+   nm->isr_ctx[i].irq_idx = i;
+   nm->isr_ctx[i].nm = nm;
+
+   if (!nm->isr_ctx[i].irq_num) {
+   irq = ntbm_msi_request_irq(nm->ntb, ntb_msit_isr,
+  KBUILD_MODNAME,
+  >isr_ctx[i],
+  >isr_ctx[i].desc);
+   if (irq < 0)
+   break;
+
+   nm->isr_ctx[i].irq_num = irq;
+   }
+
+   ret = 

[PATCH v5 09/10] NTB: Add MSI interrupt support to ntb_transport

2019-05-23 Thread Logan Gunthorpe
Introduce the module parameter 'use_msi' which, when set, uses
MSI interrupts instead of doorbells for each queue pair (QP). The
parameter is only available if NTB MSI support is configured into
the kernel. We also require there to be more than one memory window
(MW) so that an extra one is available to forward the APIC region.

To use MSIs, we request one interrupt per QP and forward the MSI address
and data to the peer using scratch pad registers (SPADS) above the MW
SPADS. (If there are not enough SPADS the MSI interrupt will not be used.)

Once registered, we simply use ntb_msi_peer_trigger and the receiving
ISR simply queues up the rxc_db_work for the queue.

This addition can significantly improve performance of ntb_transport.
In a simple, untuned, apples-to-apples comparision using ntb_netdev
and iperf with switchtec hardware, I see 3.88Gb/s without MSI
interrupts and 14.1Gb/s wit MSI, which is a more than 3x improvement.

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
---
 drivers/ntb/ntb_transport.c | 169 +++-
 1 file changed, 168 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index d4f39ba1d976..f1cf0942cb99 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -93,6 +93,12 @@ static bool use_dma;
 module_param(use_dma, bool, 0644);
 MODULE_PARM_DESC(use_dma, "Use DMA engine to perform large data copy");
 
+static bool use_msi;
+#ifdef CONFIG_NTB_MSI
+module_param(use_msi, bool, 0644);
+MODULE_PARM_DESC(use_msi, "Use MSI interrupts instead of doorbells");
+#endif
+
 static struct dentry *nt_debugfs_dir;
 
 /* Only two-ports NTB devices are supported */
@@ -188,6 +194,11 @@ struct ntb_transport_qp {
u64 tx_err_no_buf;
u64 tx_memcpy;
u64 tx_async;
+
+   bool use_msi;
+   int msi_irq;
+   struct ntb_msi_desc msi_desc;
+   struct ntb_msi_desc peer_msi_desc;
 };
 
 struct ntb_transport_mw {
@@ -221,6 +232,10 @@ struct ntb_transport_ctx {
u64 qp_bitmap;
u64 qp_bitmap_free;
 
+   bool use_msi;
+   unsigned int msi_spad_offset;
+   u64 msi_db_mask;
+
bool link_is_up;
struct delayed_work link_work;
struct work_struct link_cleanup;
@@ -667,6 +682,114 @@ static int ntb_transport_setup_qp_mw(struct 
ntb_transport_ctx *nt,
return 0;
 }
 
+static irqreturn_t ntb_transport_isr(int irq, void *dev)
+{
+   struct ntb_transport_qp *qp = dev;
+
+   tasklet_schedule(>rxc_db_work);
+
+   return IRQ_HANDLED;
+}
+
+static void ntb_transport_setup_qp_peer_msi(struct ntb_transport_ctx *nt,
+   unsigned int qp_num)
+{
+   struct ntb_transport_qp *qp = >qp_vec[qp_num];
+   int spad = qp_num * 2 + nt->msi_spad_offset;
+
+   if (!nt->use_msi)
+   return;
+
+   if (spad >= ntb_spad_count(nt->ndev))
+   return;
+
+   qp->peer_msi_desc.addr_offset =
+   ntb_peer_spad_read(qp->ndev, PIDX, spad);
+   qp->peer_msi_desc.data =
+   ntb_peer_spad_read(qp->ndev, PIDX, spad + 1);
+
+   dev_dbg(>ndev->pdev->dev, "QP%d Peer MSI addr=%x data=%x\n",
+   qp_num, qp->peer_msi_desc.addr_offset, qp->peer_msi_desc.data);
+
+   if (qp->peer_msi_desc.addr_offset) {
+   qp->use_msi = true;
+   dev_info(>ndev->pdev->dev,
+"Using MSI interrupts for QP%d\n", qp_num);
+   }
+}
+
+static void ntb_transport_setup_qp_msi(struct ntb_transport_ctx *nt,
+  unsigned int qp_num)
+{
+   struct ntb_transport_qp *qp = >qp_vec[qp_num];
+   int spad = qp_num * 2 + nt->msi_spad_offset;
+   int rc;
+
+   if (!nt->use_msi)
+   return;
+
+   if (spad >= ntb_spad_count(nt->ndev)) {
+   dev_warn_once(>ndev->pdev->dev,
+ "Not enough SPADS to use MSI interrupts\n");
+   return;
+   }
+
+   ntb_spad_write(qp->ndev, spad, 0);
+   ntb_spad_write(qp->ndev, spad + 1, 0);
+
+   if (!qp->msi_irq) {
+   qp->msi_irq = ntbm_msi_request_irq(qp->ndev, ntb_transport_isr,
+  KBUILD_MODNAME, qp,
+  >msi_desc);
+   if (qp->msi_irq < 0) {
+   dev_warn(>ndev->pdev->dev,
+"Unable to allocate MSI interrupt for qp%d\n",
+qp_num);
+   return;
+   }
+   }
+
+   rc = ntb_spad_write(qp->ndev, spad, qp->msi_desc.addr_offset);
+   if (rc)
+   goto err_free_interrupt;
+
+   rc = ntb_spad_write(qp->ndev, spad + 1, qp->msi_desc.data);
+   if (rc)
+   goto err_free_interrupt;
+
+   dev_dbg(>ndev->pdev->dev, "QP%d MSI %d addr=%x data=%x\n",
+   

[PATCH v5 06/10] NTB: Introduce MSI library

2019-05-23 Thread Logan Gunthorpe
The NTB MSI library allows passing MSI interrupts across a memory
window. This offers similar functionality to doorbells or messages
except will often have much better latency and the client can
potentially use significantly more remote interrupts than typical hardware
provides for doorbells. (Which can be important in high-multiport
setups.)

The library utilizes one memory window per peer and uses the highest
index memory windows. Before any ntb_msi function may be used, the user
must call ntb_msi_init(). It may then setup and tear down the memory
windows when the link state changes using ntb_msi_setup_mws() and
ntb_msi_clear_mws().

The peer which receives the interrupt must call ntb_msim_request_irq()
to assign the interrupt handler (this function is functionally
similar to devm_request_irq()) and the returned descriptor must be
transferred to the peer which can use it to trigger the interrupt.
The triggering peer, once having received the descriptor, can
trigger the interrupt by calling ntb_msi_peer_trigger().

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
---
 drivers/ntb/Kconfig  |  11 ++
 drivers/ntb/Makefile |   3 +-
 drivers/ntb/msi.c| 415 +++
 include/linux/ntb.h  |  73 
 4 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ntb/msi.c

diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
index 95944e52fa36..5760764052be 100644
--- a/drivers/ntb/Kconfig
+++ b/drivers/ntb/Kconfig
@@ -12,6 +12,17 @@ menuconfig NTB
 
 if NTB
 
+config NTB_MSI
+   bool "MSI Interrupt Support"
+   depends on PCI_MSI
+   help
+Support using MSI interrupt forwarding instead of (or in addition to)
+hardware doorbells. MSI interrupts typically offer lower latency
+than doorbells and more MSI interrupts can be made available to
+clients. However this requires an extra memory window and support
+in the hardware driver for creating the MSI interrupts.
+
+If unsure, say N.
 source "drivers/ntb/hw/Kconfig"
 
 source "drivers/ntb/test/Kconfig"
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 537226f8e78d..cc27ad2ef150 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_NTB) += ntb.o hw/ test/
 obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
 
-ntb-y := core.o
+ntb-y  := core.o
+ntb-$(CONFIG_NTB_MSI)  += msi.o
diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c
new file mode 100644
index ..9dddf133658f
--- /dev/null
+++ b/drivers/ntb/msi.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.1");
+MODULE_AUTHOR("Logan Gunthorpe ");
+MODULE_DESCRIPTION("NTB MSI Interrupt Library");
+
+struct ntb_msi {
+   u64 base_addr;
+   u64 end_addr;
+
+   void (*desc_changed)(void *ctx);
+
+   u32 __iomem *peer_mws[];
+};
+
+/**
+ * ntb_msi_init() - Initialize the MSI context
+ * @ntb:   NTB device context
+ *
+ * This function must be called before any other ntb_msi function.
+ * It initializes the context for MSI operations and maps
+ * the peer memory windows.
+ *
+ * This function reserves the last N outbound memory windows (where N
+ * is the number of peers).
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_init(struct ntb_dev *ntb,
+void (*desc_changed)(void *ctx))
+{
+   phys_addr_t mw_phys_addr;
+   resource_size_t mw_size;
+   size_t struct_size;
+   int peer_widx;
+   int peers;
+   int ret;
+   int i;
+
+   peers = ntb_peer_port_count(ntb);
+   if (peers <= 0)
+   return -EINVAL;
+
+   struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers;
+
+   ntb->msi = devm_kzalloc(>dev, struct_size, GFP_KERNEL);
+   if (!ntb->msi)
+   return -ENOMEM;
+
+   ntb->msi->desc_changed = desc_changed;
+
+   for (i = 0; i < peers; i++) {
+   peer_widx = ntb_peer_mw_count(ntb) - 1 - i;
+
+   ret = ntb_peer_mw_get_addr(ntb, peer_widx, _phys_addr,
+  _size);
+   if (ret)
+   goto unroll;
+
+   ntb->msi->peer_mws[i] = devm_ioremap(>dev, mw_phys_addr,
+mw_size);
+   if (!ntb->msi->peer_mws[i]) {
+   ret = -EFAULT;
+   goto unroll;
+   }
+   }
+
+   return 0;
+
+unroll:
+   for (i = 0; i < peers; i++)
+   if (ntb->msi->peer_mws[i])
+   devm_iounmap(>dev, ntb->msi->peer_mws[i]);
+
+   devm_kfree(>dev, ntb->msi);
+   ntb->msi = NULL;
+   return ret;
+}
+EXPORT_SYMBOL(ntb_msi_init);
+
+/**
+ * ntb_msi_setup_mws() - Initialize the MSI inbound 

[PATCH v5 03/10] NTB: Introduce helper functions to calculate logical port number

2019-05-23 Thread Logan Gunthorpe
This patch introduces the "Logical Port Number" which is similar to the
"Port Number" in that it enumerates the ports in the system.

The original (or Physical) "Port Number" can be any number used by the
hardware to uniquely identify a port in the system. The "Logical Port
Number" enumerates all ports in the system from 0 to the number of
ports minus one.

For example a system with 5 ports might have the following port numbers
which would be enumerated thusly:

Port Number:   1  2  5  7  116
Logical Port Number:   0  1  2  3  4

The logical port number is useful when calculating which resources
to use for which peers. So we thus define two helper functions:
ntb_logical_port_number() and ntb_peer_logical_port_number() which
provide the "Logical Port Number" for the local port and any peer
respectively.

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
Cc: Serge Semin 
---
 include/linux/ntb.h | 53 -
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 56a92e3ae3ae..91cf492b16a0 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -616,7 +616,6 @@ static inline int ntb_port_number(struct ntb_dev *ntb)
 
return ntb->ops->port_number(ntb);
 }
-
 /**
  * ntb_peer_port_count() - get the number of peer device ports
  * @ntb:   NTB device context.
@@ -653,6 +652,58 @@ static inline int ntb_peer_port_number(struct ntb_dev 
*ntb, int pidx)
return ntb->ops->peer_port_number(ntb, pidx);
 }
 
+/**
+ * ntb_logical_port_number() - get the logical port number of the local port
+ * @ntb:   NTB device context.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the logical port number or negative value indicating an error
+ */
+static inline int ntb_logical_port_number(struct ntb_dev *ntb)
+{
+   int lport = ntb_port_number(ntb);
+   int pidx;
+
+   if (lport < 0)
+   return lport;
+
+   for (pidx = 0; pidx < ntb_peer_port_count(ntb); pidx++)
+   if (lport <= ntb_peer_port_number(ntb, pidx))
+   return pidx;
+
+   return pidx;
+}
+
+/**
+ * ntb_peer_logical_port_number() - get the logical peer port by given index
+ * @ntb:   NTB device context.
+ * @pidx:  Peer port index.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the peer's logical port number or negative value indicating an error
+ */
+static inline int ntb_peer_logical_port_number(struct ntb_dev *ntb, int pidx)
+{
+   if (ntb_peer_port_number(ntb, pidx) < ntb_port_number(ntb))
+   return pidx;
+   else
+   return pidx + 1;
+}
+
 /**
  * ntb_peer_port_idx() - get the peer device port index by given port number
  * @ntb:   NTB device context.
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 05/10] NTB: Rename ntb.c to support multiple source files in the module

2019-05-23 Thread Logan Gunthorpe
The kbuild system does not support having multiple source files in
a module if one of those source files has the same name as the module.

Therefore, we must rename ntb.c to core.c, while the module remains
ntb.ko.

This is similar to the way the nvme modules are structured.

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
---
 drivers/ntb/Makefile  | 2 ++
 drivers/ntb/{ntb.c => core.c} | 0
 2 files changed, 2 insertions(+)
 rename drivers/ntb/{ntb.c => core.c} (100%)

diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 1921dec1949d..537226f8e78d 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_NTB) += ntb.o hw/ test/
 obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
+
+ntb-y := core.o
diff --git a/drivers/ntb/ntb.c b/drivers/ntb/core.c
similarity index 100%
rename from drivers/ntb/ntb.c
rename to drivers/ntb/core.c
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 01/10] PCI/MSI: Support allocating virtual MSI interrupts

2019-05-23 Thread Logan Gunthorpe
For NTB devices, we want to be able to trigger MSI interrupts
through a memory window. In these cases we may want to use
more interrupts than the NTB PCI device has available in its MSI-X
table.

We allow for this by creating a new 'virtual' interrupt. These
interrupts are allocated as usual but are not programmed into the
MSI-X table (as there may not be space for them).

The MSI address and data will then handled through an NTB MSI library
introduced later in this series.

Signed-off-by: Logan Gunthorpe 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/msi.c   | 54 +
 include/linux/msi.h |  8 +++
 include/linux/pci.h |  9 
 3 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 73986825d221..668bc16ef4d1 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -192,6 +192,9 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, 
u32 flag)
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 {
+   if (desc->msi_attrib.is_virtual)
+   return NULL;
+
return desc->mask_base +
desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 }
@@ -206,14 +209,19 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc 
*desc)
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
u32 mask_bits = desc->masked;
+   void __iomem *desc_addr;
 
if (pci_msi_ignore_mask)
return 0;
+   desc_addr = pci_msix_desc_addr(desc);
+   if (!desc_addr)
+   return 0;
 
mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
if (flag)
mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
-   writel(mask_bits, pci_msix_desc_addr(desc) + 
PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+   writel(mask_bits, desc_addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
 
return mask_bits;
 }
@@ -273,6 +281,11 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
 
+   if (!base) {
+   WARN_ON(1);
+   return;
+   }
+
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
@@ -303,6 +316,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
} else if (entry->msi_attrib.is_msix) {
void __iomem *base = pci_msix_desc_addr(entry);
 
+   if (!base)
+   goto skip;
+
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
@@ -327,7 +343,13 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct 
msi_msg *msg)
  msg->data);
}
}
+
+skip:
entry->msg = *msg;
+
+   if (entry->write_msi_msg)
+   entry->write_msi_msg(entry, entry->write_msi_msg_data);
+
 }
 
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
@@ -550,6 +572,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct 
irq_affinity *affd)
 
entry->msi_attrib.is_msix   = 0;
entry->msi_attrib.is_64 = !!(control & PCI_MSI_FLAGS_64BIT);
+   entry->msi_attrib.is_virtual= 0;
entry->msi_attrib.entry_nr  = 0;
entry->msi_attrib.maskbit   = !!(control & PCI_MSI_FLAGS_MASKBIT);
entry->msi_attrib.default_irq   = dev->irq; /* Save IOAPIC IRQ */
@@ -674,6 +697,7 @@ static int msix_setup_entries(struct pci_dev *dev, void 
__iomem *base,
struct irq_affinity_desc *curmsk, *masks = NULL;
struct msi_desc *entry;
int ret, i;
+   int vec_count = pci_msix_vec_count(dev);
 
if (affd)
masks = irq_create_affinity_masks(nvec, affd);
@@ -696,6 +720,10 @@ static int msix_setup_entries(struct pci_dev *dev, void 
__iomem *base,
entry->msi_attrib.entry_nr = entries[i].entry;
else
entry->msi_attrib.entry_nr = i;
+
+   entry->msi_attrib.is_virtual =
+   entry->msi_attrib.entry_nr >= vec_count;
+
entry->msi_attrib.default_irq   = dev->irq;
entry->mask_base= base;
 
@@ -714,12 +742,19 @@ static void msix_program_entries(struct pci_dev *dev,
 {
struct msi_desc *entry;
int i = 0;
+   void __iomem *desc_addr;
 
for_each_pci_msi_entry(entry, dev) {
if (entries)
entries[i++].vector = entry->irq;
-   entry->masked = readl(pci_msix_desc_addr(entry) +
-   PCI_MSIX_ENTRY_VECTOR_CTRL);
+

[PATCH v5 02/10] PCI/switchtec: Add module parameter to request more interrupts

2019-05-23 Thread Logan Gunthorpe
Seeing the we want to use more interrupts in the NTB MSI code
we need to be able allocate more (sometimes virtual) interrupts
in the switchtec driver. Therefore add a module parameter to
request to allocate additional interrupts.

This puts virtually no limit on the number of MSI interrupts available
to NTB clients.

Signed-off-by: Logan Gunthorpe 
Cc: Bjorn Helgaas 
---
 drivers/pci/switch/switchtec.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index e22766c79fe9..8b1db78197d9 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -30,6 +30,10 @@ module_param(use_dma_mrpc, bool, 0644);
 MODULE_PARM_DESC(use_dma_mrpc,
 "Enable the use of the DMA MRPC feature");
 
+static int nirqs = 32;
+module_param(nirqs, int, 0644);
+MODULE_PARM_DESC(nirqs, "number of interrupts to allocate (more may be useful 
for NTB applications)");
+
 static dev_t switchtec_devt;
 static DEFINE_IDA(switchtec_minor_ida);
 
@@ -1247,8 +1251,12 @@ static int switchtec_init_isr(struct switchtec_dev 
*stdev)
int dma_mrpc_irq;
int rc;
 
-   nvecs = pci_alloc_irq_vectors(stdev->pdev, 1, 4,
- PCI_IRQ_MSIX | PCI_IRQ_MSI);
+   if (nirqs < 4)
+   nirqs = 4;
+
+   nvecs = pci_alloc_irq_vectors(stdev->pdev, 1, nirqs,
+ PCI_IRQ_MSIX | PCI_IRQ_MSI |
+ PCI_IRQ_VIRTUAL);
if (nvecs < 0)
return nvecs;
 
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 10/10] NTB: Describe the ntb_msi_test client in the documentation.

2019-05-23 Thread Logan Gunthorpe
Add a blurb in Documentation/ntb.txt to describe the ntb_msi_test tool's
debugfs interface. Similar to the (out of date) ntb_tool description.

Signed-off-by: Logan Gunthorpe 
---
 Documentation/ntb.txt | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
index a043854d28df..802a539f1592 100644
--- a/Documentation/ntb.txt
+++ b/Documentation/ntb.txt
@@ -194,6 +194,33 @@ Debugfs Files:
This file is used to read and write peer scratchpads.  See
*spad* for details.
 
+NTB MSI Test Client (ntb\_msi\_test)
+
+
+The MSI test client serves to test and debug the MSI library which
+allows for passing MSI interrupts across NTB memory windows. The
+test client is interacted with through the debugfs filesystem:
+
+* *debugfs*/ntb\_tool/*hw*/
+   A directory in debugfs will be created for each
+   NTB device probed by the tool.  This directory is shortened to *hw*
+   below.
+* *hw*/port
+   This file describes the local port number
+* *hw*/irq*_occurrences
+   One occurrences file exists for each interrupt and, when read,
+   returns the number of times the interrupt has been triggered.
+* *hw*/peer*/port
+   This file describes the port number for each peer
+* *hw*/peer*/count
+   This file describes the number of interrupts that can be
+   triggered on each peer
+* *hw*/peer*/trigger
+   Writing an interrupt number (any number less than the value
+   specified in count) will trigger the interrupt on the
+   specified peer. That peer's interrupt's occurrence file
+   should be incremented.
+
 NTB Hardware Drivers
 
 
-- 
2.20.1



[PATCH v5 04/10] NTB: Introduce functions to calculate multi-port resource index

2019-05-23 Thread Logan Gunthorpe
When using multi-ports each port uses resources (dbs, msgs, mws, etc)
on every other port. Creating a mapping for these resources such that
each port has a corresponding resource on every other port is a bit
tricky.

Introduce the ntb_peer_resource_idx() function for this purpose.
It returns the peer resource number that will correspond with the
local peer index on the remote peer.

Also, introduce ntb_peer_highest_mw_idx() which will use
ntb_peer_resource_idx() but return the MW index starting with the
highest index and working down.

Signed-off-by: Logan Gunthorpe 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
---
 include/linux/ntb.h | 70 +
 1 file changed, 70 insertions(+)

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 91cf492b16a0..66552830544b 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -1557,4 +1557,74 @@ static inline int ntb_peer_msg_write(struct ntb_dev 
*ntb, int pidx, int midx,
return ntb->ops->peer_msg_write(ntb, pidx, midx, msg);
 }
 
+/**
+ * ntb_peer_resource_idx() - get a resource index for a given peer idx
+ * @ntb:   NTB device context.
+ * @pidx:  Peer port index.
+ *
+ * When constructing a graph of peers, each remote peer must use a different
+ * resource index (mw, doorbell, etc) to communicate with each other
+ * peer.
+ *
+ * In a two peer system, this function should always return 0 such that
+ * resource 0 points to the remote peer on both ports.
+ *
+ * In a 5 peer system, this function will return the following matrix
+ *
+ * pidx \ port01234
+ * 0  00123
+ * 1  01123
+ * 2  01223
+ * 3  01233
+ *
+ * For example, if this function is used to program peer's memory
+ * windows, port 0 will program MW 0 on all it's peers to point to itself.
+ * port 1 will program MW 0 in port 0 to point to itself and MW 1 on all
+ * other ports. etc.
+ *
+ * For the legacy two host case, ntb_port_number() and ntb_peer_port_number()
+ * both return zero and therefore this function will always return zero.
+ * So MW 0 on each host would be programmed to point to the other host.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_resource_idx(struct ntb_dev *ntb, int pidx)
+{
+   int local_port, peer_port;
+
+   if (pidx >= ntb_peer_port_count(ntb))
+   return -EINVAL;
+
+   local_port = ntb_logical_port_number(ntb);
+   peer_port = ntb_peer_logical_port_number(ntb, pidx);
+
+   if (peer_port < local_port)
+   return local_port - 1;
+   else
+   return local_port;
+}
+
+/**
+ * ntb_peer_highest_mw_idx() - get a memory window index for a given peer idx
+ * using the highest index memory windows first
+ *
+ * @ntb:   NTB device context.
+ * @pidx:  Peer port index.
+ *
+ * Like ntb_peer_resource_idx(), except it returns indexes starting with
+ * last memory window index.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
+{
+   int ret;
+
+   ret = ntb_peer_resource_idx(ntb, pidx);
+   if (ret < 0)
+   return ret;
+
+   return ntb_mw_count(ntb, pidx) - ret - 1;
+}
+
 #endif
-- 
2.20.1



Re: [PATCH v2 03/15] iommu/arm-smmu: Add split pagetable support for arm-smmu-v2

2019-05-23 Thread Jordan Crouse
On Tue, May 21, 2019 at 07:18:32PM +0100, Robin Murphy wrote:
> On 21/05/2019 17:13, Jordan Crouse wrote:
> >Add support for a split pagetable (TTBR0/TTBR1) scheme for arm-smmu-v2.
> >If split pagetables are enabled, create a pagetable for TTBR1 and set
> >up the sign extension bit so that all IOVAs with that bit set are mapped
> >and translated from the TTBR1 pagetable.
> >
> >Signed-off-by: Jordan Crouse 
> >---
> >
> >  drivers/iommu/arm-smmu-regs.h  |  19 +
> >  drivers/iommu/arm-smmu.c   | 179 
> > ++---
> >  drivers/iommu/io-pgtable-arm.c |   3 +-
> >  3 files changed, 186 insertions(+), 15 deletions(-)
> >
> >diff --git a/drivers/iommu/arm-smmu-regs.h b/drivers/iommu/arm-smmu-regs.h
> >index e9132a9..23f27c2 100644
> >--- a/drivers/iommu/arm-smmu-regs.h
> >+++ b/drivers/iommu/arm-smmu-regs.h
> >@@ -195,7 +195,26 @@ enum arm_smmu_s2cr_privcfg {
> >  #define RESUME_RETRY   (0 << 0)
> >  #define RESUME_TERMINATE   (1 << 0)
> >+#define TTBCR_EPD1  (1 << 23)
> >+#define TTBCR_T0SZ_SHIFT0
> >+#define TTBCR_T1SZ_SHIFT16
> >+#define TTBCR_IRGN1_SHIFT   24
> >+#define TTBCR_ORGN1_SHIFT   26
> >+#define TTBCR_RGN_WBWA  1
> >+#define TTBCR_SH1_SHIFT 28
> >+#define TTBCR_SH_IS 3
> >+
> >+#define TTBCR_TG1_16K   (1 << 30)
> >+#define TTBCR_TG1_4K(2 << 30)
> >+#define TTBCR_TG1_64K   (3 << 30)
> >+
> >  #define TTBCR2_SEP_SHIFT   15
> >+#define TTBCR2_SEP_31   (0x0 << TTBCR2_SEP_SHIFT)
> >+#define TTBCR2_SEP_35   (0x1 << TTBCR2_SEP_SHIFT)
> >+#define TTBCR2_SEP_39   (0x2 << TTBCR2_SEP_SHIFT)
> >+#define TTBCR2_SEP_41   (0x3 << TTBCR2_SEP_SHIFT)
> >+#define TTBCR2_SEP_43   (0x4 << TTBCR2_SEP_SHIFT)
> >+#define TTBCR2_SEP_47   (0x5 << TTBCR2_SEP_SHIFT)
> >  #define TTBCR2_SEP_UPSTREAM(0x7 << TTBCR2_SEP_SHIFT)
> >  #define TTBCR2_AS  (1 << 4)
> >diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> >index a795ada..e09c0e6 100644
> >--- a/drivers/iommu/arm-smmu.c
> >+++ b/drivers/iommu/arm-smmu.c
> >@@ -152,6 +152,7 @@ struct arm_smmu_cb {
> > u32 tcr[2];
> > u32 mair[2];
> > struct arm_smmu_cfg *cfg;
> >+unsigned long   split_table_mask;
> >  };
> >  struct arm_smmu_master_cfg {
> >@@ -253,13 +254,14 @@ enum arm_smmu_domain_stage {
> >  struct arm_smmu_domain {
> > struct arm_smmu_device  *smmu;
> >-struct io_pgtable_ops   *pgtbl_ops;
> >+struct io_pgtable_ops   *pgtbl_ops[2];
> 
> This seems a bit off - surely the primary domain and aux domain only ever
> need one set of tables each, but either way there's definitely unnecessary
> redundancy in having four sets of io_pgtable_ops between them.
> 
> > const struct iommu_gather_ops   *tlb_ops;
> > struct arm_smmu_cfg cfg;
> > enum arm_smmu_domain_stage  stage;
> > boolnon_strict;
> > struct mutexinit_mutex; /* Protects smmu pointer */
> > spinlock_t  cb_lock; /* Serialises ATS1* ops and 
> > TLB syncs */
> >+u32 attributes;
> > struct iommu_domain domain;
> >  };
> >@@ -621,6 +623,85 @@ static irqreturn_t arm_smmu_global_fault(int irq, void 
> >*dev)
> > return IRQ_HANDLED;
> >  }
> >+/* Adjust the context bank settings to support TTBR1 */
> >+static void arm_smmu_init_ttbr1(struct arm_smmu_domain *smmu_domain,
> >+struct io_pgtable_cfg *pgtbl_cfg)
> >+{
> >+struct arm_smmu_device *smmu = smmu_domain->smmu;
> >+struct arm_smmu_cfg *cfg = _domain->cfg;
> >+struct arm_smmu_cb *cb = _domain->smmu->cbs[cfg->cbndx];
> >+int pgsize = 1 << __ffs(pgtbl_cfg->pgsize_bitmap);
> >+
> >+/* Enable speculative walks through the TTBR1 */
> >+cb->tcr[0] &= ~TTBCR_EPD1;
> >+
> >+cb->tcr[0] |= TTBCR_SH_IS << TTBCR_SH1_SHIFT;
> >+cb->tcr[0] |= TTBCR_RGN_WBWA << TTBCR_IRGN1_SHIFT;
> >+cb->tcr[0] |= TTBCR_RGN_WBWA << TTBCR_ORGN1_SHIFT;
> >+
> >+switch (pgsize) {
> >+case SZ_4K:
> >+cb->tcr[0] |= TTBCR_TG1_4K;
> >+break;
> >+case SZ_16K:
> >+cb->tcr[0] |= TTBCR_TG1_16K;
> >+break;
> >+case SZ_64K:
> >+cb->tcr[0] |= TTBCR_TG1_64K;
> >+break;
> >+}
> >+
> >+/*
> >+ * Outside of the special 49 bit UBS case that has a dedicated sign
> >+ * extension bit, setting the SEP for any other va_size will force us to
> >+ * shrink the size of the T0/T1 regions by one bit to accommodate the
> >+ * SEP
> >+ */
> >+if (smmu->va_size != 48) {
> >+/* 

Re: [PATCH 3/4] iommu: Introduce device fault report API

2019-05-23 Thread Robin Murphy

On 23/05/2019 19:06, Jean-Philippe Brucker wrote:

From: Jacob Pan 

Traditionally, device specific faults are detected and handled within
their own device drivers. When IOMMU is enabled, faults such as DMA
related transactions are detected by IOMMU. There is no generic
reporting mechanism to report faults back to the in-kernel device
driver or the guest OS in case of assigned devices.

This patch introduces a registration API for device specific fault
handlers. This differs from the existing iommu_set_fault_handler/
report_iommu_fault infrastructures in several ways:
- it allows to report more sophisticated fault events (both
   unrecoverable faults and page request faults) due to the nature
   of the iommu_fault struct
- it is device specific and not domain specific.

The current iommu_report_device_fault() implementation only handles
the "shoot and forget" unrecoverable fault case. Handling of page
request faults or stalled faults will come later.

Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Eric Auger 
---
  drivers/iommu/iommu.c | 127 ++
  include/linux/iommu.h |  29 ++
  2 files changed, 156 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 67ee6623f9b2..d546f7baa0d4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -644,6 +644,13 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
goto err_free_name;
}
  
+	dev->iommu_param = kzalloc(sizeof(*dev->iommu_param), GFP_KERNEL);

+   if (!dev->iommu_param) {
+   ret = -ENOMEM;
+   goto err_free_name;
+   }
+   mutex_init(>iommu_param->lock);
+


Note that this gets a bit tricky when we come to move to move the 
fwspec/ops/etc. into iommu_param, since that data can have a longer 
lifespan than the group association. I'd suggest moving this management 
out to the iommu_{probe,release}_device() level from the start, but 
maybe we're happy to come back and change things later as necessary.


Robin.


kobject_get(group->devices_kobj);
  
  	dev->iommu_group = group;

@@ -674,6 +681,8 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
mutex_unlock(>mutex);
dev->iommu_group = NULL;
kobject_put(group->devices_kobj);
+   kfree(dev->iommu_param);
+   dev->iommu_param = NULL;
  err_free_name:
kfree(device->name);
  err_remove_link:
@@ -721,6 +730,8 @@ void iommu_group_remove_device(struct device *dev)
  
  	trace_remove_device_from_group(group->id, dev);
  
+	kfree(dev->iommu_param);

+   dev->iommu_param = NULL;
kfree(device->name);
kfree(device);
dev->iommu_group = NULL;
@@ -854,6 +865,122 @@ int iommu_group_unregister_notifier(struct iommu_group 
*group,
  }
  EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
  
+/**

+ * iommu_register_device_fault_handler() - Register a device fault handler
+ * @dev: the device
+ * @handler: the fault handler
+ * @data: private data passed as argument to the handler
+ *
+ * When an IOMMU fault event is received, this handler gets called with the
+ * fault event and data as argument. The handler should return 0 on success.
+ *
+ * Return 0 if the fault handler was installed successfully, or an error.
+ */
+int iommu_register_device_fault_handler(struct device *dev,
+   iommu_dev_fault_handler_t handler,
+   void *data)
+{
+   struct iommu_param *param = dev->iommu_param;
+   int ret = 0;
+
+   /*
+* Device iommu_param should have been allocated when device is
+* added to its iommu_group.
+*/
+   if (!param)
+   return -EINVAL;
+
+   mutex_lock(>lock);
+   /* Only allow one fault handler registered for each device */
+   if (param->fault_param) {
+   ret = -EBUSY;
+   goto done_unlock;
+   }
+
+   get_device(dev);
+   param->fault_param =
+   kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+   if (!param->fault_param) {
+   put_device(dev);
+   ret = -ENOMEM;
+   goto done_unlock;
+   }
+   param->fault_param->handler = handler;
+   param->fault_param->data = data;
+
+done_unlock:
+   mutex_unlock(>lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+/**
+ * iommu_unregister_device_fault_handler() - Unregister the device fault 
handler
+ * @dev: the device
+ *
+ * Remove the device fault handler installed with
+ * iommu_register_device_fault_handler().
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+   struct iommu_param *param = dev->iommu_param;
+   int ret = 0;
+
+   if (!param)
+   return -EINVAL;
+
+   

Re: [PATCH 2/4] iommu: Introduce device fault data

2019-05-23 Thread Robin Murphy

On 23/05/2019 19:06, Jean-Philippe Brucker wrote:

From: Jacob Pan 

Device faults detected by IOMMU can be reported outside the IOMMU
subsystem for further processing. This patch introduces
a generic device fault data structure.

The fault can be either an unrecoverable fault or a page request,
also referred to as a recoverable fault.

We only care about non internal faults that are likely to be reported
to an external subsystem.

Signed-off-by: Jacob Pan 
Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
Signed-off-by: Eric Auger 
---
  include/linux/iommu.h  |  43 ++
  include/uapi/linux/iommu.h | 118 +
  2 files changed, 161 insertions(+)
  create mode 100644 include/uapi/linux/iommu.h

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a815cf6f6f47..d442f5f3fa93 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #define IOMMU_READ	(1 << 0)

  #define IOMMU_WRITE   (1 << 1)
@@ -49,6 +50,7 @@ struct device;
  struct iommu_domain;
  struct notifier_block;
  struct iommu_sva;
+struct iommu_fault_event;
  
  /* iommu fault flags */

  #define IOMMU_FAULT_READ  0x0
@@ -58,6 +60,7 @@ typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
struct device *, unsigned long, int, void *);
  typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *,
   void *);
+typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault_event *, void *);
  
  struct iommu_domain_geometry {

dma_addr_t aperture_start; /* First address that can be mapped*/
@@ -301,6 +304,45 @@ struct iommu_device {
struct device *dev;
  };
  
+/**

+ * struct iommu_fault_event - Generic fault event
+ *
+ * Can represent recoverable faults such as a page requests or
+ * unrecoverable faults such as DMA or IRQ remapping faults.
+ *
+ * @fault: fault descriptor
+ * @iommu_private: used by the IOMMU driver for storing fault-specific
+ * data. Users should not modify this field before
+ * sending the fault response.


Sorry if I'm a bit late to the party, but given that description, if 
users aren't allowed to touch this then why expose it to them at all? 
I.e. why not have iommu_report_device_fault() pass just the fault itself 
to the fault handler:


ret = fparam->handler(>fault, fparam->data);

and let the IOMMU core/drivers decapsulate it again later if need be. 
AFAICS drivers could also just embed the entire generic event in their 
own private structure anyway, just as we do for domains.


Robin.


+ */
+struct iommu_fault_event {
+   struct iommu_fault fault;
+   u64 iommu_private;
+};
+
+/**
+ * struct iommu_fault_param - per-device IOMMU fault data
+ * @handler: Callback function to handle IOMMU faults at device level
+ * @data: handler private data
+ */
+struct iommu_fault_param {
+   iommu_dev_fault_handler_t handler;
+   void *data;
+};
+
+/**
+ * struct iommu_param - collection of per-device IOMMU data
+ *
+ * @fault_param: IOMMU detected device fault reporting data
+ *
+ * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
+ * struct iommu_group  *iommu_group;
+ * struct iommu_fwspec *iommu_fwspec;
+ */
+struct iommu_param {
+   struct iommu_fault_param *fault_param;
+};
+
  int  iommu_device_register(struct iommu_device *iommu);
  void iommu_device_unregister(struct iommu_device *iommu);
  int  iommu_device_sysfs_add(struct iommu_device *iommu,
@@ -504,6 +546,7 @@ struct iommu_ops {};
  struct iommu_group {};
  struct iommu_fwspec {};
  struct iommu_device {};
+struct iommu_fault_param {};
  
  static inline bool iommu_present(struct bus_type *bus)

  {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
new file mode 100644
index ..796402174d6c
--- /dev/null
+++ b/include/uapi/linux/iommu.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * IOMMU user API definitions
+ */
+
+#ifndef _UAPI_IOMMU_H
+#define _UAPI_IOMMU_H
+
+#include 
+
+#define IOMMU_FAULT_PERM_READ  (1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC  (1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV  (1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+   IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */
+   IOMMU_FAULT_PAGE_REQ,   /* page request fault */
+};
+
+enum iommu_fault_reason {
+   IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+   /* Could not access the PASID table (fetch caused external abort) */
+   IOMMU_FAULT_REASON_PASID_FETCH,
+
+   /* PASID entry is invalid or has configuration errors */
+   IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+   /*
+* 

[PATCH 2/4] iommu: Introduce device fault data

2019-05-23 Thread Jean-Philippe Brucker
From: Jacob Pan 

Device faults detected by IOMMU can be reported outside the IOMMU
subsystem for further processing. This patch introduces
a generic device fault data structure.

The fault can be either an unrecoverable fault or a page request,
also referred to as a recoverable fault.

We only care about non internal faults that are likely to be reported
to an external subsystem.

Signed-off-by: Jacob Pan 
Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
Signed-off-by: Eric Auger 
---
 include/linux/iommu.h  |  43 ++
 include/uapi/linux/iommu.h | 118 +
 2 files changed, 161 insertions(+)
 create mode 100644 include/uapi/linux/iommu.h

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a815cf6f6f47..d442f5f3fa93 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IOMMU_READ (1 << 0)
 #define IOMMU_WRITE(1 << 1)
@@ -49,6 +50,7 @@ struct device;
 struct iommu_domain;
 struct notifier_block;
 struct iommu_sva;
+struct iommu_fault_event;
 
 /* iommu fault flags */
 #define IOMMU_FAULT_READ   0x0
@@ -58,6 +60,7 @@ typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
struct device *, unsigned long, int, void *);
 typedef int (*iommu_mm_exit_handler_t)(struct device *dev, struct iommu_sva *,
   void *);
+typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault_event *, void *);
 
 struct iommu_domain_geometry {
dma_addr_t aperture_start; /* First address that can be mapped*/
@@ -301,6 +304,45 @@ struct iommu_device {
struct device *dev;
 };
 
+/**
+ * struct iommu_fault_event - Generic fault event
+ *
+ * Can represent recoverable faults such as a page requests or
+ * unrecoverable faults such as DMA or IRQ remapping faults.
+ *
+ * @fault: fault descriptor
+ * @iommu_private: used by the IOMMU driver for storing fault-specific
+ * data. Users should not modify this field before
+ * sending the fault response.
+ */
+struct iommu_fault_event {
+   struct iommu_fault fault;
+   u64 iommu_private;
+};
+
+/**
+ * struct iommu_fault_param - per-device IOMMU fault data
+ * @handler: Callback function to handle IOMMU faults at device level
+ * @data: handler private data
+ */
+struct iommu_fault_param {
+   iommu_dev_fault_handler_t handler;
+   void *data;
+};
+
+/**
+ * struct iommu_param - collection of per-device IOMMU data
+ *
+ * @fault_param: IOMMU detected device fault reporting data
+ *
+ * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
+ * struct iommu_group  *iommu_group;
+ * struct iommu_fwspec *iommu_fwspec;
+ */
+struct iommu_param {
+   struct iommu_fault_param *fault_param;
+};
+
 int  iommu_device_register(struct iommu_device *iommu);
 void iommu_device_unregister(struct iommu_device *iommu);
 int  iommu_device_sysfs_add(struct iommu_device *iommu,
@@ -504,6 +546,7 @@ struct iommu_ops {};
 struct iommu_group {};
 struct iommu_fwspec {};
 struct iommu_device {};
+struct iommu_fault_param {};
 
 static inline bool iommu_present(struct bus_type *bus)
 {
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
new file mode 100644
index ..796402174d6c
--- /dev/null
+++ b/include/uapi/linux/iommu.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * IOMMU user API definitions
+ */
+
+#ifndef _UAPI_IOMMU_H
+#define _UAPI_IOMMU_H
+
+#include 
+
+#define IOMMU_FAULT_PERM_READ  (1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC  (1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV  (1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+   IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */
+   IOMMU_FAULT_PAGE_REQ,   /* page request fault */
+};
+
+enum iommu_fault_reason {
+   IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+   /* Could not access the PASID table (fetch caused external abort) */
+   IOMMU_FAULT_REASON_PASID_FETCH,
+
+   /* PASID entry is invalid or has configuration errors */
+   IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+   /*
+* PASID is out of range (e.g. exceeds the maximum PASID
+* supported by the IOMMU) or disabled.
+*/
+   IOMMU_FAULT_REASON_PASID_INVALID,
+
+   /*
+* An external abort occurred fetching (or updating) a translation
+* table descriptor
+*/
+   IOMMU_FAULT_REASON_WALK_EABT,
+
+   /*
+* Could not access the page table entry (Bad address),
+* actual translation fault
+*/
+   IOMMU_FAULT_REASON_PTE_FETCH,
+
+   /* Protection flag check failed */
+   IOMMU_FAULT_REASON_PERMISSION,
+
+   /* 

[PATCH 4/4] iommu: Add recoverable fault reporting

2019-05-23 Thread Jean-Philippe Brucker
Some IOMMU hardware features, for example PCI PRI and Arm SMMU Stall,
enable recoverable I/O page faults. Allow IOMMU drivers to report PRI Page
Requests and Stall events through the new fault reporting API. The
consumer of the fault can be either an I/O page fault handler in the host,
or a guest OS.

Once handled, the fault must be completed by sending a page response back
to the IOMMU. Add an iommu_page_response() function to complete a page
fault.

Signed-off-by: Jacob Pan 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/iommu.c  | 95 +-
 include/linux/iommu.h  | 19 
 include/uapi/linux/iommu.h | 34 ++
 3 files changed, 146 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d546f7baa0d4..b09b3707f0e4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -872,7 +872,14 @@ EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
  * @data: private data passed as argument to the handler
  *
  * When an IOMMU fault event is received, this handler gets called with the
- * fault event and data as argument. The handler should return 0 on success.
+ * fault event and data as argument. The handler should return 0 on success. If
+ * the fault is recoverable (IOMMU_FAULT_PAGE_REQ), the handler should also
+ * complete the fault by calling iommu_page_response() with one of the 
following
+ * response code:
+ * - IOMMU_PAGE_RESP_SUCCESS: retry the translation
+ * - IOMMU_PAGE_RESP_INVALID: terminate the fault
+ * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting
+ *   page faults if possible.
  *
  * Return 0 if the fault handler was installed successfully, or an error.
  */
@@ -907,6 +914,8 @@ int iommu_register_device_fault_handler(struct device *dev,
}
param->fault_param->handler = handler;
param->fault_param->data = data;
+   mutex_init(>fault_param->lock);
+   INIT_LIST_HEAD(>fault_param->faults);
 
 done_unlock:
mutex_unlock(>lock);
@@ -937,6 +946,12 @@ int iommu_unregister_device_fault_handler(struct device 
*dev)
if (!param->fault_param)
goto unlock;
 
+   /* we cannot unregister handler if there are pending faults */
+   if (!list_empty(>fault_param->faults)) {
+   ret = -EBUSY;
+   goto unlock;
+   }
+
kfree(param->fault_param);
param->fault_param = NULL;
put_device(dev);
@@ -953,13 +968,15 @@ EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
  * @evt: fault event data
  *
  * Called by IOMMU drivers when a fault is detected, typically in a threaded 
IRQ
- * handler.
+ * handler. When this function fails and the fault is recoverable, it is the
+ * caller's responsibility to complete the fault.
  *
  * Return 0 on success, or an error.
  */
 int iommu_report_device_fault(struct device *dev, struct iommu_fault_event 
*evt)
 {
struct iommu_param *param = dev->iommu_param;
+   struct iommu_fault_event *evt_pending = NULL;
struct iommu_fault_param *fparam;
int ret = 0;
 
@@ -974,7 +991,27 @@ int iommu_report_device_fault(struct device *dev, struct 
iommu_fault_event *evt)
ret = -EINVAL;
goto done_unlock;
}
+
+   if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
+   (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+   evt_pending = kmemdup(evt, sizeof(struct iommu_fault_event),
+ GFP_KERNEL);
+   if (!evt_pending) {
+   ret = -ENOMEM;
+   goto done_unlock;
+   }
+   mutex_lock(>lock);
+   list_add_tail(_pending->list, >faults);
+   mutex_unlock(>lock);
+   }
+
ret = fparam->handler(evt, fparam->data);
+   if (ret && evt_pending) {
+   mutex_lock(>lock);
+   list_del(_pending->list);
+   mutex_unlock(>lock);
+   kfree(evt_pending);
+   }
 done_unlock:
mutex_unlock(>lock);
return ret;
@@ -1515,6 +1552,60 @@ int iommu_attach_device(struct iommu_domain *domain, 
struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_page_response(struct device *dev,
+   struct iommu_page_response *msg)
+{
+   bool pasid_valid;
+   int ret = -EINVAL;
+   struct iommu_fault_event *evt;
+   struct iommu_fault_page_request *prm;
+   struct iommu_param *param = dev->iommu_param;
+   struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+   if (!domain || !domain->ops->page_response)
+   return -ENODEV;
+
+   /*
+* Device iommu_param should have been allocated when device is
+* added to its iommu_group.
+*/
+   if (!param || !param->fault_param)
+   return -EINVAL;
+
+   /* Only send response if there is a fault 

[PATCH 3/4] iommu: Introduce device fault report API

2019-05-23 Thread Jean-Philippe Brucker
From: Jacob Pan 

Traditionally, device specific faults are detected and handled within
their own device drivers. When IOMMU is enabled, faults such as DMA
related transactions are detected by IOMMU. There is no generic
reporting mechanism to report faults back to the in-kernel device
driver or the guest OS in case of assigned devices.

This patch introduces a registration API for device specific fault
handlers. This differs from the existing iommu_set_fault_handler/
report_iommu_fault infrastructures in several ways:
- it allows to report more sophisticated fault events (both
  unrecoverable faults and page request faults) due to the nature
  of the iommu_fault struct
- it is device specific and not domain specific.

The current iommu_report_device_fault() implementation only handles
the "shoot and forget" unrecoverable fault case. Handling of page
request faults or stalled faults will come later.

Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
Signed-off-by: Jean-Philippe Brucker 
Signed-off-by: Eric Auger 
---
 drivers/iommu/iommu.c | 127 ++
 include/linux/iommu.h |  29 ++
 2 files changed, 156 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 67ee6623f9b2..d546f7baa0d4 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -644,6 +644,13 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
goto err_free_name;
}
 
+   dev->iommu_param = kzalloc(sizeof(*dev->iommu_param), GFP_KERNEL);
+   if (!dev->iommu_param) {
+   ret = -ENOMEM;
+   goto err_free_name;
+   }
+   mutex_init(>iommu_param->lock);
+
kobject_get(group->devices_kobj);
 
dev->iommu_group = group;
@@ -674,6 +681,8 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
mutex_unlock(>mutex);
dev->iommu_group = NULL;
kobject_put(group->devices_kobj);
+   kfree(dev->iommu_param);
+   dev->iommu_param = NULL;
 err_free_name:
kfree(device->name);
 err_remove_link:
@@ -721,6 +730,8 @@ void iommu_group_remove_device(struct device *dev)
 
trace_remove_device_from_group(group->id, dev);
 
+   kfree(dev->iommu_param);
+   dev->iommu_param = NULL;
kfree(device->name);
kfree(device);
dev->iommu_group = NULL;
@@ -854,6 +865,122 @@ int iommu_group_unregister_notifier(struct iommu_group 
*group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+/**
+ * iommu_register_device_fault_handler() - Register a device fault handler
+ * @dev: the device
+ * @handler: the fault handler
+ * @data: private data passed as argument to the handler
+ *
+ * When an IOMMU fault event is received, this handler gets called with the
+ * fault event and data as argument. The handler should return 0 on success.
+ *
+ * Return 0 if the fault handler was installed successfully, or an error.
+ */
+int iommu_register_device_fault_handler(struct device *dev,
+   iommu_dev_fault_handler_t handler,
+   void *data)
+{
+   struct iommu_param *param = dev->iommu_param;
+   int ret = 0;
+
+   /*
+* Device iommu_param should have been allocated when device is
+* added to its iommu_group.
+*/
+   if (!param)
+   return -EINVAL;
+
+   mutex_lock(>lock);
+   /* Only allow one fault handler registered for each device */
+   if (param->fault_param) {
+   ret = -EBUSY;
+   goto done_unlock;
+   }
+
+   get_device(dev);
+   param->fault_param =
+   kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+   if (!param->fault_param) {
+   put_device(dev);
+   ret = -ENOMEM;
+   goto done_unlock;
+   }
+   param->fault_param->handler = handler;
+   param->fault_param->data = data;
+
+done_unlock:
+   mutex_unlock(>lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+/**
+ * iommu_unregister_device_fault_handler() - Unregister the device fault 
handler
+ * @dev: the device
+ *
+ * Remove the device fault handler installed with
+ * iommu_register_device_fault_handler().
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+   struct iommu_param *param = dev->iommu_param;
+   int ret = 0;
+
+   if (!param)
+   return -EINVAL;
+
+   mutex_lock(>lock);
+
+   if (!param->fault_param)
+   goto unlock;
+
+   kfree(param->fault_param);
+   param->fault_param = NULL;
+   put_device(dev);
+unlock:
+   mutex_unlock(>lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
+
+/**
+ * iommu_report_device_fault() - Report fault event to device driver
+ * @dev: the device
+ * @evt: fault 

[PATCH 0/4] iommu: Add device fault reporting API

2019-05-23 Thread Jean-Philippe Brucker
Allow device drivers and VFIO to get notifications on IOMMU translation
fault, and to handle recoverable faults (PCI PRI). These four patches
are relatively mature since they are required by three different series,
and have been under discussion for a while:

* Nested translation support for SMMUv3 [1].
* vSVA for VT-d [2].
* My generic host SVA implementation.

I reworked patch 4 according to previous discussions, and moved the page
response structure to UAPI. For the other patches I only fixed comments
and whitespaces. Please have a look and see if it works for you.

[1] [PATCH v7 00/23] SMMUv3 Nested Stage Setup
https://lore.kernel.org/lkml/20190408121911.24103-1-eric.au...@redhat.com/
[2] [PATCH v3 00/16] Shared virtual address IOMMU and VT-d support

https://lore.kernel.org/lkml/1556922737-76313-1-git-send-email-jacob.jun@linux.intel.com/

Jacob Pan (3):
  driver core: Add per device iommu param
  iommu: Introduce device fault data
  iommu: Introduce device fault report API

Jean-Philippe Brucker (1):
  iommu: Add recoverable fault reporting

 drivers/iommu/iommu.c  | 218 +
 include/linux/device.h |   3 +
 include/linux/iommu.h  |  91 
 include/uapi/linux/iommu.h | 152 ++
 4 files changed, 464 insertions(+)
 create mode 100644 include/uapi/linux/iommu.h

-- 
2.21.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/4] driver core: Add per device iommu param

2019-05-23 Thread Jean-Philippe Brucker
From: Jacob Pan 

DMA faults can be detected by IOMMU at device level. Adding a pointer
to struct device allows IOMMU subsystem to report relevant faults
back to the device driver for further handling.
For direct assigned device (or user space drivers), guest OS holds
responsibility to handle and respond per device IOMMU fault.
Therefore we need fault reporting mechanism to propagate faults beyond
IOMMU subsystem.

There are two other IOMMU data pointers under struct device today, here
we introduce iommu_param as a parent pointer such that all device IOMMU
data can be consolidated here. The idea was suggested here by Greg KH
and Joerg. The name iommu_param is chosen here since iommu_data has been
used.

Suggested-by: Greg Kroah-Hartman 
Reviewed-by: Greg Kroah-Hartman 
Signed-off-by: Jacob Pan 
Link: https://lkml.org/lkml/2017/10/6/81
---
 include/linux/device.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index e85264fb6616..f0a975abd6e9 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,6 +42,7 @@ struct iommu_ops;
 struct iommu_group;
 struct iommu_fwspec;
 struct dev_pin_info;
+struct iommu_param;
 
 struct bus_attribute {
struct attributeattr;
@@ -959,6 +960,7 @@ struct dev_links_info {
  * device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
  * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
+ * @iommu_param: Per device generic IOMMU runtime data
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:   Set after successful invocation of bus type's .offline().
@@ -1052,6 +1054,7 @@ struct device {
void(*release)(struct device *dev);
struct iommu_group  *iommu_group;
struct iommu_fwspec *iommu_fwspec;
+   struct iommu_param  *iommu_param;
 
booloffline_disabled:1;
booloffline:1;
-- 
2.21.0



Re: [PATCH] swiotlb: sync buffer when mapping FROM_DEVICE

2019-05-23 Thread Robin Murphy

On 23/05/2019 17:43, Christoph Hellwig wrote:

On Thu, May 23, 2019 at 07:35:07AM +0200, Marek Szyprowski wrote:

Don't we have DMA_BIDIRECTIONAL for such case?


Not sure if it was intended for that case, but it definitively should
do the right thing for swiotlb, and it should also do the right thing
in terms of cache maintainance.


Maybe we should update
documentation a bit to point that DMA_FROM_DEVICE expects the whole
buffer to be filled by the device?


Probably. Horia, can you try to use DMA_BIDIRECTIONAL?



Yes, in general that should be a viable option. I got rather focused on 
the distinction that a "partial" FROM_DEVICE mapping would still be 
allowed to physically prevent the device from making any reads, whereas 
BIDIRECTIONAL would not, but I suspect any benefit being lost there is 
mostly one of debugging visibility rather than appreciable security, and 
probably not enough to justify additional complexity on its own - I 
couldn't say off-hand how many IOMMUs actually support write-only 
permissions anyway.


Whichever way, I'd certainly have no objection to formalising what seems 
to be the existing behaviour (both SWIOTLB and ARM dmabounce look 
consistent, at least) as something like "for the DMA_FROM_DEVICE 
direction, any parts of the buffer not written to by the device will 
become undefined". The IOMMU bounce page stuff is going to be another 
one in this boat, too.


Robin.


Re: [PATCH] swiotlb: sync buffer when mapping FROM_DEVICE

2019-05-23 Thread Horia Geanta
On 5/23/2019 7:43 PM, Christoph Hellwig wrote:
> On Thu, May 23, 2019 at 07:35:07AM +0200, Marek Szyprowski wrote:
>> Don't we have DMA_BIDIRECTIONAL for such case?
> 
> Not sure if it was intended for that case, but it definitively should
> do the right thing for swiotlb, and it should also do the right thing
> in terms of cache maintainance.
> 
>> Maybe we should update 
>> documentation a bit to point that DMA_FROM_DEVICE expects the whole 
>> buffer to be filled by the device?
> 
> Probably. Horia, can you try to use DMA_BIDIRECTIONAL?
> 
This works, but at the cost of performance - all the cache lines being written
back to memory, just to be overwritten by the device.

Thanks,
Horia
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] swiotlb: sync buffer when mapping FROM_DEVICE

2019-05-23 Thread Christoph Hellwig
On Thu, May 23, 2019 at 07:35:07AM +0200, Marek Szyprowski wrote:
> Don't we have DMA_BIDIRECTIONAL for such case?

Not sure if it was intended for that case, but it definitively should
do the right thing for swiotlb, and it should also do the right thing
in terms of cache maintainance.

> Maybe we should update 
> documentation a bit to point that DMA_FROM_DEVICE expects the whole 
> buffer to be filled by the device?

Probably. Horia, can you try to use DMA_BIDIRECTIONAL?


Re: [PATCH] swiotlb: sync buffer when mapping FROM_DEVICE

2019-05-23 Thread Horia Geanta
On 5/23/2019 8:35 AM, Marek Szyprowski wrote:
> Hi Robin,
> 
> On 2019-05-22 15:55, Robin Murphy wrote:
>> On 22/05/2019 14:34, Christoph Hellwig wrote:
>>> On Wed, May 22, 2019 at 02:25:38PM +0100, Robin Murphy wrote:
 Sure, but that should be irrelevant since the effective problem here 
 is in
 the sync_*_for_cpu direction, and it's the unmap which nobbles the 
 buffer.
 If the driver does this:

 dma_map_single(whole buffer);
 
 dma_unmap_single(whole buffer);
 

 then it could instead do this and be happy:

 dma_map_single(whole buffer, SKIP_CPU_SYNC);
 
 dma_sync_single_for_cpu(updated part of buffer);
 dma_unmap_single(whole buffer, SKIP_CPU_SYNC);
 
>>>
>>> Assuming the driver knows how much was actually DMAed this would
>>> solve the issue.  Horia, does this work for you?
In my particular case, input is provided as a scatterlist, out of which first N
bytes are problematic (not written to by device and corrupted when swiotlb
bouncing is needed), while remaining bytes (Total - N) are updated by the 
device.

>>
>> Ohhh, and now I've just twigged what you were suggesting - your 
>> DMA_ATTR_PARTIAL flag would mean "treat this as a read-modify-write of 
>> the buffer because we *don't* know exactly which parts the device may 
>> write to". So indeed if we did go down that route we wouldn't need any 
>> of the sync stuff I was worrying about (but I might suggest naming it 
>> DMA_ATTR_UPDATE instead). Apologies for being slow :)
> 
> Don't we have DMA_BIDIRECTIONAL for such case? Maybe we should update 
> documentation a bit to point that DMA_FROM_DEVICE expects the whole 
> buffer to be filled by the device?
> 
Or, put more bluntly, driver must not rely on previous data in the area mapped
DMA_FROM_DEVICE. This limitation stems from the buffer bouncing mechanism of the
swiotlb DMA API backend, which other backends might not suffer from (e.g. 
IOMMU).

Btw, the device I am working on (caam crypto engine) is deployed in several SoCs
configured differently - with or without an IOMMU (and coherent or non-coherent
etc.). IOW it's a "power user" of the DMA API and I appreciate all the help in
solving / clarifying this kind of implicit assumptions.

Thanks,
Horia


Re: [PATCH v3 2/3] vfio: zpci: defining the VFIO headers

2019-05-23 Thread Cornelia Huck
On Thu, 23 May 2019 14:25:25 +0200
Pierre Morel  wrote:

> We define a new device region in vfio.h to be able to
> get the ZPCI CLP information by reading this region from
> userland.
> 
> We create a new file, vfio_zdev.h to define the structure
> of the new region we defined in vfio.h
> 
> Signed-off-by: Pierre Morel 
> ---
>  include/uapi/linux/vfio.h  |  4 
>  include/uapi/linux/vfio_zdev.h | 34 ++
>  2 files changed, 38 insertions(+)
>  create mode 100644 include/uapi/linux/vfio_zdev.h
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 8f10748..56595b8 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -306,6 +306,10 @@ struct vfio_region_info_cap_type {
>  #define VFIO_REGION_TYPE_GFX(1)
>  #define VFIO_REGION_SUBTYPE_GFX_EDID(1)
>  
> +/* IBM Subtypes */
> +#define VFIO_REGION_TYPE_IBM_ZDEV(1)
> +#define VFIO_REGION_SUBTYPE_ZDEV_CLP (1)

I'm afraid that confuses me a bit. You want to add the region to every
vfio-pci device when we're running under s390, right? So this does not
depend on the device type of the actual device (which may or may not be
from IBM), but only on the architecture?

(Generally speaking, I think using regions for this makes sense,
though.)

> +
>  /**
>   * struct vfio_region_gfx_edid - EDID region layout.
>   *
> diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
> new file mode 100644
> index 000..84b1a82
> --- /dev/null
> +++ b/include/uapi/linux/vfio_zdev.h
> @@ -0,0 +1,34 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/*
> + * Region definition for ZPCI devices
> + *
> + * Copyright IBM Corp. 2019
> + *
> + * Author(s): Pierre Morel 
> + */
> +
> +#ifndef _VFIO_ZDEV_H_
> +#define _VFIO_ZDEV_H_
> +
> +#include 
> +
> +/**
> + * struct vfio_region_zpci_info - ZPCI information.
> + *
> + */
> +struct vfio_region_zpci_info {
> + __u64 dasm;
> + __u64 start_dma;
> + __u64 end_dma;
> + __u64 msi_addr;
> + __u64 flags;
> + __u16 pchid;
> + __u16 mui;
> + __u16 noi;
> + __u8 gid;
> + __u8 version;
> +#define VFIO_PCI_ZDEV_FLAGS_REFRESH 1
> + __u8 util_str[CLP_UTIL_STR_LEN];
> +} __packed;
> +
> +#endif

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 04/16] ioasid: Add custom IOASID allocator

2019-05-23 Thread Jacob Pan
On Thu, 23 May 2019 09:14:07 +0200
Auger Eric  wrote:

> Hi Jacob,
> 
> On 5/22/19 9:42 PM, Jacob Pan wrote:
> > On Tue, 21 May 2019 11:55:55 +0200
> > Auger Eric  wrote:
> >   
> >> Hi Jacob,
> >>
> >> On 5/4/19 12:32 AM, Jacob Pan wrote:  
> >>> Sometimes, IOASID allocation must be handled by platform specific
> >>> code. The use cases are guest vIOMMU and pvIOMMU where IOASIDs
> >>> need to be allocated by the host via enlightened or paravirt
> >>> interfaces.
> >>>
> >>> This patch adds an extension to the IOASID allocator APIs such
> >>> that platform drivers can register a custom allocator, possibly
> >>> at boot time, to take over the allocation. Xarray is still used
> >>> for tracking and searching purposes internal to the IOASID code.
> >>> Private data of an IOASID can also be set after the allocation.
> >>>
> >>> There can be multiple custom allocators registered but only one is
> >>> used at a time. In case of hot removal of devices that provides
> >>> the allocator, all IOASIDs must be freed prior to unregistering
> >>> the allocator. Default XArray based allocator cannot be mixed with
> >>> custom allocators, i.e. custom allocators will not be used if
> >>> there are outstanding IOASIDs allocated by the default XA
> >>> allocator.
> >>>
> >>> Signed-off-by: Jacob Pan 
> >>> ---
> >>>  drivers/iommu/ioasid.c | 125
> >>> + 1 file changed,
> >>> 125 insertions(+)
> >>>
> >>> diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
> >>> index 99f5e0a..ed2915a 100644
> >>> --- a/drivers/iommu/ioasid.c
> >>> +++ b/drivers/iommu/ioasid.c
> >>> @@ -17,6 +17,100 @@ struct ioasid_data {
> >>>  };
> >>>  
> >>>  static DEFINE_XARRAY_ALLOC(ioasid_xa);
> >>> +static DEFINE_MUTEX(ioasid_allocator_lock);
> >>> +static struct ioasid_allocator *active_custom_allocator;
> >>> +
> >>> +static LIST_HEAD(custom_allocators);
> >>> +/*
> >>> + * A flag to track if ioasid default allocator is in use, this
> >>> will
> >>> + * prevent custom allocator from being used. The reason is that
> >>> custom allocator
> >>> + * must have unadulterated space to track private data with
> >>> xarray, there cannot
> >>> + * be a mix been default and custom allocated IOASIDs.
> >>> + */
> >>> +static int default_allocator_active;
> >>> +
> >>> +/**
> >>> + * ioasid_register_allocator - register a custom allocator
> >>> + * @allocator: the custom allocator to be registered
> >>> + *
> >>> + * Custom allocators take precedence over the default xarray
> >>> based allocator.
> >>> + * Private data associated with the ASID are managed by ASID
> >>> common code
> >>> + * similar to data stored in xa.
> >>> + *
> >>> + * There can be multiple allocators registered but only one is
> >>> active. In case
> >>> + * of runtime removal of a custom allocator, the next one is
> >>> activated based
> >>> + * on the registration ordering.
> >>> + */
> >>> +int ioasid_register_allocator(struct ioasid_allocator *allocator)
> >>> +{
> >>> + struct ioasid_allocator *pallocator;
> >>> + int ret = 0;
> >>> +
> >>> + if (!allocator)
> >>> + return -EINVAL;
> >> is it really necessary? Sin't it the caller responsibility?  
> > makes sense. will remove this one and below.  
> >>> +
> >>> + mutex_lock(_allocator_lock);
> >>> + /*
> >>> +  * No particular preference since all custom allocators
> >>> end up calling
> >>> +  * the host to allocate IOASIDs. We activate the first
> >>> one and keep
> >>> +  * the later registered allocators in a list in case the
> >>> first one gets
> >>> +  * removed due to hotplug.
> >>> +  */
> >>> + if (list_empty(_allocators))
> >>> + active_custom_allocator = allocator;> +
> >>> else {
> >>> + /* Check if the allocator is already registered
> >>> */
> >>> + list_for_each_entry(pallocator,
> >>> _allocators, list) {
> >>> + if (pallocator == allocator) {
> >>> + pr_err("IOASID allocator already
> >>> registered\n");
> >>> + ret = -EEXIST;
> >>> + goto out_unlock;
> >>> + }
> >>> + }
> >>> + }
> >>> + list_add_tail(>list, _allocators);
> >>> +
> >>> +out_unlock:
> >>> + mutex_unlock(_allocator_lock);
> >>> + return ret;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(ioasid_register_allocator);
> >>> +
> >>> +/**
> >>> + * ioasid_unregister_allocator - Remove a custom IOASID allocator
> >>> + * @allocator: the custom allocator to be removed
> >>> + *
> >>> + * Remove an allocator from the list, activate the next allocator
> >>> in
> >>> + * the order it was registered.
> >>> + */
> >>> +void ioasid_unregister_allocator(struct ioasid_allocator
> >>> *allocator) +{
> >>> + if (!allocator)
> >>> + return;
> >> is it really necessary?  
> >>> +
> >>> + if (list_empty(_allocators)) {
> >>> + pr_warn("No custom IOASID allocators active!\n");
> >>> + return;
> >>> + }
> >>> +
> >>> + mutex_lock(_allocator_lock);
> >>> + 

Re: [RFC v3 0/3] vfio_pci: wrap pci device as a mediated device

2019-05-23 Thread Alex Williamson
On Thu, 23 May 2019 08:44:57 +
"Liu, Yi L"  wrote:

> Hi Alex,
> 
> Sorry to disturb you. Do you want to review on this version or review a 
> rebased version? :-) If rebase version is better, I can try to do it asap.

Hi Yi,

Perhaps you missed my comments on 1/3:

https://www.spinics.net/lists/kvm/msg187282.html

In summary, it looks pretty good, but consider a file name more
consistent with the existing files and prune out the code changes from
the code moves so they can be reviewed more easily.  Thanks,

Alex

> > -Original Message-
> > From: Liu, Yi L
> > Sent: Tuesday, April 23, 2019 8:15 PM
> > To: alex.william...@redhat.com; kwankh...@nvidia.com
> > Cc: Tian, Kevin ; baolu...@linux.intel.com; Liu, Yi L
> > ; Sun, Yi Y ; j...@8bytes.org; jean-
> > philippe.bruc...@arm.com; pet...@redhat.com; linux-ker...@vger.kernel.org;
> > k...@vger.kernel.org; yamada.masah...@socionext.com; iommu@lists.linux-
> > foundation.org
> > Subject: [RFC v3 0/3] vfio_pci: wrap pci device as a mediated device
> > 
> > This patchset aims to add a vfio-pci-like meta driver as a demo user of the 
> > vfio
> > changes introduced in "vfio/mdev: IOMMU aware mediated device" patchset from
> > Baolu Lu.
> > 
> > Previous RFC v1 has given two proposals and the discussion could be found in
> > following link. Per the comments, this patchset adds a separate driver 
> > named vfio-
> > mdev-pci. It is a sample driver, but loactes in drivers/vfio/pci due to 
> > code sharing
> > consideration.
> > The corresponding Kconfig definition is in samples/Kconfig.
> > 
> > https://lkml.org/lkml/2019/3/4/529
> > 
> > Besides the test purpose, per Alex's comments, it could also be a good base 
> > driver
> > for experimenting with device specific mdev migration.
> > 
> > Specific interface tested in this proposal:
> > 
> > *) int mdev_set_iommu_device(struct device *dev,
> > struct device *iommu_device)
> >introduced in the patch as below:
> >"[PATCH v5 6/8] vfio/mdev: Add iommu related member in mdev_device"
> > 
> > 
> > Links:
> > *) Link of "vfio/mdev: IOMMU aware mediated device"
> > https://lwn.net/Articles/780522/
> > 
> > Please feel free give your comments.
> > 
> > Thanks,
> > Yi Liu
> > 
> > Change log:
> >   v2->v3:
> >   - use vfio-mdev-pci instead of vfio-pci-mdev
> >   - place the new driver under drivers/vfio/pci while define
> > Kconfig in samples/Kconfig to clarify it is a sample driver
> > 
> >   v1->v2:
> >   - instead of adding kernel option to existing vfio-pci
> > module in v1, v2 follows Alex's suggestion to add a
> > separate vfio-pci-mdev module.
> >   - new patchset subject: "vfio/pci: wrap pci device as a mediated device"
> > 
> > Liu, Yi L (3):
> >   vfio_pci: split vfio_pci.c into two source files
> >   vfio/pci: protect cap/ecap_perm bits alloc/free with atomic op
> >   smaples: add vfio-mdev-pci driver
> > 
> >  drivers/vfio/pci/Makefile   |7 +-
> >  drivers/vfio/pci/common.c   | 1511 
> > +++
> >  drivers/vfio/pci/vfio_mdev_pci.c|  386 +
> >  drivers/vfio/pci/vfio_pci.c | 1476 
> > +-
> >  drivers/vfio/pci/vfio_pci_config.c  |9 +
> >  drivers/vfio/pci/vfio_pci_private.h |   27 +
> >  samples/Kconfig |   11 +
> >  7 files changed, 1962 insertions(+), 1465 deletions(-)  create mode 100644
> > drivers/vfio/pci/common.c  create mode 100644 
> > drivers/vfio/pci/vfio_mdev_pci.c
> > 
> > --
> > 2.7.4  
> 



Re: [PATCH v5 1/1] iommu/io-pgtable-arm: Add support to use system cache

2019-05-23 Thread Vivek Gautam
On Thu, May 23, 2019 at 4:11 PM Robin Murphy  wrote:
>
> On 2019-05-16 10:30 am, Vivek Gautam wrote:
> > Few Qualcomm platforms such as, sdm845 have an additional outer
> > cache called as System cache, aka. Last level cache (LLC) that
> > allows non-coherent devices to upgrade to using caching.
> > This cache sits right before the DDR, and is tightly coupled
> > with the memory controller. The clients using this cache request
> > their slices from this system cache, make it active, and can then
> > start using it.
> >
> > There is a fundamental assumption that non-coherent devices can't
> > access caches. This change adds an exception where they *can* use
> > some level of cache despite still being non-coherent overall.
> > The coherent devices that use cacheable memory, and CPU make use of
> > this system cache by default.
> >
> > Looking at memory types, we have following -
> > a) Normal uncached :- MAIR 0x44, inner non-cacheable,
> >outer non-cacheable;
> > b) Normal cached :-   MAIR 0xff, inner read write-back non-transient,
> >outer read write-back non-transient;
> >attribute setting for coherenet I/O devices.
> > and, for non-coherent i/o devices that can allocate in system cache
> > another type gets added -
> > c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable,
> >  outer read write-back non-transient
> >
> > Coherent I/O devices use system cache by marking the memory as
> > normal cached.
> > Non-coherent I/O devices should mark the memory as normal
> > sys-cached in page tables to use system cache.
> >
> > Signed-off-by: Vivek Gautam 
>
> Acked-by: Robin Murphy 

Thanks a lot Robin.

regards
Vivek
>
> There's a remote possibility that the IOMMU prot flag might be able to
> be somewhat generalised in future for panfrost, as Mali appears to have
> some pretty funky notions of cacheability, but this certainly looks fine
> for now, thanks.
>
> Robin.
>
> > ---
> >
> > V3 version of this patch and related series can be found at [1].
> > V4 of this patch is available at [2].
> >
> > The example usage of how a smmu master can make use of this protection
> > flag and set the correct memory attributes to start using system cache,
> > can be found at [3]; and here at [3] IOMMU_UPSTREAM_HINT is same as
> > IOMMU_QCOM_SYS_CACHE.
> >
> > Changes since v4:
> >   - Changed ARM_LPAE_MAIR_ATTR_QCOM_SYS_CACHE to
> > ARM_LPAE_MAIR_ATTR_INC_OWBRWA.
> >   - Changed ARM_LPAE_MAIR_ATTR_IDX_QCOM_SYS_CACHE to
> > ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE.
> >   - Added comments to iommu protection flag - IOMMU_QCOM_SYS_CACHE.
> >
> > Changes since v3:
> >   - Dropping support to cache i/o page tables to system cache. Getting 
> > support
> > for data buffers is the first step.
> > Removed io-pgtable quirk and related change to add domain attribute.
> >
> > Glmark2 numbers on SDM845 based cheza board:
> >
> > S.No.|with LLC support   |without LLC support
> >   |   for data buffers   |
> > ---
> > 1|4480; 72.3fps  |4042; 65.2fps
> > 2|4500; 72.6fps  |4039; 65.1fps
> > 3|4523; 72.9fps  |4106; 66.2fps
> > 4|4489; 72.4fps  |4104; 66.2fps
> > 5|4518; 72.9fps  |4072; 65.7fps
> >
> > [1] https://patchwork.kernel.org/cover/10772629/
> > [2] https://lore.kernel.org/patchwork/patch/1072936/
> > [3] https://patchwork.kernel.org/patch/10302791/
> >
> >   drivers/iommu/io-pgtable-arm.c | 9 -
> >   include/linux/iommu.h  | 6 ++
> >   2 files changed, 14 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index 4e21efbc4459..2454ac11aa97 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -167,10 +167,12 @@
> >   #define ARM_LPAE_MAIR_ATTR_MASK 0xff
> >   #define ARM_LPAE_MAIR_ATTR_DEVICE   0x04
> >   #define ARM_LPAE_MAIR_ATTR_NC   0x44
> > +#define ARM_LPAE_MAIR_ATTR_INC_OWBRWA0xf4
> >   #define ARM_LPAE_MAIR_ATTR_WBRWA0xff
> >   #define ARM_LPAE_MAIR_ATTR_IDX_NC   0
> >   #define ARM_LPAE_MAIR_ATTR_IDX_CACHE1
> >   #define ARM_LPAE_MAIR_ATTR_IDX_DEV  2
> > +#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE3
> >
> >   #define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)
> >   #define ARM_MALI_LPAE_TTBR_READ_INNER   BIT(2)
> > @@ -470,6 +472,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
> > arm_lpae_io_pgtable *data,
> >   else if (prot & IOMMU_CACHE)
> >   pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> >   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > + else if (prot & IOMMU_QCOM_SYS_CACHE)
> > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> > + << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> >   

Re: implement generic dma_map_ops for IOMMUs v6

2019-05-23 Thread Robin Murphy

On 23/05/2019 08:00, Christoph Hellwig wrote:


Hi Robin and Joerg,

I think we are finally ready for the generic dma-iommu series.  I have
various DMA API changes pending, and Tom has patches ready to convert
the AMD and Intel iommu drivers over to it.  I'd love to have this
in a stable branch shared between the dma-mapping and iommu trees
the day after rc2 is released.  I volunteer to create the branch,
but I'm fine with it living in the iommu tree as well.


Yup, I'd also like to see this in -next ASAP in the hope that board 
farms and maybe even real users can start chewing on it. Thanks for 
persevering :)


Robin.




A git tree is also available at:

 git://git.infradead.org/users/hch/misc.git dma-iommu-ops.6

Gitweb:

 
http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-iommu-ops.6

Changes since v5:
  - rebased to latest linus tree and dropped a patch now merged
  - remove the now pointless __dma_iommu_mmap function
  - restore a cleanup from Robin that would have caused a conflict in
last merge winow, but is fine now

Changes since v4:
  - rebased to 5.2-rc1

Changes since v3:
  - fold the separate patch to refactor mmap bounds checking
  - don't warn on not finding a vm_area
  - improve a commit log
  - refactor __dma_iommu_free a little differently
  - remove a minor MSI map cleanup to avoid a conflict with the
"Split iommu_dma_map_msi_msg" series

Changes since v2:
  - address various review comments and include patches from Robin

Changes since v1:
  - only include other headers in dma-iommu.h if CONFIG_DMA_IOMMU is enabled
  - keep using a scatterlist in iommu_dma_alloc
  - split out mmap/sgtable fixes and move them early in the series
  - updated a few commit logs
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 1/3] s390: pci: Exporting access to CLP PCI function and PCI group

2019-05-23 Thread Pierre Morel
For the generic implementation of VFIO PCI we need to retrieve
the hardware configuration for the PCI functions and the
PCI function groups.

We modify the internal function using CLP Query PCI function and
CLP query PCI function group so that they can be called from
outside the S390 architecture PCI code and prefix the two
functions with "zdev" to make clear that they can be called
knowing only the associated zdevice.

Signed-off-by: Pierre Morel 
Reviewed-by: Sebastian Ott 
---
 arch/s390/include/asm/pci.h |  3 ++
 arch/s390/pci/pci_clp.c | 70 +++--
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 305befd..e66b246 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -261,4 +261,7 @@ cpumask_of_pcibus(const struct pci_bus *bus)
 
 #endif /* CONFIG_NUMA */
 
+int zdev_query_pci_fngrp(struct zpci_dev *zdev,
+struct clp_req_rsp_query_pci_grp *rrb);
+int zdev_query_pci_fn(struct zpci_dev *zdev, struct clp_req_rsp_query_pci 
*rrb);
 #endif
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 3a36b07..c57f675 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -113,31 +113,16 @@ static void clp_store_query_pci_fngrp(struct zpci_dev 
*zdev,
}
 }
 
-static int clp_query_pci_fngrp(struct zpci_dev *zdev, u8 pfgid)
+int zdev_query_pci_fngrp(struct zpci_dev *zdev,
+struct clp_req_rsp_query_pci_grp *rrb)
 {
-   struct clp_req_rsp_query_pci_grp *rrb;
-   int rc;
-
-   rrb = clp_alloc_block(GFP_KERNEL);
-   if (!rrb)
-   return -ENOMEM;
-
memset(rrb, 0, sizeof(*rrb));
rrb->request.hdr.len = sizeof(rrb->request);
rrb->request.hdr.cmd = CLP_QUERY_PCI_FNGRP;
rrb->response.hdr.len = sizeof(rrb->response);
-   rrb->request.pfgid = pfgid;
+   rrb->request.pfgid = zdev->pfgid;
 
-   rc = clp_req(rrb, CLP_LPS_PCI);
-   if (!rc && rrb->response.hdr.rsp == CLP_RC_OK)
-   clp_store_query_pci_fngrp(zdev, >response);
-   else {
-   zpci_err("Q PCI FGRP:\n");
-   zpci_err_clp(rrb->response.hdr.rsp, rc);
-   rc = -EIO;
-   }
-   clp_free_block(rrb);
-   return rc;
+   return clp_req(rrb, CLP_LPS_PCI);
 }
 
 static int clp_store_query_pci_fn(struct zpci_dev *zdev,
@@ -174,32 +159,49 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
return 0;
 }
 
-static int clp_query_pci_fn(struct zpci_dev *zdev, u32 fh)
+int zdev_query_pci_fn(struct zpci_dev *zdev, struct clp_req_rsp_query_pci *rrb)
+{
+
+   memset(rrb, 0, sizeof(*rrb));
+   rrb->request.hdr.len = sizeof(rrb->request);
+   rrb->request.hdr.cmd = CLP_QUERY_PCI_FN;
+   rrb->response.hdr.len = sizeof(rrb->response);
+   rrb->request.fh = zdev->fh;
+
+   return clp_req(rrb, CLP_LPS_PCI);
+}
+
+static int clp_query_pci(struct zpci_dev *zdev)
 {
struct clp_req_rsp_query_pci *rrb;
+   struct clp_req_rsp_query_pci_grp *grrb;
int rc;
 
rrb = clp_alloc_block(GFP_KERNEL);
if (!rrb)
return -ENOMEM;
 
-   memset(rrb, 0, sizeof(*rrb));
-   rrb->request.hdr.len = sizeof(rrb->request);
-   rrb->request.hdr.cmd = CLP_QUERY_PCI_FN;
-   rrb->response.hdr.len = sizeof(rrb->response);
-   rrb->request.fh = fh;
-
-   rc = clp_req(rrb, CLP_LPS_PCI);
-   if (!rc && rrb->response.hdr.rsp == CLP_RC_OK) {
-   rc = clp_store_query_pci_fn(zdev, >response);
-   if (rc)
-   goto out;
-   rc = clp_query_pci_fngrp(zdev, rrb->response.pfgid);
-   } else {
+   rc = zdev_query_pci_fn(zdev, rrb);
+   if (rc || rrb->response.hdr.rsp != CLP_RC_OK) {
zpci_err("Q PCI FN:\n");
zpci_err_clp(rrb->response.hdr.rsp, rc);
rc = -EIO;
+   goto out;
}
+   rc = clp_store_query_pci_fn(zdev, >response);
+   if (rc)
+   goto out;
+
+   grrb = (struct clp_req_rsp_query_pci_grp *)rrb;
+   rc = zdev_query_pci_fngrp(zdev, grrb);
+   if (rc || grrb->response.hdr.rsp != CLP_RC_OK) {
+   zpci_err("Q PCI FGRP:\n");
+   zpci_err_clp(grrb->response.hdr.rsp, rc);
+   rc = -EIO;
+   goto out;
+   }
+   clp_store_query_pci_fngrp(zdev, >response);
+
 out:
clp_free_block(rrb);
return rc;
@@ -219,7 +221,7 @@ int clp_add_pci_device(u32 fid, u32 fh, int configured)
zdev->fid = fid;
 
/* Query function properties and update zdev */
-   rc = clp_query_pci_fn(zdev, fh);
+   rc = clp_query_pci(zdev);
if (rc)
goto error;
 
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org

[PATCH v3 0/3] Retrieving zPCI specific info with VFIO

2019-05-23 Thread Pierre Morel
We define a new configuration entry for VFIO/PCI, VFIO_PCI_ZDEV
to configure access to a zPCI region dedicated for retrieving
zPCI features.

When the VFIO_PCI_ZDEV feature is configured we initialize
a new device region, VFIO_REGION_SUBTYPE_ZDEV_CLP, to hold
the information from the ZPCI device the userland needs to
give to a guest driving the zPCI function.


Note that in the current state we do not use the CLP instructions
to access the firmware but get the information directly from
the zdev device.

-This means that the patch 1, "s390: pci: Exporting access to CLP PCI
function and PCI group" is not used and can be let out of this series
without denying the good working of the other patches.
- But we will need this later, eventually in the next iteration
  to retrieve values not being saved inside the zdev structure.
  like maxstbl and the PCI supported version

To share the code with arch/s390/pci/pci_clp.c the original functions
in pci_clp.c to query PCI functions and PCI functions group are
modified so that they can be exported.

A new function clp_query_pci() replaces clp_query_pci_fn() and
the previous calls to clp_query_pci_fn() and clp_query_pci_fngrp()
are replaced with calls to zdev_query_pci_fn() and zdev_query_pci_fngrp()
using a zdev pointer as argument.


Pierre Morel (3):
  s390: pci: Exporting access to CLP PCI function and PCI group
  vfio: zpci: defining the VFIO headers
  vfio: pci: Using a device region to retrieve zPCI information

 arch/s390/include/asm/pci.h |  3 ++
 arch/s390/pci/pci_clp.c | 70 ---
 drivers/vfio/pci/Kconfig|  7 
 drivers/vfio/pci/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c |  9 
 drivers/vfio/pci/vfio_pci_private.h | 10 +
 drivers/vfio/pci/vfio_pci_zdev.c| 83 +
 include/uapi/linux/vfio.h   |  4 ++
 include/uapi/linux/vfio_zdev.h  | 34 +++
 9 files changed, 187 insertions(+), 34 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_zdev.c
 create mode 100644 include/uapi/linux/vfio_zdev.h

-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 2/3] vfio: zpci: defining the VFIO headers

2019-05-23 Thread Pierre Morel
We define a new device region in vfio.h to be able to
get the ZPCI CLP information by reading this region from
userland.

We create a new file, vfio_zdev.h to define the structure
of the new region we defined in vfio.h

Signed-off-by: Pierre Morel 
---
 include/uapi/linux/vfio.h  |  4 
 include/uapi/linux/vfio_zdev.h | 34 ++
 2 files changed, 38 insertions(+)
 create mode 100644 include/uapi/linux/vfio_zdev.h

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8f10748..56595b8 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -306,6 +306,10 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_GFX(1)
 #define VFIO_REGION_SUBTYPE_GFX_EDID(1)
 
+/* IBM Subtypes */
+#define VFIO_REGION_TYPE_IBM_ZDEV  (1)
+#define VFIO_REGION_SUBTYPE_ZDEV_CLP   (1)
+
 /**
  * struct vfio_region_gfx_edid - EDID region layout.
  *
diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h
new file mode 100644
index 000..84b1a82
--- /dev/null
+++ b/include/uapi/linux/vfio_zdev.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Region definition for ZPCI devices
+ *
+ * Copyright IBM Corp. 2019
+ *
+ * Author(s): Pierre Morel 
+ */
+
+#ifndef _VFIO_ZDEV_H_
+#define _VFIO_ZDEV_H_
+
+#include 
+
+/**
+ * struct vfio_region_zpci_info - ZPCI information.
+ *
+ */
+struct vfio_region_zpci_info {
+   __u64 dasm;
+   __u64 start_dma;
+   __u64 end_dma;
+   __u64 msi_addr;
+   __u64 flags;
+   __u16 pchid;
+   __u16 mui;
+   __u16 noi;
+   __u8 gid;
+   __u8 version;
+#define VFIO_PCI_ZDEV_FLAGS_REFRESH 1
+   __u8 util_str[CLP_UTIL_STR_LEN];
+} __packed;
+
+#endif
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 3/3] vfio: pci: Using a device region to retrieve zPCI information

2019-05-23 Thread Pierre Morel
We define a new configuration entry for VFIO/PCI, VFIO_PCI_ZDEV

When the VFIO_PCI_ZDEV feature is configured we initialize
a new device region, VFIO_REGION_SUBTYPE_ZDEV_CLP, to hold
the information from the ZPCI device the userland needs to
give to a guest driving the zPCI function.

Signed-off-by: Pierre Morel 
---
 drivers/vfio/pci/Kconfig|  7 
 drivers/vfio/pci/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c |  9 
 drivers/vfio/pci/vfio_pci_private.h | 10 +
 drivers/vfio/pci/vfio_pci_zdev.c| 83 +
 5 files changed, 110 insertions(+)
 create mode 100644 drivers/vfio/pci/vfio_pci_zdev.c

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index d0f8e4f..9c1181c 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -44,3 +44,10 @@ config VFIO_PCI_NVLINK2
depends on VFIO_PCI && PPC_POWERNV
help
  VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
+
+config VFIO_PCI_ZDEV
+   tristate "VFIO PCI Generic for ZPCI devices"
+   depends on VFIO_PCI && S390
+   default y
+   help
+ VFIO PCI support for S390 Z-PCI devices
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 9662c06..fd53819 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -2,5 +2,6 @@
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
+vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 3fa20e9..b6087d6 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -362,6 +362,15 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
}
}
 
+   if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
+   ret = vfio_pci_zdev_init(vdev);
+   if (ret) {
+   dev_warn(>pdev->dev,
+"Failed to setup ZDEV regions\n");
+   goto disable_exit;
+   }
+   }
+
vfio_pci_probe_mmaps(vdev);
 
return 0;
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 1812cf2..db73cdf 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -189,4 +189,14 @@ static inline int vfio_pci_ibm_npu2_init(struct 
vfio_pci_device *vdev)
return -ENODEV;
 }
 #endif
+
+#ifdef(IS_ENABLED_VFIO_PCI_ZDEV)
+extern int vfio_pci_zdev_init(struct vfio_pci_device *vdev);
+#else
+static inline int vfio_pci_zdev_init(struct vfio_pci_device *vdev)
+{
+   return -ENODEV;
+}
+#endif
+
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
new file mode 100644
index 000..230a4e4
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2019.  All rights reserved.
+ * Author: Pierre Morel 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_pci_private.h"
+
+static size_t vfio_pci_zdev_rw(struct vfio_pci_device *vdev,
+  char __user *buf, size_t count, loff_t *ppos,
+  bool iswrite)
+{
+   struct vfio_region_zpci_info *region;
+   struct zpci_dev *zdev;
+   unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+   if (!vdev->pdev->bus)
+   return -ENODEV;
+
+   zdev = vdev->pdev->bus->sysdata;
+   if (!zdev)
+   return -ENODEV;
+
+   if ((*ppos & VFIO_PCI_OFFSET_MASK) || (count != sizeof(*region)))
+   return -EINVAL;
+
+   region = vdev->region[index - VFIO_PCI_NUM_REGIONS].data;
+   region->dasm = zdev->dma_mask;
+   region->start_dma = zdev->start_dma;
+   region->end_dma = zdev->end_dma;
+   region->msi_addr = zdev->msi_addr;
+   region->flags = VFIO_PCI_ZDEV_FLAGS_REFRESH;
+   region->gid = zdev->pfgid;
+   region->mui = zdev->fmb_update;
+   region->noi = zdev->max_msi;
+   memcpy(region->util_str, zdev->util_str, CLP_UTIL_STR_LEN);
+
+   if (copy_to_user(buf, region, count))
+   return -EFAULT;
+
+   return count;
+}
+
+static void vfio_pci_zdev_release(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region)
+{
+   kfree(region->data);
+}
+
+static const struct vfio_pci_regops vfio_pci_zdev_regops = {
+   .rw = vfio_pci_zdev_rw,
+   .release= 

Re: [PATCH v5 1/1] iommu/io-pgtable-arm: Add support to use system cache

2019-05-23 Thread Robin Murphy

On 2019-05-16 10:30 am, Vivek Gautam wrote:

Few Qualcomm platforms such as, sdm845 have an additional outer
cache called as System cache, aka. Last level cache (LLC) that
allows non-coherent devices to upgrade to using caching.
This cache sits right before the DDR, and is tightly coupled
with the memory controller. The clients using this cache request
their slices from this system cache, make it active, and can then
start using it.

There is a fundamental assumption that non-coherent devices can't
access caches. This change adds an exception where they *can* use
some level of cache despite still being non-coherent overall.
The coherent devices that use cacheable memory, and CPU make use of
this system cache by default.

Looking at memory types, we have following -
a) Normal uncached :- MAIR 0x44, inner non-cacheable,
   outer non-cacheable;
b) Normal cached :-   MAIR 0xff, inner read write-back non-transient,
   outer read write-back non-transient;
   attribute setting for coherenet I/O devices.
and, for non-coherent i/o devices that can allocate in system cache
another type gets added -
c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable,
 outer read write-back non-transient

Coherent I/O devices use system cache by marking the memory as
normal cached.
Non-coherent I/O devices should mark the memory as normal
sys-cached in page tables to use system cache.

Signed-off-by: Vivek Gautam 


Acked-by: Robin Murphy 

There's a remote possibility that the IOMMU prot flag might be able to 
be somewhat generalised in future for panfrost, as Mali appears to have 
some pretty funky notions of cacheability, but this certainly looks fine 
for now, thanks.


Robin.


---

V3 version of this patch and related series can be found at [1].
V4 of this patch is available at [2].

The example usage of how a smmu master can make use of this protection
flag and set the correct memory attributes to start using system cache,
can be found at [3]; and here at [3] IOMMU_UPSTREAM_HINT is same as
IOMMU_QCOM_SYS_CACHE.

Changes since v4:
  - Changed ARM_LPAE_MAIR_ATTR_QCOM_SYS_CACHE to
ARM_LPAE_MAIR_ATTR_INC_OWBRWA.
  - Changed ARM_LPAE_MAIR_ATTR_IDX_QCOM_SYS_CACHE to
ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE.
  - Added comments to iommu protection flag - IOMMU_QCOM_SYS_CACHE.

Changes since v3:
  - Dropping support to cache i/o page tables to system cache. Getting support
for data buffers is the first step.
Removed io-pgtable quirk and related change to add domain attribute.

Glmark2 numbers on SDM845 based cheza board:

S.No.|  with LLC support   |without LLC support
  | for data buffers   |
--- 
1|  4480; 72.3fps  |4042; 65.2fps
2|  4500; 72.6fps  |4039; 65.1fps
3|  4523; 72.9fps  |4106; 66.2fps
4|  4489; 72.4fps  |4104; 66.2fps
5|  4518; 72.9fps  |4072; 65.7fps

[1] https://patchwork.kernel.org/cover/10772629/
[2] https://lore.kernel.org/patchwork/patch/1072936/
[3] https://patchwork.kernel.org/patch/10302791/

  drivers/iommu/io-pgtable-arm.c | 9 -
  include/linux/iommu.h  | 6 ++
  2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 4e21efbc4459..2454ac11aa97 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -167,10 +167,12 @@
  #define ARM_LPAE_MAIR_ATTR_MASK   0xff
  #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04
  #define ARM_LPAE_MAIR_ATTR_NC 0x44
+#define ARM_LPAE_MAIR_ATTR_INC_OWBRWA  0xf4
  #define ARM_LPAE_MAIR_ATTR_WBRWA  0xff
  #define ARM_LPAE_MAIR_ATTR_IDX_NC 0
  #define ARM_LPAE_MAIR_ATTR_IDX_CACHE  1
  #define ARM_LPAE_MAIR_ATTR_IDX_DEV2
+#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE  3
  
  #define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)

  #define ARM_MALI_LPAE_TTBR_READ_INNER BIT(2)
@@ -470,6 +472,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (prot & IOMMU_QCOM_SYS_CACHE)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}
  
  	if (prot & IOMMU_NOEXEC)

@@ -857,7 +862,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
  (ARM_LPAE_MAIR_ATTR_WBRWA
   << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
  (ARM_LPAE_MAIR_ATTR_DEVICE
-  << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
+  << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
+ (ARM_LPAE_MAIR_ATTR_INC_OWBRWA
+  << 

RE: [RFC v3 0/3] vfio_pci: wrap pci device as a mediated device

2019-05-23 Thread Liu, Yi L
Hi Alex,

Sorry to disturb you. Do you want to review on this version or review a rebased 
version? :-) If rebase version is better, I can try to do it asap.

Thanks,
Yi Liu

> -Original Message-
> From: Liu, Yi L
> Sent: Tuesday, April 23, 2019 8:15 PM
> To: alex.william...@redhat.com; kwankh...@nvidia.com
> Cc: Tian, Kevin ; baolu...@linux.intel.com; Liu, Yi L
> ; Sun, Yi Y ; j...@8bytes.org; jean-
> philippe.bruc...@arm.com; pet...@redhat.com; linux-ker...@vger.kernel.org;
> k...@vger.kernel.org; yamada.masah...@socionext.com; iommu@lists.linux-
> foundation.org
> Subject: [RFC v3 0/3] vfio_pci: wrap pci device as a mediated device
> 
> This patchset aims to add a vfio-pci-like meta driver as a demo user of the 
> vfio
> changes introduced in "vfio/mdev: IOMMU aware mediated device" patchset from
> Baolu Lu.
> 
> Previous RFC v1 has given two proposals and the discussion could be found in
> following link. Per the comments, this patchset adds a separate driver named 
> vfio-
> mdev-pci. It is a sample driver, but loactes in drivers/vfio/pci due to code 
> sharing
> consideration.
> The corresponding Kconfig definition is in samples/Kconfig.
> 
> https://lkml.org/lkml/2019/3/4/529
> 
> Besides the test purpose, per Alex's comments, it could also be a good base 
> driver
> for experimenting with device specific mdev migration.
> 
> Specific interface tested in this proposal:
> 
> *) int mdev_set_iommu_device(struct device *dev,
>   struct device *iommu_device)
>introduced in the patch as below:
>"[PATCH v5 6/8] vfio/mdev: Add iommu related member in mdev_device"
> 
> 
> Links:
> *) Link of "vfio/mdev: IOMMU aware mediated device"
>   https://lwn.net/Articles/780522/
> 
> Please feel free give your comments.
> 
> Thanks,
> Yi Liu
> 
> Change log:
>   v2->v3:
>   - use vfio-mdev-pci instead of vfio-pci-mdev
>   - place the new driver under drivers/vfio/pci while define
> Kconfig in samples/Kconfig to clarify it is a sample driver
> 
>   v1->v2:
>   - instead of adding kernel option to existing vfio-pci
> module in v1, v2 follows Alex's suggestion to add a
> separate vfio-pci-mdev module.
>   - new patchset subject: "vfio/pci: wrap pci device as a mediated device"
> 
> Liu, Yi L (3):
>   vfio_pci: split vfio_pci.c into two source files
>   vfio/pci: protect cap/ecap_perm bits alloc/free with atomic op
>   smaples: add vfio-mdev-pci driver
> 
>  drivers/vfio/pci/Makefile   |7 +-
>  drivers/vfio/pci/common.c   | 1511 
> +++
>  drivers/vfio/pci/vfio_mdev_pci.c|  386 +
>  drivers/vfio/pci/vfio_pci.c | 1476 +-
>  drivers/vfio/pci/vfio_pci_config.c  |9 +
>  drivers/vfio/pci/vfio_pci_private.h |   27 +
>  samples/Kconfig |   11 +
>  7 files changed, 1962 insertions(+), 1465 deletions(-)  create mode 100644
> drivers/vfio/pci/common.c  create mode 100644 drivers/vfio/pci/vfio_mdev_pci.c
> 
> --
> 2.7.4



Re: [PATCH v5 1/1] iommu/io-pgtable-arm: Add support to use system cache

2019-05-23 Thread Vivek Gautam
Hi Robin,



On Thu, May 16, 2019 at 3:00 PM Vivek Gautam
 wrote:
>
> Few Qualcomm platforms such as, sdm845 have an additional outer
> cache called as System cache, aka. Last level cache (LLC) that
> allows non-coherent devices to upgrade to using caching.
> This cache sits right before the DDR, and is tightly coupled
> with the memory controller. The clients using this cache request
> their slices from this system cache, make it active, and can then
> start using it.
>
> There is a fundamental assumption that non-coherent devices can't
> access caches. This change adds an exception where they *can* use
> some level of cache despite still being non-coherent overall.
> The coherent devices that use cacheable memory, and CPU make use of
> this system cache by default.
>
> Looking at memory types, we have following -
> a) Normal uncached :- MAIR 0x44, inner non-cacheable,
>   outer non-cacheable;
> b) Normal cached :-   MAIR 0xff, inner read write-back non-transient,
>   outer read write-back non-transient;
>   attribute setting for coherenet I/O devices.
> and, for non-coherent i/o devices that can allocate in system cache
> another type gets added -
> c) Normal sys-cached :- MAIR 0xf4, inner non-cacheable,
> outer read write-back non-transient
>
> Coherent I/O devices use system cache by marking the memory as
> normal cached.
> Non-coherent I/O devices should mark the memory as normal
> sys-cached in page tables to use system cache.
>
> Signed-off-by: Vivek Gautam 
> ---

Let me know if there's more to improve on this patch.

Best regards
Vivek

>
> V3 version of this patch and related series can be found at [1].
> V4 of this patch is available at [2].
>
> The example usage of how a smmu master can make use of this protection
> flag and set the correct memory attributes to start using system cache,
> can be found at [3]; and here at [3] IOMMU_UPSTREAM_HINT is same as
> IOMMU_QCOM_SYS_CACHE.
>
> Changes since v4:
>  - Changed ARM_LPAE_MAIR_ATTR_QCOM_SYS_CACHE to
>ARM_LPAE_MAIR_ATTR_INC_OWBRWA.
>  - Changed ARM_LPAE_MAIR_ATTR_IDX_QCOM_SYS_CACHE to
>ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE.
>  - Added comments to iommu protection flag - IOMMU_QCOM_SYS_CACHE.
>
> Changes since v3:
>  - Dropping support to cache i/o page tables to system cache. Getting support
>for data buffers is the first step.
>Removed io-pgtable quirk and related change to add domain attribute.
>
> Glmark2 numbers on SDM845 based cheza board:
>
> S.No.|  with LLC support   |without LLC support
>  |  for data buffers   |
> ---
> 1|  4480; 72.3fps  |4042; 65.2fps
> 2|  4500; 72.6fps  |4039; 65.1fps
> 3|  4523; 72.9fps  |4106; 66.2fps
> 4|  4489; 72.4fps  |4104; 66.2fps
> 5|  4518; 72.9fps  |4072; 65.7fps
>
> [1] https://patchwork.kernel.org/cover/10772629/
> [2] https://lore.kernel.org/patchwork/patch/1072936/
> [3] https://patchwork.kernel.org/patch/10302791/
>
>  drivers/iommu/io-pgtable-arm.c | 9 -
>  include/linux/iommu.h  | 6 ++
>  2 files changed, 14 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> index 4e21efbc4459..2454ac11aa97 100644
> --- a/drivers/iommu/io-pgtable-arm.c
> +++ b/drivers/iommu/io-pgtable-arm.c
> @@ -167,10 +167,12 @@
>  #define ARM_LPAE_MAIR_ATTR_MASK0xff
>  #define ARM_LPAE_MAIR_ATTR_DEVICE  0x04
>  #define ARM_LPAE_MAIR_ATTR_NC  0x44
> +#define ARM_LPAE_MAIR_ATTR_INC_OWBRWA  0xf4
>  #define ARM_LPAE_MAIR_ATTR_WBRWA   0xff
>  #define ARM_LPAE_MAIR_ATTR_IDX_NC  0
>  #define ARM_LPAE_MAIR_ATTR_IDX_CACHE   1
>  #define ARM_LPAE_MAIR_ATTR_IDX_DEV 2
> +#define ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE  3
>
>  #define ARM_MALI_LPAE_TTBR_ADRMODE_TABLE (3u << 0)
>  #define ARM_MALI_LPAE_TTBR_READ_INNER  BIT(2)
> @@ -470,6 +472,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
> arm_lpae_io_pgtable *data,
> else if (prot & IOMMU_CACHE)
> pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> +   else if (prot & IOMMU_QCOM_SYS_CACHE)
> +   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> +   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> }
>
> if (prot & IOMMU_NOEXEC)
> @@ -857,7 +862,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
> void *cookie)
>   (ARM_LPAE_MAIR_ATTR_WBRWA
><< ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_CACHE)) |
>   (ARM_LPAE_MAIR_ATTR_DEVICE
> -  << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV));
> +  << ARM_LPAE_MAIR_ATTR_SHIFT(ARM_LPAE_MAIR_ATTR_IDX_DEV)) |
> + (ARM_LPAE_MAIR_ATTR_INC_OWBRWA
> +  << 
> 

Re: [PATCH v3 04/16] ioasid: Add custom IOASID allocator

2019-05-23 Thread Auger Eric
Hi Jacob,

On 5/22/19 9:42 PM, Jacob Pan wrote:
> On Tue, 21 May 2019 11:55:55 +0200
> Auger Eric  wrote:
> 
>> Hi Jacob,
>>
>> On 5/4/19 12:32 AM, Jacob Pan wrote:
>>> Sometimes, IOASID allocation must be handled by platform specific
>>> code. The use cases are guest vIOMMU and pvIOMMU where IOASIDs need
>>> to be allocated by the host via enlightened or paravirt interfaces.
>>>
>>> This patch adds an extension to the IOASID allocator APIs such that
>>> platform drivers can register a custom allocator, possibly at boot
>>> time, to take over the allocation. Xarray is still used for tracking
>>> and searching purposes internal to the IOASID code. Private data of
>>> an IOASID can also be set after the allocation.
>>>
>>> There can be multiple custom allocators registered but only one is
>>> used at a time. In case of hot removal of devices that provides the
>>> allocator, all IOASIDs must be freed prior to unregistering the
>>> allocator. Default XArray based allocator cannot be mixed with
>>> custom allocators, i.e. custom allocators will not be used if there
>>> are outstanding IOASIDs allocated by the default XA allocator.
>>>
>>> Signed-off-by: Jacob Pan 
>>> ---
>>>  drivers/iommu/ioasid.c | 125
>>> + 1 file changed,
>>> 125 insertions(+)
>>>
>>> diff --git a/drivers/iommu/ioasid.c b/drivers/iommu/ioasid.c
>>> index 99f5e0a..ed2915a 100644
>>> --- a/drivers/iommu/ioasid.c
>>> +++ b/drivers/iommu/ioasid.c
>>> @@ -17,6 +17,100 @@ struct ioasid_data {
>>>  };
>>>  
>>>  static DEFINE_XARRAY_ALLOC(ioasid_xa);
>>> +static DEFINE_MUTEX(ioasid_allocator_lock);
>>> +static struct ioasid_allocator *active_custom_allocator;
>>> +
>>> +static LIST_HEAD(custom_allocators);
>>> +/*
>>> + * A flag to track if ioasid default allocator is in use, this will
>>> + * prevent custom allocator from being used. The reason is that
>>> custom allocator
>>> + * must have unadulterated space to track private data with
>>> xarray, there cannot
>>> + * be a mix been default and custom allocated IOASIDs.
>>> + */
>>> +static int default_allocator_active;
>>> +
>>> +/**
>>> + * ioasid_register_allocator - register a custom allocator
>>> + * @allocator: the custom allocator to be registered
>>> + *
>>> + * Custom allocators take precedence over the default xarray based
>>> allocator.
>>> + * Private data associated with the ASID are managed by ASID
>>> common code
>>> + * similar to data stored in xa.
>>> + *
>>> + * There can be multiple allocators registered but only one is
>>> active. In case
>>> + * of runtime removal of a custom allocator, the next one is
>>> activated based
>>> + * on the registration ordering.
>>> + */
>>> +int ioasid_register_allocator(struct ioasid_allocator *allocator)
>>> +{
>>> +   struct ioasid_allocator *pallocator;
>>> +   int ret = 0;
>>> +
>>> +   if (!allocator)
>>> +   return -EINVAL;  
>> is it really necessary? Sin't it the caller responsibility?
> makes sense. will remove this one and below.
>>> +
>>> +   mutex_lock(_allocator_lock);
>>> +   /*
>>> +* No particular preference since all custom allocators
>>> end up calling
>>> +* the host to allocate IOASIDs. We activate the first one
>>> and keep
>>> +* the later registered allocators in a list in case the
>>> first one gets
>>> +* removed due to hotplug.
>>> +*/
>>> +   if (list_empty(_allocators))
>>> +   active_custom_allocator = allocator;> +
>>> else {
>>> +   /* Check if the allocator is already registered */
>>> +   list_for_each_entry(pallocator,
>>> _allocators, list) {
>>> +   if (pallocator == allocator) {
>>> +   pr_err("IOASID allocator already
>>> registered\n");
>>> +   ret = -EEXIST;
>>> +   goto out_unlock;
>>> +   }
>>> +   }
>>> +   }
>>> +   list_add_tail(>list, _allocators);
>>> +
>>> +out_unlock:
>>> +   mutex_unlock(_allocator_lock);
>>> +   return ret;
>>> +}
>>> +EXPORT_SYMBOL_GPL(ioasid_register_allocator);
>>> +
>>> +/**
>>> + * ioasid_unregister_allocator - Remove a custom IOASID allocator
>>> + * @allocator: the custom allocator to be removed
>>> + *
>>> + * Remove an allocator from the list, activate the next allocator
>>> in
>>> + * the order it was registered.
>>> + */
>>> +void ioasid_unregister_allocator(struct ioasid_allocator
>>> *allocator) +{
>>> +   if (!allocator)
>>> +   return;  
>> is it really necessary?
>>> +
>>> +   if (list_empty(_allocators)) {
>>> +   pr_warn("No custom IOASID allocators active!\n");
>>> +   return;
>>> +   }
>>> +
>>> +   mutex_lock(_allocator_lock);
>>> +   list_del(>list);
>>> +   if (list_empty(_allocators)) {
>>> +   pr_info("No custom IOASID allocators\n")>
>>> +   /*
>>> +* All IOASIDs should have been freed before the
>>> last custom
>>> +* allocator is unregistered. Unless default
>>> allocator is 

[PATCH 06/23] iommu/dma: Move domain lookup into __iommu_dma_{map, unmap}

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

Most of the callers don't care, and the couple that do already have the
domain to hand for other reasons are in slow paths where the (trivial)
overhead of a repeated lookup will be utterly immaterial.

Signed-off-by: Robin Murphy 
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 34 --
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c406abe3be01..c04450a4adec 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -448,9 +448,10 @@ static void iommu_dma_free_iova(struct iommu_dma_cookie 
*cookie,
size >> iova_shift(iovad));
 }
 
-static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr,
+static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr,
size_t size)
 {
+   struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = >iovad;
size_t iova_off = iova_offset(iovad, dma_addr);
@@ -465,8 +466,9 @@ static void __iommu_dma_unmap(struct iommu_domain *domain, 
dma_addr_t dma_addr,
 }
 
 static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
-   size_t size, int prot, struct iommu_domain *domain)
+   size_t size, int prot)
 {
+   struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
size_t iova_off = 0;
dma_addr_t iova;
@@ -565,7 +567,7 @@ static struct page **__iommu_dma_alloc_pages(struct device 
*dev,
 static void __iommu_dma_free(struct device *dev, struct page **pages,
size_t size, dma_addr_t *handle)
 {
-   __iommu_dma_unmap(iommu_get_dma_domain(dev), *handle, size);
+   __iommu_dma_unmap(dev, *handle, size);
__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
*handle = DMA_MAPPING_ERROR;
 }
@@ -718,14 +720,13 @@ static void iommu_dma_sync_sg_for_device(struct device 
*dev,
 static dma_addr_t __iommu_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, int prot)
 {
-   return __iommu_dma_map(dev, page_to_phys(page) + offset, size, prot,
-   iommu_get_dma_domain(dev));
+   return __iommu_dma_map(dev, page_to_phys(page) + offset, size, prot);
 }
 
 static void __iommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-   __iommu_dma_unmap(iommu_get_dma_domain(dev), handle, size);
+   __iommu_dma_unmap(dev, handle, size);
 }
 
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
@@ -734,11 +735,10 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, 
struct page *page,
 {
phys_addr_t phys = page_to_phys(page) + offset;
bool coherent = dev_is_dma_coherent(dev);
+   int prot = dma_info_to_prot(dir, coherent, attrs);
dma_addr_t dma_handle;
 
-   dma_handle =__iommu_dma_map(dev, phys, size,
-   dma_info_to_prot(dir, coherent, attrs),
-   iommu_get_dma_domain(dev));
+   dma_handle =__iommu_dma_map(dev, phys, size, prot);
if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
dma_handle != DMA_MAPPING_ERROR)
arch_sync_dma_for_device(dev, phys, size, dir);
@@ -750,7 +750,7 @@ static void iommu_dma_unmap_page(struct device *dev, 
dma_addr_t dma_handle,
 {
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
-   __iommu_dma_unmap(iommu_get_dma_domain(dev), dma_handle, size);
+   __iommu_dma_unmap(dev, dma_handle, size);
 }
 
 /*
@@ -931,21 +931,20 @@ static void iommu_dma_unmap_sg(struct device *dev, struct 
scatterlist *sg,
sg = tmp;
}
end = sg_dma_address(sg) + sg_dma_len(sg);
-   __iommu_dma_unmap(iommu_get_dma_domain(dev), start, end - start);
+   __iommu_dma_unmap(dev, start, end - start);
 }
 
 static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
return __iommu_dma_map(dev, phys, size,
-   dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO,
-   iommu_get_dma_domain(dev));
+   dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO);
 }
 
 static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
-   __iommu_dma_unmap(iommu_get_dma_domain(dev), handle, size);
+   __iommu_dma_unmap(dev, handle, size);
 }
 
 static void *iommu_dma_alloc(struct device *dev, size_t size,
@@ -1205,9 +1204,8 @@ void 

[PATCH 20/23] iommu/dma: Don't depend on CONFIG_DMA_DIRECT_REMAP

2019-05-23 Thread Christoph Hellwig
For entirely dma coherent architectures there is no requirement to ever
remap dma coherent allocation.  Move all the remap and pool code under
IS_ENABLED() checks and drop the Kconfig dependency.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/Kconfig |  1 -
 drivers/iommu/dma-iommu.c | 16 +---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index d6d063160dd6..83664db5221d 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -97,7 +97,6 @@ config IOMMU_DMA
select IOMMU_IOVA
select IRQ_MSI_IOMMU
select NEED_SG_DMA_LENGTH
-   depends on DMA_DIRECT_REMAP
 
 config FSL_PAMU
bool "Freescale IOMMU support"
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index ea2797d10070..567c300d1926 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -927,10 +927,11 @@ static void __iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr)
struct page *page = NULL, **pages = NULL;
 
/* Non-coherent atomic allocation? Easy */
-   if (dma_free_from_pool(cpu_addr, alloc_size))
+   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+   dma_free_from_pool(cpu_addr, alloc_size))
return;
 
-   if (is_vmalloc_addr(cpu_addr)) {
+   if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
/*
 * If it the address is remapped, then it's either non-coherent
 * or highmem CMA, or an iommu_dma_alloc_remap() construction.
@@ -974,7 +975,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, 
size_t size,
if (!page)
return NULL;
 
-   if (!coherent || PageHighMem(page)) {
+   if (IS_ENABLED(CONFIG_DMA_REMAP) && (!coherent || PageHighMem(page))) {
pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
 
cpu_addr = dma_common_contiguous_remap(page, alloc_size,
@@ -1007,11 +1008,12 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
 
gfp |= __GFP_ZERO;
 
-   if (gfpflags_allow_blocking(gfp) &&
+   if (IS_ENABLED(CONFIG_DMA_REMAP) && gfpflags_allow_blocking(gfp) &&
!(attrs & DMA_ATTR_FORCE_CONTIGUOUS))
return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs);
 
-   if (!gfpflags_allow_blocking(gfp) && !coherent)
+   if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+   !gfpflags_allow_blocking(gfp) && !coherent)
cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), , gfp);
else
cpu_addr = iommu_dma_alloc_pages(dev, size, , gfp, attrs);
@@ -1044,7 +1046,7 @@ static int iommu_dma_mmap(struct device *dev, struct 
vm_area_struct *vma,
if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
return -ENXIO;
 
-   if (is_vmalloc_addr(cpu_addr)) {
+   if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
struct page **pages = __iommu_dma_get_pages(cpu_addr);
 
if (pages)
@@ -1066,7 +1068,7 @@ static int iommu_dma_get_sgtable(struct device *dev, 
struct sg_table *sgt,
struct page *page;
int ret;
 
-   if (is_vmalloc_addr(cpu_addr)) {
+   if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
struct page **pages = __iommu_dma_get_pages(cpu_addr);
 
if (pages) {
-- 
2.20.1



[PATCH 21/23] iommu/dma: Switch copyright boilerplace to SPDX

2019-05-23 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
Acked-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 13 +
 include/linux/dma-iommu.h | 13 +
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 567c300d1926..0233195dd196 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * A fairly generic DMA-API to IOMMU-API glue layer.
  *
@@ -5,18 +6,6 @@
  *
  * based in part on arch/arm/mm/dma-mapping.c:
  * Copyright (C) 2000-2004 Russell King
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see .
  */
 
 #include 
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index b3cc3fb84079..05556f4d9cce 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -1,17 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2014-2015 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see .
  */
 #ifndef __DMA_IOMMU_H
 #define __DMA_IOMMU_H
-- 
2.20.1



[PATCH 19/23] iommu/dma: Refactor iommu_dma_mmap

2019-05-23 Thread Christoph Hellwig
Inline __iommu_dma_mmap and __iommu_dma_mmap_pfn into the main function,
and use the fact that __iommu_dma_get_pages return NULL for remapped
contigous allocations to simplify the code flow a bit.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 60 +++
 1 file changed, 11 insertions(+), 49 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index b56bd8e7d5f9..ea2797d10070 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -650,21 +650,6 @@ static void *iommu_dma_alloc_remap(struct device *dev, 
size_t size,
return NULL;
 }
 
-/**
- * __iommu_dma_mmap - Map a buffer into provided user VMA
- * @pages: Array representing buffer from __iommu_dma_alloc()
- * @size: Size of buffer in bytes
- * @vma: VMA describing requested userspace mapping
- *
- * Maps the pages of the buffer in @pages into @vma. The caller is responsible
- * for verifying the correct size and protection of @vma beforehand.
- */
-static int __iommu_dma_mmap(struct page **pages, size_t size,
-   struct vm_area_struct *vma)
-{
-   return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
-}
-
 static void iommu_dma_sync_single_for_cpu(struct device *dev,
dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
 {
@@ -1042,31 +1027,13 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
return cpu_addr;
 }
 
-static int __iommu_dma_mmap_pfn(struct vm_area_struct *vma,
- unsigned long pfn, size_t size)
-{
-   int ret = -ENXIO;
-   unsigned long nr_vma_pages = vma_pages(vma);
-   unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   unsigned long off = vma->vm_pgoff;
-
-   if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
-   ret = remap_pfn_range(vma, vma->vm_start,
- pfn + off,
- vma->vm_end - vma->vm_start,
- vma->vm_page_prot);
-   }
-
-   return ret;
-}
-
 static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
 {
unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
-   struct page **pages;
+   unsigned long pfn;
int ret;
 
vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
@@ -1077,24 +1044,19 @@ static int iommu_dma_mmap(struct device *dev, struct 
vm_area_struct *vma,
if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
return -ENXIO;
 
-   if (!is_vmalloc_addr(cpu_addr)) {
-   unsigned long pfn = page_to_pfn(virt_to_page(cpu_addr));
-   return __iommu_dma_mmap_pfn(vma, pfn, size);
-   }
+   if (is_vmalloc_addr(cpu_addr)) {
+   struct page **pages = __iommu_dma_get_pages(cpu_addr);
 
-   if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-   /*
-* DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped,
-* hence in the vmalloc space.
-*/
-   unsigned long pfn = vmalloc_to_pfn(cpu_addr);
-   return __iommu_dma_mmap_pfn(vma, pfn, size);
+   if (pages)
+   return vm_map_pages(vma, pages, nr_pages);
+   pfn = vmalloc_to_pfn(cpu_addr);
+   } else {
+   pfn = page_to_pfn(virt_to_page(cpu_addr));
}
 
-   pages = __iommu_dma_get_pages(cpu_addr);
-   if (!pages)
-   return -ENXIO;
-   return __iommu_dma_mmap(pages, size, vma);
+   return remap_pfn_range(vma, vma->vm_start, pfn + off,
+  vma->vm_end - vma->vm_start,
+  vma->vm_page_prot);
 }
 
 static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
-- 
2.20.1



[PATCH 22/23] arm64: switch copyright boilerplace to SPDX in dma-mapping.c

2019-05-23 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig 
Acked-by: Robin Murphy 
Reviewed-by: Mukesh Ojha 
Acked-by: Catalin Marinas 
---
 arch/arm64/mm/dma-mapping.c | 15 +--
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index d1661f78eb4d..184ef9ccd69d 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
- * SWIOTLB-based DMA API implementation
- *
  * Copyright (C) 2012 ARM Ltd.
  * Author: Catalin Marinas 
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see .
  */
 
 #include 
-- 
2.20.1



[PATCH 08/23] iommu/dma: Factor out remapped pages lookup

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

Since we duplicate the find_vm_area() logic a few times in places where
we only care aboute the pages, factor out a helper to abstract it.

Signed-off-by: Robin Murphy 
[hch: don't warn when not finding a region, as we'll rely on that later]
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 32 
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 4596e4860da8..e870ea59a34d 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -554,6 +554,15 @@ static struct page **__iommu_dma_alloc_pages(struct device 
*dev,
return pages;
 }
 
+static struct page **__iommu_dma_get_pages(void *cpu_addr)
+{
+   struct vm_struct *area = find_vm_area(cpu_addr);
+
+   if (!area || !area->pages)
+   return NULL;
+   return area->pages;
+}
+
 /**
  * iommu_dma_free - Free a buffer allocated by __iommu_dma_alloc()
  * @dev: Device which owns this buffer
@@ -1042,11 +1051,11 @@ static void iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr,
dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
dma_common_free_remap(cpu_addr, size, VM_USERMAP);
} else if (is_vmalloc_addr(cpu_addr)){
-   struct vm_struct *area = find_vm_area(cpu_addr);
+   struct page **pages = __iommu_dma_get_pages(cpu_addr);
 
-   if (WARN_ON(!area || !area->pages))
+   if (!pages)
return;
-   __iommu_dma_free(dev, area->pages, iosize, );
+   __iommu_dma_free(dev, pages, iosize, );
dma_common_free_remap(cpu_addr, size, VM_USERMAP);
} else {
__iommu_dma_unmap(dev, handle, iosize);
@@ -1078,7 +1087,7 @@ static int iommu_dma_mmap(struct device *dev, struct 
vm_area_struct *vma,
 {
unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
-   struct vm_struct *area;
+   struct page **pages;
int ret;
 
vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
@@ -1103,11 +1112,10 @@ static int iommu_dma_mmap(struct device *dev, struct 
vm_area_struct *vma,
return __iommu_dma_mmap_pfn(vma, pfn, size);
}
 
-   area = find_vm_area(cpu_addr);
-   if (WARN_ON(!area || !area->pages))
+   pages = __iommu_dma_get_pages(cpu_addr);
+   if (!pages)
return -ENXIO;
-
-   return __iommu_dma_mmap(area->pages, size, vma);
+   return __iommu_dma_mmap(pages, size, vma);
 }
 
 static int __iommu_dma_get_sgtable_page(struct sg_table *sgt, struct page 
*page,
@@ -1125,7 +1133,7 @@ static int iommu_dma_get_sgtable(struct device *dev, 
struct sg_table *sgt,
unsigned long attrs)
 {
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   struct vm_struct *area = find_vm_area(cpu_addr);
+   struct page **pages;
 
if (!is_vmalloc_addr(cpu_addr)) {
struct page *page = virt_to_page(cpu_addr);
@@ -1141,10 +1149,10 @@ static int iommu_dma_get_sgtable(struct device *dev, 
struct sg_table *sgt,
return __iommu_dma_get_sgtable_page(sgt, page, size);
}
 
-   if (WARN_ON(!area || !area->pages))
+   pages = __iommu_dma_get_pages(cpu_addr);
+   if (!pages)
return -ENXIO;
-
-   return sg_alloc_table_from_pages(sgt, area->pages, count, 0, size,
+   return sg_alloc_table_from_pages(sgt, pages, count, 0, size,
 GFP_KERNEL);
 }
 
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 16/23] iommu/dma: Cleanup variable naming in iommu_dma_alloc

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

Most importantly clear up the size / iosize confusion.  Also rename addr
to cpu_addr to match the surrounding code and make the intention a little
more clear.

Signed-off-by: Robin Murphy 
[hch: split from a larger patch]
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 45 +++
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 33d1ce8cc640..9ac76d286df1 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -977,64 +977,63 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
 {
bool coherent = dev_is_dma_coherent(dev);
int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
-   size_t iosize = size;
+   size_t alloc_size = PAGE_ALIGN(size);
struct page *page = NULL;
-   void *addr;
+   void *cpu_addr;
 
-   size = PAGE_ALIGN(size);
gfp |= __GFP_ZERO;
 
if (gfpflags_allow_blocking(gfp) &&
!(attrs & DMA_ATTR_FORCE_CONTIGUOUS))
-   return iommu_dma_alloc_remap(dev, iosize, handle, gfp, attrs);
+   return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs);
 
if (!gfpflags_allow_blocking(gfp) && !coherent) {
-   addr = dma_alloc_from_pool(size, , gfp);
-   if (!addr)
+   cpu_addr = dma_alloc_from_pool(alloc_size, , gfp);
+   if (!cpu_addr)
return NULL;
 
-   *handle = __iommu_dma_map(dev, page_to_phys(page), iosize,
+   *handle = __iommu_dma_map(dev, page_to_phys(page), size,
  ioprot);
if (*handle == DMA_MAPPING_ERROR) {
-   dma_free_from_pool(addr, size);
+   dma_free_from_pool(cpu_addr, alloc_size);
return NULL;
}
-   return addr;
+   return cpu_addr;
}
 
if (gfpflags_allow_blocking(gfp))
-   page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-get_order(size),
+   page = dma_alloc_from_contiguous(dev, alloc_size >> PAGE_SHIFT,
+get_order(alloc_size),
 gfp & __GFP_NOWARN);
if (!page)
-   page = alloc_pages(gfp, get_order(size));
+   page = alloc_pages(gfp, get_order(alloc_size));
if (!page)
return NULL;
 
-   *handle = __iommu_dma_map(dev, page_to_phys(page), iosize, ioprot);
+   *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot);
if (*handle == DMA_MAPPING_ERROR)
goto out_free_pages;
 
if (!coherent || PageHighMem(page)) {
pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
 
-   addr = dma_common_contiguous_remap(page, size, VM_USERMAP, prot,
-   __builtin_return_address(0));
-   if (!addr)
+   cpu_addr = dma_common_contiguous_remap(page, alloc_size,
+   VM_USERMAP, prot, __builtin_return_address(0));
+   if (!cpu_addr)
goto out_unmap;
 
if (!coherent)
-   arch_dma_prep_coherent(page, iosize);
+   arch_dma_prep_coherent(page, size);
} else {
-   addr = page_address(page);
+   cpu_addr = page_address(page);
}
-   memset(addr, 0, size);
-   return addr;
+   memset(cpu_addr, 0, alloc_size);
+   return cpu_addr;
 out_unmap:
-   __iommu_dma_unmap(dev, *handle, iosize);
+   __iommu_dma_unmap(dev, *handle, size);
 out_free_pages:
-   if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
-   __free_pages(page, get_order(size));
+   if (!dma_release_from_contiguous(dev, page, alloc_size >> PAGE_SHIFT))
+   __free_pages(page, get_order(alloc_size));
return NULL;
 }
 
-- 
2.20.1



[PATCH 15/23] iommu/dma: Split iommu_dma_free

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

Most of it can double up to serve the failure cleanup path for
iommu_dma_alloc().

Signed-off-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 6b8cedae7cff..33d1ce8cc640 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -935,15 +935,12 @@ static void iommu_dma_unmap_resource(struct device *dev, 
dma_addr_t handle,
__iommu_dma_unmap(dev, handle, size);
 }
 
-static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
-   dma_addr_t handle, unsigned long attrs)
+static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
 {
size_t alloc_size = PAGE_ALIGN(size);
int count = alloc_size >> PAGE_SHIFT;
struct page *page = NULL, **pages = NULL;
 
-   __iommu_dma_unmap(dev, handle, size);
-
/* Non-coherent atomic allocation? Easy */
if (dma_free_from_pool(cpu_addr, alloc_size))
return;
@@ -968,6 +965,13 @@ static void iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr,
__free_pages(page, get_order(alloc_size));
 }
 
+static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
+   dma_addr_t handle, unsigned long attrs)
+{
+   __iommu_dma_unmap(dev, handle, size);
+   __iommu_dma_free(dev, size, cpu_addr);
+}
+
 static void *iommu_dma_alloc(struct device *dev, size_t size,
dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
 {
-- 
2.20.1



[PATCH 23/23] arm64: trim includes in dma-mapping.c

2019-05-23 Thread Christoph Hellwig
With most of the previous functionality now elsewhere a lot of the
headers included in this file are not needed.

Signed-off-by: Christoph Hellwig 
Acked-by: Catalin Marinas 
---
 arch/arm64/mm/dma-mapping.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 184ef9ccd69d..1669618db08a 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -5,19 +5,9 @@
  */
 
 #include 
-#include 
-#include 
 #include 
-#include 
-#include 
-#include 
-#include 
 #include 
-#include 
 #include 
-#include 
-#include 
-#include 
 
 #include 
 
-- 
2.20.1



[PATCH 18/23] iommu/dma: Refactor iommu_dma_get_sgtable

2019-05-23 Thread Christoph Hellwig
Inline __iommu_dma_get_sgtable_page into the main function, and use the
fact that __iommu_dma_get_pages return NULL for remapped contigous
allocations to simplify the code flow a bit.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 45 +++
 1 file changed, 17 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9f0aa80f2bdd..b56bd8e7d5f9 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1097,42 +1097,31 @@ static int iommu_dma_mmap(struct device *dev, struct 
vm_area_struct *vma,
return __iommu_dma_mmap(pages, size, vma);
 }
 
-static int __iommu_dma_get_sgtable_page(struct sg_table *sgt, struct page 
*page,
-   size_t size)
-{
-   int ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
-
-   if (!ret)
-   sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
-   return ret;
-}
-
 static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
 {
-   unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   struct page **pages;
+   struct page *page;
+   int ret;
 
-   if (!is_vmalloc_addr(cpu_addr)) {
-   struct page *page = virt_to_page(cpu_addr);
-   return __iommu_dma_get_sgtable_page(sgt, page, size);
-   }
+   if (is_vmalloc_addr(cpu_addr)) {
+   struct page **pages = __iommu_dma_get_pages(cpu_addr);
 
-   if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-   /*
-* DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped,
-* hence in the vmalloc space.
-*/
-   struct page *page = vmalloc_to_page(cpu_addr);
-   return __iommu_dma_get_sgtable_page(sgt, page, size);
+   if (pages) {
+   return sg_alloc_table_from_pages(sgt, pages,
+   PAGE_ALIGN(size) >> PAGE_SHIFT,
+   0, size, GFP_KERNEL);
+   }
+
+   page = vmalloc_to_page(cpu_addr);
+   } else {
+   page = virt_to_page(cpu_addr);
}
 
-   pages = __iommu_dma_get_pages(cpu_addr);
-   if (!pages)
-   return -ENXIO;
-   return sg_alloc_table_from_pages(sgt, pages, count, 0, size,
-GFP_KERNEL);
+   ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+   if (!ret)
+   sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+   return ret;
 }
 
 static const struct dma_map_ops iommu_dma_ops = {
-- 
2.20.1



[PATCH 09/23] iommu/dma: Refactor the page array remapping allocator

2019-05-23 Thread Christoph Hellwig
Move the call to dma_common_pages_remap into __iommu_dma_alloc and
rename it to iommu_dma_alloc_remap.  This creates a self-contained
helper for remapped pages allocation and mapping.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 54 +++
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e870ea59a34d..0ccc25fd5c86 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -564,9 +564,9 @@ static struct page **__iommu_dma_get_pages(void *cpu_addr)
 }
 
 /**
- * iommu_dma_free - Free a buffer allocated by __iommu_dma_alloc()
+ * iommu_dma_free - Free a buffer allocated by iommu_dma_alloc_remap()
  * @dev: Device which owns this buffer
- * @pages: Array of buffer pages as returned by __iommu_dma_alloc()
+ * @pages: Array of buffer pages as returned by __iommu_dma_alloc_remap()
  * @size: Size of buffer in bytes
  * @handle: DMA address of buffer
  *
@@ -582,33 +582,35 @@ static void __iommu_dma_free(struct device *dev, struct 
page **pages,
 }
 
 /**
- * __iommu_dma_alloc - Allocate and map a buffer contiguous in IOVA space
+ * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
  * @dev: Device to allocate memory for. Must be a real device
  *  attached to an iommu_dma_domain
  * @size: Size of buffer in bytes
+ * @dma_handle: Out argument for allocated DMA handle
  * @gfp: Allocation flags
  * @attrs: DMA attributes for this allocation
- * @prot: IOMMU mapping flags
- * @handle: Out argument for allocated DMA handle
  *
  * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
  * but an IOMMU which supports smaller pages might not map the whole thing.
  *
- * Return: Array of struct page pointers describing the buffer,
- *or NULL on failure.
+ * Return: Mapped virtual address, or NULL on failure.
  */
-static struct page **__iommu_dma_alloc(struct device *dev, size_t size,
-   gfp_t gfp, unsigned long attrs, int prot, dma_addr_t *handle)
+static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
+   dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = >iovad;
+   bool coherent = dev_is_dma_coherent(dev);
+   int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
+   unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
struct page **pages;
struct sg_table sgt;
dma_addr_t iova;
-   unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap;
+   void *vaddr;
 
-   *handle = DMA_MAPPING_ERROR;
+   *dma_handle = DMA_MAPPING_ERROR;
 
min_size = alloc_sizes & -alloc_sizes;
if (min_size < PAGE_SIZE) {
@@ -634,7 +636,7 @@ static struct page **__iommu_dma_alloc(struct device *dev, 
size_t size,
if (sg_alloc_table_from_pages(, pages, count, 0, size, GFP_KERNEL))
goto out_free_iova;
 
-   if (!(prot & IOMMU_CACHE)) {
+   if (!(ioprot & IOMMU_CACHE)) {
struct scatterlist *sg;
int i;
 
@@ -642,14 +644,21 @@ static struct page **__iommu_dma_alloc(struct device 
*dev, size_t size,
arch_dma_prep_coherent(sg_page(sg), sg->length);
}
 
-   if (iommu_map_sg(domain, iova, sgt.sgl, sgt.orig_nents, prot)
+   if (iommu_map_sg(domain, iova, sgt.sgl, sgt.orig_nents, ioprot)
< size)
goto out_free_sg;
 
-   *handle = iova;
+   vaddr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
+   __builtin_return_address(0));
+   if (!vaddr)
+   goto out_unmap;
+
+   *dma_handle = iova;
sg_free_table();
-   return pages;
+   return vaddr;
 
+out_unmap:
+   __iommu_dma_unmap(dev, iova, size);
 out_free_sg:
sg_free_table();
 out_free_iova:
@@ -1008,18 +1017,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
size >> PAGE_SHIFT);
}
} else {
-   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
-   struct page **pages;
-
-   pages = __iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
-   handle);
-   if (!pages)
-   return NULL;
-
-   addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
- __builtin_return_address(0));
-   if (!addr)
-   __iommu_dma_free(dev, pages, iosize, handle);
+   addr = iommu_dma_alloc_remap(dev, 

[PATCH 10/23] iommu/dma: Remove __iommu_dma_free

2019-05-23 Thread Christoph Hellwig
We only have a single caller of this function left, so open code it there.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 21 ++---
 1 file changed, 2 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 0ccc25fd5c86..191c0a4c8e31 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -563,24 +563,6 @@ static struct page **__iommu_dma_get_pages(void *cpu_addr)
return area->pages;
 }
 
-/**
- * iommu_dma_free - Free a buffer allocated by iommu_dma_alloc_remap()
- * @dev: Device which owns this buffer
- * @pages: Array of buffer pages as returned by __iommu_dma_alloc_remap()
- * @size: Size of buffer in bytes
- * @handle: DMA address of buffer
- *
- * Frees both the pages associated with the buffer, and the array
- * describing them
- */
-static void __iommu_dma_free(struct device *dev, struct page **pages,
-   size_t size, dma_addr_t *handle)
-{
-   __iommu_dma_unmap(dev, *handle, size);
-   __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
-   *handle = DMA_MAPPING_ERROR;
-}
-
 /**
  * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space
  * @dev: Device to allocate memory for. Must be a real device
@@ -1053,7 +1035,8 @@ static void iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr,
 
if (!pages)
return;
-   __iommu_dma_free(dev, pages, iosize, );
+   __iommu_dma_unmap(dev, handle, iosize);
+   __iommu_dma_free_pages(pages, size >> PAGE_SHIFT);
dma_common_free_remap(cpu_addr, size, VM_USERMAP);
} else {
__iommu_dma_unmap(dev, handle, iosize);
-- 
2.20.1



[PATCH 12/23] iommu/dma: Refactor iommu_dma_alloc

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

Shuffle around the self-contained atomic and non-contiguous cases to
return early and get out of the way of the CMA case that we're about to
work on next.

Signed-off-by: Robin Murphy 
[hch: slight changes to the code flow]
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 60 +++
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f61e3f8861a8..41e87756c076 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -973,14 +973,19 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
 {
bool coherent = dev_is_dma_coherent(dev);
int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
size_t iosize = size;
+   struct page *page;
void *addr;
 
size = PAGE_ALIGN(size);
gfp |= __GFP_ZERO;
 
+   if (gfpflags_allow_blocking(gfp) &&
+   !(attrs & DMA_ATTR_FORCE_CONTIGUOUS))
+   return iommu_dma_alloc_remap(dev, iosize, handle, gfp, attrs);
+
if (!gfpflags_allow_blocking(gfp)) {
-   struct page *page;
/*
 * In atomic context we can't remap anything, so we'll only
 * get the virtually contiguous buffer we need by way of a
@@ -1002,39 +1007,34 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
__free_pages(page, get_order(size));
else
dma_free_from_pool(addr, size);
-   addr = NULL;
-   }
-   } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
-   struct page *page;
-
-   page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-   get_order(size), gfp & __GFP_NOWARN);
-   if (!page)
return NULL;
-
-   *handle = __iommu_dma_map(dev, page_to_phys(page), iosize, 
ioprot);
-   if (*handle == DMA_MAPPING_ERROR) {
-   dma_release_from_contiguous(dev, page,
-   size >> PAGE_SHIFT);
-   return NULL;
-   }
-   addr = dma_common_contiguous_remap(page, size, VM_USERMAP,
-  prot,
-  __builtin_return_address(0));
-   if (addr) {
-   if (!coherent)
-   arch_dma_prep_coherent(page, iosize);
-   memset(addr, 0, size);
-   } else {
-   __iommu_dma_unmap(dev, *handle, iosize);
-   dma_release_from_contiguous(dev, page,
-   size >> PAGE_SHIFT);
}
-   } else {
-   addr = iommu_dma_alloc_remap(dev, iosize, handle, gfp, attrs);
+   return addr;
}
+
+   page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
+get_order(size), gfp & __GFP_NOWARN);
+   if (!page)
+   return NULL;
+
+   *handle = __iommu_dma_map(dev, page_to_phys(page), iosize, ioprot);
+   if (*handle == DMA_MAPPING_ERROR)
+   goto out_free_pages;
+
+   addr = dma_common_contiguous_remap(page, size, VM_USERMAP, prot,
+   __builtin_return_address(0));
+   if (!addr)
+   goto out_unmap;
+
+   if (!coherent)
+   arch_dma_prep_coherent(page, iosize);
+   memset(addr, 0, size);
return addr;
+out_unmap:
+   __iommu_dma_unmap(dev, *handle, iosize);
+out_free_pages:
+   dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
+   return NULL;
 }
 
 static int __iommu_dma_mmap_pfn(struct vm_area_struct *vma,
-- 
2.20.1



[PATCH 11/23] iommu/dma: Refactor iommu_dma_free

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

The freeing logic was made particularly horrible by part of it being
opaque to the arch wrapper, which led to a lot of convoluted repetition
to ensure each path did everything in the right order. Now that it's
all private, we can pick apart and consolidate the logically-distinct
steps of freeing the IOMMU mapping, the underlying pages, and the CPU
remap (if necessary) into something much more manageable.

Signed-off-by: Robin Murphy 
[various cosmetic changes to the code flow]
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 73 ++-
 1 file changed, 33 insertions(+), 40 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 191c0a4c8e31..f61e3f8861a8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -935,6 +935,39 @@ static void iommu_dma_unmap_resource(struct device *dev, 
dma_addr_t handle,
__iommu_dma_unmap(dev, handle, size);
 }
 
+static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
+   dma_addr_t handle, unsigned long attrs)
+{
+   size_t alloc_size = PAGE_ALIGN(size);
+   int count = alloc_size >> PAGE_SHIFT;
+   struct page *page = NULL, **pages = NULL;
+
+   __iommu_dma_unmap(dev, handle, size);
+
+   /* Non-coherent atomic allocation? Easy */
+   if (dma_free_from_pool(cpu_addr, alloc_size))
+   return;
+
+   if (is_vmalloc_addr(cpu_addr)) {
+   /*
+* If it the address is remapped, then it's either non-coherent
+* or highmem CMA, or an iommu_dma_alloc_remap() construction.
+*/
+   pages = __iommu_dma_get_pages(cpu_addr);
+   if (!pages)
+   page = vmalloc_to_page(cpu_addr);
+   dma_common_free_remap(cpu_addr, alloc_size, VM_USERMAP);
+   } else {
+   /* Lowmem means a coherent atomic or CMA allocation */
+   page = virt_to_page(cpu_addr);
+   }
+
+   if (pages)
+   __iommu_dma_free_pages(pages, count);
+   if (page && !dma_release_from_contiguous(dev, page, count))
+   __free_pages(page, get_order(alloc_size));
+}
+
 static void *iommu_dma_alloc(struct device *dev, size_t size,
dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
 {
@@ -1004,46 +1037,6 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
return addr;
 }
 
-static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr,
-   dma_addr_t handle, unsigned long attrs)
-{
-   size_t iosize = size;
-
-   size = PAGE_ALIGN(size);
-   /*
-* @cpu_addr will be one of 4 things depending on how it was allocated:
-* - A remapped array of pages for contiguous allocations.
-* - A remapped array of pages from iommu_dma_alloc_remap(), for all
-*   non-atomic allocations.
-* - A non-cacheable alias from the atomic pool, for atomic
-*   allocations by non-coherent devices.
-* - A normal lowmem address, for atomic allocations by
-*   coherent devices.
-* Hence how dodgy the below logic looks...
-*/
-   if (dma_in_atomic_pool(cpu_addr, size)) {
-   __iommu_dma_unmap(dev, handle, iosize);
-   dma_free_from_pool(cpu_addr, size);
-   } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-   struct page *page = vmalloc_to_page(cpu_addr);
-
-   __iommu_dma_unmap(dev, handle, iosize);
-   dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
-   dma_common_free_remap(cpu_addr, size, VM_USERMAP);
-   } else if (is_vmalloc_addr(cpu_addr)){
-   struct page **pages = __iommu_dma_get_pages(cpu_addr);
-
-   if (!pages)
-   return;
-   __iommu_dma_unmap(dev, handle, iosize);
-   __iommu_dma_free_pages(pages, size >> PAGE_SHIFT);
-   dma_common_free_remap(cpu_addr, size, VM_USERMAP);
-   } else {
-   __iommu_dma_unmap(dev, handle, iosize);
-   __free_pages(virt_to_page(cpu_addr), get_order(size));
-   }
-}
-
 static int __iommu_dma_mmap_pfn(struct vm_area_struct *vma,
  unsigned long pfn, size_t size)
 {
-- 
2.20.1



[PATCH 17/23] iommu/dma: Refactor iommu_dma_alloc, part 2

2019-05-23 Thread Christoph Hellwig
All the logic in iommu_dma_alloc that deals with page allocation from
the CMA or page allocators can be split into a self-contained helper,
and we can than map the result of that or the atomic pool allocation
with the iommu later.  This also allows reusing __iommu_dma_free to
tear down the allocations and MMU mappings when the IOMMU mapping
fails.

Based on a patch from Robin Murphy.

Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 65 +--
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 9ac76d286df1..9f0aa80f2bdd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -972,35 +972,14 @@ static void iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr,
__iommu_dma_free(dev, size, cpu_addr);
 }
 
-static void *iommu_dma_alloc(struct device *dev, size_t size,
-   dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+static void *iommu_dma_alloc_pages(struct device *dev, size_t size,
+   struct page **pagep, gfp_t gfp, unsigned long attrs)
 {
bool coherent = dev_is_dma_coherent(dev);
-   int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
size_t alloc_size = PAGE_ALIGN(size);
struct page *page = NULL;
void *cpu_addr;
 
-   gfp |= __GFP_ZERO;
-
-   if (gfpflags_allow_blocking(gfp) &&
-   !(attrs & DMA_ATTR_FORCE_CONTIGUOUS))
-   return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs);
-
-   if (!gfpflags_allow_blocking(gfp) && !coherent) {
-   cpu_addr = dma_alloc_from_pool(alloc_size, , gfp);
-   if (!cpu_addr)
-   return NULL;
-
-   *handle = __iommu_dma_map(dev, page_to_phys(page), size,
- ioprot);
-   if (*handle == DMA_MAPPING_ERROR) {
-   dma_free_from_pool(cpu_addr, alloc_size);
-   return NULL;
-   }
-   return cpu_addr;
-   }
-
if (gfpflags_allow_blocking(gfp))
page = dma_alloc_from_contiguous(dev, alloc_size >> PAGE_SHIFT,
 get_order(alloc_size),
@@ -1010,33 +989,59 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
if (!page)
return NULL;
 
-   *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot);
-   if (*handle == DMA_MAPPING_ERROR)
-   goto out_free_pages;
-
if (!coherent || PageHighMem(page)) {
pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
 
cpu_addr = dma_common_contiguous_remap(page, alloc_size,
VM_USERMAP, prot, __builtin_return_address(0));
if (!cpu_addr)
-   goto out_unmap;
+   goto out_free_pages;
 
if (!coherent)
arch_dma_prep_coherent(page, size);
} else {
cpu_addr = page_address(page);
}
+
+   *pagep = page;
memset(cpu_addr, 0, alloc_size);
return cpu_addr;
-out_unmap:
-   __iommu_dma_unmap(dev, *handle, size);
 out_free_pages:
if (!dma_release_from_contiguous(dev, page, alloc_size >> PAGE_SHIFT))
__free_pages(page, get_order(alloc_size));
return NULL;
 }
 
+static void *iommu_dma_alloc(struct device *dev, size_t size,
+   dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
+{
+   bool coherent = dev_is_dma_coherent(dev);
+   int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
+   struct page *page = NULL;
+   void *cpu_addr;
+
+   gfp |= __GFP_ZERO;
+
+   if (gfpflags_allow_blocking(gfp) &&
+   !(attrs & DMA_ATTR_FORCE_CONTIGUOUS))
+   return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs);
+
+   if (!gfpflags_allow_blocking(gfp) && !coherent)
+   cpu_addr = dma_alloc_from_pool(PAGE_ALIGN(size), , gfp);
+   else
+   cpu_addr = iommu_dma_alloc_pages(dev, size, , gfp, attrs);
+   if (!cpu_addr)
+   return NULL;
+
+   *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot);
+   if (*handle == DMA_MAPPING_ERROR) {
+   __iommu_dma_free(dev, size, cpu_addr);
+   return NULL;
+   }
+
+   return cpu_addr;
+}
+
 static int __iommu_dma_mmap_pfn(struct vm_area_struct *vma,
  unsigned long pfn, size_t size)
 {
-- 
2.20.1



[PATCH 13/23] iommu/dma: Don't remap CMA unnecessarily

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

Always remapping CMA allocations was largely a bodge to keep the freeing
logic manageable when it was split between here and an arch wrapper. Now
that it's all together and streamlined, we can relax that limitation.

Signed-off-by: Robin Murphy 
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 41e87756c076..3629bc2f59ee 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -973,7 +973,6 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
 {
bool coherent = dev_is_dma_coherent(dev);
int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
-   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
size_t iosize = size;
struct page *page;
void *addr;
@@ -1021,13 +1020,19 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
if (*handle == DMA_MAPPING_ERROR)
goto out_free_pages;
 
-   addr = dma_common_contiguous_remap(page, size, VM_USERMAP, prot,
-   __builtin_return_address(0));
-   if (!addr)
-   goto out_unmap;
+   if (!coherent || PageHighMem(page)) {
+   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
 
-   if (!coherent)
-   arch_dma_prep_coherent(page, iosize);
+   addr = dma_common_contiguous_remap(page, size, VM_USERMAP, prot,
+   __builtin_return_address(0));
+   if (!addr)
+   goto out_unmap;
+
+   if (!coherent)
+   arch_dma_prep_coherent(page, iosize);
+   } else {
+   addr = page_address(page);
+   }
memset(addr, 0, size);
return addr;
 out_unmap:
-- 
2.20.1



[PATCH 14/23] iommu/dma: Merge the CMA and alloc_pages allocation paths

2019-05-23 Thread Christoph Hellwig
Instead of having a separate code path for the non-blocking alloc_pages
and CMA allocations paths merge them into one.  There is a slight
behavior change here in that we try the page allocator if CMA fails.
This matches what dma-direct and other iommu drivers do and will be
needed to use the dma-iommu code on architectures without DMA remapping
later on.

Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 32 
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 3629bc2f59ee..6b8cedae7cff 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -974,7 +974,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
bool coherent = dev_is_dma_coherent(dev);
int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
size_t iosize = size;
-   struct page *page;
+   struct page *page = NULL;
void *addr;
 
size = PAGE_ALIGN(size);
@@ -984,35 +984,26 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
!(attrs & DMA_ATTR_FORCE_CONTIGUOUS))
return iommu_dma_alloc_remap(dev, iosize, handle, gfp, attrs);
 
-   if (!gfpflags_allow_blocking(gfp)) {
-   /*
-* In atomic context we can't remap anything, so we'll only
-* get the virtually contiguous buffer we need by way of a
-* physically contiguous allocation.
-*/
-   if (coherent) {
-   page = alloc_pages(gfp, get_order(size));
-   addr = page ? page_address(page) : NULL;
-   } else {
-   addr = dma_alloc_from_pool(size, , gfp);
-   }
+   if (!gfpflags_allow_blocking(gfp) && !coherent) {
+   addr = dma_alloc_from_pool(size, , gfp);
if (!addr)
return NULL;
 
*handle = __iommu_dma_map(dev, page_to_phys(page), iosize,
  ioprot);
if (*handle == DMA_MAPPING_ERROR) {
-   if (coherent)
-   __free_pages(page, get_order(size));
-   else
-   dma_free_from_pool(addr, size);
+   dma_free_from_pool(addr, size);
return NULL;
}
return addr;
}
 
-   page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-get_order(size), gfp & __GFP_NOWARN);
+   if (gfpflags_allow_blocking(gfp))
+   page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
+get_order(size),
+gfp & __GFP_NOWARN);
+   if (!page)
+   page = alloc_pages(gfp, get_order(size));
if (!page)
return NULL;
 
@@ -1038,7 +1029,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
 out_unmap:
__iommu_dma_unmap(dev, *handle, iosize);
 out_free_pages:
-   dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
+   if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
+   __free_pages(page, get_order(size));
return NULL;
 }
 
-- 
2.20.1



[PATCH 07/23] iommu/dma: Squash __iommu_dma_{map,unmap}_page helpers

2019-05-23 Thread Christoph Hellwig
From: Robin Murphy 

The remaining internal callsites don't care about having prototypes
compatible with the relevant dma_map_ops callbacks, so the extra
level of indirection just wastes space and complictaes things.

Signed-off-by: Robin Murphy 
Signed-off-by: Christoph Hellwig 
---
 drivers/iommu/dma-iommu.c | 25 +++--
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c04450a4adec..4596e4860da8 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -717,18 +717,6 @@ static void iommu_dma_sync_sg_for_device(struct device 
*dev,
arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
 }
 
-static dma_addr_t __iommu_dma_map_page(struct device *dev, struct page *page,
-   unsigned long offset, size_t size, int prot)
-{
-   return __iommu_dma_map(dev, page_to_phys(page) + offset, size, prot);
-}
-
-static void __iommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
-   size_t size, enum dma_data_direction dir, unsigned long attrs)
-{
-   __iommu_dma_unmap(dev, handle, size);
-}
-
 static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -974,7 +962,8 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
if (!addr)
return NULL;
 
-   *handle = __iommu_dma_map_page(dev, page, 0, iosize, ioprot);
+   *handle = __iommu_dma_map(dev, page_to_phys(page), iosize,
+ ioprot);
if (*handle == DMA_MAPPING_ERROR) {
if (coherent)
__free_pages(page, get_order(size));
@@ -991,7 +980,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
if (!page)
return NULL;
 
-   *handle = __iommu_dma_map_page(dev, page, 0, iosize, ioprot);
+   *handle = __iommu_dma_map(dev, page_to_phys(page), iosize, 
ioprot);
if (*handle == DMA_MAPPING_ERROR) {
dma_release_from_contiguous(dev, page,
size >> PAGE_SHIFT);
@@ -1005,7 +994,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t 
size,
arch_dma_prep_coherent(page, iosize);
memset(addr, 0, size);
} else {
-   __iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs);
+   __iommu_dma_unmap(dev, *handle, iosize);
dma_release_from_contiguous(dev, page,
size >> PAGE_SHIFT);
}
@@ -1044,12 +1033,12 @@ static void iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr,
 * Hence how dodgy the below logic looks...
 */
if (dma_in_atomic_pool(cpu_addr, size)) {
-   __iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
+   __iommu_dma_unmap(dev, handle, iosize);
dma_free_from_pool(cpu_addr, size);
} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
struct page *page = vmalloc_to_page(cpu_addr);
 
-   __iommu_dma_unmap_page(dev, handle, iosize, 0, attrs);
+   __iommu_dma_unmap(dev, handle, iosize);
dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
dma_common_free_remap(cpu_addr, size, VM_USERMAP);
} else if (is_vmalloc_addr(cpu_addr)){
@@ -1060,7 +1049,7 @@ static void iommu_dma_free(struct device *dev, size_t 
size, void *cpu_addr,
__iommu_dma_free(dev, area->pages, iosize, );
dma_common_free_remap(cpu_addr, size, VM_USERMAP);
} else {
-   __iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
+   __iommu_dma_unmap(dev, handle, iosize);
__free_pages(virt_to_page(cpu_addr), get_order(size));
}
 }
-- 
2.20.1



implement generic dma_map_ops for IOMMUs v6

2019-05-23 Thread Christoph Hellwig


Hi Robin and Joerg,

I think we are finally ready for the generic dma-iommu series.  I have
various DMA API changes pending, and Tom has patches ready to convert
the AMD and Intel iommu drivers over to it.  I'd love to have this
in a stable branch shared between the dma-mapping and iommu trees
the day after rc2 is released.  I volunteer to create the branch,
but I'm fine with it living in the iommu tree as well.


A git tree is also available at:

git://git.infradead.org/users/hch/misc.git dma-iommu-ops.6

Gitweb:


http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/dma-iommu-ops.6

Changes since v5:
 - rebased to latest linus tree and dropped a patch now merged
 - remove the now pointless __dma_iommu_mmap function
 - restore a cleanup from Robin that would have caused a conflict in
   last merge winow, but is fine now

Changes since v4:
 - rebased to 5.2-rc1

Changes since v3:
 - fold the separate patch to refactor mmap bounds checking
 - don't warn on not finding a vm_area
 - improve a commit log
 - refactor __dma_iommu_free a little differently
 - remove a minor MSI map cleanup to avoid a conflict with the
   "Split iommu_dma_map_msi_msg" series

Changes since v2:
 - address various review comments and include patches from Robin

Changes since v1:
 - only include other headers in dma-iommu.h if CONFIG_DMA_IOMMU is enabled
 - keep using a scatterlist in iommu_dma_alloc
 - split out mmap/sgtable fixes and move them early in the series
 - updated a few commit logs
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 02/23] iommu/dma: Remove the flush_page callback

2019-05-23 Thread Christoph Hellwig
We now have a arch_dma_prep_coherent architecture hook that is used
for the generic DMA remap allocator, and we should use the same
interface for the dma-iommu code.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
Acked-by: Catalin Marinas 
---
 arch/arm64/mm/dma-mapping.c | 8 +---
 drivers/iommu/dma-iommu.c   | 8 +++-
 include/linux/dma-iommu.h   | 3 +--
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 674860e3e478..10a8852c8b6a 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -104,12 +104,6 @@ arch_initcall(arm64_dma_init);
 #include 
 #include 
 
-/* Thankfully, all cache ops are by VA so we can ignore phys here */
-static void flush_page(struct device *dev, const void *virt, phys_addr_t phys)
-{
-   __dma_flush_area(virt, PAGE_SIZE);
-}
-
 static void *__iommu_alloc_attrs(struct device *dev, size_t size,
 dma_addr_t *handle, gfp_t gfp,
 unsigned long attrs)
@@ -186,7 +180,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t 
size,
struct page **pages;
 
pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
-   handle, flush_page);
+   handle);
if (!pages)
return NULL;
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 129c4badf9ae..aac12433ffef 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -560,8 +561,6 @@ void iommu_dma_free(struct device *dev, struct page 
**pages, size_t size,
  * @attrs: DMA attributes for this allocation
  * @prot: IOMMU mapping flags
  * @handle: Out argument for allocated DMA handle
- * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
- * given VA/PA are visible to the given non-coherent device.
  *
  * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
  * but an IOMMU which supports smaller pages might not map the whole thing.
@@ -570,8 +569,7 @@ void iommu_dma_free(struct device *dev, struct page 
**pages, size_t size,
  *or NULL on failure.
  */
 struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
-   unsigned long attrs, int prot, dma_addr_t *handle,
-   void (*flush_page)(struct device *, const void *, phys_addr_t))
+   unsigned long attrs, int prot, dma_addr_t *handle)
 {
struct iommu_domain *domain = iommu_get_dma_domain(dev);
struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -615,7 +613,7 @@ struct page **iommu_dma_alloc(struct device *dev, size_t 
size, gfp_t gfp,
 */
sg_miter_start(, sgt.sgl, sgt.orig_nents, 
SG_MITER_FROM_SG);
while (sg_miter_next())
-   flush_page(dev, miter.addr, page_to_phys(miter.page));
+   arch_dma_prep_coherent(miter.page, PAGE_SIZE);
sg_miter_stop();
}
 
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index dfb83f9c24dc..e1ef265b578b 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -44,8 +44,7 @@ int dma_info_to_prot(enum dma_data_direction dir, bool 
coherent,
  * the arch code to take care of attributes and cache maintenance
  */
 struct page **iommu_dma_alloc(struct device *dev, size_t size, gfp_t gfp,
-   unsigned long attrs, int prot, dma_addr_t *handle,
-   void (*flush_page)(struct device *, const void *, phys_addr_t));
+   unsigned long attrs, int prot, dma_addr_t *handle);
 void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
dma_addr_t *handle);
 
-- 
2.20.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 05/23] iommu/dma: Move __iommu_dma_map

2019-05-23 Thread Christoph Hellwig
Moving this function up to its unmap counterpart helps to keep related
code together for the following changes.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 46 +++
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e34ba23353cb..c406abe3be01 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -464,6 +464,29 @@ static void __iommu_dma_unmap(struct iommu_domain *domain, 
dma_addr_t dma_addr,
iommu_dma_free_iova(cookie, dma_addr, size);
 }
 
+static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
+   size_t size, int prot, struct iommu_domain *domain)
+{
+   struct iommu_dma_cookie *cookie = domain->iova_cookie;
+   size_t iova_off = 0;
+   dma_addr_t iova;
+
+   if (cookie->type == IOMMU_DMA_IOVA_COOKIE) {
+   iova_off = iova_offset(>iovad, phys);
+   size = iova_align(>iovad, size + iova_off);
+   }
+
+   iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
+   if (!iova)
+   return DMA_MAPPING_ERROR;
+
+   if (iommu_map(domain, iova, phys - iova_off, size, prot)) {
+   iommu_dma_free_iova(cookie, iova, size);
+   return DMA_MAPPING_ERROR;
+   }
+   return iova + iova_off;
+}
+
 static void __iommu_dma_free_pages(struct page **pages, int count)
 {
while (count--)
@@ -692,29 +715,6 @@ static void iommu_dma_sync_sg_for_device(struct device 
*dev,
arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir);
 }
 
-static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
-   size_t size, int prot, struct iommu_domain *domain)
-{
-   struct iommu_dma_cookie *cookie = domain->iova_cookie;
-   size_t iova_off = 0;
-   dma_addr_t iova;
-
-   if (cookie->type == IOMMU_DMA_IOVA_COOKIE) {
-   iova_off = iova_offset(>iovad, phys);
-   size = iova_align(>iovad, size + iova_off);
-   }
-
-   iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev);
-   if (!iova)
-   return DMA_MAPPING_ERROR;
-
-   if (iommu_map(domain, iova, phys - iova_off, size, prot)) {
-   iommu_dma_free_iova(cookie, iova, size);
-   return DMA_MAPPING_ERROR;
-   }
-   return iova + iova_off;
-}
-
 static dma_addr_t __iommu_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, int prot)
 {
-- 
2.20.1



[PATCH 03/23] iommu/dma: Use for_each_sg in iommu_dma_alloc

2019-05-23 Thread Christoph Hellwig
arch_dma_prep_coherent can handle physically contiguous ranges larger
than PAGE_SIZE just fine, which means we don't need a page-based
iterator.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index aac12433ffef..9b7f120d7381 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -606,15 +606,11 @@ struct page **iommu_dma_alloc(struct device *dev, size_t 
size, gfp_t gfp,
goto out_free_iova;
 
if (!(prot & IOMMU_CACHE)) {
-   struct sg_mapping_iter miter;
-   /*
-* The CPU-centric flushing implied by SG_MITER_TO_SG isn't
-* sufficient here, so skip it by using the "wrong" direction.
-*/
-   sg_miter_start(, sgt.sgl, sgt.orig_nents, 
SG_MITER_FROM_SG);
-   while (sg_miter_next())
-   arch_dma_prep_coherent(miter.page, PAGE_SIZE);
-   sg_miter_stop();
+   struct scatterlist *sg;
+   int i;
+
+   for_each_sg(sgt.sgl, sg, sgt.orig_nents, i)
+   arch_dma_prep_coherent(sg_page(sg), sg->length);
}
 
if (iommu_map_sg(domain, iova, sgt.sgl, sgt.orig_nents, prot)
-- 
2.20.1



[PATCH 04/23] iommu/dma: move the arm64 wrappers to common code

2019-05-23 Thread Christoph Hellwig
There is nothing really arm64 specific in the iommu_dma_ops
implementation, so move it to dma-iommu.c and keep a lot of symbols
self-contained.  Note the implementation does depend on the
DMA_DIRECT_REMAP infrastructure for now, so we'll have to make the
DMA_IOMMU support depend on it, but this will be relaxed soon.

Signed-off-by: Christoph Hellwig 
Acked-by: Robin Murphy 
---
 arch/arm64/mm/dma-mapping.c | 394 +--
 drivers/iommu/Kconfig   |   1 +
 drivers/iommu/dma-iommu.c   | 398 +---
 include/linux/dma-iommu.h   |  42 +---
 4 files changed, 378 insertions(+), 457 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 10a8852c8b6a..d1661f78eb4d 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -58,37 +59,6 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
__dma_flush_area(page_address(page), size);
 }
 
-#ifdef CONFIG_IOMMU_DMA
-static int __swiotlb_get_sgtable_page(struct sg_table *sgt,
- struct page *page, size_t size)
-{
-   int ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
-
-   if (!ret)
-   sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
-
-   return ret;
-}
-
-static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
- unsigned long pfn, size_t size)
-{
-   int ret = -ENXIO;
-   unsigned long nr_vma_pages = vma_pages(vma);
-   unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
-   unsigned long off = vma->vm_pgoff;
-
-   if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
-   ret = remap_pfn_range(vma, vma->vm_start,
- pfn + off,
- vma->vm_end - vma->vm_start,
- vma->vm_page_prot);
-   }
-
-   return ret;
-}
-#endif /* CONFIG_IOMMU_DMA */
-
 static int __init arm64_dma_init(void)
 {
WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
@@ -100,374 +70,18 @@ static int __init arm64_dma_init(void)
 arch_initcall(arm64_dma_init);
 
 #ifdef CONFIG_IOMMU_DMA
-#include 
-#include 
-#include 
-
-static void *__iommu_alloc_attrs(struct device *dev, size_t size,
-dma_addr_t *handle, gfp_t gfp,
-unsigned long attrs)
-{
-   bool coherent = dev_is_dma_coherent(dev);
-   int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs);
-   size_t iosize = size;
-   void *addr;
-
-   if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
-   return NULL;
-
-   size = PAGE_ALIGN(size);
-
-   /*
-* Some drivers rely on this, and we probably don't want the
-* possibility of stale kernel data being read by devices anyway.
-*/
-   gfp |= __GFP_ZERO;
-
-   if (!gfpflags_allow_blocking(gfp)) {
-   struct page *page;
-   /*
-* In atomic context we can't remap anything, so we'll only
-* get the virtually contiguous buffer we need by way of a
-* physically contiguous allocation.
-*/
-   if (coherent) {
-   page = alloc_pages(gfp, get_order(size));
-   addr = page ? page_address(page) : NULL;
-   } else {
-   addr = dma_alloc_from_pool(size, , gfp);
-   }
-   if (!addr)
-   return NULL;
-
-   *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot);
-   if (*handle == DMA_MAPPING_ERROR) {
-   if (coherent)
-   __free_pages(page, get_order(size));
-   else
-   dma_free_from_pool(addr, size);
-   addr = NULL;
-   }
-   } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
-   pgprot_t prot = arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs);
-   struct page *page;
-
-   page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
-   get_order(size), gfp & __GFP_NOWARN);
-   if (!page)
-   return NULL;
-
-   *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot);
-   if (*handle == DMA_MAPPING_ERROR) {
-   dma_release_from_contiguous(dev, page,
-   size >> PAGE_SHIFT);
-   return NULL;
-   }
-   addr = dma_common_contiguous_remap(page, size, VM_USERMAP,
-  prot,
-  __builtin_return_address(0));
-

[PATCH 01/23] iommu/dma: Cleanup dma-iommu.h

2019-05-23 Thread Christoph Hellwig
No need for a __KERNEL__ guard outside uapi and add a missing comment
describing the #else cpp statement.  Last but not least include
 instead of the asm version, which is frowned upon.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 include/linux/dma-iommu.h | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 476e0c54de2d..dfb83f9c24dc 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -16,9 +16,8 @@
 #ifndef __DMA_IOMMU_H
 #define __DMA_IOMMU_H
 
-#ifdef __KERNEL__
+#include 
 #include 
-#include 
 
 #ifdef CONFIG_IOMMU_DMA
 #include 
@@ -86,7 +85,7 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc,
 
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
-#else
+#else /* CONFIG_IOMMU_DMA */
 
 struct iommu_domain;
 struct msi_desc;
@@ -128,5 +127,4 @@ static inline void iommu_dma_get_resv_regions(struct device 
*dev, struct list_he
 }
 
 #endif /* CONFIG_IOMMU_DMA */
-#endif /* __KERNEL__ */
 #endif /* __DMA_IOMMU_H */
-- 
2.20.1