date:20181219

[PATCH] powerpc/8xx: Allow pinning IMMR TLB when using early debug console

2018-12-19 Thread Christophe Leroy

CONFIG_EARLY_DEBUG_CPM requires IMMR area TLB to be pinned
otherwise it doesn't survive MMU_init, and the boot fails.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7996ec33f1b4..45aed802ba86 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1192,7 +1192,7 @@ config PIN_TLB_DATA
 
 config PIN_TLB_IMMR
bool "Pinned TLB for IMMR"
-   depends on PIN_TLB
+   depends on PIN_TLB || PPC_EARLY_DEBUG_CPM
default y
 
 config PIN_TLB_TEXT
-- 
2.13.3

Re: [PATCH V4 5/5] arch/powerpc/mm/hugetlb: NestMMU workaround for hugetlb mprotect RW upgrade

2018-12-19 Thread Christoph Hellwig

On Thu, Dec 20, 2018 at 11:30:12AM +1100, Benjamin Herrenschmidt wrote:
> On Wed, 2018-12-19 at 08:50 +0530, Aneesh Kumar K.V wrote:
> > Christoph Hellwig  writes:
> > 
> > > On Tue, Dec 18, 2018 at 03:11:37PM +0530, Aneesh Kumar K.V wrote:
> > > > +EXPORT_SYMBOL(huge_ptep_modify_prot_start);
> > > 
> > > The only user of this function is the one you added in the last patch
> > > in mm/hugetlb.c, so there is no need to export this function.
> > > 
> > > > +
> > > > +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned 
> > > > long addr,
> > > > + pte_t *ptep, pte_t old_pte, pte_t pte)
> > > > +{
> > > > +
> > > > +   if (radix_enabled())
> > > > +   return radix__huge_ptep_modify_prot_commit(vma, addr, 
> > > > ptep,
> > > > +  old_pte, 
> > > > pte);
> > > > +   set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
> > > > +}
> > > > +EXPORT_SYMBOL(huge_ptep_modify_prot_commit);
> > > 
> > > Same here.
> > 
> > That was done considering that ptep_modify_prot_start/commit was defined
> > in asm-generic/pgtable.h. I was trying to make sure I didn't break
> > anything with the patch. Also s390 do have that EXPORT_SYMBOL() for the
> > same. hugetlb just inherited that.
> > 
> > If you feel strongly about it, I can drop the EXPORT_SYMBOL().
> 
> At the very least it should be _GPL

In general yes, but in this case it just needs to go.  The s390
maintainers already removed a patch to remove their
ptep_modify_prot_start / ptep_modify_prot_commit exports.

[RFC PATCH v2 3/3] powerpc/nohash32: Add KASAN support

2018-12-19 Thread Christophe Leroy

This patch adds KASAN support for nohash PPC32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/kasan.h | 22 +
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 +
 arch/powerpc/include/asm/ppc_asm.h   |  5 ++
 arch/powerpc/include/asm/setup.h |  5 ++
 arch/powerpc/include/asm/string.h| 14 ++
 arch/powerpc/kernel/Makefile |  3 ++
 arch/powerpc/kernel/setup-common.c   |  2 +
 arch/powerpc/kernel/setup_32.c   |  3 ++
 arch/powerpc/lib/Makefile|  2 +
 arch/powerpc/lib/copy_32.S   |  9 ++--
 arch/powerpc/mm/Makefile |  3 ++
 arch/powerpc/mm/dump_linuxpagetables.c   |  8 +++
 arch/powerpc/mm/kasan_init.c | 73 
 arch/powerpc/mm/mem.c|  4 ++
 15 files changed, 153 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kasan.h
 create mode 100644 arch/powerpc/mm/kasan_init.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8ea7c2c02cbf..44be55c087be 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -174,6 +174,7 @@ config PPC
select GENERIC_TIME_VSYSCALL
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
+   select HAVE_ARCH_KASAN  if PPC32 && PPC_MMU_NOHASH
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if COMPAT
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
new file mode 100644
index ..3c2f98c40307
--- /dev/null
+++ b/arch/powerpc/include/asm/kasan.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_KASAN_H
+#define __ASM_KASAN_H
+
+#ifndef __ASSEMBLY__
+
+#include 
+#include 
+#include 
+
+#define KASAN_SHADOW_SCALE_SHIFT   3
+#define KASAN_SHADOW_SIZE  ((~0UL - PAGE_OFFSET + 1) >> 
KASAN_SHADOW_SCALE_SHIFT)
+
+#define KASAN_SHADOW_START  (ALIGN_DOWN(FIXADDR_START - KASAN_SHADOW_SIZE, 
PGDIR_SIZE))
+#define KASAN_SHADOW_END(KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
+#define KASAN_SHADOW_OFFSET (KASAN_SHADOW_START - (PAGE_OFFSET >> 
KASAN_SHADOW_SCALE_SHIFT))
+
+void kasan_early_init(void);
+void kasan_init(void);
+
+#endif
+#endif
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index bed433358260..b3b52f02be1a 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -71,6 +71,8 @@ extern int icache_44x_need_flush;
  */
 #ifdef CONFIG_HIGHMEM
 #define KVIRT_TOP  PKMAP_BASE
+#elif defined(CONFIG_KASAN)
+#define KVIRT_TOP  KASAN_SHADOW_START
 #else
 #define KVIRT_TOP  (0xfe00UL)  /* for now, could be FIXMAP_BASE ? */
 #endif
diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index b5d023680801..80d520e34552 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -251,6 +251,11 @@ GLUE(.,name):
 
 #define _GLOBAL_TOC(name) _GLOBAL(name)
 
+#define KASAN_OVERRIDE(x, y) \
+   .weak x; \
+   .set x, y
+
+
 #endif
 
 /*
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index 1fffbba8d6a5..16572484149c 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -67,6 +67,11 @@ void do_barrier_nospec_fixups_range(bool enable, void 
*start, void *end);
 static inline void do_barrier_nospec_fixups_range(bool enable, void *start, 
void *end) { };
 #endif
 
+#ifndef CONFIG_KASAN
+static inline void kasan_early_init(void) { }
+static inline void kasan_init(void) { }
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_SETUP_H */
diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 1647de15a31e..28795a72aba1 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -27,6 +27,20 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
+void * __memset(void *, int, __kernel_size_t);
+void * __memcpy(void *, const void *, __kernel_size_t);
+void * __memmove(void *, const void *, __kernel_size_t);
+
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+/*
+ * For files that are not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
 #ifdef CONFIG_PPC64
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index

[RFC PATCH v2 2/3] powerpc/32: Move early_init() in a separate file

2018-12-19 Thread Christophe Leroy

In preparation of KASAN, move early_init() into a separate
file in order to allow deactivation of KASAN for that function.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/Makefile   |  2 +-
 arch/powerpc/kernel/early_32.c | 35 +++
 arch/powerpc/kernel/setup_32.c | 26 --
 3 files changed, 36 insertions(+), 27 deletions(-)
 create mode 100644 arch/powerpc/kernel/early_32.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index a5a6a243f3cf..e9a9419b98b6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -93,7 +93,7 @@ extra-y   += vmlinux.lds
 
 obj-$(CONFIG_RELOCATABLE)  += reloc_$(BITS).o
 
-obj-$(CONFIG_PPC32)+= entry_32.o setup_32.o
+obj-$(CONFIG_PPC32)+= entry_32.o setup_32.o early_32.o
 obj-$(CONFIG_PPC64)+= dma-iommu.o iommu.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_BOOTX_TEXT)   += btext.o
diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c
new file mode 100644
index ..b3e40d6d651c
--- /dev/null
+++ b/arch/powerpc/kernel/early_32.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Early init before relocation
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * We're called here very early in the boot.
+ *
+ * Note that the kernel may be running at an address which is different
+ * from the address that it was linked at, so we must use RELOC/PTRRELOC
+ * to access static data (including strings).  -- paulus
+ */
+notrace unsigned long __init early_init(unsigned long dt_ptr)
+{
+   unsigned long offset = reloc_offset();
+
+   /* First zero the BSS */
+   memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
+
+   /*
+* Identify the CPU type and fix up code sections
+* that depend on which cpu we have.
+*/
+   identify_cpu(offset, mfspr(SPRN_PVR));
+
+   apply_feature_fixups();
+
+   return KERNELBASE + offset;
+}
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 00b8f54fed29..62efe32d890d 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -64,32 +64,6 @@ EXPORT_SYMBOL(DMA_MODE_READ);
 EXPORT_SYMBOL(DMA_MODE_WRITE);
 
 /*
- * We're called here very early in the boot.
- *
- * Note that the kernel may be running at an address which is different
- * from the address that it was linked at, so we must use RELOC/PTRRELOC
- * to access static data (including strings).  -- paulus
- */
-notrace unsigned long __init early_init(unsigned long dt_ptr)
-{
-   unsigned long offset = reloc_offset();
-
-   /* First zero the BSS */
-   memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
-
-   /*
-* Identify the CPU type and fix up code sections
-* that depend on which cpu we have.
-*/
-   identify_cpu(offset, mfspr(SPRN_PVR));
-
-   apply_feature_fixups();
-
-   return KERNELBASE + offset;
-}
-
-
-/*
  * This is run before start_kernel(), the kernel has been relocated
  * and we are running with enough of the MMU enabled to have our
  * proper kernel virtual addresses
-- 
2.13.3

[RFC PATCH v2 0/3] KASAN for nohash PPC32

2018-12-19 Thread Christophe Leroy

This serie adds KASAN support to nohash PPC32

Tested on 8xx

Changes in v2:
- Rebased.
- Using __set_pte_at() to build the early table.
- Worked around and got rid of the patch adding asm/page.h in 
asm/pgtable-types.h
==> might be fixed independently but needed for this serie.

In principle, this should also work on the 603. For now I have not been
able to boot, it stops before early console is active, so I'm quite blind at the
moment and don't really know what's wrong. Any idea ?

For hash32 (not 603), it cannot work as is because due to HASHPTE flag, we
can't use the same pagetable for several PGD entries.

Christophe Leroy (3):
  powerpc/mm: prepare kernel for KAsan on PPC32
  powerpc/32: Move early_init() in a separate file
  powerpc/nohash32: Add KASAN support

 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/kasan.h | 22 +
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 +
 arch/powerpc/include/asm/ppc_asm.h   |  5 ++
 arch/powerpc/include/asm/setup.h |  5 ++
 arch/powerpc/include/asm/string.h| 14 ++
 arch/powerpc/kernel/Makefile |  5 +-
 arch/powerpc/kernel/cputable.c   |  4 +-
 arch/powerpc/kernel/early_32.c   | 35 +
 arch/powerpc/kernel/setup-common.c   |  2 +
 arch/powerpc/kernel/setup_32.c   | 31 ++--
 arch/powerpc/lib/Makefile|  2 +
 arch/powerpc/lib/copy_32.S   |  9 ++--
 arch/powerpc/mm/Makefile |  3 ++
 arch/powerpc/mm/dump_linuxpagetables.c   |  8 +++
 arch/powerpc/mm/kasan_init.c | 73 
 arch/powerpc/mm/mem.c|  4 ++
 17 files changed, 191 insertions(+), 34 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kasan.h
 create mode 100644 arch/powerpc/kernel/early_32.c
 create mode 100644 arch/powerpc/mm/kasan_init.c

-- 
2.13.3

[RFC PATCH v2 1/3] powerpc/mm: prepare kernel for KAsan on PPC32

2018-12-19 Thread Christophe Leroy

In kernel/cputable.c, explicitly use memcpy() in order
to allow GCC to replace it with __memcpy() when KASAN is
selected.

Since commit 400c47d81ca38 ("powerpc32: memset: only use dcbz once cache is
enabled"), memset() can be used before activation of the cache,
so no need to use memset_io() for zeroing the BSS.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/cputable.c | 4 ++--
 arch/powerpc/kernel/setup_32.c | 6 ++
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 2da01340c84c..9ea031b05f19 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -2145,7 +2145,7 @@ void __init set_cur_cpu_spec(struct cpu_spec *s)
struct cpu_spec *t = _cpu_spec;
 
t = PTRRELOC(t);
-   *t = *s;
+   memcpy(t, s, sizeof(*t));
 
*PTRRELOC(_cpu_spec) = _cpu_spec;
 }
@@ -2160,7 +2160,7 @@ static struct cpu_spec * __init setup_cpu_spec(unsigned 
long offset,
old = *t;
 
/* Copy everything, then do fixups */
-   *t = *s;
+   memcpy(t, s, sizeof(*t));
 
/*
 * If we are overriding a previous value derived from the real
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 972c98d1e208..00b8f54fed29 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -74,10 +74,8 @@ notrace unsigned long __init early_init(unsigned long dt_ptr)
 {
unsigned long offset = reloc_offset();
 
-   /* First zero the BSS -- use memset_io, some platforms don't have
-* caches on yet */
-   memset_io((void __iomem *)PTRRELOC(&__bss_start), 0,
-   __bss_stop - __bss_start);
+   /* First zero the BSS */
+   memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start);
 
/*
 * Identify the CPU type and fix up code sections
-- 
2.13.3

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Aneesh Kumar K.V


On 12/20/18 11:50 AM, Alexey Kardashevskiy wrote:



On 20/12/2018 16:52, Aneesh Kumar K.V wrote:

On 12/20/18 11:18 AM, Alexey Kardashevskiy wrote:



On 20/12/2018 16:22, Aneesh Kumar K.V wrote:

On 12/20/18 9:49 AM, Alexey Kardashevskiy wrote:



On 19/12/2018 14:40, Aneesh Kumar K.V wrote:

This helper does a get_user_pages_fast and if it find pages in the
CMA area
it will try to migrate them before taking page reference. This makes
sure that
we don't keep non-movable pages (due to page reference count) in the
CMA area.
Not able to move pages out of CMA area result in CMA allocation
failures.

Signed-off-by: Aneesh Kumar K.V 




.

+ * We did migrate all the pages, Try to get the page
references again
+ * migrating any new CMA pages which we failed to isolate
earlier.
+ */
+    drain_allow = true;
+    goto get_user_again;



So it is possible to have pages pinned, then successfully migrated
(migrate_pages() returned 0), then pinned again, then some pages may
end
up in CMA again and migrate again and nothing seems to prevent this
loop
from being endless. What do I miss?



pages used as target page for migration won't be allocated from CMA
region.



Then migrate_allow should be set to "false" regardless what
migrate_pages() returned and then I am totally missing the point of this
goto and going through the loop again even when we know for sure it
won't do literally anything but checking is_migrate_cma_page() even
though we know pages won't be allocated from CMA.



Because we might have failed to isolate all the pages in the first attempt.


isolate==migrate?


no

The call to isolate_lru_page and isolate_huge_page. We can fail because 
the percpu pagevec is not fully drained



-aneesh

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Alexey Kardashevskiy

On 20/12/2018 16:52, Aneesh Kumar K.V wrote:
> On 12/20/18 11:18 AM, Alexey Kardashevskiy wrote:
>>
>>
>> On 20/12/2018 16:22, Aneesh Kumar K.V wrote:
>>> On 12/20/18 9:49 AM, Alexey Kardashevskiy wrote:

 On 19/12/2018 14:40, Aneesh Kumar K.V wrote:
> This helper does a get_user_pages_fast and if it find pages in the
> CMA area
> it will try to migrate them before taking page reference. This makes
> sure that
> we don't keep non-movable pages (due to page reference count) in the
> CMA area.
> Not able to move pages out of CMA area result in CMA allocation
> failures.
>
> Signed-off-by: Aneesh Kumar K.V 

>>>
>>> .
> + * We did migrate all the pages, Try to get the page
> references again
> + * migrating any new CMA pages which we failed to isolate
> earlier.
> + */
> +    drain_allow = true;
> +    goto get_user_again;

 So it is possible to have pages pinned, then successfully migrated
 (migrate_pages() returned 0), then pinned again, then some pages may
 end
 up in CMA again and migrate again and nothing seems to prevent this
 loop
 from being endless. What do I miss?

>>>
>>> pages used as target page for migration won't be allocated from CMA
>>> region.
>>
>>
>> Then migrate_allow should be set to "false" regardless what
>> migrate_pages() returned and then I am totally missing the point of this
>> goto and going through the loop again even when we know for sure it
>> won't do literally anything but checking is_migrate_cma_page() even
>> though we know pages won't be allocated from CMA.
>>
> 
> Because we might have failed to isolate all the pages in the first attempt.

isolate==migrate?

If we failed to migrate, then migrate_pages() returns non zero (positive
or negative), we set migrate_allow to false, empty the cma_page_list
and repeat but we won't add anything to cma_page_list as
migrate_allow==false.

If we succeeded to migrate, then we repeat the loop with
migrate_allow==true but it does not matter as is_migrate_cma_page() is
expected to return false because we just successfully migrated
_everything_ so we won't be adding anything to cma_page_list either.

What have I missed?

-- 
Alexey

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Aneesh Kumar K.V


On 12/20/18 11:18 AM, Alexey Kardashevskiy wrote:



On 20/12/2018 16:22, Aneesh Kumar K.V wrote:

On 12/20/18 9:49 AM, Alexey Kardashevskiy wrote:



On 19/12/2018 14:40, Aneesh Kumar K.V wrote:

This helper does a get_user_pages_fast and if it find pages in the
CMA area
it will try to migrate them before taking page reference. This makes
sure that
we don't keep non-movable pages (due to page reference count) in the
CMA area.
Not able to move pages out of CMA area result in CMA allocation
failures.

Signed-off-by: Aneesh Kumar K.V 




.

+ * We did migrate all the pages, Try to get the page
references again
+ * migrating any new CMA pages which we failed to isolate
earlier.
+ */
+    drain_allow = true;
+    goto get_user_again;



So it is possible to have pages pinned, then successfully migrated
(migrate_pages() returned 0), then pinned again, then some pages may end
up in CMA again and migrate again and nothing seems to prevent this loop
from being endless. What do I miss?



pages used as target page for migration won't be allocated from CMA region.



Then migrate_allow should be set to "false" regardless what
migrate_pages() returned and then I am totally missing the point of this
goto and going through the loop again even when we know for sure it
won't do literally anything but checking is_migrate_cma_page() even
though we know pages won't be allocated from CMA.



Because we might have failed to isolate all the pages in the first attempt.

-aneesh

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Alexey Kardashevskiy




On 20/12/2018 16:22, Aneesh Kumar K.V wrote:
> On 12/20/18 9:49 AM, Alexey Kardashevskiy wrote:
>>
>>
>> On 19/12/2018 14:40, Aneesh Kumar K.V wrote:
>>> This helper does a get_user_pages_fast and if it find pages in the
>>> CMA area
>>> it will try to migrate them before taking page reference. This makes
>>> sure that
>>> we don't keep non-movable pages (due to page reference count) in the
>>> CMA area.
>>> Not able to move pages out of CMA area result in CMA allocation
>>> failures.
>>>
>>> Signed-off-by: Aneesh Kumar K.V 
>>
> 
> .
>>> + * We did migrate all the pages, Try to get the page
>>> references again
>>> + * migrating any new CMA pages which we failed to isolate
>>> earlier.
>>> + */
>>> +    drain_allow = true;
>>> +    goto get_user_again;
>>
>>
>> So it is possible to have pages pinned, then successfully migrated
>> (migrate_pages() returned 0), then pinned again, then some pages may end
>> up in CMA again and migrate again and nothing seems to prevent this loop
>> from being endless. What do I miss?
>>
> 
> pages used as target page for migration won't be allocated from CMA region.


Then migrate_allow should be set to "false" regardless what
migrate_pages() returned and then I am totally missing the point of this
goto and going through the loop again even when we know for sure it
won't do literally anything but checking is_migrate_cma_page() even
though we know pages won't be allocated from CMA.

It should be simple gup_fast() instead of goto and then we won't need
goto/migrate_allow.


-- 
Alexey

[PATCH] powerpc/8xx: Map a second 8M text page at startup when needed.

2018-12-19 Thread Christophe Leroy

Some debug setup like CONFIG_KASAN generate huge
kernels with text size over the 8M limit.

This patch maps a second 8M page when _einittext is over 8M.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 27 +--
 arch/powerpc/mm/8xx_mmu.c  |  4 
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index b171b7c0a0e7..f6bc4392ea9f 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -334,8 +334,8 @@ InstructionTLBMiss:
rlwinm  r10, r10, 16, 0xfff8
cmpli   cr0, r10, PAGE_OFFSET@h
 #ifndef CONFIG_PIN_TLB_TEXT
-   /* It is assumed that kernel code fits into the first 8M page */
-0: cmpli   cr7, r10, (PAGE_OFFSET + 0x080)@h
+   /* It is assumed that kernel code fits into the two first 8M pages */
+0: cmpli   cr7, r10, (PAGE_OFFSET + 0x100)@h
patch_site  0b, patch__itlbmiss_linmem_top
 #endif
 #endif
@@ -904,6 +904,29 @@ initial_mmu:
li  r8, MI_BOOTINIT /* Create RPN for address 0 */
mtspr   SPRN_MI_RPN, r8 /* Store TLB entry */
 
+   /* Map a second 8M page if needed */
+   lis r9, _einittext@h
+   orisr9, r9, _einittext@l
+   cmpli   cr0, r9, (PAGE_OFFSET + 0x800)@h
+   blt 1f
+
+#ifdef CONFIG_PIN_TLB_TEXT
+   lis r8, MI_RSV4I@h
+   ori r8, r8, 0x1d00
+
+   mtspr   SPRN_MI_CTR, r8 /* Set instruction MMU control */
+#endif
+
+   lis r8, (KERNELBASE + 0x80)@h   /* Create vaddr for TLB */
+   ori r8, r8, MI_EVALID   /* Mark it valid */
+   mtspr   SPRN_MI_EPN, r8
+   li  r8, MI_PS8MEG /* Set 8M byte page */
+   ori r8, r8, MI_SVALID   /* Make it valid */
+   mtspr   SPRN_MI_TWC, r8
+   li  r8, MI_BOOTINIT /* Create RPN for address 0 */
+   addis   r8, r8, 0x80
+   mtspr   SPRN_MI_RPN, r8 /* Store TLB entry */
+1:
lis r8, MI_APG_INIT@h   /* Set protection modes */
ori r8, r8, MI_APG_INIT@l
mtspr   SPRN_MI_AP, r8
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index e2b6687ebb50..1bdbfbf9fe16 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -122,6 +122,10 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
 #endif
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+#ifndef CONFIG_PIN_TLB_TEXT
+   mmu_patch_cmp_limit(__itlbmiss_linmem_top,
+   _ALIGN(__pa(_einittext), 8 << 20));
+#endif
}
 
mmu_patch_cmp_limit(__dtlbmiss_linmem_top, mapped);
-- 
2.13.3

[RFC/WIP] powerpc: Fix 32-bit handling of MSR_EE on exceptions

2018-12-19 Thread Benjamin Herrenschmidt

Hi folks !

Why trying to figure out why we had occasionally lockdep barf about
interrupt state on ppc32 (440 in my case but I could reproduce on e500
as well using qemu), I realized that we are still doing something
rather gothic and wrong on 32-bit which we stopped doing on 64-bit
a while ago.

We have that thing where some handlers "copy" the EE value from the
original stack frame into the new MSR before transferring to the
handler.

Thus for a number of exceptions, we enter the handlers with interrupts
enabled.

This is rather fishy, some of the stuff that handlers might do early
on such as irq_enter/exit or user_exit, context tracking, etc... should
be run with interrupts off afaik.

Generally our handlers know when to re-enable interrupts if needed
(though some of the FSL specific SPE ones don't).

The problem we were having is that we assumed these interrupts would
return with interrupts enabled. However that isn't the case.

Instead, this changes things so that we always enter exception handlers
with interrupts *off* with the notable exception of syscalls which are
special (and get a fast path).

Currently, the patch only changes BookE (440 and E5xx tested in qemu),
the same recipe needs to be applied to 6xx, 8xx and 40x.

Also I'm not sure whether we need to create a stack frame around some
of the calls to trace_hardirqs_* in asm. ppc64 does it, due to problems
with the irqsoff tracer, but I haven't managed to reproduce those
issues. We need to look into it a bit more.

I'll work more on this in the next few days, comments appreciated.

Not-signed-off-by: Benjamin Herrenschmidt 

---
 arch/powerpc/kernel/entry_32.S   | 113 ++-
 arch/powerpc/kernel/head_44x.S   |   9 +--
 arch/powerpc/kernel/head_booke.h |  34 ---
 arch/powerpc/kernel/head_fsl_booke.S |  28 -
 arch/powerpc/kernel/traps.c  |   8 +++
 5 files changed, 111 insertions(+), 81 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 3841d74..39b4cb5 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -34,6 +34,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 /*
  * MSR_KERNEL is > 0x1 on 4xx/Book-E since it include MSR_CE.
@@ -205,20 +208,46 @@ transfer_to_handler_cont:
mflrr9
lwz r11,0(r9)   /* virtual address of handler */
lwz r9,4(r9)/* where to go when done */
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+   mtspr   SPRN_NRI, r0
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS
+   /*
+* When tracing IRQ state (lockdep) we enable the MMU before we call
+* the IRQ tracing functions as they might access vmalloc space or
+* perform IOs for console output.
+*
+* To speed up the syscall path where interrupts stay on, let's check
+* first if we are changing the MSR value at all.
+*/
+   lwz r12,_MSR(r1)
+   xor r0,r10,r12
+   andi.   r0,r0,MSR_EE
+   bne 1f
+
+   /* MSR isn't changing, just transition directly */
+   lwz r0,GPR0(r1)
+   mtspr   SPRN_SRR0,r11
+   mtspr   SPRN_SRR1,r10
+   mtlrr9
+   SYNC
+   RFI
+
+1: /* MSR is changing, re-enable MMU so we can notify lockdep. We need to
+* keep interrupts disabled at this point otherwise we might risk
+* taking an interrupt before we tell lockdep they are enabled.
+*/
lis r12,reenable_mmu@h
ori r12,r12,reenable_mmu@l
+   lis r0,MSR_KERNEL@h
+   ori r0,r0,MSR_KERNEL@l
mtspr   SPRN_SRR0,r12
-   mtspr   SPRN_SRR1,r10
+   mtspr   SPRN_SRR1,r0
SYNC
RFI
-reenable_mmu:  /* re-enable mmu so we can */
-   mfmsr   r10
-   lwz r12,_MSR(r1)
-   xor r10,r10,r12
-   andi.   r10,r10,MSR_EE  /* Did EE change? */
-   beq 1f
 
+reenable_mmu:
/*
 * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
 * If from user mode there is only one stack frame on the stack, and
@@ -239,8 +268,29 @@ reenable_mmu:  /* re-enable 
mmu so we can */
stw r3,16(r1)
stw r4,20(r1)
stw r5,24(r1)
-   bl  trace_hardirqs_off
-   lwz r5,24(r1)
+
+   /* Are we enabling or disabling interrupts ? */
+   andi.   r0,r10,MSR_EE
+   beq 1f
+
+   /* If we are enabling interrupt, this is a syscall. They shouldn't
+* happen while interrupts are disabled, so let's do a warning here.
+*/
+0: trap
+   EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
+   bl  trace_hardirqs_on
+
+   /* Now enable for real */
+   mfmsr   r10
+   ori r10,r10,MSR_EE
+   mtmsr   r10
+   b   2f
+
+   /* If we are disabling interrupts (normal case),

Re: [PATCH kernel v5 14/20] powerpc/powernv/npu: Add compound IOMMU groups

2018-12-19 Thread Michael Ellerman

Alexey Kardashevskiy  writes:
> On 19/12/2018 21:00, Michael Ellerman wrote:
>> Alexey Kardashevskiy  writes:
>>> On 19/12/2018 11:17, Michael Ellerman wrote:
 Alexey Kardashevskiy  writes:
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index dc629ee..3468eaa 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -372,8 +358,263 @@ struct npu {
 ...
> +
> +static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
> + struct pnv_ioda_pe *pe)
> +{
> + if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
> + return;
> +
> + npucomp->pe[npucomp->pe_num] = pe;
> + ++npucomp->pe_num;
> +}
> +
> +struct iommu_table_group *pnv_try_setup_npu_table_group(struct 
> pnv_ioda_pe *pe)
> +{
> + struct iommu_table_group *table_group;
> + struct npu_comp *npucomp;
> + struct pci_dev *gpdev = NULL;
> + struct pci_controller *hose;
> + struct pci_dev *npdev;
> +
> + list_for_each_entry(gpdev, >pbus->devices, bus_list) {
> + npdev = pnv_pci_get_npu_dev(gpdev, 0);
> + if (npdev)
> + break;
> + }
> +
> + if (!npdev)
> + /* It is not an NPU attached device, skip */
> + return NULL;

 This breaks some configs with:

   arch/powerpc/platforms/powernv/npu-dma.c:550:5: error: 'npdev' may be 
 used uninitialized in this function [-Werror=uninitialized]
>>>
>>>
>>> gcc 5, 7 and 8 do not warn about this, I have to disable
>>> list_for_each_entry() above to recreate this.
>>>
>>> I even compiled gcc 5.5 which some of your buildmachines use and yet no
>>> error on this:
>>>
>>> make O=/home/aik/pbuild/kernel-le/ KCFLAGS=-Werror=all ARCH=powerpc
>>> CROSS_COMPILE=/opt/cross/gcc-powerpc64le-linux-5.5.0-nolibc/bin/powerpc64le-linux-
>>> arch/powerpc/platforms/powernv/npu-dma.o
>> 
>> Odd. That error is from kisskb like the others.
>> 
>> http://kisskb.ellerman.id.au/kisskb/buildresult/13622793/
>> 
>> Seems it's GCC 4.6.3 that is producing that one. Not sure why newer
>> compilers aren't warning about it.
>> 
>> 
>> It's pretty obviously correct though, unless you can prove that the list
>> is never empty?
>
> I know these are correct and I want to catch these before I post patches :-/
>
> Can I get that gcc 4.6.3? It does not compile on my ubuntu for whatever
> reason.

It's the old korg one, it's on the ka's in /opt/cross.

It's this one:

https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/4.6.3/x86_64-gcc-4.6.3-nolibc_powerpc64-linux.tar.xz

cheers

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Aneesh Kumar K.V


On 12/20/18 9:49 AM, Alexey Kardashevskiy wrote:



On 19/12/2018 14:40, Aneesh Kumar K.V wrote:

This helper does a get_user_pages_fast and if it find pages in the CMA area
it will try to migrate them before taking page reference. This makes sure that
we don't keep non-movable pages (due to page reference count) in the CMA area.
Not able to move pages out of CMA area result in CMA allocation failures.

Signed-off-by: Aneesh Kumar K.V 




.

+* We did migrate all the pages, Try to get the page references 
again
+* migrating any new CMA pages which we failed to isolate 
earlier.
+*/
+   drain_allow = true;
+   goto get_user_again;



So it is possible to have pages pinned, then successfully migrated
(migrate_pages() returned 0), then pinned again, then some pages may end
up in CMA again and migrate again and nothing seems to prevent this loop
from being endless. What do I miss?



pages used as target page for migration won't be allocated from CMA region.

-aneesh

Re: [PATCH V5 2/3] powerpc/mm/iommu: Allow migration of cma allocated pages during mm_iommu_get

2018-12-19 Thread Alexey Kardashevskiy




On 19/12/2018 14:40, Aneesh Kumar K.V wrote:
> Current code doesn't do page migration if the page allocated is a compound 
> page.
> With HugeTLB migration support, we can end up allocating hugetlb pages from
> CMA region. Also THP pages can be allocated from CMA region. This patch 
> updates
> the code to handle compound pages correctly.
> 
> This use the new helper get_user_pages_cma_migrate. It does one get_user_pages
> with right count, instead of doing one get_user_pages per page. That avoids
> reading page table multiple times.
> 
> The patch also convert the hpas member of mm_iommu_table_group_mem_t to a 
> union.
> We use the same storage location to store pointers to struct page. We cannot
> update alll the code path use struct page *, because we access hpas in real 
> mode

s/alll/all/


> and we can't do that struct page * to pfn conversion in real mode.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/mmu_context_iommu.c | 120 
>  1 file changed, 35 insertions(+), 85 deletions(-)
> 
> diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
> b/arch/powerpc/mm/mmu_context_iommu.c
> index 56c2234cc6ae..1d5161f93ce6 100644
> --- a/arch/powerpc/mm/mmu_context_iommu.c
> +++ b/arch/powerpc/mm/mmu_context_iommu.c
> @@ -21,6 +21,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  static DEFINE_MUTEX(mem_list_mutex);
>  
> @@ -34,8 +35,18 @@ struct mm_iommu_table_group_mem_t {
>   atomic64_t mapped;
>   unsigned int pageshift;
>   u64 ua; /* userspace address */
> - u64 entries;/* number of entries in hpas[] */

Still a valid comment imho, or you could s'hpas'hpas/hpages' but
replacing hpas with hpages seems strange.


> - u64 *hpas;  /* vmalloc'ed */
> + u64 entries;/* number of entries in hpages[] */
> + /*
> +  * in mm_iommu_get we temporarily use this to store
> +  * struct page address.
> +  *
> +  * We need to convert ua to hpa in real mode. Make it
> +  * simpler by storing physicall address.

s/physicall/physical/


> +  */
> + union {
> + struct page **hpages;   /* vmalloc'ed */
> + phys_addr_t *hpas;
> + };
>  };
>  
>  static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
> @@ -78,63 +89,14 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
>  }
>  EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
>  
> -/*
> - * Taken from alloc_migrate_target with changes to remove CMA allocations
> - */
> -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
> -{
> - gfp_t gfp_mask = GFP_USER;
> - struct page *new_page;
> -
> - if (PageCompound(page))
> - return NULL;
> -
> - if (PageHighMem(page))
> - gfp_mask |= __GFP_HIGHMEM;
> -
> - /*
> -  * We don't want the allocation to force an OOM if possibe
> -  */
> - new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
> - return new_page;
> -}
> -
> -static int mm_iommu_move_page_from_cma(struct page *page)
> -{
> - int ret = 0;
> - LIST_HEAD(cma_migrate_pages);
> -
> - /* Ignore huge pages for now */
> - if (PageCompound(page))
> - return -EBUSY;
> -
> - lru_add_drain();
> - ret = isolate_lru_page(page);
> - if (ret)
> - return ret;
> -
> - list_add(>lru, _migrate_pages);
> - put_page(page); /* Drop the gup reference */
> -
> - ret = migrate_pages(_migrate_pages, new_iommu_non_cma_page,
> - NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
> - if (ret) {
> - if (!list_empty(_migrate_pages))
> - putback_movable_pages(_migrate_pages);
> - }
> -
> - return 0;
> -}
> -
>  long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
> entries,
>   struct mm_iommu_table_group_mem_t **pmem)
>  {
>   struct mm_iommu_table_group_mem_t *mem;
> - long i, j, ret = 0, locked_entries = 0;
> + long i, ret = 0, locked_entries = 0;
>   unsigned int pageshift;
>   unsigned long flags;
>   unsigned long cur_ua;
> - struct page *page = NULL;
>  
>   mutex_lock(_list_mutex);
>  
> @@ -181,41 +143,24 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> ua, unsigned long entries,
>   goto unlock_exit;
>   }
>  
> + ret = get_user_pages_cma_migrate(ua, entries, 1, mem->hpages);

btw get_user_pages_cma_migrate() name suggests me (yeah, not a native
speaker and an ignorant person in general :) ) that it migrates and pins
pages while it can actually pin pages without migrating them (if it
could not).


> + if (ret != entries) {
> + /* free the reference taken */
> + for (i = 0; i < ret; i++)
> + put_page(mem->hpages[i]);
> +
> + vfree(mem->hpas);
> + kfree(mem);
> + ret = -EFAULT;
> + goto unlock_exit;
>

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Alexey Kardashevskiy




On 19/12/2018 14:40, Aneesh Kumar K.V wrote:
> This helper does a get_user_pages_fast and if it find pages in the CMA area
> it will try to migrate them before taking page reference. This makes sure that
> we don't keep non-movable pages (due to page reference count) in the CMA area.
> Not able to move pages out of CMA area result in CMA allocation failures.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  include/linux/hugetlb.h |   2 +
>  include/linux/migrate.h |   3 +
>  mm/hugetlb.c|   4 +-
>  mm/migrate.c| 139 
>  4 files changed, 146 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 087fd5f48c91..1eed0cdaec0e 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, 
> int preferred_nid,
>   nodemask_t *nmask);
>  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct 
> *vma,
>   unsigned long address);
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +  int nid, nodemask_t *nmask);
>  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
>   pgoff_t idx);
>  
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index f2b4abbca55e..d82b35afd2eb 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -286,6 +286,9 @@ static inline int migrate_vma(const struct 
> migrate_vma_ops *ops,
>  }
>  #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
>  
> +extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int 
> write,
> +   struct page **pages);


ah, sorry for commenting the same patch again but
./scripts/checkpatch.pl complains a log on this patch.


-- 
Alexey

Re: [PATCH V5 1/3] mm: Add get_user_pages_cma_migrate

2018-12-19 Thread Alexey Kardashevskiy




On 19/12/2018 14:40, Aneesh Kumar K.V wrote:
> This helper does a get_user_pages_fast and if it find pages in the CMA area
> it will try to migrate them before taking page reference. This makes sure that
> we don't keep non-movable pages (due to page reference count) in the CMA area.
> Not able to move pages out of CMA area result in CMA allocation failures.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  include/linux/hugetlb.h |   2 +
>  include/linux/migrate.h |   3 +
>  mm/hugetlb.c|   4 +-
>  mm/migrate.c| 139 
>  4 files changed, 146 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 087fd5f48c91..1eed0cdaec0e 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, 
> int preferred_nid,
>   nodemask_t *nmask);
>  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct 
> *vma,
>   unsigned long address);
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +  int nid, nodemask_t *nmask);
>  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
>   pgoff_t idx);
>  
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index f2b4abbca55e..d82b35afd2eb 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -286,6 +286,9 @@ static inline int migrate_vma(const struct 
> migrate_vma_ops *ops,
>  }
>  #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
>  
> +extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int 
> write,
> +   struct page **pages);
> +
>  #endif /* CONFIG_MIGRATION */
>  
>  #endif /* _LINUX_MIGRATE_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 7f2a28ab46d5..faf3102ae45e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1585,8 +1585,8 @@ static struct page *alloc_surplus_huge_page(struct 
> hstate *h, gfp_t gfp_mask,
>   return page;
>  }
>  
> -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> - int nid, nodemask_t *nmask)
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +  int nid, nodemask_t *nmask)
>  {
>   struct page *page;
>  
> diff --git a/mm/migrate.c b/mm/migrate.c
> index f7e4bfdc13b7..d564558fba03 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2946,3 +2946,142 @@ int migrate_vma(const struct migrate_vma_ops *ops,
>  }
>  EXPORT_SYMBOL(migrate_vma);
>  #endif /* defined(MIGRATE_VMA_HELPER) */
> +
> +static struct page *new_non_cma_page(struct page *page, unsigned long 
> private)
> +{
> + /*
> +  * We want to make sure we allocate the new page from the same node
> +  * as the source page.
> +  */
> + int nid = page_to_nid(page);
> + /*
> +  * Trying to allocate a page for migration. Ignore allocation
> +  * failure warnings
> +  */
> + gfp_t gfp_mask = GFP_USER | __GFP_THISNODE | __GFP_NOWARN;
> +
> + if (PageHighMem(page))
> + gfp_mask |= __GFP_HIGHMEM;
> +
> +#ifdef CONFIG_HUGETLB_PAGE
> + if (PageHuge(page)) {
> + struct hstate *h = page_hstate(page);
> + /*
> +  * We don't want to dequeue from the pool because pool pages 
> will
> +  * mostly be from the CMA region.
> +  */
> + return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
> + }
> +#endif
> + if (PageTransHuge(page)) {
> + struct page *thp;
> + /*
> +  * ignore allocation failure warnings
> +  */
> + gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE | 
> __GFP_NOWARN;
> +
> + /*
> +  * Remove the movable mask so that we don't allocate from
> +  * CMA area again.
> +  */
> + thp_gfpmask &= ~__GFP_MOVABLE;
> + thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
> + if (!thp)
> + return NULL;
> + prep_transhuge_page(thp);
> + return thp;
> + }
> +
> + return __alloc_pages_node(nid, gfp_mask, 0);
> +}
> +
> +/**
> + * get_user_pages_cma_migrate() - pin user pages in memory by migrating 
> pages in CMA region
> + * @start:   starting user address
> + * @nr_pages:number of pages from start to pin
> + * @write:   whether pages will be written to
> + * @pages:   array that receives pointers to the pages pinned.
> + *   Should be at least nr_pages long.
> + *
> + * Attempt to pin user pages in memory without taking mm->mmap_sem.
> + * If not successful, it will fall back to taking the lock and
> + * calling get_user_pages().
> + *
> + * If the pinned pages are backed by

Re: [PATCH -next] powerpc/eeh: Fix debugfs_simple_attr.cocci warnings

2018-12-19 Thread Russell Currey

On Thu, 2018-12-20 at 02:42 +, YueHaibing wrote:
> Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE
> for debugfs files.
> 
> Semantic patch information:
> Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file()
> imposes some significant overhead as compared to
> DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe().
> 
> Generated by:
> scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci
> 
> Signed-off-by: YueHaibing 

Acked-by: Russell Currey

[PATCH -next] powerpc/eeh: Fix debugfs_simple_attr.cocci warnings

2018-12-19 Thread YueHaibing

Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE
for debugfs files.

Semantic patch information:
Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file()
imposes some significant overhead as compared to
DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe().

Generated by: scripts/coccinelle/api/debugfs/debugfs_simple_attr.cocci

Signed-off-by: YueHaibing 
---
 arch/powerpc/kernel/eeh.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 23fe62f..ae05203 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1808,10 +1808,10 @@ static int eeh_freeze_dbgfs_get(void *data, u64 *val)
return 0;
 }
 
-DEFINE_SIMPLE_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
-   eeh_enable_dbgfs_set, "0x%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get,
-   eeh_freeze_dbgfs_set, "0x%llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get,
+eeh_enable_dbgfs_set, "0x%llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get,
+eeh_freeze_dbgfs_set, "0x%llx\n");
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1819,12 +1819,12 @@ static int __init eeh_init_proc(void)
if (machine_is(pseries) || machine_is(powernv)) {
proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show);
 #ifdef CONFIG_DEBUG_FS
-   debugfs_create_file("eeh_enable", 0600,
-powerpc_debugfs_root, NULL,
-_enable_dbgfs_ops);
-   debugfs_create_file("eeh_max_freezes", 0600,
-   powerpc_debugfs_root, NULL,
-   _freeze_dbgfs_ops);
+   debugfs_create_file_unsafe("eeh_enable", 0600,
+  powerpc_debugfs_root, NULL,
+  _enable_dbgfs_ops);
+   debugfs_create_file_unsafe("eeh_max_freezes", 0600,
+  powerpc_debugfs_root, NULL,
+  _freeze_dbgfs_ops);
 #endif
}

Re: [PATCH kernel v6 14/20] powerpc/powernv/npu: Add compound IOMMU groups

2018-12-19 Thread Alexey Kardashevskiy




On 19/12/2018 19:52, Alexey Kardashevskiy wrote:
> At the moment the powernv platform registers an IOMMU group for each PE.
> There is an exception though: an NVLink bridge which is attached to
> the corresponding GPU's IOMMU group making it a master.
> 
> Now we have POWER9 systems with GPUs connected to each other directly
> bypassing PCI. At the moment we do not control state of these links so
> we have to put such interconnected GPUs to one IOMMU group which
> means that the old scheme with one GPU as a master won't work - there will
> be up to 3 GPUs in such group.
> 
> This introduces a npu_comp struct which represents a compound IOMMU
> group made of multiple PEs - PCI PEs (for GPUs) and NPU PEs (for NVLink
> bridges). This converts the existing NVLink1 code to use the new scheme.
> From now on, each PE must have a valid iommu_table_group_ops which will
> either be called directly (for a single PE group) or indirectly from
> a compound group handlers.
> 
> This moves IOMMU group registration for NVLink-connected GPUs to npu-dma.c.
> For POWER8, this stores a new compound group pointer in the PE (so a GPU
> is still a master); for POWER9 the new group pointer is stored in an NPU
> (which is allocated per a PCI host controller).
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v5:
> * now read page sizes from PHB NVLink to narrow down what the compoind PE
> can actually support (hint: 4K/64K only)
> ---
>  arch/powerpc/include/asm/pci.h|   1 +
>  arch/powerpc/platforms/powernv/pci.h  |   7 +
>  arch/powerpc/platforms/powernv/npu-dma.c  | 291 --
>  arch/powerpc/platforms/powernv/pci-ioda.c | 159 
>  4 files changed, 322 insertions(+), 136 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
> index baf2886..0c72f18 100644
> --- a/arch/powerpc/include/asm/pci.h
> +++ b/arch/powerpc/include/asm/pci.h
> @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev 
> *gpdev, int index);
>  extern int pnv_npu2_init(struct pci_controller *hose);
>  extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
>   unsigned long msr);
> +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
>  
>  #endif /* __ASM_POWERPC_PCI_H */
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> index cf9f748..aef4bb5 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -62,6 +62,7 @@ struct pnv_ioda_pe {
>  
>   /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
>   struct iommu_table_group table_group;
> + struct npu_comp *npucomp;
>  
>   /* 64-bit TCE bypass region */
>   booltce_bypass_enabled;
> @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>  extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
>  extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
>  extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
> + __u64 window_size, __u32 levels);
>  extern int pnv_eeh_post_init(void);
>  
>  extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe 
> *pe, const char *level,
>  extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
>  extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool 
> rm);
>  extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
> +extern struct iommu_table_group *pnv_try_setup_npu_table_group(
> + struct pnv_ioda_pe *pe);
> +extern struct iommu_table_group *pnv_npu_compound_attach(
> + struct pnv_ioda_pe *pe);
>  
>  /* pci-ioda-tce.c */
>  #define POWERNV_IOMMU_DEFAULT_LEVELS 1
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index dc629ee..3468eaa 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -328,31 +328,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
>   .unset_window = pnv_npu_unset_window,
>   .take_ownership = pnv_npu_take_ownership,
>  };
> -
> -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
> -{
> - struct pnv_phb *phb = npe->phb;
> - struct pci_bus *pbus = phb->hose->bus;
> - struct pci_dev *npdev, *gpdev = NULL, *gptmp;
> - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, );
> -
> - if (!gpe || !gpdev)
> - return NULL;
> -
> - npe->table_group.ops = _pci_npu_ops;
> -
> - list_for_each_entry(npdev, >devices, bus_list) {
> - gptmp = pnv_pci_get_gpu_dev(npdev);
> -
> - if (gptmp != gpdev)
> - continue;
> -
> -

Re: [PATCH kernel v5 14/20] powerpc/powernv/npu: Add compound IOMMU groups

2018-12-19 Thread Alexey Kardashevskiy




On 19/12/2018 21:00, Michael Ellerman wrote:
> Alexey Kardashevskiy  writes:
>> On 19/12/2018 11:17, Michael Ellerman wrote:
>>> Alexey Kardashevskiy  writes:
 diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
 b/arch/powerpc/platforms/powernv/npu-dma.c
 index dc629ee..3468eaa 100644
 --- a/arch/powerpc/platforms/powernv/npu-dma.c
 +++ b/arch/powerpc/platforms/powernv/npu-dma.c
 @@ -372,8 +358,263 @@ struct npu {
>>> ...
 +
 +static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
 +  struct pnv_ioda_pe *pe)
 +{
 +  if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
 +  return;
 +
 +  npucomp->pe[npucomp->pe_num] = pe;
 +  ++npucomp->pe_num;
 +}
 +
 +struct iommu_table_group *pnv_try_setup_npu_table_group(struct 
 pnv_ioda_pe *pe)
 +{
 +  struct iommu_table_group *table_group;
 +  struct npu_comp *npucomp;
 +  struct pci_dev *gpdev = NULL;
 +  struct pci_controller *hose;
 +  struct pci_dev *npdev;
 +
 +  list_for_each_entry(gpdev, >pbus->devices, bus_list) {
 +  npdev = pnv_pci_get_npu_dev(gpdev, 0);
 +  if (npdev)
 +  break;
 +  }
 +
 +  if (!npdev)
 +  /* It is not an NPU attached device, skip */
 +  return NULL;
>>>
>>> This breaks some configs with:
>>>
>>>   arch/powerpc/platforms/powernv/npu-dma.c:550:5: error: 'npdev' may be 
>>> used uninitialized in this function [-Werror=uninitialized]
>>
>>
>> gcc 5, 7 and 8 do not warn about this, I have to disable
>> list_for_each_entry() above to recreate this.
>>
>> I even compiled gcc 5.5 which some of your buildmachines use and yet no
>> error on this:
>>
>> make O=/home/aik/pbuild/kernel-le/ KCFLAGS=-Werror=all ARCH=powerpc
>> CROSS_COMPILE=/opt/cross/gcc-powerpc64le-linux-5.5.0-nolibc/bin/powerpc64le-linux-
>> arch/powerpc/platforms/powernv/npu-dma.o
> 
> Odd. That error is from kisskb like the others.
> 
> http://kisskb.ellerman.id.au/kisskb/buildresult/13622793/
> 
> Seems it's GCC 4.6.3 that is producing that one. Not sure why newer
> compilers aren't warning about it.
> 
> 
> It's pretty obviously correct though, unless you can prove that the list
> is never empty?

I know these are correct and I want to catch these before I post patches :-/

Can I get that gcc 4.6.3? It does not compile on my ubuntu for whatever
reason.


> 
>   struct pci_dev *npdev;
> 
>   list_for_each_entry(gpdev, >pbus->devices, bus_list) {
>   npdev = pnv_pci_get_npu_dev(gpdev, 0);
>   if (npdev)
>   break;
>   }
> 
>   if (!npdev)
>   /* It is not an NPU attached device, skip */
>   return NULL;
> 
> 
> cheers
> 

-- 
Alexey

[PATCH kernel v6.1 20/20 REPOST] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-19 Thread Alexey Kardashevskiy

POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
pluggable PCIe devices but still have PCIe links which are used
for config space and MMIO. In addition to that the GPUs have 6 NVLinks
which are connected to other GPUs and the POWER9 CPU. POWER9 chips
have a special unit on a die called an NPU which is an NVLink2 host bus
adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
These systems also support ATS (address translation services) which is
a part of the NVLink2 protocol. Such GPUs also share on-board RAM
(16GB or 32GB) to the system via the same NVLink2 so a CPU has
cache-coherent access to a GPU RAM.

This exports GPU RAM to the userspace as a new VFIO device region. This
preregisters the new memory as device memory as it might be used for DMA.
This inserts pfns from the fault handler as the GPU memory is not onlined
until the vendor driver is loaded and trained the NVLinks so doing this
earlier causes low level errors which we fence in the firmware so
it does not hurt the host system but still better be avoided; for the same
reason this does not map GPU RAM into the host kernel (usual thing for
emulated access otherwise).

This exports an ATSD (Address Translation Shootdown) register of NPU which
allows TLB invalidations inside GPU for an operating system. The register
conveniently occupies a single 64k page. It is also presented to
the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
each of them can be used for TLB invalidation in a GPU linked to this NPU.
This allocates one ATSD register per an NVLink bridge allowing passing
up to 6 registers. Due to the host firmware bug (just recently fixed),
only 1 ATSD register per NPU was actually advertised to the host system
so this passes that alone register via the first NVLink bridge device in
the group which is still enough as QEMU collects them all back and
presents to the guest via vPHB to mimic the emulated NPU PHB on the host.

In order to provide the userspace with the information about GPU-to-NVLink
connections, this exports an additional capability called "tgt"
(which is an abbreviated host system bus address). The "tgt" property
tells the GPU its own system address and allows the guest driver to
conglomerate the routing information so each GPU knows how to get directly
to the other GPUs.

For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
know LPID (a logical partition ID or a KVM guest hardware ID in other
words) and PID (a memory context ID of a userspace process, not to be
confused with a linux pid). This assigns a GPU to LPID in the NPU and
this is why this adds a listener for KVM on an IOMMU group. A PID comes
via NVLink from a GPU and NPU uses a PID wildcard to pass it through.

This requires coherent memory and ATSD to be available on the host as
the GPU vendor only supports configurations with both features enabled
and other configurations are known not to work. Because of this and
because of the ways the features are advertised to the host system
(which is a device tree with very platform specific properties),
this requires enabled POWERNV platform.

The V100 GPUs do not advertise any of these capabilities via the config
space and there are more than just one device ID so this relies on
the platform to tell whether these GPUs have special abilities such as
NVLinks.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6.1:
* fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD

v6:
* reworked capabilities - tgt for nvlink and gpu and link-speed
for nvlink only

v5:
* do not memremap GPU RAM for emulation, map it only when it is needed
* allocate 1 ATSD register per NVLink bridge, if none left, then expose
the region with a zero size
* separate caps per device type
* addressed AW review comments

v4:
* added nvlink-speed to the NPU bridge capability as this turned out to
be not a constant value
* instead of looking at the exact device ID (which also changes from system
to system), now this (indirectly) looks at the device tree to know
if GPU and NPU support NVLink

v3:
* reworded the commit log about tgt
* added tracepoints (do we want them enabled for entire vfio-pci?)
* added code comments
* added write|mmap flags to the new regions
* auto enabled VFIO_PCI_NVLINK2 config option
* added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
references; there are required by the NVIDIA driver
* keep notifier registered only for short time
---
 drivers/vfio/pci/Makefile   |   1 +
 drivers/vfio/pci/trace.h| 102 ++
 drivers/vfio/pci/vfio_pci_private.h |  14 +
 include/uapi/linux/vfio.h   |  37 +++
 drivers/vfio/pci/vfio_pci.c |  27 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c | 482 
 drivers/vfio/pci/Kconfig|   6 +
 7 files changed, 667 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/pci/trace.h
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

diff

Re: [PATCH V4 5/5] arch/powerpc/mm/hugetlb: NestMMU workaround for hugetlb mprotect RW upgrade

2018-12-19 Thread Benjamin Herrenschmidt

On Wed, 2018-12-19 at 08:50 +0530, Aneesh Kumar K.V wrote:
> Christoph Hellwig  writes:
> 
> > On Tue, Dec 18, 2018 at 03:11:37PM +0530, Aneesh Kumar K.V wrote:
> > > +EXPORT_SYMBOL(huge_ptep_modify_prot_start);
> > 
> > The only user of this function is the one you added in the last patch
> > in mm/hugetlb.c, so there is no need to export this function.
> > 
> > > +
> > > +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned 
> > > long addr,
> > > +   pte_t *ptep, pte_t old_pte, pte_t pte)
> > > +{
> > > +
> > > + if (radix_enabled())
> > > + return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
> > > +old_pte, pte);
> > > + set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
> > > +}
> > > +EXPORT_SYMBOL(huge_ptep_modify_prot_commit);
> > 
> > Same here.
> 
> That was done considering that ptep_modify_prot_start/commit was defined
> in asm-generic/pgtable.h. I was trying to make sure I didn't break
> anything with the patch. Also s390 do have that EXPORT_SYMBOL() for the
> same. hugetlb just inherited that.
> 
> If you feel strongly about it, I can drop the EXPORT_SYMBOL().

At the very least it should be _GPL

Cheers,
Ben.

Re: [PATCH v2 2/2] kgdb/treewide: constify struct kgdb_arch arch_kgdb_ops

2018-12-19 Thread Daniel Thompson

On Thu, Dec 06, 2018 at 08:07:40PM +, Christophe Leroy wrote:
> checkpatch.pl reports the following:
> 
>   WARNING: struct kgdb_arch should normally be const
>   #28: FILE: arch/mips/kernel/kgdb.c:397:
>   +struct kgdb_arch arch_kgdb_ops = {
> 
> This report makes sense, as all other ops struct, this
> one should also be const. This patch does the change.
> 
> Cc: Vineet Gupta 
> Cc: Russell King 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Yoshinori Sato 
> Cc: Richard Kuo 
> Cc: Michal Simek 
> Cc: Ralf Baechle 
> Cc: Paul Burton 
> Cc: James Hogan 
> Cc: Ley Foon Tan 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Rich Felker 
> Cc: "David S. Miller" 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: Borislav Petkov 
> Cc: x...@kernel.org
> Acked-by: Daniel Thompson 
> Acked-by: Paul Burton 
> Signed-off-by: Christophe Leroy 

I've not heard any objections from the arch/ maintainers so...

Applied! Thanks.


> -
> ---
>  v2: Added CCs to all maintainers/supporters identified by get_maintainer.pl 
> and Acks from Daniel and Paul.
> 
>  arch/arc/kernel/kgdb.c| 2 +-
>  arch/arm/kernel/kgdb.c| 2 +-
>  arch/arm64/kernel/kgdb.c  | 2 +-
>  arch/h8300/kernel/kgdb.c  | 2 +-
>  arch/hexagon/kernel/kgdb.c| 2 +-
>  arch/microblaze/kernel/kgdb.c | 2 +-
>  arch/mips/kernel/kgdb.c   | 2 +-
>  arch/nios2/kernel/kgdb.c  | 2 +-
>  arch/powerpc/kernel/kgdb.c| 2 +-
>  arch/sh/kernel/kgdb.c | 2 +-
>  arch/sparc/kernel/kgdb_32.c   | 2 +-
>  arch/sparc/kernel/kgdb_64.c   | 2 +-
>  arch/x86/kernel/kgdb.c| 2 +-
>  include/linux/kgdb.h  | 2 +-
>  14 files changed, 14 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
> index 9a3c34af2ae8..bfd04b442e36 100644
> --- a/arch/arc/kernel/kgdb.c
> +++ b/arch/arc/kernel/kgdb.c
> @@ -204,7 +204,7 @@ void kgdb_roundup_cpus(unsigned long flags)
>   local_irq_disable();
>  }
>  
> -struct kgdb_arch arch_kgdb_ops = {
> +const struct kgdb_arch arch_kgdb_ops = {
>   /* breakpoint instruction: TRAP_S 0x3 */
>  #ifdef CONFIG_CPU_BIG_ENDIAN
>   .gdb_bpt_instr  = {0x78, 0x7e},
> diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
> index caa0dbe3dc61..21a6d5958955 100644
> --- a/arch/arm/kernel/kgdb.c
> +++ b/arch/arm/kernel/kgdb.c
> @@ -274,7 +274,7 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
>   * and we handle the normal undef case within the do_undefinstr
>   * handler.
>   */
> -struct kgdb_arch arch_kgdb_ops = {
> +const struct kgdb_arch arch_kgdb_ops = {
>  #ifndef __ARMEB__
>   .gdb_bpt_instr  = {0xfe, 0xde, 0xff, 0xe7}
>  #else /* ! __ARMEB__ */
> diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> index a20de58061a8..fe1d1f935b90 100644
> --- a/arch/arm64/kernel/kgdb.c
> +++ b/arch/arm64/kernel/kgdb.c
> @@ -357,7 +357,7 @@ void kgdb_arch_exit(void)
>   unregister_die_notifier(_notifier);
>  }
>  
> -struct kgdb_arch arch_kgdb_ops;
> +const struct kgdb_arch arch_kgdb_ops;
>  
>  int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
>  {
> diff --git a/arch/h8300/kernel/kgdb.c b/arch/h8300/kernel/kgdb.c
> index 1a1d30cb0609..602e478afbd5 100644
> --- a/arch/h8300/kernel/kgdb.c
> +++ b/arch/h8300/kernel/kgdb.c
> @@ -129,7 +129,7 @@ void kgdb_arch_exit(void)
>   /* Nothing to do */
>  }
>  
> -struct kgdb_arch arch_kgdb_ops = {
> +const struct kgdb_arch arch_kgdb_ops = {
>   /* Breakpoint instruction: trapa #2 */
>   .gdb_bpt_instr = { 0x57, 0x20 },
>  };
> diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
> index 16c24b22d0b2..f1924d483e78 100644
> --- a/arch/hexagon/kernel/kgdb.c
> +++ b/arch/hexagon/kernel/kgdb.c
> @@ -83,7 +83,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
>   { "syscall_nr", GDB_SIZEOF_REG, offsetof(struct pt_regs, syscall_nr)},
>  };
>  
> -struct kgdb_arch arch_kgdb_ops = {
> +const struct kgdb_arch arch_kgdb_ops = {
>   /* trap0(#0xDB) 0x0cdb0054 */
>   .gdb_bpt_instr = {0x54, 0x00, 0xdb, 0x0c},
>  };
> diff --git a/arch/microblaze/kernel/kgdb.c b/arch/microblaze/kernel/kgdb.c
> index 6366f69d118e..130cd0f064ce 100644
> --- a/arch/microblaze/kernel/kgdb.c
> +++ b/arch/microblaze/kernel/kgdb.c
> @@ -143,7 +143,7 @@ void kgdb_arch_exit(void)
>  /*
>   * Global data
>   */
> -struct kgdb_arch arch_kgdb_ops = {
> +const struct kgdb_arch arch_kgdb_ops = {
>  #ifdef __MICROBLAZEEL__
>   .gdb_bpt_instr = {0x18, 0x00, 0x0c, 0xba}, /* brki r16, 0x18 */
>  #else
> diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
> index 31eff1bec577..edfdc2ec2d16 100644
> --- a/arch/mips/kernel/kgdb.c
> +++ b/arch/mips/kernel/kgdb.c
> @@ -394,7 +394,7 @@ int kgdb_arch_handle_exception(int vector, int signo, int 
> err_code,
>   return -1;
>  }
>  
> -struct kgdb_arch arch_kgdb_ops = {
> +const struct kgdb_arch arch_kgdb_ops = {
>  #ifdef CONFIG_CPU_BIG_ENDIAN
>   .gdb_bpt_instr = { spec_op <<

Re: [PATCH v2 1/2] mips/kgdb: prepare arch_kgdb_ops for constness

2018-12-19 Thread Daniel Thompson

On Thu, Dec 06, 2018 at 08:07:38PM +, Christophe Leroy wrote:
> MIPS is the only architecture modifying arch_kgdb_ops during init.
> This patch makes the init static, so that it can be changed to
> const in following patch, as recommended by checkpatch.pl
> 
> Suggested-by: Paul Burton 
> Acked-by: Daniel Thompson 
> Acked-by: Paul Burton 
> Signed-off-by: Christophe Leroy 

Applied! Thanks.


> -
> ---
>  v2: Added acks from Daniel and Paul.
> 
>  arch/mips/kernel/kgdb.c | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
> index eb6c0d582626..31eff1bec577 100644
> --- a/arch/mips/kernel/kgdb.c
> +++ b/arch/mips/kernel/kgdb.c
> @@ -394,18 +394,16 @@ int kgdb_arch_handle_exception(int vector, int signo, 
> int err_code,
>   return -1;
>  }
>  
> -struct kgdb_arch arch_kgdb_ops;
> +struct kgdb_arch arch_kgdb_ops = {
> +#ifdef CONFIG_CPU_BIG_ENDIAN
> + .gdb_bpt_instr = { spec_op << 2, 0x00, 0x00, break_op },
> +#else
> + .gdb_bpt_instr = { break_op, 0x00, 0x00, spec_op << 2 },
> +#endif
> +};
>  
>  int kgdb_arch_init(void)
>  {
> - union mips_instruction insn = {
> - .r_format = {
> - .opcode = spec_op,
> - .func   = break_op,
> - }
> - };
> - memcpy(arch_kgdb_ops.gdb_bpt_instr, insn.byte, BREAK_INSTR_SIZE);
> -
>   register_die_notifier(_notifier);
>  
>   return 0;
> -- 
> 2.13.3
>

Re: [REPOST PATCH v6 2/4] kgdb: Fix kgdb_roundup_cpus() for arches who used smp_call_function()

2018-12-19 Thread Daniel Thompson

On Tue, Dec 04, 2018 at 07:38:26PM -0800, Douglas Anderson wrote:
> When I had lockdep turned on and dropped into kgdb I got a nice splat
> on my system.  Specifically it hit:
>   DEBUG_LOCKS_WARN_ON(current->hardirq_context)
> 
> Specifically it looked like this:
>   sysrq: SysRq : DEBUG
>   [ cut here ]
>   DEBUG_LOCKS_WARN_ON(current->hardirq_context)
>   WARNING: CPU: 0 PID: 0 at .../kernel/locking/lockdep.c:2875 
> lockdep_hardirqs_on+0xf0/0x160
>   CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.19.0 #27
>   pstate: 604003c9 (nZCv DAIF +PAN -UAO)
>   pc : lockdep_hardirqs_on+0xf0/0x160
>   ...
>   Call trace:
>lockdep_hardirqs_on+0xf0/0x160
>trace_hardirqs_on+0x188/0x1ac
>kgdb_roundup_cpus+0x14/0x3c
>kgdb_cpu_enter+0x53c/0x5cc
>kgdb_handle_exception+0x180/0x1d4
>kgdb_compiled_brk_fn+0x30/0x3c
>brk_handler+0x134/0x178
>do_debug_exception+0xfc/0x178
>el1_dbg+0x18/0x78
>kgdb_breakpoint+0x34/0x58
>sysrq_handle_dbg+0x54/0x5c
>__handle_sysrq+0x114/0x21c
>handle_sysrq+0x30/0x3c
>qcom_geni_serial_isr+0x2dc/0x30c
>   ...
>   ...
>   irq event stamp: ...45
>   hardirqs last  enabled at (...44): [...] __do_softirq+0xd8/0x4e4
>   hardirqs last disabled at (...45): [...] el1_irq+0x74/0x130
>   softirqs last  enabled at (...42): [...] _local_bh_enable+0x2c/0x34
>   softirqs last disabled at (...43): [...] irq_exit+0xa8/0x100
>   ---[ end trace adf21f830c46e638 ]---
> 
> Looking closely at it, it seems like a really bad idea to be calling
> local_irq_enable() in kgdb_roundup_cpus().  If nothing else that seems
> like it could violate spinlock semantics and cause a deadlock.
> 
> Instead, let's use a private csd alongside
> smp_call_function_single_async() to round up the other CPUs.  Using
> smp_call_function_single_async() doesn't require interrupts to be
> enabled so we can remove the offending bit of code.
> 
> In order to avoid duplicating this across all the architectures that
> use the default kgdb_roundup_cpus(), we'll add a "weak" implementation
> to debug_core.c.
> 
> Looking at all the people who previously had copies of this code,
> there were a few variants.  I've attempted to keep the variants
> working like they used to.  Specifically:
> * For arch/arc we passed NULL to kgdb_nmicallback() instead of
>   get_irq_regs().
> * For arch/mips there was a bit of extra code around
>   kgdb_nmicallback()
> 
> NOTE: In this patch we will still get into trouble if we try to round
> up a CPU that failed to round up before.  We'll try to round it up
> again and potentially hang when we try to grab the csd lock.  That's
> not new behavior but we'll still try to do better in a future patch.
> 
> Suggested-by: Daniel Thompson 
> Signed-off-by: Douglas Anderson 
> Cc: Vineet Gupta 
> Cc: Russell King 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Richard Kuo 
> Cc: Ralf Baechle 
> Cc: Paul Burton 
> Cc: James Hogan 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Yoshinori Sato 
> Cc: Rich Felker 
> Cc: "David S. Miller" 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: Borislav Petkov 
> Cc: "H. Peter Anvin" 
> Acked-by: Will Deacon 

I've not heard any objections from the arch/ maintainers so...

Applied! Thanks.


> -
> ---
> 
> Changes in v6:
> - Moved smp_call_function_single_async() error check to patch 3.
> 
> Changes in v5:
> - Add a comment about get_irq_regs().
> - get_cpu() => raw_smp_processor_id() in kgdb_roundup_cpus().
> - for_each_cpu() => for_each_online_cpu()
> - Error check smp_call_function_single_async()
> 
> Changes in v4: None
> Changes in v3:
> - No separate init call.
> - Don't round up the CPU that is doing the rounding up.
> - Add "#ifdef CONFIG_SMP" to match the rest of the file.
> - Updated desc saying we don't solve the "failed to roundup" case.
> - Document the ignored parameter.
> 
> Changes in v2:
> - Removing irq flags separated from fixing lockdep splat.
> - Don't use smp_call_function (Daniel).
> 
>  arch/arc/kernel/kgdb.c | 10 ++
>  arch/arm/kernel/kgdb.c | 12 ---
>  arch/arm64/kernel/kgdb.c   | 12 ---
>  arch/hexagon/kernel/kgdb.c | 27 -
>  arch/mips/kernel/kgdb.c|  9 +
>  arch/powerpc/kernel/kgdb.c |  4 ++--
>  arch/sh/kernel/kgdb.c  | 12 ---
>  include/linux/kgdb.h   | 15 --
>  kernel/debug/debug_core.c  | 41 ++
>  9 files changed, 59 insertions(+), 83 deletions(-)
> 
> diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
> index 0932851028e0..68d9fe4b5aa7 100644
> --- a/arch/arc/kernel/kgdb.c
> +++ b/arch/arc/kernel/kgdb.c
> @@ -192,18 +192,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned 
> long ip)
>   instruction_pointer(regs) = ip;
>  }
>  
> -static void kgdb_call_nmi_hook(void *ignored)
> +void kgdb_call_nmi_hook(void *ignored)
>  {
> + /* Default implementation passes get_irq_regs() but we don't */
>

Re: [REPOST PATCH v6 1/4] kgdb: Remove irq flags from roundup

2018-12-19 Thread Daniel Thompson

On Tue, Dec 04, 2018 at 07:38:25PM -0800, Douglas Anderson wrote:
> The function kgdb_roundup_cpus() was passed a parameter that was
> documented as:
> 
> > the flags that will be used when restoring the interrupts. There is
> > local_irq_save() call before kgdb_roundup_cpus().
> 
> Nobody used those flags.  Anyone who wanted to temporarily turn on
> interrupts just did local_irq_enable() and local_irq_disable() without
> looking at them.  So we can definitely remove the flags.
> 
> Signed-off-by: Douglas Anderson 
> Cc: Vineet Gupta 
> Cc: Russell King 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Richard Kuo 
> Cc: Ralf Baechle 
> Cc: Paul Burton 
> Cc: James Hogan 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Yoshinori Sato 
> Cc: Rich Felker 
> Cc: "David S. Miller" 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: Borislav Petkov 
> Cc: "H. Peter Anvin" 
> Acked-by: Will Deacon 

I've not heard any objections from the arch/ maintainers so...

Applied! Thanks.


> ---
> 
> Changes in v6: None
> Changes in v5: None
> Changes in v4: None
> Changes in v3: None
> Changes in v2:
> - Removing irq flags separated from fixing lockdep splat.
> 
>  arch/arc/kernel/kgdb.c | 2 +-
>  arch/arm/kernel/kgdb.c | 2 +-
>  arch/arm64/kernel/kgdb.c   | 2 +-
>  arch/hexagon/kernel/kgdb.c | 9 ++---
>  arch/mips/kernel/kgdb.c| 2 +-
>  arch/powerpc/kernel/kgdb.c | 2 +-
>  arch/sh/kernel/kgdb.c  | 2 +-
>  arch/sparc/kernel/smp_64.c | 2 +-
>  arch/x86/kernel/kgdb.c | 9 ++---
>  include/linux/kgdb.h   | 9 ++---
>  kernel/debug/debug_core.c  | 2 +-
>  11 files changed, 14 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c
> index 9a3c34af2ae8..0932851028e0 100644
> --- a/arch/arc/kernel/kgdb.c
> +++ b/arch/arc/kernel/kgdb.c
> @@ -197,7 +197,7 @@ static void kgdb_call_nmi_hook(void *ignored)
>   kgdb_nmicallback(raw_smp_processor_id(), NULL);
>  }
>  
> -void kgdb_roundup_cpus(unsigned long flags)
> +void kgdb_roundup_cpus(void)
>  {
>   local_irq_enable();
>   smp_call_function(kgdb_call_nmi_hook, NULL, 0);
> diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
> index caa0dbe3dc61..f21077b077be 100644
> --- a/arch/arm/kernel/kgdb.c
> +++ b/arch/arm/kernel/kgdb.c
> @@ -175,7 +175,7 @@ static void kgdb_call_nmi_hook(void *ignored)
> kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
>  }
>  
> -void kgdb_roundup_cpus(unsigned long flags)
> +void kgdb_roundup_cpus(void)
>  {
> local_irq_enable();
> smp_call_function(kgdb_call_nmi_hook, NULL, 0);
> diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c
> index a20de58061a8..12c339ff6e75 100644
> --- a/arch/arm64/kernel/kgdb.c
> +++ b/arch/arm64/kernel/kgdb.c
> @@ -289,7 +289,7 @@ static void kgdb_call_nmi_hook(void *ignored)
>   kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
>  }
>  
> -void kgdb_roundup_cpus(unsigned long flags)
> +void kgdb_roundup_cpus(void)
>  {
>   local_irq_enable();
>   smp_call_function(kgdb_call_nmi_hook, NULL, 0);
> diff --git a/arch/hexagon/kernel/kgdb.c b/arch/hexagon/kernel/kgdb.c
> index 16c24b22d0b2..012e0e230ac2 100644
> --- a/arch/hexagon/kernel/kgdb.c
> +++ b/arch/hexagon/kernel/kgdb.c
> @@ -119,17 +119,12 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned 
> long pc)
>  
>  /**
>   * kgdb_roundup_cpus - Get other CPUs into a holding pattern
> - * @flags: Current IRQ state
>   *
>   * On SMP systems, we need to get the attention of the other CPUs
>   * and get them be in a known state.  This should do what is needed
>   * to get the other CPUs to call kgdb_wait(). Note that on some arches,
>   * the NMI approach is not used for rounding up all the CPUs. For example,
> - * in case of MIPS, smp_call_function() is used to roundup CPUs. In
> - * this case, we have to make sure that interrupts are enabled before
> - * calling smp_call_function(). The argument to this function is
> - * the flags that will be used when restoring the interrupts. There is
> - * local_irq_save() call before kgdb_roundup_cpus().
> + * in case of MIPS, smp_call_function() is used to roundup CPUs.
>   *
>   * On non-SMP systems, this is not called.
>   */
> @@ -139,7 +134,7 @@ static void hexagon_kgdb_nmi_hook(void *ignored)
>   kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
>  }
>  
> -void kgdb_roundup_cpus(unsigned long flags)
> +void kgdb_roundup_cpus(void)
>  {
>   local_irq_enable();
>   smp_call_function(hexagon_kgdb_nmi_hook, NULL, 0);
> diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
> index eb6c0d582626..2b05effc17b4 100644
> --- a/arch/mips/kernel/kgdb.c
> +++ b/arch/mips/kernel/kgdb.c
> @@ -219,7 +219,7 @@ static void kgdb_call_nmi_hook(void *ignored)
>   set_fs(old_fs);
>  }
>  
> -void kgdb_roundup_cpus(unsigned long flags)
> +void kgdb_roundup_cpus(void)
>  {
>   local_irq_enable();
>

Re: [PATCH kernel v6 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-19 Thread Alex Williamson

[cc +kvm, +lkml]

Ditto list cc comment from 18/20, and doubly so on this with updates to
the vfio uapi.  Also comment below...

On Wed, 19 Dec 2018 19:52:32 +1100
Alexey Kardashevskiy  wrote:

> POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
> pluggable PCIe devices but still have PCIe links which are used
> for config space and MMIO. In addition to that the GPUs have 6 NVLinks
> which are connected to other GPUs and the POWER9 CPU. POWER9 chips
> have a special unit on a die called an NPU which is an NVLink2 host bus
> adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
> These systems also support ATS (address translation services) which is
> a part of the NVLink2 protocol. Such GPUs also share on-board RAM
> (16GB or 32GB) to the system via the same NVLink2 so a CPU has
> cache-coherent access to a GPU RAM.
> 
> This exports GPU RAM to the userspace as a new VFIO device region. This
> preregisters the new memory as device memory as it might be used for DMA.
> This inserts pfns from the fault handler as the GPU memory is not onlined
> until the vendor driver is loaded and trained the NVLinks so doing this
> earlier causes low level errors which we fence in the firmware so
> it does not hurt the host system but still better be avoided; for the same
> reason this does not map GPU RAM into the host kernel (usual thing for
> emulated access otherwise).
> 
> This exports an ATSD (Address Translation Shootdown) register of NPU which
> allows TLB invalidations inside GPU for an operating system. The register
> conveniently occupies a single 64k page. It is also presented to
> the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
> each of them can be used for TLB invalidation in a GPU linked to this NPU.
> This allocates one ATSD register per an NVLink bridge allowing passing
> up to 6 registers. Due to the host firmware bug (just recently fixed),
> only 1 ATSD register per NPU was actually advertised to the host system
> so this passes that alone register via the first NVLink bridge device in
> the group which is still enough as QEMU collects them all back and
> presents to the guest via vPHB to mimic the emulated NPU PHB on the host.
> 
> In order to provide the userspace with the information about GPU-to-NVLink
> connections, this exports an additional capability called "tgt"
> (which is an abbreviated host system bus address). The "tgt" property
> tells the GPU its own system address and allows the guest driver to
> conglomerate the routing information so each GPU knows how to get directly
> to the other GPUs.
> 
> For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
> know LPID (a logical partition ID or a KVM guest hardware ID in other
> words) and PID (a memory context ID of a userspace process, not to be
> confused with a linux pid). This assigns a GPU to LPID in the NPU and
> this is why this adds a listener for KVM on an IOMMU group. A PID comes
> via NVLink from a GPU and NPU uses a PID wildcard to pass it through.
> 
> This requires coherent memory and ATSD to be available on the host as
> the GPU vendor only supports configurations with both features enabled
> and other configurations are known not to work. Because of this and
> because of the ways the features are advertised to the host system
> (which is a device tree with very platform specific properties),
> this requires enabled POWERNV platform.
> 
> The V100 GPUs do not advertise any of these capabilities via the config
> space and there are more than just one device ID so this relies on
> the platform to tell whether these GPUs have special abilities such as
> NVLinks.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v6:
> * reworked capabilities - tgt for nvlink and gpu and link-speed
> for nvlink only
> 
> v5:
> * do not memremap GPU RAM for emulation, map it only when it is needed
> * allocate 1 ATSD register per NVLink bridge, if none left, then expose
> the region with a zero size
> * separate caps per device type
> * addressed AW review comments
> 
> v4:
> * added nvlink-speed to the NPU bridge capability as this turned out to
> be not a constant value
> * instead of looking at the exact device ID (which also changes from system
> to system), now this (indirectly) looks at the device tree to know
> if GPU and NPU support NVLink
> 
> v3:
> * reworded the commit log about tgt
> * added tracepoints (do we want them enabled for entire vfio-pci?)
> * added code comments
> * added write|mmap flags to the new regions
> * auto enabled VFIO_PCI_NVLINK2 config option
> * added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
> references; there are required by the NVIDIA driver
> * keep notifier registered only for short time
> ---
>  drivers/vfio/pci/Makefile   |   1 +
>  drivers/vfio/pci/trace.h| 102 ++
>  drivers/vfio/pci/vfio_pci_private.h |  14 +
>  include/uapi/linux/vfio.h   |  38 +++
>

Re: [PATCH kernel v6 19/20] vfio_pci: Allow regions to add own capabilities

2018-12-19 Thread Alex Williamson

[cc +kvm, +lkml]

Ditto list cc comment from 18/20

On Wed, 19 Dec 2018 19:52:31 +1100
Alexey Kardashevskiy  wrote:

> VFIO regions already support region capabilities with a limited set of
> fields. However the subdriver might have to report to the userspace
> additional bits.
> 
> This adds an add_capability() hook to vfio_pci_regops.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Acked-by: Alex Williamson 
> ---
> Changes:
> v3:
> * removed confusing rationale for the patch, the next patch makes
> use of it anyway
> ---
>  drivers/vfio/pci/vfio_pci_private.h | 3 +++
>  drivers/vfio/pci/vfio_pci.c | 6 ++
>  2 files changed, 9 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_private.h 
> b/drivers/vfio/pci/vfio_pci_private.h
> index 86aab05..93c1738 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -62,6 +62,9 @@ struct vfio_pci_regops {
>   int (*mmap)(struct vfio_pci_device *vdev,
>   struct vfio_pci_region *region,
>   struct vm_area_struct *vma);
> + int (*add_capability)(struct vfio_pci_device *vdev,
> +   struct vfio_pci_region *region,
> +   struct vfio_info_cap *caps);
>  };
>  
>  struct vfio_pci_region {
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 4a6f7c0..6cb70cf 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -763,6 +763,12 @@ static long vfio_pci_ioctl(void *device_data,
>   if (ret)
>   return ret;
>  
> + if (vdev->region[i].ops->add_capability) {
> + ret = vdev->region[i].ops->add_capability(vdev,
> + >region[i], );
> + if (ret)
> + return ret;
> + }
>   }
>   }
>

Re: [PATCH kernel v6 18/20] vfio_pci: Allow mapping extra regions

2018-12-19 Thread Alex Williamson

[cc +kvm, +lkml]

Sorry, just noticed these are only visible on ppc lists or for those
directly cc'd.  vfio's official development list is the kvm list.  I'll
let spapr specific changes get away without copying this list, but
changes like this really need to be visible to everyone.  Thanks,

Alex

On Wed, 19 Dec 2018 19:52:30 +1100
Alexey Kardashevskiy  wrote:

> So far we only allowed mapping of MMIO BARs to the userspace. However
> there are GPUs with on-board coherent RAM accessible via side
> channels which we also want to map to the userspace. The first client
> for this is NVIDIA V100 GPU with NVLink2 direct links to a POWER9
> NPU-enabled CPU; such GPUs have 16GB RAM which is coherently mapped
> to the system address space, we are going to export these as an extra
> PCI region.
> 
> We already support extra PCI regions and this adds support for mapping
> them to the userspace.
> 
> Signed-off-by: Alexey Kardashevskiy 
> Reviewed-by: David Gibson 
> Acked-by: Alex Williamson 
> ---
> Changes:
> v2:
> * reverted one of mistakenly removed error checks
> ---
>  drivers/vfio/pci/vfio_pci_private.h | 3 +++
>  drivers/vfio/pci/vfio_pci.c | 9 +
>  2 files changed, 12 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_private.h 
> b/drivers/vfio/pci/vfio_pci_private.h
> index cde3b5d..86aab05 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -59,6 +59,9 @@ struct vfio_pci_regops {
> size_t count, loff_t *ppos, bool iswrite);
>   void(*release)(struct vfio_pci_device *vdev,
>  struct vfio_pci_region *region);
> + int (*mmap)(struct vfio_pci_device *vdev,
> + struct vfio_pci_region *region,
> + struct vm_area_struct *vma);
>  };
>  
>  struct vfio_pci_region {
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index fef5002..4a6f7c0 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -1130,6 +1130,15 @@ static int vfio_pci_mmap(void *device_data, struct 
> vm_area_struct *vma)
>   return -EINVAL;
>   if ((vma->vm_flags & VM_SHARED) == 0)
>   return -EINVAL;
> + if (index >= VFIO_PCI_NUM_REGIONS) {
> + int regnum = index - VFIO_PCI_NUM_REGIONS;
> + struct vfio_pci_region *region = vdev->region + regnum;
> +
> + if (region && region->ops && region->ops->mmap &&
> + (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
> + return region->ops->mmap(vdev, region, vma);
> + return -EINVAL;
> + }
>   if (index >= VFIO_PCI_ROM_REGION_INDEX)
>   return -EINVAL;
>   if (!vdev->bar_mmap_supported[index])

Re: [PATCH 3/8] crypto4xx_core: don't abuse __dma_sync_page

2018-12-19 Thread Christian Lamparter

On Sunday, December 16, 2018 6:19:46 PM CET Christoph Hellwig wrote:
> This function is internal to the DMA API implementation.  Instead use the
> DMA API to properly unmap.  Note that the DMA API usage in this driver
> is a disaster and urgently needs some work - it is missing all the unmaps,
> seems to do a secondary map where it looks like it should to a unmap
> in one place to work around cache coherency and the directions passed in
> seem to be partially wrong.
> 
> Signed-off-by: Christoph Hellwig 
I've loaded the series (+dir -> direction patch) onto a cross-compiled
vanilla 4.20-rc7. I can report that the box didn't crash, though I would
have liked to test with DMA_DEBUG.

Tested-by: Christian Lamparter 

> ---
>  drivers/crypto/amcc/crypto4xx_core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/crypto/amcc/crypto4xx_core.c 
> b/drivers/crypto/amcc/crypto4xx_core.c
> index 6eaec9ba0f68..63cb6956c948 100644
> --- a/drivers/crypto/amcc/crypto4xx_core.c
> +++ b/drivers/crypto/amcc/crypto4xx_core.c
> @@ -596,7 +596,7 @@ static void crypto4xx_aead_done(struct crypto4xx_device 
> *dev,
> pd->pd_ctl_len.bf.pkt_len,
> dst);
>   } else {
> - __dma_sync_page(sg_page(dst), dst->offset, dst->length,
> + dma_unmap_page(dev->core_dev->device, pd->dest, dst->length,
>   DMA_FROM_DEVICE);
>   }
>  
>

Re: [PATCH kernel v5 14/20] powerpc/powernv/npu: Add compound IOMMU groups

2018-12-19 Thread Michael Ellerman

Alexey Kardashevskiy  writes:
> On 19/12/2018 11:17, Michael Ellerman wrote:
>> Alexey Kardashevskiy  writes:
>>> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
>>> b/arch/powerpc/platforms/powernv/npu-dma.c
>>> index dc629ee..3468eaa 100644
>>> --- a/arch/powerpc/platforms/powernv/npu-dma.c
>>> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
>>> @@ -372,8 +358,263 @@ struct npu {
>> ...
>>> +
>>> +static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
>>> +   struct pnv_ioda_pe *pe)
>>> +{
>>> +   if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
>>> +   return;
>>> +
>>> +   npucomp->pe[npucomp->pe_num] = pe;
>>> +   ++npucomp->pe_num;
>>> +}
>>> +
>>> +struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe 
>>> *pe)
>>> +{
>>> +   struct iommu_table_group *table_group;
>>> +   struct npu_comp *npucomp;
>>> +   struct pci_dev *gpdev = NULL;
>>> +   struct pci_controller *hose;
>>> +   struct pci_dev *npdev;
>>> +
>>> +   list_for_each_entry(gpdev, >pbus->devices, bus_list) {
>>> +   npdev = pnv_pci_get_npu_dev(gpdev, 0);
>>> +   if (npdev)
>>> +   break;
>>> +   }
>>> +
>>> +   if (!npdev)
>>> +   /* It is not an NPU attached device, skip */
>>> +   return NULL;
>> 
>> This breaks some configs with:
>> 
>>   arch/powerpc/platforms/powernv/npu-dma.c:550:5: error: 'npdev' may be used 
>> uninitialized in this function [-Werror=uninitialized]
>
>
> gcc 5, 7 and 8 do not warn about this, I have to disable
> list_for_each_entry() above to recreate this.
>
> I even compiled gcc 5.5 which some of your buildmachines use and yet no
> error on this:
>
> make O=/home/aik/pbuild/kernel-le/ KCFLAGS=-Werror=all ARCH=powerpc
> CROSS_COMPILE=/opt/cross/gcc-powerpc64le-linux-5.5.0-nolibc/bin/powerpc64le-linux-
> arch/powerpc/platforms/powernv/npu-dma.o

Odd. That error is from kisskb like the others.

http://kisskb.ellerman.id.au/kisskb/buildresult/13622793/

Seems it's GCC 4.6.3 that is producing that one. Not sure why newer
compilers aren't warning about it.


It's pretty obviously correct though, unless you can prove that the list
is never empty?

struct pci_dev *npdev;

list_for_each_entry(gpdev, >pbus->devices, bus_list) {
npdev = pnv_pci_get_npu_dev(gpdev, 0);
if (npdev)
break;
}

if (!npdev)
/* It is not an NPU attached device, skip */
return NULL;


cheers

Re: [PATCH kernel v5 10/20] powerpc/iommu_api: Move IOMMU groups setup to a single place

2018-12-19 Thread Michael Ellerman

Alexey Kardashevskiy  writes:
> On 19/12/2018 10:35, Michael Ellerman wrote:
>> Alexey Kardashevskiy  writes:
>> 
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index b86a6e0..1168b185 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -2735,12 +2733,68 @@ static struct iommu_table_group_ops 
>>> pnv_pci_ioda2_npu_ops = {
>>> .release_ownership = pnv_ioda2_release_ownership,
>>>  };
>>>  
>>> +static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe 
>>> *pe,
>>> +   struct pci_bus *bus)
>>> +{
>>> +   struct pci_dev *dev;
>>> +
>>> +   list_for_each_entry(dev, >devices, bus_list) {
>>> +   iommu_add_device(>table_group, >dev);
>>> +
>>> +   if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>> +   pnv_ioda_setup_bus_iommu_group_add_devices(pe,
>>> +   dev->subordinate);
>>> +   }
>>> +}
>>> +
>>> +static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe)
>>> +{
>>> +   if (!pnv_pci_ioda_pe_dma_weight(pe))
>>> +   return;
>>> +
>>> +   iommu_register_group(>table_group, pe->phb->hose->global_number,
>>> +   pe->pe_number);
>>> +
>>> +   /*
>>> +* set_iommu_table_base(>pdev->dev, tbl) should have been called
>>> +* by now
>>> +*/
>>> +   if (pe->flags & PNV_IODA_PE_DEV)
>>> +   iommu_add_device(>table_group, >pdev->dev);
>>> +   else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
>>> +   pnv_ioda_setup_bus_iommu_group_add_devices(pe, pe->pbus);
>>> +}
>>> +
>> 
>> This breaks skiroot_defconfig with:
>> 
>> arch/powerpc/platforms/powernv/pci-ioda.c:2731:13: error: 
>> 'pnv_ioda_setup_bus_iommu_group' defined but not used 
>> [-Werror=unused-function]
>> 
>>   http://kisskb.ellerman.id.au/kisskb/buildresult/13623033/
>
>
> How do you enable these warnings? I do not get them no matter what I do.

Just build skiroot_defconfig ?

Or turn off CONFIG_SPAPR_TCE_IOMMU, CONFIG_IOMMU_SUPPORT,
CONFIG_IOMMU_API etc.

cheers

Re: [PATCH] powerpc/32: Include .branch_lt in data section

2018-12-19 Thread Michael Ellerman

Alan Modra  writes:
> On Thu, Nov 15, 2018 at 11:47:52PM +1100, Michael Ellerman wrote:
>> Alan Modra  writes:
>> 
>> > On Wed, Nov 14, 2018 at 01:32:18PM +1030, Joel Stanley wrote:
>> >> I wasn't sure where this should go or if the ordering matters.
>> >
>> > The usual answer is: "Look at where the section goes in the standard
>> > linker scripts."   But that doesn't apply here.  The section will be
>> > empty for a kernel build so it doesn't matter where it goes.
>> 
>> If it's empty why don't we just discard it?
>
> That can be a recipe for finding linker bugs.  Not that I'm against
> you finding linker bugs.  ;-)

Seems we might have found a linker bug :)

With a binutils 2.29 toolchain discarding .branch_lt causes a segfault
when linking:

  http://kisskb.ellerman.id.au/kisskb/buildresult/13618838/log/

  /kisskb/src/scripts/link-vmlinux.sh: line 85: 74627 Segmentation fault  
(core dumped) ${LD} ${KBUILD_LDFLAGS} ${LDFLAGS_vmlinux} -o ${2} -T ${lds} 
${objects}


That toolchain is here:

  
https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/5.5.0/x86_64-gcc-5.5.0-nolibc-powerpc64-linux.tar.xz

It seems to be better with binutils 2.30, it doesn't crash but still
doesn't link:

  http://kisskb.ellerman.id.au/kisskb/buildresult/13620269/log/

  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-ld: 
linkage table error against `0002bf55.xdp_rxq_info_reg_mem_model'
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-ld: 
stubs don't match calculated size
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-ld: 
can not build stubs: Bad value
  make[1]: *** [/kisskb/src/Makefile:1036: vmlinux] Error 1

So I guess I'll take this version of the patch for now.

cheers

Re: [PATCH 07/11] powerpc/fsl: Flush the branch predictor at each kernel entry (32 bit)

2018-12-19 Thread kbuild test robot

Hi Diana,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on v4.20-rc7 next-20181218]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Diana-Craciun/powerpc-fsl-NXP-PowerPC-Spectre-variant-2-workarounds/20181213-015503
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-icon_defconfig (attached as .config)
compiler: powerpc-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=powerpc 

All warnings (new ones prefixed by >>):

>> powerpc-linux-gnu-ld: warning: orphan section `__btb_flush_fixup' from 
>> `arch/powerpc/kernel/head_44x.o' being placed in section `__btb_flush_fixup'.
>> powerpc-linux-gnu-ld: warning: orphan section `__btb_flush_fixup' from 
>> `arch/powerpc/kernel/head_44x.o' being placed in section `__btb_flush_fixup'.
>> powerpc-linux-gnu-ld: warning: orphan section `__btb_flush_fixup' from 
>> `arch/powerpc/kernel/head_44x.o' being placed in section `__btb_flush_fixup'.

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

RE: [PATCHv3 4/4] misc: pci_endpoint_test: Add the layerscape EP device support

2018-12-19 Thread Xiaowei Bao



-Original Message-
From: Greg KH  
Sent: 2018年12月19日 17:21
To: Xiaowei Bao 
Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com; 
shawn...@kernel.org; Leo Li ; kis...@ti.com; 
lorenzo.pieral...@arm.com; a...@arndb.de; M.h. Lian ; 
Mingkai Hu ; Roy Zang ; 
kstew...@linuxfoundation.org; cyrille.pitc...@free-electrons.com; 
pombreda...@nexb.com; shawn@rock-chips.com; niklas.cas...@axis.com; 
linux-...@vger.kernel.org; devicet...@vger.kernel.org; 
linux-ker...@vger.kernel.org; linux-arm-ker...@lists.infradead.org; 
linuxppc-dev@lists.ozlabs.org
Subject: Re: [PATCHv3 4/4] misc: pci_endpoint_test: Add the layerscape EP 
device support

On Mon, Dec 03, 2018 at 06:35:05PM +0800, Xiaowei Bao wrote:
> Add the layerscape EP device support in pci_endpoint_test driver.
> 
> Signed-off-by: Xiaowei Bao 
> ---
> v2:
>  - no change
> v3:
>  - no change
> 
>  drivers/misc/pci_endpoint_test.c |2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/misc/pci_endpoint_test.c 
> b/drivers/misc/pci_endpoint_test.c
> index 896e2df..744d10c 100644
> --- a/drivers/misc/pci_endpoint_test.c
> +++ b/drivers/misc/pci_endpoint_test.c
> @@ -788,6 +788,8 @@ static void pci_endpoint_test_remove(struct 
> pci_dev *pdev)  static const struct pci_device_id pci_endpoint_test_tbl[] = {
>   { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) },
>   { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) },
> + /* 0x81c0: The device id of ls1046a in NXP. */

There is no need for a comment, just this line is fine:

> + { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) },

If you note, no other lines have comments.

If you want to say that this is a specific device, then use a #define with that 
device name and then use that define here, like the PCI_DEVICE_ID_TI_DRA72x was 
done.

thanks,

greg k-h
[Xiaowei Bao] Hi Greg, OK, I can remove the comments in the next version patch.

Re: [PATCHv3 4/4] misc: pci_endpoint_test: Add the layerscape EP device support

2018-12-19 Thread Greg KH

On Mon, Dec 03, 2018 at 06:35:05PM +0800, Xiaowei Bao wrote:
> Add the layerscape EP device support in pci_endpoint_test driver.
> 
> Signed-off-by: Xiaowei Bao 
> ---
> v2:
>  - no change
> v3:
>  - no change
> 
>  drivers/misc/pci_endpoint_test.c |2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/misc/pci_endpoint_test.c 
> b/drivers/misc/pci_endpoint_test.c
> index 896e2df..744d10c 100644
> --- a/drivers/misc/pci_endpoint_test.c
> +++ b/drivers/misc/pci_endpoint_test.c
> @@ -788,6 +788,8 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev)
>  static const struct pci_device_id pci_endpoint_test_tbl[] = {
>   { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) },
>   { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) },
> + /* 0x81c0: The device id of ls1046a in NXP. */

There is no need for a comment, just this line is fine:

> + { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) },

If you note, no other lines have comments.

If you want to say that this is a specific device, then use a #define
with that device name and then use that define here, like the
PCI_DEVICE_ID_TI_DRA72x was done.

thanks,

greg k-h

[PATCH kernel v6 18/20] vfio_pci: Allow mapping extra regions

2018-12-19 Thread Alexey Kardashevskiy

So far we only allowed mapping of MMIO BARs to the userspace. However
there are GPUs with on-board coherent RAM accessible via side
channels which we also want to map to the userspace. The first client
for this is NVIDIA V100 GPU with NVLink2 direct links to a POWER9
NPU-enabled CPU; such GPUs have 16GB RAM which is coherently mapped
to the system address space, we are going to export these as an extra
PCI region.

We already support extra PCI regions and this adds support for mapping
them to the userspace.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
Acked-by: Alex Williamson 
---
Changes:
v2:
* reverted one of mistakenly removed error checks
---
 drivers/vfio/pci/vfio_pci_private.h | 3 +++
 drivers/vfio/pci/vfio_pci.c | 9 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d..86aab05 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -59,6 +59,9 @@ struct vfio_pci_regops {
  size_t count, loff_t *ppos, bool iswrite);
void(*release)(struct vfio_pci_device *vdev,
   struct vfio_pci_region *region);
+   int (*mmap)(struct vfio_pci_device *vdev,
+   struct vfio_pci_region *region,
+   struct vm_area_struct *vma);
 };
 
 struct vfio_pci_region {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index fef5002..4a6f7c0 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1130,6 +1130,15 @@ static int vfio_pci_mmap(void *device_data, struct 
vm_area_struct *vma)
return -EINVAL;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
+   if (index >= VFIO_PCI_NUM_REGIONS) {
+   int regnum = index - VFIO_PCI_NUM_REGIONS;
+   struct vfio_pci_region *region = vdev->region + regnum;
+
+   if (region && region->ops && region->ops->mmap &&
+   (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+   return region->ops->mmap(vdev, region, vma);
+   return -EINVAL;
+   }
if (index >= VFIO_PCI_ROM_REGION_INDEX)
return -EINVAL;
if (!vdev->bar_mmap_supported[index])
-- 
2.17.1

[PATCH kernel v6 16/20] powerpc/powernv/npu: Check mmio_atsd array bounds when populating

2018-12-19 Thread Alexey Kardashevskiy

A broken device tree might contain more than 8 values and introduce hard
to debug memory corruption bug. This adds the boundary check.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 31dfc11..2c405a4 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -1179,8 +1179,9 @@ int pnv_npu2_init(struct pci_controller *hose)
 
npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
 
-   for (i = 0; !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
-   i, _atsd); i++)
+   for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) &&
+   !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+   i, _atsd); i++)
npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
 
pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
-- 
2.17.1

[PATCH kernel v6 12/20] powerpc/powernv/npu: Move single TVE handling to NPU PE

2018-12-19 Thread Alexey Kardashevskiy

Normal PCI PEs have 2 TVEs, one per a DMA window; however NPU PE has only
one which points to one of two tables of the corresponding PCI PE.

So whenever a new DMA window is programmed to PEs, the NPU PE needs to
release old table in order to use the new one.

Commit d41ce7b1bcc3e ("powerpc/powernv/npu: Do not try invalidating 32bit
table when 64bit table is enabled") did just that but in pci-ioda.c
while it actually belongs to npu-dma.c.

This moves the single TVE handling to npu-dma.c. This does not implement
restoring though as it is highly unlikely that we can set the table to
PCI PE and cannot to NPU PE and if that fails, we could only set 32bit
table to NPU PE and this configuration is not really supported or wanted.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c  |  8 +++
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index ef1457f..26063fb 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -130,6 +130,11 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
tbl->it_level_size : tbl->it_size;
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+   int num2 = (num == 0) ? 1 : 0;
+
+   /* NPU has just one TVE so if there is another table, remove it first */
+   if (npe->table_group.tables[num2])
+   pnv_npu_unset_window(npe, num2);
 
pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
start_addr, start_addr + win_size - 1,
@@ -160,6 +165,9 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
struct pnv_phb *phb = npe->phb;
int64_t rc;
 
+   if (!npe->table_group.tables[num])
+   return 0;
+
pe_info(npe, "Removing DMA window\n");
 
rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index a5879ab..1ee3c5d6 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2672,23 +2672,14 @@ static struct pnv_ioda_pe *gpe_table_group_to_npe(
 static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
int num, struct iommu_table *tbl)
 {
-   struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-   int num2 = (num == 0) ? 1 : 0;
long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
 
if (ret)
return ret;
 
-   if (table_group->tables[num2])
-   pnv_npu_unset_window(npe, num2);
-
-   ret = pnv_npu_set_window(npe, num, tbl);
-   if (ret) {
+   ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
+   if (ret)
pnv_pci_ioda2_unset_window(table_group, num);
-   if (table_group->tables[num2])
-   pnv_npu_set_window(npe, num2,
-   table_group->tables[num2]);
-   }
 
return ret;
 }
@@ -2697,24 +2688,12 @@ static long pnv_pci_ioda2_npu_unset_window(
struct iommu_table_group *table_group,
int num)
 {
-   struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-   int num2 = (num == 0) ? 1 : 0;
long ret = pnv_pci_ioda2_unset_window(table_group, num);
 
if (ret)
return ret;
 
-   if (!npe->table_group.tables[num])
-   return 0;
-
-   ret = pnv_npu_unset_window(npe, num);
-   if (ret)
-   return ret;
-
-   if (table_group->tables[num2])
-   ret = pnv_npu_set_window(npe, num2, table_group->tables[num2]);
-
-   return ret;
+   return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
 }
 
 static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
-- 
2.17.1

[PATCH kernel v6 10/20] powerpc/iommu_api: Move IOMMU groups setup to a single place

2018-12-19 Thread Alexey Kardashevskiy

Registering new IOMMU groups and adding devices to them are separated in
code and the latter is dug in the DMA setup code which it does not
really belong to.

This moved IOMMU groups setup to a separate helper which registers a group
and adds devices as before. This does not make a difference as IOMMU
groups are not used anyway; the only dependency here is that
iommu_add_device() requires a valid pointer to an iommu_table
(set by set_iommu_table_base()).

To keep the old behaviour, this does not add new IOMMU groups for PEs
with no DMA weight and also skips NVLink bridges which do not have
pci_controller_ops::setup_bridge (the normal way of adding PEs).

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
Changes:
v5:
* fixed compile with defined but not used pnv_ioda_setup_bus_iommu_group();
unfortunately defining a dummy version looks uglier than #ifdef
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 82 +++
 1 file changed, 68 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index b86a6e0..f6ab13d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1538,6 +1538,9 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe);
+#ifdef CONFIG_IOMMU_API
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe);
+#endif
 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 {
struct pci_bus*bus;
@@ -1591,6 +1594,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
mutex_unlock(>ioda.pe_list_mutex);
 
pnv_pci_ioda2_setup_dma_pe(phb, pe);
+#ifdef CONFIG_IOMMU_API
+   pnv_ioda_setup_bus_iommu_group(pe);
+#endif
}
 }
 
@@ -1930,21 +1936,16 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct 
pci_dev *pdev)
return mask;
 }
 
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-  struct pci_bus *bus,
-  bool add_to_group)
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
 {
struct pci_dev *dev;
 
list_for_each_entry(dev, >devices, bus_list) {
set_iommu_table_base(>dev, pe->table_group.tables[0]);
set_dma_offset(>dev, pe->tce_bypass_base);
-   if (add_to_group)
-   iommu_add_device(>table_group, >dev);
 
if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-   pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-   add_to_group);
+   pnv_ioda_setup_bus_dma(pe, dev->subordinate);
}
 }
 
@@ -2374,7 +2375,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb 
*phb,
iommu_init_table(tbl, phb->hose->node);
 
if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
-   pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+   pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
return;
  fail:
@@ -2607,7 +2608,7 @@ static void pnv_ioda2_take_ownership(struct 
iommu_table_group *table_group)
pnv_pci_ioda2_set_bypass(pe, false);
pnv_pci_ioda2_unset_window(>table_group, 0);
if (pe->pbus)
-   pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+   pnv_ioda_setup_bus_dma(pe, pe->pbus);
iommu_tce_table_put(tbl);
 }
 
@@ -2618,7 +2619,7 @@ static void pnv_ioda2_release_ownership(struct 
iommu_table_group *table_group)
 
pnv_pci_ioda2_setup_default_config(pe);
if (pe->pbus)
-   pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+   pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
@@ -2735,12 +2736,68 @@ static struct iommu_table_group_ops 
pnv_pci_ioda2_npu_ops = {
.release_ownership = pnv_ioda2_release_ownership,
 };
 
+static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
+   struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, >devices, bus_list) {
+   iommu_add_device(>table_group, >dev);
+
+   if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+   pnv_ioda_setup_bus_iommu_group_add_devices(pe,
+   dev->subordinate);
+   }
+}
+
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe)
+{
+   if (!pnv_pci_ioda_pe_dma_weight(pe))
+   return;
+
+   iommu_register_group(>table_group, pe->phb->hose->global_number,
+   pe->pe_number);
+
+   /*
+* set_iommu_table_base(>pdev->dev, tbl) should have been called
+* by now
+*/
+   if

[PATCH kernel v6 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-19 Thread Alexey Kardashevskiy

POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
pluggable PCIe devices but still have PCIe links which are used
for config space and MMIO. In addition to that the GPUs have 6 NVLinks
which are connected to other GPUs and the POWER9 CPU. POWER9 chips
have a special unit on a die called an NPU which is an NVLink2 host bus
adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
These systems also support ATS (address translation services) which is
a part of the NVLink2 protocol. Such GPUs also share on-board RAM
(16GB or 32GB) to the system via the same NVLink2 so a CPU has
cache-coherent access to a GPU RAM.

This exports GPU RAM to the userspace as a new VFIO device region. This
preregisters the new memory as device memory as it might be used for DMA.
This inserts pfns from the fault handler as the GPU memory is not onlined
until the vendor driver is loaded and trained the NVLinks so doing this
earlier causes low level errors which we fence in the firmware so
it does not hurt the host system but still better be avoided; for the same
reason this does not map GPU RAM into the host kernel (usual thing for
emulated access otherwise).

This exports an ATSD (Address Translation Shootdown) register of NPU which
allows TLB invalidations inside GPU for an operating system. The register
conveniently occupies a single 64k page. It is also presented to
the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
each of them can be used for TLB invalidation in a GPU linked to this NPU.
This allocates one ATSD register per an NVLink bridge allowing passing
up to 6 registers. Due to the host firmware bug (just recently fixed),
only 1 ATSD register per NPU was actually advertised to the host system
so this passes that alone register via the first NVLink bridge device in
the group which is still enough as QEMU collects them all back and
presents to the guest via vPHB to mimic the emulated NPU PHB on the host.

In order to provide the userspace with the information about GPU-to-NVLink
connections, this exports an additional capability called "tgt"
(which is an abbreviated host system bus address). The "tgt" property
tells the GPU its own system address and allows the guest driver to
conglomerate the routing information so each GPU knows how to get directly
to the other GPUs.

For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
know LPID (a logical partition ID or a KVM guest hardware ID in other
words) and PID (a memory context ID of a userspace process, not to be
confused with a linux pid). This assigns a GPU to LPID in the NPU and
this is why this adds a listener for KVM on an IOMMU group. A PID comes
via NVLink from a GPU and NPU uses a PID wildcard to pass it through.

This requires coherent memory and ATSD to be available on the host as
the GPU vendor only supports configurations with both features enabled
and other configurations are known not to work. Because of this and
because of the ways the features are advertised to the host system
(which is a device tree with very platform specific properties),
this requires enabled POWERNV platform.

The V100 GPUs do not advertise any of these capabilities via the config
space and there are more than just one device ID so this relies on
the platform to tell whether these GPUs have special abilities such as
NVLinks.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6:
* reworked capabilities - tgt for nvlink and gpu and link-speed
for nvlink only

v5:
* do not memremap GPU RAM for emulation, map it only when it is needed
* allocate 1 ATSD register per NVLink bridge, if none left, then expose
the region with a zero size
* separate caps per device type
* addressed AW review comments

v4:
* added nvlink-speed to the NPU bridge capability as this turned out to
be not a constant value
* instead of looking at the exact device ID (which also changes from system
to system), now this (indirectly) looks at the device tree to know
if GPU and NPU support NVLink

v3:
* reworded the commit log about tgt
* added tracepoints (do we want them enabled for entire vfio-pci?)
* added code comments
* added write|mmap flags to the new regions
* auto enabled VFIO_PCI_NVLINK2 config option
* added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
references; there are required by the NVIDIA driver
* keep notifier registered only for short time
---
 drivers/vfio/pci/Makefile   |   1 +
 drivers/vfio/pci/trace.h| 102 ++
 drivers/vfio/pci/vfio_pci_private.h |  14 +
 include/uapi/linux/vfio.h   |  38 +++
 drivers/vfio/pci/vfio_pci.c |  27 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c | 482 
 drivers/vfio/pci/Kconfig|   6 +
 7 files changed, 668 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/pci/trace.h
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index

[PATCH kernel v6 09/20] powerpc/powernv/pseries: Rework device adding to IOMMU groups

2018-12-19 Thread Alexey Kardashevskiy

The powernv platform registers IOMMU groups and adds devices to them
from the pci_controller_ops::setup_bridge() hook except one case when
virtual functions (SRIOV VFs) are added from a bus notifier.

The pseries platform registers IOMMU groups from
the pci_controller_ops::dma_bus_setup() hook and adds devices from
the pci_controller_ops::dma_dev_setup() hook. The very same bus notifier
used for powernv does not add devices for pseries though as
__of_scan_bus() adds devices first, then it does the bus/dev DMA setup.

Both platforms use iommu_add_device() which takes a device and expects
it to have a valid IOMMU table struct with an iommu_table_group pointer
which in turn points the iommu_group struct (which represents
an IOMMU group). Although the helper seems easy to use, it relies on
some pre-existing device configuration and associated data structures
which it does not really need.

This simplifies iommu_add_device() to take the table_group pointer
directly. Pseries already has a table_group pointer handy and the bus
notified is not used anyway. For powernv, this copies the existing bus
notifier, makes it work for powernv only which means an easy way of
getting to the table_group pointer. This was tested on VFs but should
also support physical PCI hotplug.

Since iommu_add_device() receives the table_group pointer directly,
pseries does not do TCE cache invalidation (the hypervisor does) nor
allow multiple groups per a VFIO container (in other words sharing
an IOMMU table between partitionable endpoints), this removes
iommu_table_group_link from pseries.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/include/asm/iommu.h  | 12 ++---
 arch/powerpc/kernel/iommu.c   | 58 ++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +---
 arch/powerpc/platforms/powernv/pci.c  | 43 -
 arch/powerpc/platforms/pseries/iommu.c| 46 +-
 5 files changed, 74 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index a8aeac0..e847ff6 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -215,9 +215,9 @@ struct iommu_table_group {
 
 extern void iommu_register_group(struct iommu_table_group *table_group,
 int pci_domain_number, unsigned long pe_num);
-extern int iommu_add_device(struct device *dev);
+extern int iommu_add_device(struct iommu_table_group *table_group,
+   struct device *dev);
 extern void iommu_del_device(struct device *dev);
-extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
unsigned long entry, unsigned long *hpa,
enum dma_data_direction *direction);
@@ -228,7 +228,8 @@ static inline void iommu_register_group(struct 
iommu_table_group *table_group,
 {
 }
 
-static inline int iommu_add_device(struct device *dev)
+static inline int iommu_add_device(struct iommu_table_group *table_group,
+   struct device *dev)
 {
return 0;
 }
@@ -236,11 +237,6 @@ static inline int iommu_add_device(struct device *dev)
 static inline void iommu_del_device(struct device *dev)
 {
 }
-
-static inline int __init tce_iommu_bus_notifier_init(void)
-{
-return 0;
-}
 #endif /* !CONFIG_IOMMU_API */
 
 int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index cbcc615..9d5d109 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1078,11 +1078,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-int iommu_add_device(struct device *dev)
+int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
 {
-   struct iommu_table *tbl;
-   struct iommu_table_group_link *tgl;
-
/*
 * The sysfs entries should be populated before
 * binding IOMMU group. If sysfs entries isn't
@@ -1098,32 +1095,10 @@ int iommu_add_device(struct device *dev)
return -EBUSY;
}
 
-   tbl = get_iommu_table_base(dev);
-   if (!tbl) {
-   pr_debug("%s: Skipping device %s with no tbl\n",
-__func__, dev_name(dev));
-   return 0;
-   }
-
-   tgl = list_first_entry_or_null(>it_group_list,
-   struct iommu_table_group_link, next);
-   if (!tgl) {
-   pr_debug("%s: Skipping device %s with no group\n",
-__func__, dev_name(dev));
-   return 0;
-   }
pr_debug("%s: Adding %s to iommu group %d\n",
-__func__, dev_name(dev),
-iommu_group_id(tgl->table_group->group));
+__func__, dev_name(dev),  iommu_group_id(table_group->group));
 
-   if (PAGE_SIZE <

[PATCH kernel v6 19/20] vfio_pci: Allow regions to add own capabilities

2018-12-19 Thread Alexey Kardashevskiy

VFIO regions already support region capabilities with a limited set of
fields. However the subdriver might have to report to the userspace
additional bits.

This adds an add_capability() hook to vfio_pci_regops.

Signed-off-by: Alexey Kardashevskiy 
Acked-by: Alex Williamson 
---
Changes:
v3:
* removed confusing rationale for the patch, the next patch makes
use of it anyway
---
 drivers/vfio/pci/vfio_pci_private.h | 3 +++
 drivers/vfio/pci/vfio_pci.c | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 86aab05..93c1738 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -62,6 +62,9 @@ struct vfio_pci_regops {
int (*mmap)(struct vfio_pci_device *vdev,
struct vfio_pci_region *region,
struct vm_area_struct *vma);
+   int (*add_capability)(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region,
+ struct vfio_info_cap *caps);
 };
 
 struct vfio_pci_region {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 4a6f7c0..6cb70cf 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -763,6 +763,12 @@ static long vfio_pci_ioctl(void *device_data,
if (ret)
return ret;
 
+   if (vdev->region[i].ops->add_capability) {
+   ret = vdev->region[i].ops->add_capability(vdev,
+   >region[i], );
+   if (ret)
+   return ret;
+   }
}
}
 
-- 
2.17.1

[PATCH kernel v6 08/20] powerpc/pseries: Remove IOMMU API support for non-LPAR systems

2018-12-19 Thread Alexey Kardashevskiy

The pci_dma_bus_setup_pSeries and pci_dma_dev_setup_pSeries hooks are
registered for the pseries platform which does not have FW_FEATURE_LPAR;
these would be pre-powernv platforms which we never supported PCI pass
through for anyway so remove it.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/platforms/pseries/iommu.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index cbcc8ce..2783cb7 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -645,7 +645,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
iommu_table_setparms(pci->phb, dn, tbl);
tbl->it_ops = _table_pseries_ops;
iommu_init_table(tbl, pci->phb->node);
-   iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
 
/* Divide the rest (1.75GB) among the children */
pci->phb->dma_window_size = 0x8000ul;
@@ -756,10 +755,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
iommu_table_setparms(phb, dn, tbl);
tbl->it_ops = _table_pseries_ops;
iommu_init_table(tbl, phb->node);
-   iommu_register_group(PCI_DN(dn)->table_group,
-   pci_domain_nr(phb->bus), 0);
set_iommu_table_base(>dev, tbl);
-   iommu_add_device(>dev);
return;
}
 
@@ -770,11 +766,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
dn = dn->parent;
 
-   if (dn && PCI_DN(dn)) {
+   if (dn && PCI_DN(dn))
set_iommu_table_base(>dev,
PCI_DN(dn)->table_group->tables[0]);
-   iommu_add_device(>dev);
-   } else
+   else
printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
   pci_name(dev));
 }
-- 
2.17.1

[PATCH kernel v6 17/20] powerpc/powernv/npu: Fault user page into the hypervisor's pagetable

2018-12-19 Thread Alexey Kardashevskiy

When a page fault happens in a GPU, the GPU signals the OS and the GPU
driver calls the fault handler which populated a page table; this allows
the GPU to complete an ATS request.

On the bare metal get_user_pages() is enough as it adds a pte to
the kernel page table but under KVM the partition scope tree does not get
updated so ATS will still fail.

This reads a byte from an effective address which causes HV storage
interrupt and KVM updates the partition scope tree.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 2c405a4..ed81426 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -1133,6 +1133,8 @@ int pnv_npu2_handle_fault(struct npu_context *context, 
uintptr_t *ea,
u64 rc = 0, result = 0;
int i, is_write;
struct page *page[1];
+   const char __user *u;
+   char c;
 
/* mmap_sem should be held so the struct_mm must be present */
struct mm_struct *mm = context->mm;
@@ -1145,18 +1147,17 @@ int pnv_npu2_handle_fault(struct npu_context *context, 
uintptr_t *ea,
is_write ? FOLL_WRITE : 0,
page, NULL, NULL);
 
-   /*
-* To support virtualised environments we will have to do an
-* access to the page to ensure it gets faulted into the
-* hypervisor. For the moment virtualisation is not supported in
-* other areas so leave the access out.
-*/
if (rc != 1) {
status[i] = rc;
result = -EFAULT;
continue;
}
 
+   /* Make sure partition scoped tree gets a pte */
+   u = page_address(page[0]);
+   if (__get_user(c, u))
+   result = -EFAULT;
+
status[i] = 0;
put_page(page[0]);
}
-- 
2.17.1

[PATCH kernel v6 15/20] powerpc/powernv/npu: Add release_ownership hook

2018-12-19 Thread Alexey Kardashevskiy

In order to make ATS work and translate addresses for arbitrary
LPID and PID, we need to program an NPU with LPID and allow PID wildcard
matching with a specific MSR mask.

This implements a helper to assign a GPU to LPAR and program the NPU
with a wildcard for PID and a helper to do clean-up. The helper takes
MSR (only DR/HV/PR/SF bits are allowed) to program them into NPU2 for
ATS checkout requests support.

This exports pnv_npu2_unmap_lpar_dev() as following patches will use it
from the VFIO driver.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* removed opal_purge_cache as it is a part of reset in skiboot now
---
 arch/powerpc/platforms/powernv/npu-dma.c | 51 
 1 file changed, 51 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 3468eaa..31dfc11 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -300,6 +300,7 @@ static void pnv_npu_take_ownership(struct iommu_table_group 
*table_group)
table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
+   struct pci_dev *gpdev = NULL;
 
/*
 * Note: NPU has just a single TVE in the hardware which means that
@@ -321,12 +322,28 @@ static void pnv_npu_take_ownership(struct 
iommu_table_group *table_group)
return;
}
pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+
+   get_gpu_pci_dev_and_pe(npe, );
+   if (gpdev)
+   pnv_npu2_unmap_lpar_dev(gpdev);
+}
+
+static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
+{
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   struct pci_dev *gpdev = NULL;
+
+   get_gpu_pci_dev_and_pe(npe, );
+   if (gpdev)
+   pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
 }
 
 static struct iommu_table_group_ops pnv_pci_npu_ops = {
.set_window = pnv_npu_set_window,
.unset_window = pnv_npu_unset_window,
.take_ownership = pnv_npu_take_ownership,
+   .release_ownership = pnv_npu_release_ownership,
 };
 #endif /* !CONFIG_IOMMU_API */
 
@@ -1237,3 +1254,37 @@ void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned 
long msr)
list_for_each_entry(gpdev, >pbus->devices, bus_list)
pnv_npu2_map_lpar_dev(gpdev, 0, msr);
 }
+
+int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
+{
+   int ret;
+   struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+   struct pci_controller *hose;
+   struct pnv_phb *nphb;
+
+   if (!npdev)
+   return -ENODEV;
+
+   hose = pci_bus_to_host(npdev->bus);
+   nphb = hose->private_data;
+
+   dev_dbg(>dev, "destroy context opalid=%llu\n",
+   nphb->opal_id);
+   ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
+   PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+   if (ret < 0) {
+   dev_err(>dev, "Failed to destroy context: %d\n", ret);
+   return ret;
+   }
+
+   /* Set LPID to 0 anyway, just to be safe */
+   dev_dbg(>dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
+   ret = opal_npu_map_lpar(nphb->opal_id,
+   PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/,
+   0 /* LPCR bits */);
+   if (ret)
+   dev_err(>dev, "Error %d mapping device to LPAR\n", ret);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
-- 
2.17.1

[PATCH kernel v6 04/20] powerpc/powernv: Move npu struct from pnv_phb to pci_controller

2018-12-19 Thread Alexey Kardashevskiy

The powernv PCI code stores NPU data in the pnv_phb struct. The latter
is referenced by pci_controller::private_data. We are going to have NPU2
support in the pseries platform as well but it does not store any
private_data in in the pci_controller struct; and even if it did,
it would be a different data structure.

This makes npu a pointer and stores it one level higher in
the pci_controller struct.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* removed !npu checks as this is out of scope of this patch
* added WARN_ON_ONCE in WARN_ON_ONCE(pnv_npu2_init(phb))

v4:
* changed subj from "powerpc/powernv: Detach npu struct from pnv_phb"
* got rid of global list of npus - store them now in pci_controller
* got rid of npdev_to_npu() helper
---
 arch/powerpc/include/asm/pci-bridge.h |  1 +
 arch/powerpc/platforms/powernv/pci.h  | 16 -
 arch/powerpc/platforms/powernv/npu-dma.c  | 74 +--
 arch/powerpc/platforms/powernv/pci-ioda.c |  2 +-
 4 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 94d4490..aee4fcc 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -129,6 +129,7 @@ struct pci_controller {
 #endif /* CONFIG_PPC64 */
 
void *private_data;
+   struct npu *npu;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 2131373..f2d50974 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -8,9 +8,6 @@
 
 struct pci_dn;
 
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-
 enum pnv_phb_type {
PNV_PHB_IODA1   = 0,
PNV_PHB_IODA2   = 1,
@@ -176,19 +173,6 @@ struct pnv_phb {
unsigned intdiag_data_size;
u8  *diag_data;
 
-   /* Nvlink2 data */
-   struct npu {
-   int index;
-   __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
-   unsigned int mmio_atsd_count;
-
-   /* Bitmask for MMIO register usage */
-   unsigned long mmio_atsd_usage;
-
-   /* Do we need to explicitly flush the nest mmu? */
-   bool nmmu_flush;
-   } npu;
-
int p2p_target_count;
 };
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 91d488f..5e66439 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -327,6 +327,25 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct 
pnv_ioda_pe *npe)
return gpe;
 }
 
+/*
+ * NPU2 ATS
+ */
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
+/* An NPU descriptor, valid for POWER9 only */
+struct npu {
+   int index;
+   __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+   unsigned int mmio_atsd_count;
+
+   /* Bitmask for MMIO register usage */
+   unsigned long mmio_atsd_usage;
+
+   /* Do we need to explicitly flush the nest mmu? */
+   bool nmmu_flush;
+};
+
 /* Maximum number of nvlinks per npu */
 #define NV_MAX_LINKS 6
 
@@ -478,7 +497,6 @@ static void acquire_atsd_reg(struct npu_context 
*npu_context,
int i, j;
struct npu *npu;
struct pci_dev *npdev;
-   struct pnv_phb *nphb;
 
for (i = 0; i <= max_npu2_index; i++) {
mmio_atsd_reg[i].reg = -1;
@@ -493,8 +511,7 @@ static void acquire_atsd_reg(struct npu_context 
*npu_context,
if (!npdev)
continue;
 
-   nphb = pci_bus_to_host(npdev->bus)->private_data;
-   npu = >npu;
+   npu = pci_bus_to_host(npdev->bus)->npu;
mmio_atsd_reg[i].npu = npu;
mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
while (mmio_atsd_reg[i].reg < 0) {
@@ -662,6 +679,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
struct pnv_phb *nphb;
struct npu *npu;
struct npu_context *npu_context;
+   struct pci_controller *hose;
 
/*
 * At present we don't support GPUs connected to multiple NPUs and I'm
@@ -689,8 +707,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
return ERR_PTR(-EINVAL);
}
 
-   nphb = pci_bus_to_host(npdev->bus)->private_data;
-   npu = >npu;
+   hose = pci_bus_to_host(npdev->bus);
+   nphb = hose->private_data;
+   npu = hose->npu;
 
/*
 * Setup the NPU context table for a particular GPU. These need to be
@@ -764,7 +783,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
 */
WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index],

[PATCH kernel v6 14/20] powerpc/powernv/npu: Add compound IOMMU groups

2018-12-19 Thread Alexey Kardashevskiy

At the moment the powernv platform registers an IOMMU group for each PE.
There is an exception though: an NVLink bridge which is attached to
the corresponding GPU's IOMMU group making it a master.

Now we have POWER9 systems with GPUs connected to each other directly
bypassing PCI. At the moment we do not control state of these links so
we have to put such interconnected GPUs to one IOMMU group which
means that the old scheme with one GPU as a master won't work - there will
be up to 3 GPUs in such group.

This introduces a npu_comp struct which represents a compound IOMMU
group made of multiple PEs - PCI PEs (for GPUs) and NPU PEs (for NVLink
bridges). This converts the existing NVLink1 code to use the new scheme.
>From now on, each PE must have a valid iommu_table_group_ops which will
either be called directly (for a single PE group) or indirectly from
a compound group handlers.

This moves IOMMU group registration for NVLink-connected GPUs to npu-dma.c.
For POWER8, this stores a new compound group pointer in the PE (so a GPU
is still a master); for POWER9 the new group pointer is stored in an NPU
(which is allocated per a PCI host controller).

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* now read page sizes from PHB NVLink to narrow down what the compoind PE
can actually support (hint: 4K/64K only)
---
 arch/powerpc/include/asm/pci.h|   1 +
 arch/powerpc/platforms/powernv/pci.h  |   7 +
 arch/powerpc/platforms/powernv/npu-dma.c  | 291 --
 arch/powerpc/platforms/powernv/pci-ioda.c | 159 
 4 files changed, 322 insertions(+), 136 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index baf2886..0c72f18 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev 
*gpdev, int index);
 extern int pnv_npu2_init(struct pci_controller *hose);
 extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
unsigned long msr);
+extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
 
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index cf9f748..aef4bb5 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -62,6 +62,7 @@ struct pnv_ioda_pe {
 
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
struct iommu_table_group table_group;
+   struct npu_comp *npucomp;
 
/* 64-bit TCE bypass region */
booltce_bypass_enabled;
@@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+   __u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
 
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+extern struct iommu_table_group *pnv_try_setup_npu_table_group(
+   struct pnv_ioda_pe *pe);
+extern struct iommu_table_group *pnv_npu_compound_attach(
+   struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index dc629ee..3468eaa 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -328,31 +328,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
.unset_window = pnv_npu_unset_window,
.take_ownership = pnv_npu_take_ownership,
 };
-
-struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
-{
-   struct pnv_phb *phb = npe->phb;
-   struct pci_bus *pbus = phb->hose->bus;
-   struct pci_dev *npdev, *gpdev = NULL, *gptmp;
-   struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, );
-
-   if (!gpe || !gpdev)
-   return NULL;
-
-   npe->table_group.ops = _pci_npu_ops;
-
-   list_for_each_entry(npdev, >devices, bus_list) {
-   gptmp = pnv_pci_get_gpu_dev(npdev);
-
-   if (gptmp != gpdev)
-   continue;
-
-   pe_info(gpe, "Attached NPU %s\n", dev_name(>dev));
-   iommu_group_add_device(gpe->table_group.group, >dev);
-   }
-
-   return gpe;
-}
 #endif /* !CONFIG_IOMMU_API */
 
 /*
@@ -360,6 +335,17 @@

[PATCH kernel v6 13/20] powerpc/powernv/npu: Convert NPU IOMMU helpers to iommu_table_group_ops

2018-12-19 Thread Alexey Kardashevskiy

At the moment NPU IOMMU is manipulated directly from the IODA2 PCI
PE code; PCI PE acts as a master to NPU PE. Soon we will have compound
IOMMU groups with several PEs from several different PHB (such as
interconnected GPUs and NPUs) so there will be no single master but
a one big IOMMU group.

This makes a first step and converts an NPU PE with a set of extern
function to a table group.

This should cause no behavioral change. Note that
pnv_npu_release_ownership() has never been implemented.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/platforms/powernv/pci.h  |  5 
 arch/powerpc/platforms/powernv/npu-dma.c  | 34 ++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +--
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index ddb4f02..cf9f748 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -216,11 +216,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
-extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
-   struct iommu_table *tbl);
-extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
-extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
-extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 26063fb..dc629ee 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -121,9 +121,14 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct 
pnv_ioda_pe *npe,
return pe;
 }
 
-long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+static long pnv_npu_unset_window(struct iommu_table_group *table_group,
+   int num);
+
+static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
struct iommu_table *tbl)
 {
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
const unsigned long size = tbl->it_indirect_levels ?
@@ -134,7 +139,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 
/* NPU has just one TVE so if there is another table, remove it first */
if (npe->table_group.tables[num2])
-   pnv_npu_unset_window(npe, num2);
+   pnv_npu_unset_window(>table_group, num2);
 
pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
start_addr, start_addr + win_size - 1,
@@ -160,8 +165,10 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
return 0;
 }
 
-long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
+static long pnv_npu_unset_window(struct iommu_table_group *table_group, int 
num)
 {
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
 
@@ -206,7 +213,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
if (!gpe)
return;
 
-   rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
+   rc = pnv_npu_set_window(>table_group, 0,
+   gpe->table_group.tables[0]);
 
/*
 * NVLink devices use the same TCE table configuration as
@@ -231,7 +239,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
return -EINVAL;
 
-   rc = pnv_npu_unset_window(npe, 0);
+   rc = pnv_npu_unset_window(>table_group, 0);
if (rc != OPAL_SUCCESS)
return rc;
 
@@ -284,9 +292,12 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, 
bool bypass)
}
 }
 
+#ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
-void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
 {
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
 
@@ -297,7 +308,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
 * if it was enabled at the moment of ownership change.
 */
if (npe->table_group.tables[0]) {
-   pnv_npu_unset_window(npe, 0);
+   pnv_npu_unset_window(>table_group, 0);
return;
}
 
@@ -312,6

[PATCH kernel v6 02/20] powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a region

2018-12-19 Thread Alexey Kardashevskiy

Normally mm_iommu_get() should add a reference and mm_iommu_put() should
remove it. However historically mm_iommu_find() does the referencing and
mm_iommu_get() is doing allocation and referencing.

We are going to add another helper to preregister device memory so
instead of having mm_iommu_new() (which pre-registers the normal memory
and references the region), we need separate helpers for pre-registering
and referencing.

This renames:
- mm_iommu_get to mm_iommu_new;
- mm_iommu_find to mm_iommu_get.

This changes mm_iommu_get() to reference the region so the name now
reflects what it does.

This removes the check for exact match from mm_iommu_new() as we want it
to fail on existing regions; mm_iommu_get() should be used instead.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
Changes:
v5:
* fixed a bug with uninitialized @found in tce_iommu_unregister_pages()
* reworded the commit log

v4:
* squashed "powerpc/mm/iommu: Make mm_iommu_new() fail on existing regions" 
into this

v2:
* merged 2 patches into one
---
 arch/powerpc/include/asm/mmu_context.h |  4 +--
 arch/powerpc/mm/mmu_context_iommu.c| 19 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c| 35 +-
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index c05efd2..268e112 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -21,7 +21,7 @@ struct mm_iommu_table_group_mem_t;
 
 extern int isolate_lru_page(struct page *page);/* from internal.h */
 extern bool mm_iommu_preregistered(struct mm_struct *mm);
-extern long mm_iommu_get(struct mm_struct *mm,
+extern long mm_iommu_new(struct mm_struct *mm,
unsigned long ua, unsigned long entries,
struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_struct *mm,
@@ -32,7 +32,7 @@ extern struct mm_iommu_table_group_mem_t 
*mm_iommu_lookup(struct mm_struct *mm,
unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
struct mm_struct *mm, unsigned long ua, unsigned long size);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned int pageshift, unsigned long *hpa);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index 0741d90..25a4b7f7 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -89,7 +89,7 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
-long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
struct mm_iommu_table_group_mem_t **pmem)
 {
struct mm_iommu_table_group_mem_t *mem;
@@ -100,12 +100,6 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
 
list_for_each_entry_rcu(mem, >context.iommu_group_mem_list,
next) {
-   if ((mem->ua == ua) && (mem->entries == entries)) {
-   ++mem->used;
-   *pmem = mem;
-   goto unlock_exit;
-   }
-
/* Overlap? */
if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
(ua < (mem->ua +
@@ -192,7 +186,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
 
return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_get);
+EXPORT_SYMBOL_GPL(mm_iommu_new);
 
 static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
 {
@@ -308,21 +302,26 @@ struct mm_iommu_table_group_mem_t 
*mm_iommu_lookup_rm(struct mm_struct *mm,
return ret;
 }
 
-struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
unsigned long ua, unsigned long entries)
 {
struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
 
+   mutex_lock(_list_mutex);
+
list_for_each_entry_rcu(mem, >context.iommu_group_mem_list, next) {
if ((mem->ua == ua) && (mem->entries == entries)) {
ret = mem;
+   ++mem->used;
break;
}
}
 
+   mutex_unlock(_list_mutex);
+
return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_find);
+EXPORT_SYMBOL_GPL(mm_iommu_get);
 
 long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned int pageshift, unsigned long

[PATCH kernel v6 11/20] powerpc/powernv: Reference iommu_table while it is linked to a group

2018-12-19 Thread Alexey Kardashevskiy

The iommu_table pointer stored in iommu_table_group may get stale
by accident, this adds referencing and removes a redundant comment
about this.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 3 ++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 7639b21..697449a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -368,6 +368,7 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
found = false;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
if (table_group->tables[i] == tbl) {
+   iommu_tce_table_put(tbl);
table_group->tables[i] = NULL;
found = true;
break;
@@ -393,7 +394,7 @@ long pnv_pci_link_table_and_group(int node, int num,
tgl->table_group = table_group;
list_add_rcu(>next, >it_group_list);
 
-   table_group->tables[num] = tbl;
+   table_group->tables[num] = iommu_tce_table_get(tbl);
 
return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index f6ab13d..a5879ab 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2719,10 +2719,6 @@ static long pnv_pci_ioda2_npu_unset_window(
 
 static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
 {
-   /*
-* Detach NPU first as pnv_ioda2_take_ownership() will destroy
-* the iommu_table if 32bit DMA is enabled.
-*/
pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
pnv_ioda2_take_ownership(table_group);
 }
-- 
2.17.1

[PATCH kernel v6 07/20] powerpc/pseries/npu: Enable platform support

2018-12-19 Thread Alexey Kardashevskiy

We already changed NPU API for GPUs to not to call OPAL and the remaining
bit is initializing NPU structures.

This searches for POWER9 NVLinks attached to any device on a PHB and
initializes an NPU structure if any found.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* added WARN_ON_ONCE

v4:
* dropped "IBM,npu-vphb" compatible type on PHB and use the type of NVLink
---
 arch/powerpc/platforms/pseries/pci.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pci.c 
b/arch/powerpc/platforms/pseries/pci.c
index 41d8a4d..7725825 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "pseries.h"
 
 #if 0
@@ -237,6 +238,8 @@ static void __init pSeries_request_regions(void)
 
 void __init pSeries_final_fixup(void)
 {
+   struct pci_controller *hose;
+
pSeries_request_regions();
 
eeh_probe_devices();
@@ -246,6 +249,25 @@ void __init pSeries_final_fixup(void)
ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
 #endif
+   list_for_each_entry(hose, _list, list_node) {
+   struct device_node *dn = hose->dn, *nvdn;
+
+   while (1) {
+   dn = of_find_all_nodes(dn);
+   if (!dn)
+   break;
+   nvdn = of_parse_phandle(dn, "ibm,nvlink", 0);
+   if (!nvdn)
+   continue;
+   if (!of_device_is_compatible(nvdn, "ibm,npu-link"))
+   continue;
+   if (!of_device_is_compatible(nvdn->parent,
+   "ibm,power9-npu"))
+   continue;
+   WARN_ON_ONCE(pnv_npu2_init(hose));
+   break;
+   }
+   }
 }
 
 /*
-- 
2.17.1

[PATCH kernel v6 06/20] powerpc/pseries/iommu: Use memory@ nodes in max RAM address calculation

2018-12-19 Thread Alexey Kardashevskiy

We might have memory@ nodes with "linux,usable-memory" set to zero
(for example, to replicate powernv's behaviour for GPU coherent memory)
which means that the memory needs an extra initialization but since
it can be used afterwards, the pseries platform will try mapping it
for DMA so the DMA window needs to cover those memory regions too;
if the window cannot cover new memory regions, the memory onlining fails.

This walks through the memory nodes to find the highest RAM address to
let a huge DMA window cover that too in case this memory gets onlined
later.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v4:
* uses of_read_number directly instead of cut-n-pasted read_n_cells
---
 arch/powerpc/platforms/pseries/iommu.c | 33 +-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 06f0296..cbcc8ce 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -964,6 +964,37 @@ struct failed_ddw_pdn {
 
 static LIST_HEAD(failed_ddw_pdn_list);
 
+static phys_addr_t ddw_memory_hotplug_max(void)
+{
+   phys_addr_t max_addr = memory_hotplug_max();
+   struct device_node *memory;
+
+   for_each_node_by_type(memory, "memory") {
+   unsigned long start, size;
+   int ranges, n_mem_addr_cells, n_mem_size_cells, len;
+   const __be32 *memcell_buf;
+
+   memcell_buf = of_get_property(memory, "reg", );
+   if (!memcell_buf || len <= 0)
+   continue;
+
+   n_mem_addr_cells = of_n_addr_cells(memory);
+   n_mem_size_cells = of_n_size_cells(memory);
+
+   /* ranges in cell */
+   ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+   start = of_read_number(memcell_buf, n_mem_addr_cells);
+   memcell_buf += n_mem_addr_cells;
+   size = of_read_number(memcell_buf, n_mem_size_cells);
+   memcell_buf += n_mem_size_cells;
+
+   max_addr = max_t(phys_addr_t, max_addr, start + size);
+   }
+
+   return max_addr;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1053,7 +1084,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
}
/* verify the window * number of ptes will map the partition */
/* check largest block * page size > max memory hotplug addr */
-   max_addr = memory_hotplug_max();
+   max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
dev_dbg(>dev, "can't map partition max 0x%llx with %u "
  "%llu-sized pages\n", max_addr,  
query.largest_available_block,
-- 
2.17.1

[PATCH kernel v6 05/20] powerpc/powernv/npu: Move OPAL calls away from context manipulation

2018-12-19 Thread Alexey Kardashevskiy

When introduced, the NPU context init/destroy helpers called OPAL which
enabled/disabled PID (a userspace memory context ID) filtering in an NPU
per a GPU; this was a requirement for P9 DD1.0. However newer chip
revision added a PID wildcard support so there is no more need to
call OPAL every time a new context is initialized. Also, since the PID
wildcard support was added, skiboot does not clear wildcard entries
in the NPU so these remain in the hardware till the system reboot.

This moves LPID and wildcard programming to the PE setup code which
executes once during the booting process so NPU2 context init/destroy
won't need to do additional configuration.

This replaces the check for FW_FEATURE_OPAL with a check for npu!=NULL as
this is the way to tell if the NPU support is present and configured.

This moves pnv_npu2_init() declaration as pseries should be able to use it.
This keeps pnv_npu2_map_lpar() in powernv as pseries is not allowed to
call that. This exports pnv_npu2_map_lpar_dev() as following patches
will use it from the VFIO driver.

While at it, replace redundant list_for_each_entry_safe() with
a simpler list_for_each_entry().

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* add few checks for npu!=NULL

v4:
* add flags check in pnv_npu2_init_context()
---
 arch/powerpc/include/asm/pci.h|   3 +
 arch/powerpc/platforms/powernv/pci.h  |   2 +-
 arch/powerpc/platforms/powernv/npu-dma.c  | 111 --
 arch/powerpc/platforms/powernv/pci-ioda.c |  15 ++-
 4 files changed, 77 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 2af9ded..baf2886 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -129,5 +129,8 @@ extern void pcibios_scan_phb(struct pci_controller *hose);
 
 extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev);
 extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
+extern int pnv_npu2_init(struct pci_controller *hose);
+extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
+   unsigned long msr);
 
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index f2d50974..ddb4f02 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -190,6 +190,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_init_npu_phb(struct device_node *np);
 extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
+extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
@@ -220,7 +221,6 @@ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int 
num,
 extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
 extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
 extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-extern int pnv_npu2_init(struct pnv_phb *phb);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 5e66439..ef1457f 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -512,6 +512,9 @@ static void acquire_atsd_reg(struct npu_context 
*npu_context,
continue;
 
npu = pci_bus_to_host(npdev->bus)->npu;
+   if (!npu)
+   continue;
+
mmio_atsd_reg[i].npu = npu;
mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
while (mmio_atsd_reg[i].reg < 0) {
@@ -676,7 +679,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
u32 nvlink_index;
struct device_node *nvlink_dn;
struct mm_struct *mm = current->mm;
-   struct pnv_phb *nphb;
struct npu *npu;
struct npu_context *npu_context;
struct pci_controller *hose;
@@ -687,13 +689,14 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
 */
struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
 
-   if (!firmware_has_feature(FW_FEATURE_OPAL))
-   return ERR_PTR(-ENODEV);
-
if (!npdev)
/* No nvlink associated with this GPU device */
return ERR_PTR(-ENODEV);
 
+   /* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */
+   if (flags & ~(MSR_DR | MSR_PR | MSR_HV))
+   return ERR_PTR(-EINVAL);
+
nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",

[PATCH kernel v6 03/20] powerpc/vfio/iommu/kvm: Do not pin device memory

2018-12-19 Thread Alexey Kardashevskiy

This new memory does not have page structs as it is not plugged to
the host so gup() will fail anyway.

This adds 2 helpers:
- mm_iommu_newdev() to preregister the "memory device" memory so
the rest of API can still be used;
- mm_iommu_is_devmem() to know if the physical address is one of thise
new regions which we must avoid unpinning of.

This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test
if the memory is device memory to avoid pfn_to_page().

This adds a check for device memory in mm_iommu_ua_mark_dirty_rm() which
does delayed pages dirtying.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: Paul Mackerras 
---
Changes:
v6:
* added dummy mm_iommu_is_devmem() for !CONFIG_SPAPR_TCE_IOMMU
* removed "extern" from c file

v5:
* mm_iommu_is_devmem() now returns the actual size which might me smaller
than the pageshift so  tce_page_is_contained() won't do pfn_to_page()
if @hpa..@hpa+64K is preregistered but page_shift is bigger than 16
* removed David's r-by because of the change in mm_iommu_is_devmem

v4:
* added device memory check in the real mode path
---
 arch/powerpc/include/asm/iommu.h   |  5 +-
 arch/powerpc/include/asm/mmu_context.h | 11 +++
 arch/powerpc/kernel/iommu.c| 11 ++-
 arch/powerpc/kvm/book3s_64_vio.c   | 18 ++---
 arch/powerpc/mm/mmu_context_iommu.c| 93 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c| 29 +---
 6 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 35db0cb..a8aeac0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -218,8 +218,9 @@ extern void iommu_register_group(struct iommu_table_group 
*table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
-extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-   unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+   unsigned long entry, unsigned long *hpa,
+   enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 268e112..c50bd6a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -24,6 +24,9 @@ extern bool mm_iommu_preregistered(struct mm_struct *mm);
 extern long mm_iommu_new(struct mm_struct *mm,
unsigned long ua, unsigned long entries,
struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+   unsigned long entries, unsigned long dev_hpa,
+   struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_struct *mm,
struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(struct mm_struct *mm);
@@ -39,8 +42,16 @@ extern long mm_iommu_ua_to_hpa(struct 
mm_iommu_table_group_mem_t *mem,
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua);
+extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+   unsigned int pageshift, unsigned long *size);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#else
+static inline bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+   unsigned int pageshift, unsigned long *size)
+{
+   return false;
+}
 #endif
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 extern void set_context(unsigned long id, pgd_t *pgd);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f0dc680..cbcc615 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DBG(...)
 
@@ -993,15 +994,19 @@ int iommu_tce_check_gpa(unsigned long page_shift, 
unsigned long gpa)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
-long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-   unsigned long *hpa, enum dma_data_direction *direction)
+long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+   unsigned long entry, unsigned long *hpa,
+   enum dma_data_direction *direction)
 {
long ret;
+   unsigned long size = 0;
 
ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-

[PATCH kernel v6 01/20] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-12-19 Thread Alexey Kardashevskiy

The skiboot firmware has a hot reset handler which fences the NVIDIA V100
GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs:
https://github.com/open-power/skiboot/commit/fca2b2b839a67

Now we are going to pass V100 via VFIO which most certainly involves
KVM guests which are often terminated without getting a chance to offline
GPU RAM so we end up with a running machine with misconfigured memory.
Accessing this memory produces hardware management interrupts (HMI)
which bring the host down.

To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable()
via pci_disable_device() which switches NPU2 to a safe mode and prevents
HMIs.

Signed-off-by: Alexey Kardashevskiy 
Acked-by: Alistair Popple 
Reviewed-by: David Gibson 
---
Changes:
v2:
* updated the commit log
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9ee7a30..29c6837 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3676,6 +3676,15 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
pnv_ioda_release_pe(pe);
 }
 
+static void pnv_npu_disable_device(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+   struct eeh_pe *eehpe = edev ? edev->pe : NULL;
+
+   if (eehpe && eeh_ops && eeh_ops->reset)
+   eeh_ops->reset(eehpe, EEH_RESET_HOT);
+}
+
 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
struct pnv_phb *phb = hose->private_data;
@@ -3720,6 +3729,7 @@ static const struct pci_controller_ops 
pnv_npu_ioda_controller_ops = {
.reset_secondary_bus= pnv_pci_reset_secondary_bus,
.dma_set_mask   = pnv_npu_dma_set_mask,
.shutdown   = pnv_pci_ioda_shutdown,
+   .disable_device = pnv_npu_disable_device,
 };
 
 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
-- 
2.17.1

[PATCH kernel v6 00/20] powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough

2018-12-19 Thread Alexey Kardashevskiy



This is for passing through NVIDIA V100 GPUs on POWER9 systems.
20/20 has the details of hardware setup.

This implements support for NVIDIA V100 GPU with coherent memory and
NPU/ATS support available in the POWER9 CPU. The aim is to support
unmodified vendor driver in the guest.

This is pushed to (both guest and host kernels):
https://github.com/aik/linux/tree/nv2

Matching qemu is pushed to github:
https://github.com/aik/qemu/tree/nv2

Skiboot bits are here:
https://github.com/aik/skiboot/tree/nv2

The individual patches have changelogs. v6 is mostly about compile
fixes; it also changes VFIO capabilities in 20/20.


Please comment. Thanks.



Alexey Kardashevskiy (20):
  powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2
  powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a
region
  powerpc/vfio/iommu/kvm: Do not pin device memory
  powerpc/powernv: Move npu struct from pnv_phb to pci_controller
  powerpc/powernv/npu: Move OPAL calls away from context manipulation
  powerpc/pseries/iommu: Use memory@ nodes in max RAM address
calculation
  powerpc/pseries/npu: Enable platform support
  powerpc/pseries: Remove IOMMU API support for non-LPAR systems
  powerpc/powernv/pseries: Rework device adding to IOMMU groups
  powerpc/iommu_api: Move IOMMU groups setup to a single place
  powerpc/powernv: Reference iommu_table while it is linked to a group
  powerpc/powernv/npu: Move single TVE handling to NPU PE
  powerpc/powernv/npu: Convert NPU IOMMU helpers to
iommu_table_group_ops
  powerpc/powernv/npu: Add compound IOMMU groups
  powerpc/powernv/npu: Add release_ownership hook
  powerpc/powernv/npu: Check mmio_atsd array bounds when populating
  powerpc/powernv/npu: Fault user page into the hypervisor's pagetable
  vfio_pci: Allow mapping extra regions
  vfio_pci: Allow regions to add own capabilities
  vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

 drivers/vfio/pci/Makefile |   1 +
 arch/powerpc/include/asm/iommu.h  |  17 +-
 arch/powerpc/include/asm/mmu_context.h|  15 +-
 arch/powerpc/include/asm/pci-bridge.h |   1 +
 arch/powerpc/include/asm/pci.h|   4 +
 arch/powerpc/platforms/powernv/pci.h  |  30 +-
 drivers/vfio/pci/trace.h  | 102 
 drivers/vfio/pci/vfio_pci_private.h   |  20 +
 include/uapi/linux/vfio.h |  38 ++
 arch/powerpc/kernel/iommu.c   |  69 +--
 arch/powerpc/kvm/book3s_64_vio.c  |  18 +-
 arch/powerpc/mm/mmu_context_iommu.c   | 110 +++-
 arch/powerpc/platforms/powernv/npu-dma.c  | 549 +++---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c |   3 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 237 
 arch/powerpc/platforms/powernv/pci.c  |  43 +-
 arch/powerpc/platforms/pseries/iommu.c|  88 ++-
 arch/powerpc/platforms/pseries/pci.c  |  22 +
 drivers/vfio/pci/vfio_pci.c   |  42 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c   | 482 +++
 drivers/vfio/vfio_iommu_spapr_tce.c   |  64 +-
 drivers/vfio/pci/Kconfig  |   6 +
 22 files changed, 1570 insertions(+), 391 deletions(-)
 create mode 100644 drivers/vfio/pci/trace.h
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

-- 
2.17.1

Re: [PATCH v3] powerpc: implement CONFIG_DEBUG_VIRTUAL

2018-12-19 Thread Michael Ellerman

Christophe Leroy  writes:
> On 12/19/2018 06:57 AM, Christophe Leroy wrote:
...
>
> In fact the solution is the following:
>
> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> index 4fc77a99c9bf..60401af2bc8f 100644
> --- a/arch/powerpc/mm/pgtable_32.c
> +++ b/arch/powerpc/mm/pgtable_32.c
> @@ -143,7 +143,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long 
> size, pgprot_t prot, void *call
>* Don't allow anybody to remap normal RAM that we're using.
>* mem_init() sets high_memory so only do the check after that.
>*/
> - if (slab_is_available() && (p < virt_to_phys(high_memory)) &&
> + if (slab_is_available() && (p <= virt_to_phys(high_memory - 1)) &&
>   page_is_ram(__phys_to_pfn(p))) {
>   printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n",
>  (unsigned long long)p, __builtin_return_address(0));
>
>
> I'll send an updated patch in a few minutes.

Awesome, thanks. I'll take v4.

cheers

59 matches

Mail list logo