date:20190815

[PATCH 5/5] powerpc/mm: ppc 603 doesn't need update_mmu_cache()

2019-08-15 Thread Christophe Leroy

On powerpc 603, there is no hash table so get out of
update_mmu_cache() early.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/book3s32/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 3a62bf99f93f..c20269be79ec 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -319,6 +319,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea)
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
  pte_t *ptep)
 {
+   if (!mmu_has_feature(MMU_FTR_HPTE_TABLE))
+   return;
/*
 * We don't need to worry about _PAGE_PRESENT here because we are
 * called with either mm->page_table_lock held or ptl lock held
-- 
2.13.3

[PATCH 4/5] powerpc/mm: Simplify update_mmu_cache() on BOOK3S32

2019-08-15 Thread Christophe Leroy

On BOOK3S32, hash_preload() neither use is_exec nor trap,
so drop those parameters and simplify update_mmu_cached().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/book3s32/mmu.c | 29 +++--
 arch/powerpc/mm/mmu_decl.h |  3 +--
 arch/powerpc/mm/pgtable_32.c   |  2 +-
 3 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 3e3c4077cdb7..3a62bf99f93f 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -297,8 +297,7 @@ void __init setbat(int index, unsigned long virt, 
phys_addr_t phys,
 /*
  * Preload a translation in the hash table
  */
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- bool is_exec, unsigned long trap)
+void hash_preload(struct mm_struct *mm, unsigned long ea)
 {
pmd_t *pmd;
 
@@ -324,34 +323,20 @@ void update_mmu_cache(struct vm_area_struct *vma, 
unsigned long address,
 * We don't need to worry about _PAGE_PRESENT here because we are
 * called with either mm->page_table_lock held or ptl lock held
 */
-   unsigned long trap;
-   bool is_exec;
 
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
 
-   /* We try to figure out if we are coming from an instruction
-* access fault and pass that down to __hash_page so we avoid
-* double-faulting on execution of fresh text. We have to test
-* for regs NULL since init will get here first thing at boot
-*
-* We also avoid filling the hash if not coming from a fault
-*/
+   /* We have to test for regs NULL since init will get here first thing 
at boot */
+   if (!current->thread.regs)
+   return;
 
-   trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
-   switch (trap) {
-   case 0x300:
-   is_exec = false;
-   break;
-   case 0x400:
-   is_exec = true;
-   break;
-   default:
+   /* We also avoid filling the hash if not coming from a fault */
+   if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) 
!= 0x400)
return;
-   }
 
-   hash_preload(vma->vm_mm, address, is_exec, trap);
+   hash_preload(vma->vm_mm, address);
 }
 
 /*
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 9f325a7a09cb..adbaf2167214 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -91,8 +91,7 @@ void print_system_hash_info(void);
 
 #ifdef CONFIG_PPC32
 
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- bool is_exec, unsigned long trap);
+void hash_preload(struct mm_struct *mm, unsigned long ea);
 
 extern void mapin_ram(void);
 extern void setbat(int index, unsigned long virt, phys_addr_t phys,
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 35cb96cfc258..97f401a06fcc 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -252,7 +252,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, 
unsigned long top)
map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
 #ifdef CONFIG_PPC_BOOK3S_32
if (ktext)
-   hash_preload(_mm, v, false, 0x300);
+   hash_preload(_mm, v);
 #endif
v += PAGE_SIZE;
p += PAGE_SIZE;
-- 
2.13.3

[PATCH 3/5] powerpc/mm: move update_mmu_cache() into book3s hash utils.

2019-08-15 Thread Christophe Leroy

update_mmu_cache() is only for BOOK3S, and can be simplified
for BOOK3S32.

Move it out of mem.c into respective BOOK3S32 and BOOK3S64
files containing hash utils.

BOOK3S64 version of hash_preload() is only used locally,
declare it static.

Remove the radix_enabled() stuff in BOOK3S32 version.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/book3s32/mmu.c| 45 +
 arch/powerpc/mm/book3s64/hash_utils.c | 54 +--
 arch/powerpc/mm/mem.c | 52 -
 arch/powerpc/mm/mmu_decl.h|  7 ++---
 4 files changed, 100 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index e249fbf6b9c3..3e3c4077cdb7 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -310,6 +310,51 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
 }
 
 /*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+   /*
+* We don't need to worry about _PAGE_PRESENT here because we are
+* called with either mm->page_table_lock held or ptl lock held
+*/
+   unsigned long trap;
+   bool is_exec;
+
+   /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+   if (!pte_young(*ptep) || address >= TASK_SIZE)
+   return;
+
+   /* We try to figure out if we are coming from an instruction
+* access fault and pass that down to __hash_page so we avoid
+* double-faulting on execution of fresh text. We have to test
+* for regs NULL since init will get here first thing at boot
+*
+* We also avoid filling the hash if not coming from a fault
+*/
+
+   trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+   switch (trap) {
+   case 0x300:
+   is_exec = false;
+   break;
+   case 0x400:
+   is_exec = true;
+   break;
+   default:
+   return;
+   }
+
+   hash_preload(vma->vm_mm, address, is_exec, trap);
+}
+
+/*
  * Initialize the hash table and patch the instructions in hashtable.S.
  */
 void __init MMU_init_hw(void)
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index e6d471058597..fdad5bc07d79 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1518,8 +1518,8 @@ static bool should_hash_preload(struct mm_struct *mm, 
unsigned long ea)
 }
 #endif
 
-void hash_preload(struct mm_struct *mm, unsigned long ea,
- bool is_exec, unsigned long trap)
+static void hash_preload(struct mm_struct *mm, unsigned long ea,
+bool is_exec, unsigned long trap)
 {
int hugepage_shift;
unsigned long vsid;
@@ -1599,6 +1599,56 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
local_irq_restore(flags);
 }
 
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ * We use it to preload an HPTE into the hash table corresponding to
+ * the updated linux PTE.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+   /*
+* We don't need to worry about _PAGE_PRESENT here because we are
+* called with either mm->page_table_lock held or ptl lock held
+*/
+   unsigned long trap;
+   bool is_exec;
+
+   if (radix_enabled()) {
+   prefetch((void *)address);
+   return;
+   }
+
+   /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
+   if (!pte_young(*ptep) || address >= TASK_SIZE)
+   return;
+
+   /* We try to figure out if we are coming from an instruction
+* access fault and pass that down to __hash_page so we avoid
+* double-faulting on execution of fresh text. We have to test
+* for regs NULL since init will get here first thing at boot
+*
+* We also avoid filling the hash if not coming from a fault
+*/
+
+   trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL;
+   switch (trap) {
+   case 0x300:
+   is_exec = false;
+   break;
+   case 0x400:
+   is_exec = true;
+   break;
+   default:
+   return;
+   }
+
+   hash_preload(vma->vm_mm, address, is_exec, trap);
+}
+
 #ifdef

[PATCH 2/5] powerpc/mm: move FSL_BOOK3 version of update_mmu_cache()

2019-08-15 Thread Christophe Leroy

Move FSL_BOOK3E version of update_mmu_cache() at the same
place as book3e_hugetlb_preload() as update_mmu_cache() is
the only user of book3e_hugetlb_preload().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hugetlb.h  |  3 ---
 arch/powerpc/mm/mem.c   |  8 
 arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 16 ++--
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 20a101046cff..bd6504c28c2f 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -31,9 +31,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
return 0;
 }
 
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
-   pte_t pte);
-
 #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
 void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
unsigned long end, unsigned long floor,
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 582ad728ac9d..c45d44538ddb 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -457,14 +457,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned 
long address,
hash_preload(vma->vm_mm, address, is_exec, trap);
 }
 #endif /* CONFIG_PPC_BOOK3S */
-#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep)
-{
-   if (is_vm_hugetlb_page(vma))
-   book3e_hugetlb_preload(vma, address, *ptep);
-}
-#endif
 
 /*
  * System memory should not be in /proc/iomem but various tools expect it
diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c 
b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
index 61915f4d3c7f..8b88be91b622 100644
--- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c
@@ -122,8 +122,8 @@ static inline int book3e_tlb_exists(unsigned long ea, 
unsigned long pid)
return found;
 }
 
-void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
-   pte_t pte)
+static void
+book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte)
 {
unsigned long mas1, mas2;
u64 mas7_3;
@@ -183,6 +183,18 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, 
unsigned long ea,
local_irq_restore(flags);
 }
 
+/*
+ * This is called at the end of handling a user page fault, when the
+ * fault has been handled by updating a PTE in the linux page tables.
+ *
+ * This must always be called with the pte lock held.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t 
*ptep)
+{
+   if (is_vm_hugetlb_page(vma))
+   book3e_hugetlb_preload(vma, address, *ptep);
+}
+
 void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
struct hstate *hstate = hstate_file(vma->vm_file);
-- 
2.13.3

[PATCH 1/5] powerpc/mm: define empty update_mmu_cache() as static inline

2019-08-15 Thread Christophe Leroy

Only BOOK3S and FSL_BOOK3E have a usefull update_mmu_cache().

For the others, just define it static inline.

In the meantime, simplify the FSL_BOOK3E related ifdef as
book3e_hugetlb_preload() only exists when CONFIG_PPC_FSL_BOOK3E
is selected.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/pgtable.h | 11 +++
 arch/powerpc/include/asm/nohash/pgtable.h | 13 +
 arch/powerpc/include/asm/pgtable.h| 12 
 arch/powerpc/mm/mem.c | 11 +++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/pgtable.h 
b/arch/powerpc/include/asm/book3s/pgtable.h
index 6436b65ac7bc..0e1263455d73 100644
--- a/arch/powerpc/include/asm/book3s/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/pgtable.h
@@ -26,5 +26,16 @@ extern pgprot_t phys_mem_access_prot(struct file *file, 
unsigned long pfn,
 unsigned long size, pgprot_t vma_prot);
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 
+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ * On machines which use an MMU hash table, we use this to put a
+ * corresponding HPTE into the hash table ahead of time, instead of
+ * waiting for the inevitable extra hash-table miss exception.
+ */
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t 
*ptep);
+
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index 1ca1c1864b32..7fed9dc0f147 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -293,5 +293,18 @@ static inline int pgd_huge(pgd_t pgd)
 #define is_hugepd(hpd) (hugepd_ok(hpd))
 #endif
 
+/*
+ * This gets called at the end of handling a page fault, when
+ * the kernel has put a new PTE into the page table for the process.
+ * We use it to ensure coherency between the i-cache and d-cache
+ * for the page which has just been mapped in.
+ */
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t 
*ptep);
+#else
+static inline
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t 
*ptep) {}
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index c58ba7963688..c70916a7865a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -77,18 +77,6 @@ extern void paging_init(void);
 
 #include 
 
-
-/*
- * This gets called at the end of handling a page fault, when
- * the kernel has put a new PTE into the page table for the process.
- * We use it to ensure coherency between the i-cache and d-cache
- * for the page which has just been mapped in.
- * On machines which use an MMU hash table, we use this to put a
- * corresponding HPTE into the hash table ahead of time, instead of
- * waiting for the inevitable extra hash-table miss exception.
- */
-extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
-
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_large(pmd) 0
 #endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9191a66b3bc5..582ad728ac9d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -414,10 +414,10 @@ EXPORT_SYMBOL(flush_icache_user_range);
  * 
  * This must always be called with the pte lock held.
  */
+#ifdef CONFIG_PPC_BOOK3S
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
  pte_t *ptep)
 {
-#ifdef CONFIG_PPC_BOOK3S
/*
 * We don't need to worry about _PAGE_PRESENT here because we are
 * called with either mm->page_table_lock held or ptl lock held
@@ -455,13 +455,16 @@ void update_mmu_cache(struct vm_area_struct *vma, 
unsigned long address,
}
 
hash_preload(vma->vm_mm, address, is_exec, trap);
+}
 #endif /* CONFIG_PPC_BOOK3S */
-#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \
-   && defined(CONFIG_HUGETLB_PAGE)
+#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE)
+void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
if (is_vm_hugetlb_page(vma))
book3e_hugetlb_preload(vma, address, *ptep);
-#endif
 }
+#endif
 
 /*
  * System memory should not be in /proc/iomem but various tools expect it
-- 
2.13.3

[PATCH] ASoC: imx-audmux: Add driver suspend and resume to support MEGA Fast

2019-08-15 Thread Shengjiu Wang

For i.MX6 SoloX, there is a mode of the SoC to shutdown all power
source of modules during system suspend and resume procedure.
Thus, AUDMUX needs to save all the values of registers before the
system suspend and restore them after the system resume.

Signed-off-by: Shengjiu Wang 
---
 sound/soc/fsl/imx-audmux.c | 54 +-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/sound/soc/fsl/imx-audmux.c b/sound/soc/fsl/imx-audmux.c
index 7595f24a006e..3ce85a43e08f 100644
--- a/sound/soc/fsl/imx-audmux.c
+++ b/sound/soc/fsl/imx-audmux.c
@@ -23,6 +23,8 @@
 
 static struct clk *audmux_clk;
 static void __iomem *audmux_base;
+static u32 *regcache;
+static u32 reg_max;
 
 #define IMX_AUDMUX_V2_PTCR(x)  ((x) * 8)
 #define IMX_AUDMUX_V2_PDCR(x)  ((x) * 8 + 4)
@@ -315,8 +317,23 @@ static int imx_audmux_probe(struct platform_device *pdev)
if (of_id)
pdev->id_entry = of_id->data;
audmux_type = pdev->id_entry->driver_data;
-   if (audmux_type == IMX31_AUDMUX)
+
+   switch (audmux_type) {
+   case IMX31_AUDMUX:
audmux_debugfs_init();
+   reg_max = 14;
+   break;
+   case IMX21_AUDMUX:
+   reg_max = 6;
+   break;
+   default:
+   dev_err(>dev, "unsupported version!\n");
+   return -EINVAL;
+   }
+
+   regcache = devm_kzalloc(>dev, sizeof(u32) * reg_max, GFP_KERNEL);
+   if (!regcache)
+   return -ENOMEM;
 
if (of_id)
imx_audmux_parse_dt_defaults(pdev, pdev->dev.of_node);
@@ -332,12 +349,47 @@ static int imx_audmux_remove(struct platform_device *pdev)
return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int imx_audmux_suspend(struct device *dev)
+{
+   int i;
+
+   clk_prepare_enable(audmux_clk);
+
+   for (i = 0; i < reg_max; i++)
+   regcache[i] = readl(audmux_base + i * 4);
+
+   clk_disable_unprepare(audmux_clk);
+
+   return 0;
+}
+
+static int imx_audmux_resume(struct device *dev)
+{
+   int i;
+
+   clk_prepare_enable(audmux_clk);
+
+   for (i = 0; i < reg_max; i++)
+   writel(regcache[i], audmux_base + i * 4);
+
+   clk_disable_unprepare(audmux_clk);
+
+   return 0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+static const struct dev_pm_ops imx_audmux_pm = {
+   SET_SYSTEM_SLEEP_PM_OPS(imx_audmux_suspend, imx_audmux_resume)
+};
+
 static struct platform_driver imx_audmux_driver = {
.probe  = imx_audmux_probe,
.remove = imx_audmux_remove,
.id_table   = imx_audmux_ids,
.driver = {
.name   = DRIVER_NAME,
+   .pm = _audmux_pm,
.of_match_table = imx_audmux_dt_ids,
}
 };
-- 
2.21.0

Re: [PATCH v4 19/25] powernv/fadump: add support to preserve crash data on FADUMP disabled kernel

2019-08-15 Thread Mahesh J Salgaonkar

On 2019-07-16 17:04:16 Tue, Hari Bathini wrote:
> Add a new kernel config option, CONFIG_PRESERVE_FA_DUMP that ensures
> that crash data, from previously crash'ed kernel, is preserved. This
> helps in cases where FADump is not enabled but the subsequent memory
> preserving kernel boot is likely to process this crash data. One
> typical usecase for this config option is petitboot kernel.
> 
> As OPAL allows registering address with it in the first kernel and
> retrieving it after MPIPL, use it to store the top of boot memory.
> A kernel that intends to preserve crash data retrieves it and avoids
> using memory beyond this address.
> 
> Signed-off-by: Hari Bathini 
> ---
>  arch/powerpc/Kconfig |9 ++
>  arch/powerpc/include/asm/fadump.h|9 +-
>  arch/powerpc/kernel/Makefile |6 +
>  arch/powerpc/kernel/fadump-common.h  |   13 ++-
>  arch/powerpc/kernel/fadump.c |  128 
> --
>  arch/powerpc/kernel/prom.c   |4 -
>  arch/powerpc/platforms/powernv/Makefile  |1 
>  arch/powerpc/platforms/powernv/opal-fadump.c |   59 
>  arch/powerpc/platforms/powernv/opal-fadump.h |3 +
>  9 files changed, 176 insertions(+), 56 deletions(-)
> 
[...]
>  #include "../../kernel/fadump-common.h"
>  #include "opal-fadump.h"
>  
> +
> +#ifdef CONFIG_PRESERVE_FA_DUMP
> +/*
> + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
> + * ensure crash data is preserved in hope that the subsequent memory
> + * preserving kernel boot is going to process this crash data.
> + */
> +int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node)
> +{
> + unsigned long dn;
> + const __be32 *prop;
> +
> + dn = of_get_flat_dt_subnode_by_name(node, "dump");
> + if (dn == -FDT_ERR_NOTFOUND)
> + return 1;
> +
> + /*
> +  * Check if dump has been initiated on last reboot.
> +  */
> + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
> + if (prop) {
> + u64 addr = 0;
> + s64 ret;
> +
> + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, );
> + if ((ret != OPAL_SUCCESS) || !addr) {
> + pr_err("Failed to get boot memory tag (%lld)\n", ret);
> + return 1;
> + }
> +
> + /*
> +  * Anything below this address can be used for booting a
> +  * capture kernel or petitboot kernel. Preserve everything
> +  * above this address for processing crashdump.
> +  */
> + fadump_conf->boot_mem_top = be64_to_cpu(addr);
> + pr_debug("Preserve everything above %lx\n",
> +  fadump_conf->boot_mem_top);
> +
> + pr_info("Firmware-assisted dump is active.\n");
> + fadump_conf->dump_active = 1;
> + }
> +
> + return 1;
> +}
> +
> +#else /* CONFIG_PRESERVE_FA_DUMP */
>  static const struct opal_fadump_mem_struct *opal_fdm_active;
>  static const struct opal_mpipl_fadump *opal_cpu_metadata;
>  static struct opal_fadump_mem_struct *opal_fdm;
> @@ -155,6 +202,17 @@ static int opal_fadump_setup_kernel_metadata(struct 
> fw_dump *fadump_conf)
>   err = -EPERM;
>   }
>  
> + /*
> +  * Register boot memory top address with f/w. Should be retrieved
> +  * by a kernel that intends to preserve crash'ed kernel's memory.
> +  */
> + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
> +   fadump_conf->boot_mem_top);

Looks like we only register tag but never de-register ot set them to
NULL when we don't need it. Same for kernel TAG. i.e if we kexec into
new kernel which may not do fadump and if opal crashes it will present
stale tags to next kernel. I think we should set bootmem/kernel tag to
NULL in fadump_cleanup() path so that kexec path can be taken care of.

Thanks,
-Mahesh.

[PATCH v5 11/12] powerpc/eeh: Remove unused return path from eeh_pe_dev_traverse()

2019-08-15 Thread Sam Bobroff

There are no users of the early-out return value from
eeh_pe_dev_traverse(), so remove it.

Signed-off-by: Sam Bobroff 
---
v5 * New in this version.

 arch/powerpc/include/asm/eeh.h   |  6 +++---
 arch/powerpc/kernel/eeh.c| 16 +---
 arch/powerpc/kernel/eeh_driver.c | 26 +++---
 arch/powerpc/kernel/eeh_pe.c | 25 +++--
 4 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index e1023a556721..10cb6342f60b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -261,7 +261,7 @@ static inline bool eeh_state_active(int state)
== (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 }
 
-typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
+typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag);
 typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
@@ -275,8 +275,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
  eeh_pe_traverse_func fn, void *flag);
-void *eeh_pe_dev_traverse(struct eeh_pe *root,
- eeh_edev_traverse_func fn, void *flag);
+void eeh_pe_dev_traverse(struct eeh_pe *root,
+eeh_edev_traverse_func fn, void *flag);
 void eeh_pe_restore_bars(struct eeh_pe *pe);
 const char *eeh_pe_loc_get(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index b6683f367f7f..b7a884305ae5 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -696,7 +696,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
return rc;
 }
 
-static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev,
+static void eeh_disable_and_save_dev_state(struct eeh_dev *edev,
void *userdata)
 {
struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
@@ -707,7 +707,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev 
*edev,
 * state for the specified device
 */
if (!pdev || pdev == dev)
-   return NULL;
+   return;
 
/* Ensure we have D0 power state */
pci_set_power_state(pdev, PCI_D0);
@@ -720,18 +720,16 @@ static void *eeh_disable_and_save_dev_state(struct 
eeh_dev *edev,
 * interrupt from the device
 */
pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
-
-   return NULL;
 }
 
-static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
+static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 {
struct pci_dn *pdn = eeh_dev_to_pdn(edev);
struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
struct pci_dev *dev = userdata;
 
if (!pdev)
-   return NULL;
+   return;
 
/* Apply customization from firmware */
if (pdn && eeh_ops->restore_config)
@@ -740,8 +738,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, 
void *userdata)
/* The caller should restore state for the specified device */
if (pdev != dev)
pci_restore_state(pdev);
-
-   return NULL;
 }
 
 int eeh_restore_vf_config(struct pci_dn *pdn)
@@ -956,7 +952,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum 
pcie_reset_state stat
  * the indicated device and its children so that the bunch of the
  * devices could be reset properly.
  */
-static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
+static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag)
 {
struct pci_dev *dev;
unsigned int *freset = (unsigned int *)flag;
@@ -964,8 +960,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void 
*flag)
dev = eeh_dev_to_pci_dev(edev);
if (dev)
*freset |= dev->needs_freset;
-
-   return NULL;
 }
 
 static void eeh_pe_refreeze_passed(struct eeh_pe *root)
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index d0d175f22129..56841c6451e5 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -198,12 +198,12 @@ static void eeh_enable_irq(struct eeh_dev *edev)
}
 }
 
-static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
+static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
 {
struct pci_dev *pdev;
 
if (!edev)
-   return NULL;
+   return;
 
/*
 * We cannot access the config space on some adapters.
@@ -213,14 +213,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, 
void *userdata)
 * device is created.
 */
if (edev->pe &&

[PATCH v5 09/12] powerpc/eeh: Convert log messages to eeh_edev_* macros

2019-08-15 Thread Sam Bobroff

Convert existing messages, where appropriate, to use the eeh_edev_*
logging macros.

The only effect should be minor adjustments to the log messages, apart
from:

- A new message in pseries_eeh_probe() "Probing device" to match the
powernv case.
- The "Probing device" message in pnv_eeh_probe() is now generated
slightly later, which will mean that it is no longer emitted for
devices that aren't probed due to the initial checks.

Signed-off-by: Sam Bobroff 
---
 arch/powerpc/include/asm/ppc-pci.h   |  5 --
 arch/powerpc/kernel/eeh.c| 19 +++-
 arch/powerpc/kernel/eeh_cache.c  |  8 +--
 arch/powerpc/kernel/eeh_driver.c |  7 +--
 arch/powerpc/kernel/eeh_pe.c | 51 +---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 17 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 21 +++-
 7 files changed, 39 insertions(+), 89 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index 72860de205a0..7f4be5a05eb3 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -62,11 +62,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode);
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
 
-static inline const char *eeh_pci_name(struct pci_dev *pdev) 
-{ 
-   return pdev ? pci_name(pdev) : "";
-} 
-
 static inline const char *eeh_driver_name(struct pci_dev *pdev)
 {
return (pdev && pdev->driver) ? pdev->driver->name : "";
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index c0ec1b6b1e69..b6683f367f7f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -461,8 +461,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
/* Access to IO BARs might get this far and still not want checking. */
if (!pe) {
eeh_stats.ignored_check++;
-   pr_debug("EEH: Ignored check for %s\n",
-   eeh_pci_name(dev));
+   eeh_edev_dbg(edev, "Ignored check\n");
return 0;
}
 
@@ -502,12 +501,11 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (dn)
location = of_get_property(dn, "ibm,loc-code",
NULL);
-   printk(KERN_ERR "EEH: %d reads ignored for recovering 
device at "
-   "location=%s driver=%s pci addr=%s\n",
+   eeh_edev_err(edev, "%d reads ignored for recovering 
device at location=%s driver=%s\n",
pe->check_count,
location ? location : "unknown",
-   eeh_driver_name(dev), eeh_pci_name(dev));
-   printk(KERN_ERR "EEH: Might be infinite loop in %s 
driver\n",
+   eeh_driver_name(dev));
+   eeh_edev_err(edev, "Might be infinite loop in %s 
driver\n",
eeh_driver_name(dev));
dump_stack();
}
@@ -1268,12 +1266,11 @@ void eeh_add_device_late(struct pci_dev *dev)
if (!dev)
return;
 
-   pr_debug("EEH: Adding device %s\n", pci_name(dev));
-
pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
edev = pdn_to_eeh_dev(pdn);
+   eeh_edev_dbg(edev, "Adding device\n");
if (edev->pdev == dev) {
-   pr_debug("EEH: Device %s already referenced!\n", pci_name(dev));
+   eeh_edev_dbg(edev, "Device already referenced!\n");
return;
}
 
@@ -1374,10 +1371,10 @@ void eeh_remove_device(struct pci_dev *dev)
edev = pci_dev_to_eeh_dev(dev);
 
/* Unregister the device with the EEH/PCI address search system */
-   pr_debug("EEH: Removing device %s\n", pci_name(dev));
+   dev_dbg(>dev, "EEH: Removing device\n");
 
if (!edev || !edev->pdev || !edev->pe) {
-   pr_debug("EEH: Not referenced !\n");
+   dev_dbg(>dev, "EEH: Device not referenced!\n");
return;
}
 
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 8c8649172e97..45360b9eab90 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -145,8 +145,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t 
alo,
piar->pcidev = dev;
piar->flags = flags;
 
-   pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
-, , pci_name(dev));
+   eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n",
+, );
 
rb_link_node(>rb_node, parent, p);
rb_insert_color(>rb_node, _io_addr_cache_root.rb_root);
@@ -226,8 +226,8 @@ static inline void __eeh_addr_cache_rmv_dev(struct pci_dev 
*dev)
piar = rb_entry(n, struct pci_io_addr_range,

[PATCH v5 05/12] powerpc/eeh: EEH for pSeries hot plug

2019-08-15 Thread Sam Bobroff

On PowerNV and pSeries, devices currently acquire EEH support from
several different places: Boot-time devices from eeh_probe_devices()
and eeh_addr_cache_build(), Virtual Function devices from the pcibios
bus add device hooks and hot plugged devices from pci_hp_add_devices()
(with other platforms using other methods as well).  Unfortunately,
pSeries machines currently discover hot plugged devices using
pci_rescan_bus(), not pci_hp_add_devices(), and so those devices do
not receive EEH support.

Rather than adding another case for pci_rescan_bus(), this change
widens the scope of the pcibios bus add device hooks so that they can
handle all devices. As a side effect this also supports devices
discovered after manually rescanning via /sys/bus/pci/rescan.

Note that on PowerNV, this change allows the EEH subsystem to become
enabled after boot as long as it has not been forced off, which was
not previously possible (it was already possible on pSeries).

Signed-off-by: Sam Bobroff 
---
 arch/powerpc/kernel/eeh.c|  2 +-
 arch/powerpc/kernel/of_platform.c|  3 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 39 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 54 ++--
 4 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ca8b0c58a6a7..87edac6f2fd9 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1272,7 +1272,7 @@ void eeh_add_device_late(struct pci_dev *dev)
struct pci_dn *pdn;
struct eeh_dev *edev;
 
-   if (!dev || !eeh_enabled())
+   if (!dev)
return;
 
pr_debug("EEH: Adding device %s\n", pci_name(dev));
diff --git a/arch/powerpc/kernel/of_platform.c 
b/arch/powerpc/kernel/of_platform.c
index 427fc22f72b6..11c807468ab5 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -81,7 +81,8 @@ static int of_pci_phb_probe(struct platform_device *dev)
pcibios_claim_one_bus(phb->bus);
 
/* Finish EEH setup */
-   eeh_add_device_tree_late(phb->bus);
+   if (!eeh_has_flag(EEH_FORCE_DISABLED))
+   eeh_add_device_tree_late(phb->bus);
 
/* Add probed PCI devices to the device model */
pci_bus_add_devices(phb->bus);
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 629f9390d9af..77cc2f51c2ea 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -43,7 +43,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
struct pci_dn *pdn = pci_get_pdn(pdev);
 
-   if (!pdev->is_virtfn)
+   if (eeh_has_flag(EEH_FORCE_DISABLED))
return;
 
pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
@@ -222,6 +222,25 @@ static const struct file_operations 
eeh_tree_state_debugfs_ops = {
 
 #endif /* CONFIG_DEBUG_FS */
 
+void pnv_eeh_enable_phbs(void)
+{
+   struct pci_controller *hose;
+   struct pnv_phb *phb;
+
+   list_for_each_entry(hose, _list, list_node) {
+   phb = hose->private_data;
+   /*
+* If EEH is enabled, we're going to rely on that.
+* Otherwise, we restore to conventional mechanism
+* to clear frozen PE during PCI config access.
+*/
+   if (eeh_enabled())
+   phb->flags |= PNV_PHB_FLAG_EEH;
+   else
+   phb->flags &= ~PNV_PHB_FLAG_EEH;
+   }
+}
+
 /**
  * pnv_eeh_post_init - EEH platform dependent post initialization
  *
@@ -260,19 +279,11 @@ int pnv_eeh_post_init(void)
if (!eeh_enabled())
disable_irq(eeh_event_irq);
 
+   pnv_eeh_enable_phbs();
+
list_for_each_entry(hose, _list, list_node) {
phb = hose->private_data;
 
-   /*
-* If EEH is enabled, we're going to rely on that.
-* Otherwise, we restore to conventional mechanism
-* to clear frozen PE during PCI config access.
-*/
-   if (eeh_enabled())
-   phb->flags |= PNV_PHB_FLAG_EEH;
-   else
-   phb->flags &= ~PNV_PHB_FLAG_EEH;
-
/* Create debugfs entries */
 #ifdef CONFIG_DEBUG_FS
if (phb->has_dbgfs || !phb->dbgfs)
@@ -483,7 +494,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 * Enable EEH explicitly so that we will do EEH check
 * while accessing I/O stuff
 */
-   eeh_add_flag(EEH_ENABLED);
+   if (!eeh_has_flag(EEH_ENABLED)) {
+   enable_irq(eeh_event_irq);
+   pnv_eeh_enable_phbs();
+   eeh_add_flag(EEH_ENABLED);
+   }
 
/* Save memory bars */
eeh_save_bars(edev);
diff --git

[PATCH v5 03/12] powerpc/eeh: Improve debug messages around device addition

2019-08-15 Thread Sam Bobroff

Also remove useless comment.

Signed-off-by: Sam Bobroff 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/kernel/eeh.c|  2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 14 
 arch/powerpc/platforms/pseries/eeh_pseries.c | 23 +++-
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index d44533bba642..846cc697030c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1278,7 +1278,7 @@ void eeh_add_device_late(struct pci_dev *dev)
pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
edev = pdn_to_eeh_dev(pdn);
if (edev->pdev == dev) {
-   pr_debug("EEH: Already referenced !\n");
+   pr_debug("EEH: Device %s already referenced!\n", pci_name(dev));
return;
}
 
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 1cd5ebd7299c..629f9390d9af 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -46,10 +46,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
if (!pdev->is_virtfn)
return;
 
-   /*
-* The following operations will fail if VF's sysfs files
-* aren't created or its resources aren't finalized.
-*/
+   pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
eeh_add_device_early(pdn);
eeh_add_device_late(pdev);
eeh_sysfs_add_device(pdev);
@@ -393,6 +390,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
int ret;
int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
+   pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
+   __func__, hose->global_number, pdn->busno,
+   PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+
/*
 * When probing the root bridge, which doesn't have any
 * subordinate PCI devices. We don't have OF node for
@@ -487,6 +488,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
/* Save memory bars */
eeh_save_bars(edev);
 
+   pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
+   __func__, pdn->busno, PCI_SLOT(pdn->devfn),
+   PCI_FUNC(pdn->devfn), edev->pe->phb->global_number,
+   edev->pe->addr);
+
return NULL;
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 07e3fc2667aa..31733f6d642c 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -52,6 +52,8 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
if (!pdev->is_virtfn)
return;
 
+   pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev));
+
pdn->device_id  =  pdev->device;
pdn->vendor_id  =  pdev->vendor;
pdn->class_code =  pdev->class;
@@ -238,6 +240,10 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void 
*data)
int enable = 0;
int ret;
 
+   pr_debug("%s: probing %04x:%02x:%02x.%01x\n",
+   __func__, pdn->phb->global_number, pdn->busno,
+   PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+
/* Retrieve OF node and eeh device */
edev = pdn_to_eeh_dev(pdn);
if (!edev || edev->pe)
@@ -281,7 +287,12 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void 
*data)
 
/* Enable EEH on the device */
ret = eeh_ops->set_option(, EEH_OPT_ENABLE);
-   if (!ret) {
+   if (ret) {
+   pr_debug("%s: EEH failed to enable on %02x:%02x.%01x 
PHB#%x-PE#%x (code %d)\n",
+   __func__, pdn->busno, PCI_SLOT(pdn->devfn),
+   PCI_FUNC(pdn->devfn), pe.phb->global_number,
+   pe.addr, ret);
+   } else {
/* Retrieve PE address */
edev->pe_config_addr = eeh_ops->get_pe_addr();
pe.addr = edev->pe_config_addr;
@@ -297,11 +308,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void 
*data)
if (enable) {
eeh_add_flag(EEH_ENABLED);
eeh_add_to_parent_pe(edev);
-
-   pr_debug("%s: EEH enabled on %02x:%02x.%01x 
PHB#%x-PE#%x\n",
-   __func__, pdn->busno, PCI_SLOT(pdn->devfn),
-   PCI_FUNC(pdn->devfn), pe.phb->global_number,
-   pe.addr);
} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
   (pdn_to_eeh_dev(pdn->parent))->pe) {
/* This device doesn't support EEH, but it may have an
@@ -310,6 +316,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void 
*data)
edev->pe_config_addr =

[PATCH v5 10/12] powerpc/eeh: Fix crash when edev->pdev changes

2019-08-15 Thread Sam Bobroff

If a PCI device is removed during eeh_pe_report_edev(), between the
calls to device_lock() and device_unlock(), edev->pdev will change and
cause a crash as the wrong mutex is released.

To correct this, hold the PCI rescan/remove lock while taking a copy
of edev->pdev and performing a get_device() on it.  Use this value to
release the mutex, but also pass it through to the device driver's EEH
handlers so that they always see the same device.

Signed-off-by: Sam Bobroff 
---
v5 * New in this version.

 arch/powerpc/kernel/eeh_driver.c | 44 +---
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 670b6159cef1..d0d175f22129 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -258,20 +258,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool 
enable)
 }
 
 typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
+struct pci_dev *,
 struct pci_driver *);
 static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
   enum pci_ers_result *result)
 {
+   struct pci_dev *pdev;
struct pci_driver *driver;
enum pci_ers_result new_result;
 
-   if (!edev->pdev) {
+   pci_lock_rescan_remove();
+   pdev = edev->pdev;
+   if (pdev)
+   get_device(>dev);
+   pci_unlock_rescan_remove();
+   if (!pdev) {
eeh_edev_info(edev, "no device");
return;
}
-   device_lock(>pdev->dev);
+   device_lock(>dev);
if (eeh_edev_actionable(edev)) {
-   driver = eeh_pcid_get(edev->pdev);
+   driver = eeh_pcid_get(pdev);
 
if (!driver)
eeh_edev_info(edev, "no driver");
@@ -280,7 +287,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, 
eeh_report_fn fn,
else if (edev->mode & EEH_DEV_NO_HANDLER)
eeh_edev_info(edev, "driver bound too late");
else {
-   new_result = fn(edev, driver);
+   new_result = fn(edev, pdev, driver);
eeh_edev_info(edev, "%s driver reports: '%s'",
  driver->name,
  pci_ers_result_name(new_result));
@@ -289,12 +296,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, 
eeh_report_fn fn,
   new_result);
}
if (driver)
-   eeh_pcid_put(edev->pdev);
+   eeh_pcid_put(pdev);
} else {
-   eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev,
+   eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev,
  !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
}
-   device_unlock(>pdev->dev);
+   device_unlock(>dev);
+   if (edev->pdev != pdev)
+   eeh_edev_warn(edev, "Device changed during processing!\n");
+   put_device(>dev);
 }
 
 static void eeh_pe_report(const char *name, struct eeh_pe *root,
@@ -321,20 +331,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe 
*root,
  * Report an EEH error to each device driver.
  */
 static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
+   struct pci_dev *pdev,
struct pci_driver *driver)
 {
enum pci_ers_result rc;
-   struct pci_dev *dev = edev->pdev;
 
if (!driver->err_handler->error_detected)
return PCI_ERS_RESULT_NONE;
 
eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
  driver->name);
-   rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+   rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen);
 
edev->in_error = true;
-   pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
+   pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE);
return rc;
 }
 
@@ -347,12 +357,13 @@ static enum pci_ers_result eeh_report_error(struct 
eeh_dev *edev,
  * are now enabled.
  */
 static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
+  struct pci_dev *pdev,
   struct pci_driver *driver)
 {
if (!driver->err_handler->mmio_enabled)
return PCI_ERS_RESULT_NONE;
eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
-   return driver->err_handler->mmio_enabled(edev->pdev);
+   return driver->err_handler->mmio_enabled(pdev);
 }
 
 /**
@@ -366,12 +377,13 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct 
eeh_dev *edev,
  *

[PATCH v5 06/12] powerpc/eeh: Refactor around eeh_probe_devices()

2019-08-15 Thread Sam Bobroff

Now that EEH support for all devices (on PowerNV and pSeries) is
provided by the pcibios bus add device hooks, eeh_probe_devices() and
eeh_addr_cache_build() are redundant and can be removed.

Move the EEH enabled message into it's own function so that it can be
called from multiple places.

Note that previously on pSeries, useless EEH sysfs files were created
for some devices that did not have EEH support and this change
prevents them from being created.

Signed-off-by: Sam Bobroff 
---
 arch/powerpc/include/asm/eeh.h   |  7 ++---
 arch/powerpc/kernel/eeh.c| 27 ++---
 arch/powerpc/kernel/eeh_cache.c  | 32 
 arch/powerpc/platforms/powernv/eeh-powernv.c |  4 +--
 arch/powerpc/platforms/pseries/pci.c |  3 +-
 5 files changed, 14 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 20105964287a..7f9404a0c3bb 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -270,13 +270,12 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
-void eeh_probe_devices(void);
+void eeh_show_enabled(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
 void eeh_addr_cache_init(void);
-void eeh_addr_cache_build(void);
 void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
 void eeh_add_device_late(struct pci_dev *);
@@ -320,7 +319,7 @@ static inline bool eeh_enabled(void)
 return false;
 }
 
-static inline void eeh_probe_devices(void) { }
+static inline void eeh_show_enabled(void) { }
 
 static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
 {
@@ -338,8 +337,6 @@ static inline int eeh_check_failure(const volatile void 
__iomem *token)
 
 static inline void eeh_addr_cache_init(void) { }
 
-static inline void eeh_addr_cache_build(void) { }
-
 static inline void eeh_add_device_early(struct pci_dn *pdn) { }
 
 static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { }
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 87edac6f2fd9..c0ec1b6b1e69 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -150,6 +150,16 @@ static int __init eeh_setup(char *str)
 }
 __setup("eeh=", eeh_setup);
 
+void eeh_show_enabled(void)
+{
+   if (eeh_has_flag(EEH_FORCE_DISABLED))
+   pr_info("EEH: Recovery disabled by kernel parameter.\n");
+   else if (eeh_has_flag(EEH_ENABLED))
+   pr_info("EEH: Capable adapter found: recovery enabled.\n");
+   else
+   pr_info("EEH: No capable adapters found: recovery disabled.\n");
+}
+
 /*
  * This routine captures assorted PCI configuration space data
  * for the indicated PCI device, and puts them into a buffer
@@ -1143,23 +1153,6 @@ static struct notifier_block eeh_reboot_nb = {
.notifier_call = eeh_reboot_notifier,
 };
 
-void eeh_probe_devices(void)
-{
-   struct pci_controller *hose, *tmp;
-   struct pci_dn *pdn;
-
-   /* Enable EEH for all adapters */
-   list_for_each_entry_safe(hose, tmp, _list, list_node) {
-   pdn = hose->pci_data;
-   traverse_pci_dn(pdn, eeh_ops->probe, NULL);
-   }
-   if (eeh_enabled())
-   pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
-   else
-   pr_info("EEH: No capable adapters found\n");
-
-}
-
 /**
  * eeh_init - EEH initialization
  *
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index a790fa49c62d..8c8649172e97 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -265,38 +265,6 @@ void eeh_addr_cache_init(void)
spin_lock_init(_io_addr_cache_root.piar_lock);
 }
 
-/**
- * eeh_addr_cache_build - Build a cache of I/O addresses
- *
- * Build a cache of pci i/o addresses.  This cache will be used to
- * find the pci device that corresponds to a given address.
- * This routine scans all pci busses to build the cache.
- * Must be run late in boot process, after the pci controllers
- * have been scanned for devices (after all device resources are known).
- */
-void eeh_addr_cache_build(void)
-{
-   struct pci_dn *pdn;
-   struct eeh_dev *edev;
-   struct pci_dev *dev = NULL;
-
-   for_each_pci_dev(dev) {
-   pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
-   if (!pdn)
-   continue;
-
-   edev = pdn_to_eeh_dev(pdn);
-   if (!edev)
-   continue;
-
-   dev->dev.archdata.edev = edev;
-   edev->pdev = dev;
-
-   eeh_addr_cache_insert_dev(dev);
-

[PATCH v5 08/12] powerpc/eeh: Introduce EEH edev logging macros

2019-08-15 Thread Sam Bobroff

Now that struct eeh_dev includes the BDFN of it's PCI device, make use
of it to replace eeh_edev_info() with a set of dev_dbg()-style macros
that only need a struct edev.

With the BDFN available without the struct pci_dev, eeh_pci_name() is
now unnecessary, so remove it.

While only the "info" level function is used here, the others will be
used in followup work.

Signed-off-by: Sam Bobroff 
---
 arch/powerpc/include/asm/eeh.h   | 11 +++
 arch/powerpc/kernel/eeh_driver.c | 17 -
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index bbe0798f6624..e1023a556721 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -138,6 +138,17 @@ struct eeh_dev {
struct pci_dev *physfn; /* Associated SRIOV PF  */
 };
 
+/* "fmt" must be a simple literal string */
+#define EEH_EDEV_PRINT(level, edev, fmt, ...) \
+   pr_##level("PCI %04x:%02x:%02x.%x#%04x: EEH: " fmt, \
+   (edev)->controller->global_number, PCI_BUSNO((edev)->bdfn), \
+   PCI_SLOT((edev)->bdfn), PCI_FUNC((edev)->bdfn), \
+   ((edev)->pe ? (edev)->pe_config_addr : 0x), ##__VA_ARGS__)
+#define eeh_edev_dbg(edev, fmt, ...) EEH_EDEV_PRINT(debug, (edev), fmt, 
##__VA_ARGS__)
+#define eeh_edev_info(edev, fmt, ...) EEH_EDEV_PRINT(info, (edev), fmt, 
##__VA_ARGS__)
+#define eeh_edev_warn(edev, fmt, ...) EEH_EDEV_PRINT(warn, (edev), fmt, 
##__VA_ARGS__)
+#define eeh_edev_err(edev, fmt, ...) EEH_EDEV_PRINT(err, (edev), fmt, 
##__VA_ARGS__)
+
 static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev)
 {
return edev ? edev->pdn : NULL;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index d6f54840a3a9..29424d5e5fea 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -82,23 +82,6 @@ static const char *pci_ers_result_name(enum pci_ers_result 
result)
}
 };
 
-static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev,
-const char *fmt, ...)
-{
-   struct va_format vaf;
-   va_list args;
-
-   va_start(args, fmt);
-
-   vaf.fmt = fmt;
-   vaf.va = 
-
-   printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr,
-  edev->pdev ? dev_name(>pdev->dev) : "none", );
-
-   va_end(args);
-}
-
 static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
enum pci_ers_result new)
 {
-- 
2.22.0.216.g00a2a96fc9

[PATCH v5 12/12] powerpc/eeh: Slightly simplify eeh_add_to_parent_pe()

2019-08-15 Thread Sam Bobroff

Simplify some needlessly complicated boolean logic in
eeh_add_to_parent_pe().

Signed-off-by: Sam Bobroff 
---
v5 * New in this version.

 arch/powerpc/kernel/eeh_pe.c | 52 +++-
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 9c38fa7c33aa..1a6254bcf056 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -383,32 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 * components.
 */
pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
-   if (pe && !(pe->type & EEH_PE_INVALID)) {
-   /* Mark the PE as type of PCI bus */
-   pe->type = EEH_PE_BUS;
-   edev->pe = pe;
-
-   /* Put the edev to PE */
-   list_add_tail(>entry, >edevs);
-   eeh_edev_dbg(edev, "Added to bus PE\n");
-   return 0;
-   } else if (pe && (pe->type & EEH_PE_INVALID)) {
-   list_add_tail(>entry, >edevs);
-   edev->pe = pe;
-   /*
-* We're running to here because of PCI hotplug caused by
-* EEH recovery. We need clear EEH_PE_INVALID until the top.
-*/
-   parent = pe;
-   while (parent) {
-   if (!(parent->type & EEH_PE_INVALID))
-   break;
-   parent->type &= ~EEH_PE_INVALID;
-   parent = parent->parent;
-   }
+   if (pe) {
+   if (pe->type & EEH_PE_INVALID) {
+   list_add_tail(>entry, >edevs);
+   edev->pe = pe;
+   /*
+* We're running to here because of PCI hotplug caused 
by
+* EEH recovery. We need clear EEH_PE_INVALID until the 
top.
+*/
+   parent = pe;
+   while (parent) {
+   if (!(parent->type & EEH_PE_INVALID))
+   break;
+   parent->type &= ~EEH_PE_INVALID;
+   parent = parent->parent;
+   }
 
-   eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
-pe->parent->addr);
+   eeh_edev_dbg(edev,
+"Added to device PE (parent: PE#%x)\n",
+pe->parent->addr);
+   } else {
+   /* Mark the PE as type of PCI bus */
+   pe->type = EEH_PE_BUS;
+   edev->pe = pe;
+
+   /* Put the edev to PE */
+   list_add_tail(>entry, >edevs);
+   eeh_edev_dbg(edev, "Added to bus PE\n");
+   }
return 0;
}
 
-- 
2.22.0.216.g00a2a96fc9

[PATCH v5 07/12] powerpc/eeh: Add bdfn field to eeh_dev

2019-08-15 Thread Sam Bobroff

From: Oliver O'Halloran 

Preparation for removing pci_dn from the powernv EEH code. The only
thing we really use pci_dn for is to get the bdfn of the device for
config space accesses, so adding that information to eeh_dev reduces
the need to carry around the pci_dn.

Signed-off-by: Oliver O'Halloran 
[SB: Re-wrapped commit message, fixed whitespace damage.]
Signed-off-by: Sam Bobroff 
---
 arch/powerpc/include/asm/eeh.h | 2 ++
 arch/powerpc/include/asm/ppc-pci.h | 2 ++
 arch/powerpc/kernel/eeh_dev.c  | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7f9404a0c3bb..bbe0798f6624 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -121,6 +121,8 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 struct eeh_dev {
int mode;   /* EEH mode */
int class_code; /* Class code of the device */
+   int bdfn;   /* bdfn of device (for cfg ops) */
+   struct pci_controller *controller;
int pe_config_addr; /* PE config address*/
u32 config_space[16];   /* Saved PCI config space   */
int pcix_cap;   /* Saved PCIx capability*/
diff --git a/arch/powerpc/include/asm/ppc-pci.h 
b/arch/powerpc/include/asm/ppc-pci.h
index cec2d6409515..72860de205a0 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -74,6 +74,8 @@ static inline const char *eeh_driver_name(struct pci_dev 
*pdev)
 
 #endif /* CONFIG_EEH */
 
+#define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff)
+
 #else /* CONFIG_PCI */
 static inline void init_pci_config_tokens(void) { }
 #endif /* !CONFIG_PCI */
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index c4317c452d98..7370185c7a05 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
/* Associate EEH device with OF node */
pdn->edev = edev;
edev->pdn = pdn;
+   edev->bdfn = (pdn->busno << 8) | pdn->devfn;
+   edev->controller = pdn->phb;
 
return edev;
 }
-- 
2.22.0.216.g00a2a96fc9

[PATCH v5 02/12] powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag

2019-08-15 Thread Sam Bobroff

The EEH_DEV_NO_HANDLER flag is used by the EEH system to prevent the
use of driver callbacks in drivers that have been bound part way
through the recovery process. This is necessary to prevent later stage
handlers from being called when the earlier stage handlers haven't,
which can be confusing for drivers.

However, the flag is set for all devices that are added after boot
time and only cleared at the end of the EEH recovery process. This
results in hot plugged devices erroneously having the flag set during
the first recovery after they are added (causing their driver's
handlers to be incorrectly ignored).

To remedy this, clear the flag at the beginning of recovery
processing. The flag is still cleared at the end of recovery
processing, although it is no longer really necessary.

Also clear the flag during eeh_handle_special_event(), for the same
reasons.

Signed-off-by: Sam Bobroff 
---
 arch/powerpc/kernel/eeh_driver.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 6f3ee30565dd..d6f54840a3a9 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -819,6 +819,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
result = PCI_ERS_RESULT_DISCONNECT;
}
 
+   eeh_for_each_pe(pe, tmp_pe)
+   eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+   edev->mode &= ~EEH_DEV_NO_HANDLER;
+
/* Walk the various device drivers attached to this slot through
 * a reset sequence, giving each an opportunity to do what it needs
 * to accomplish the reset.  Each child gets a report of the
@@ -1009,7 +1013,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
  */
 void eeh_handle_special_event(void)
 {
-   struct eeh_pe *pe, *phb_pe;
+   struct eeh_pe *pe, *phb_pe, *tmp_pe;
+   struct eeh_dev *edev, *tmp_edev;
struct pci_bus *bus;
struct pci_controller *hose;
unsigned long flags;
@@ -1078,6 +1083,10 @@ void eeh_handle_special_event(void)
(phb_pe->state & EEH_PE_RECOVERING))
continue;
 
+   eeh_for_each_pe(pe, tmp_pe)
+   eeh_pe_for_each_dev(tmp_pe, edev, 
tmp_edev)
+   edev->mode &= 
~EEH_DEV_NO_HANDLER;
+
/* Notify all devices to be down */
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
eeh_set_channel_state(pe, 
pci_channel_io_perm_failure);
-- 
2.22.0.216.g00a2a96fc9

[PATCH v5 04/12] powerpc/eeh: Initialize EEH address cache earlier

2019-08-15 Thread Sam Bobroff

The EEH address cache is currently initialized and populated by a
single function: eeh_addr_cache_build().  While the initial population
of the cache can only be done once resources are allocated,
initialization (just setting up a spinlock) could be done much
earlier.

So move the initialization step into a separate function and call it
from a core_initcall (rather than a subsys initcall).

This will allow future work to make use of the cache during boot time
PCI scanning.

Signed-off-by: Sam Bobroff 
Reviewed-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/eeh.h  |  3 +++
 arch/powerpc/kernel/eeh.c   |  2 ++
 arch/powerpc/kernel/eeh_cache.c | 13 +++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 45c9b26e3cce..20105964287a 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -275,6 +275,7 @@ int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
 int eeh_dev_check_failure(struct eeh_dev *edev);
+void eeh_addr_cache_init(void);
 void eeh_addr_cache_build(void);
 void eeh_add_device_early(struct pci_dn *);
 void eeh_add_device_tree_early(struct pci_dn *);
@@ -335,6 +336,8 @@ static inline int eeh_check_failure(const volatile void 
__iomem *token)
 
 #define eeh_dev_check_failure(x) (0)
 
+static inline void eeh_addr_cache_init(void) { }
+
 static inline void eeh_addr_cache_build(void) { }
 
 static inline void eeh_add_device_early(struct pci_dn *pdn) { }
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 846cc697030c..ca8b0c58a6a7 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1200,6 +1200,8 @@ static int eeh_init(void)
list_for_each_entry_safe(hose, tmp, _list, list_node)
eeh_dev_phb_init_dynamic(hose);
 
+   eeh_addr_cache_init();
+
/* Initialize EEH event */
return eeh_event_init();
 }
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index 320472373122..a790fa49c62d 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -254,6 +254,17 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
spin_unlock_irqrestore(_io_addr_cache_root.piar_lock, flags);
 }
 
+/**
+ * eeh_addr_cache_init - Initialize a cache of I/O addresses
+ *
+ * Initialize a cache of pci i/o addresses.  This cache will be used to
+ * find the pci device that corresponds to a given address.
+ */
+void eeh_addr_cache_init(void)
+{
+   spin_lock_init(_io_addr_cache_root.piar_lock);
+}
+
 /**
  * eeh_addr_cache_build - Build a cache of I/O addresses
  *
@@ -269,8 +280,6 @@ void eeh_addr_cache_build(void)
struct eeh_dev *edev;
struct pci_dev *dev = NULL;
 
-   spin_lock_init(_io_addr_cache_root.piar_lock);
-
for_each_pci_dev(dev) {
pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn);
if (!pdn)
-- 
2.22.0.216.g00a2a96fc9

[PATCH v5 01/12] powerpc/64: Adjust order in pcibios_init()

2019-08-15 Thread Sam Bobroff

The pcibios_init() function for PowerPC 64 currently calls
pci_bus_add_devices() before pcibios_resource_survey(). This means
that at boot time, when the pcibios_bus_add_device() hooks are called
by pci_bus_add_devices(), device resources have not been allocated and
they are unable to perform EEH setup, so a separate pass is needed.

This patch adjusts that order so that it will become possible to
consolidate the EEH setup work into a single location.

The only functional change is to execute pcibios_resource_survey()
(excepting ppc_md.pcibios_fixup(), see below) before
pci_bus_add_devices() instead of after it.

Because pcibios_scan_phb() and pci_bus_add_devices() are called
together in a loop, this must be broken into one loop for each call.
Then the call to pcibios_resource_survey() is moved up in between
them. This changes the ordering but because pcibios_resource_survey()
also calls ppc_md.pcibios_fixup(), that call is extracted out into
pcibios_init() to where pcibios_resource_survey() was, so that it is
not moved.

The only other caller of pcibios_resource_survey() is the PowerPC 32
version of pcibios_init(), and therefore, that is modified to call
ppc_md.pcibios_fixup() right after pcibios_resource_survey() so that
there is no functional change there at all.

The re-arrangement will cause very few side-effects because at this
stage in the boot, pci_bus_add_devices() does very little:
- pci_create_sysfs_dev_files() does nothing (no sysfs yet)
- pci_proc_attach_device() does nothing (no proc yet)
- device_attach() does nothing (no drivers yet)
This leaves only the pci_final_fixup calls, D3 support, and marking
the device as added. Of those, only the pci_final_fixup calls have the
potential to be affected by resource allocation.

The only pci_final_fixup handlers that touch resources seem to be one
for x86 (pci_amd_enable_64bit_bar()), and a PowerPC 32 platform driver
(quirk_final_uli1575()), neither of which use this pcibios_init()
function. Even if they did, it would almost certainly be a bug, under
the current ordering, to rely on or make changes to resources before
they were allocated.

Signed-off-by: Sam Bobroff 
Reviewed-by: Alexey Kardashevskiy 
---
v5 - Complete rewrite of commit message based on more research.

 arch/powerpc/kernel/pci-common.c |  4 
 arch/powerpc/kernel/pci_32.c |  4 
 arch/powerpc/kernel/pci_64.c | 12 +---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index f627e15bb43c..1c448cf25506 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1379,10 +1379,6 @@ void __init pcibios_resource_survey(void)
pr_debug("PCI: Assigning unassigned resources...\n");
pci_assign_unassigned_resources();
}
-
-   /* Call machine dependent fixup */
-   if (ppc_md.pcibios_fixup)
-   ppc_md.pcibios_fixup();
 }
 
 /* This is used by the PCI hotplug driver to allocate resource
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 50942a1d1a5f..b49e1060a3bf 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -263,6 +263,10 @@ static int __init pcibios_init(void)
/* Call common code to handle resource allocation */
pcibios_resource_survey();
 
+   /* Call machine dependent fixup */
+   if (ppc_md.pcibios_fixup)
+   ppc_md.pcibios_fixup();
+
/* Call machine dependent post-init code */
if (ppc_md.pcibios_after_init)
ppc_md.pcibios_after_init();
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index b7030b1189d0..f83d1f69b1dd 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -54,14 +54,20 @@ static int __init pcibios_init(void)
pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0);
 
/* Scan all of the recorded PCI controllers.  */
-   list_for_each_entry_safe(hose, tmp, _list, list_node) {
+   list_for_each_entry_safe(hose, tmp, _list, list_node)
pcibios_scan_phb(hose);
-   pci_bus_add_devices(hose->bus);
-   }
 
/* Call common code to handle resource allocation */
pcibios_resource_survey();
 
+   /* Add devices. */
+   list_for_each_entry_safe(hose, tmp, _list, list_node)
+   pci_bus_add_devices(hose->bus);
+
+   /* Call machine dependent fixup */
+   if (ppc_md.pcibios_fixup)
+   ppc_md.pcibios_fixup();
+
printk(KERN_DEBUG "PCI: Probing PCI hardware done\n");
 
return 0;
-- 
2.22.0.216.g00a2a96fc9

[PATCH v5 00/12]

2019-08-15 Thread Sam Bobroff

Hi all,

Here is v5, with a complete rewrite of the commit message for patch 1, and the
inclusion of three patches from another set which are based on this set
(previously titled "EEH fixes 4").

Cover letter:

This patch set adds support for EEH recovery of hot plugged devices on pSeries
machines. Specifically, devices discovered by PCI rescanning using
/sys/bus/pci/rescan, which includes devices hotplugged by QEMU's device_add
command. (Upstream Linux pSeries guests running under QEMU/KVM don't currently
use slot power control for hotplugging.)

As a side effect this also provides EEH support for devices removed by
/sys/bus/pci/devices/*/remove and re-discovered by writing to 
/sys/bus/pci/rescan,
on all platforms.

The approach I've taken is to use the fact that the existing
pcibios_bus_add_device() platform hooks (which are used to set up EEH on
Virtual Function devices (VFs)) are actually called for all devices, so I've
widened their scope and made other adjustments necessary to allow them to work
for hotplugged and boot-time devices as well.

Because some of the changes are in generic PowerPC code, it's
possible that I've disturbed something for another PowerPC platform. I've tried
to minimize this by leaving that code alone as much as possible and so there
are a few cases where eeh_add_device_{early,late}() or eeh_add_sysfs_files() is
called more than once. I think these can be looked at later, as duplicate calls
are not harmful.

The first patch is a rework of the pcibios_init reordering patch I posted
earlier, which I've included here because it's necessary for this set.

I have done some testing for PowerNV on Power9 using a modified pnv_php module
and some testing on pSeries with slot power control using a modified rpaphp
module, and the EEH-related parts seem to work.

Cheers,
Sam.

Patch set changelog follows:

Patch set v5: 
Patch 1/12: powerpc/64: Adjust order in pcibios_init()
- Complete rewrite of commit message based on more research.
Patch 2/12: powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag
Patch 3/12: powerpc/eeh: Improve debug messages around device addition
Patch 4/12: powerpc/eeh: Initialize EEH address cache earlier
Patch 5/12: powerpc/eeh: EEH for pSeries hot plug
Patch 6/12: powerpc/eeh: Refactor around eeh_probe_devices()
Patch 7/12: powerpc/eeh: Add bdfn field to eeh_dev
Patch 8/12: powerpc/eeh: Introduce EEH edev logging macros
Patch 9/12: powerpc/eeh: Convert log messages to eeh_edev_* macros
Patch 10/12 (new in this version): powerpc/eeh: Fix crash when edev->pdev 
changes
Patch 11/12 (new in this version): powerpc/eeh: Remove unused return path from 
eeh_pe_dev_traverse()
Patch 12/12 (new in this version): powerpc/eeh: Slightly simplify 
eeh_add_to_parent_pe()

Patch set v4: 
Patch 1/9: powerpc/64: Adjust order in pcibios_init()
Patch 2/9: powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag
Patch 3/9: powerpc/eeh: Improve debug messages around device addition
Patch 4/9: powerpc/eeh: Initialize EEH address cache earlier
Patch 5/9: powerpc/eeh: EEH for pSeries hot plug
Patch 6/9: powerpc/eeh: Refactor around eeh_probe_devices()
Patch 7/9: powerpc/eeh: Add bdfn field to eeh_dev
Patch 8/9: powerpc/eeh: Introduce EEH edev logging macros
Patch 9/9: powerpc/eeh: Convert log messages to eeh_edev_* macros
- Fixed compile warning when compiling without CONFIG_IOV.

Patch set v3: 
Patch 1/9: powerpc/64: Adjust order in pcibios_init()
Patch 2/9: powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag
Patch 3/9: powerpc/eeh: Improve debug messages around device addition
Patch 4/9: powerpc/eeh: Initialize EEH address cache earlier
Patch 5/9: powerpc/eeh: EEH for pSeries hot plug
Patch 6/9: powerpc/eeh: Refactor around eeh_probe_devices()
Patch 7/9 (new in this version): powerpc/eeh: Add bdfn field to eeh_dev
Patch 8/9 (new in this version): powerpc/eeh: Introduce EEH edev logging macros
Patch 9/9 (new in this version): powerpc/eeh: Convert log messages to 
eeh_edev_* macros

Patch set v2: 
Patch 1/6: powerpc/64: Adjust order in pcibios_init()
Patch 2/6: powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag
* Also clear EEH_DEV_NO_HANDLER in eeh_handle_special_event().
Patch 3/6 (was 4/8): powerpc/eeh: Improve debug messages around device addition
Patch 4/6 (was 6/8): powerpc/eeh: Initialize EEH address cache earlier
Patch 5/6 (was 3/8 and 7/8): powerpc/eeh: EEH for pSeries hot plug
- Dropped changes to the PowerNV PHB EEH flag, instead refactor just enough to
  use the existing flag from multiple places.
- Merge the little remaining work from the above change into the patch where
  it's used.
Patch 6/6 (was 5/8 and 8/8): powerpc/eeh: Refactor around eeh_probe_devices()
- As it's so small, merged the enablement message patch into this one (where 
it's used).
- Reworked enablement messages.

Patch set v1:
Patch 1/8: powerpc/64: Adjust order in pcibios_init()
Patch 2/8: powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag
Patch 3/8: powerpc/eeh: Convert PNV_PHB_FLAG_EEH to global flag
Patch 4/8:

RE: [PATCHv3 2/3] arm64: dts: ls1028a: Add PCIe controller DT nodes

2019-08-15 Thread Z.q. Hou

Hi Xiaowei,

> -Original Message-
> From: Xiaowei Bao 
> Sent: 2019年8月6日 14:16
> To: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.com; a...@arndb.de; gre...@linuxfoundation.org;
> M.h. Lian ; Mingkai Hu ;
> Z.q. Hou ; Roy Zang ;
> kstew...@linuxfoundation.org; pombreda...@nexb.com;
> shawn@rock-chips.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-ker...@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-dev@lists.ozlabs.org
> Cc: Xiaowei Bao ; Z.q. Hou
> 
> Subject: [PATCHv3 2/3] arm64: dts: ls1028a: Add PCIe controller DT nodes
> 
> LS1028a implements 2 PCIe 3.0 controllers.
> 
> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Hou Zhiqiang 
> ---
> v2:
>  - Fix up the legacy INTx allocate failed issue.
> v3:
>  - no change.
> 
>  arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 52
> ++
>  1 file changed, 52 insertions(+)
> 
> diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> index aef5b06..0b542ed 100644
> --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi
> @@ -503,6 +503,58 @@
>   status = "disabled";
>   };
> 
> + pcie@340 {
> + compatible = "fsl,ls1028a-pcie";
> + reg = <0x00 0x0340 0x0 0x0010   /* controller
> registers */
> +0x80 0x 0x0 0x2000>; /* configuration
> space */
> + reg-names = "regs", "config";
> + interrupts = , /* PME
> interrupt */
> +  ; /* aer
> interrupt */
> + interrupt-names = "pme", "aer";
> + #address-cells = <3>;
> + #size-cells = <2>;
> + device_type = "pci";
> + dma-coherent;
> + num-lanes = <4>;

Remove the num-lanes, it is not needed by Layerscape PCIe controllers. see: 
http://patchwork.ozlabs.org/project/linux-pci/list/?series=124488

> + bus-range = <0x0 0xff>;
> + ranges = <0x8100 0x0 0x 0x80 0x0001 0x0
> 0x0001   /* downstream I/O */
> +   0x8200 0x0 0x4000 0x80 0x4000 0x0
> 0x4000>; /* non-prefetchable memory */
> + msi-parent = <>;
> + #interrupt-cells = <1>;
> + interrupt-map-mask = <0 0 0 7>;
> + interrupt-map = < 0 0 1  0 0 GIC_SPI 109
> IRQ_TYPE_LEVEL_HIGH>,
> + < 0 0 2  0 0 GIC_SPI 110
> IRQ_TYPE_LEVEL_HIGH>,
> + < 0 0 3  0 0 GIC_SPI 111
> IRQ_TYPE_LEVEL_HIGH>,
> + < 0 0 4  0 0 GIC_SPI 112
> IRQ_TYPE_LEVEL_HIGH>;
> + status = "disabled";
> + };
> +
> + pcie@350 {
> + compatible = "fsl,ls1028a-pcie";
> + reg = <0x00 0x0350 0x0 0x0010   /* controller
> registers */
> +0x88 0x 0x0 0x2000>; /* configuration
> space */
> + reg-names = "regs", "config";
> + interrupts = ,
> +  ;
> + interrupt-names = "pme", "aer";
> + #address-cells = <3>;
> + #size-cells = <2>;
> + device_type = "pci";
> + dma-coherent;
> + num-lanes = <4>;

Ditto

> + bus-range = <0x0 0xff>;
> + ranges = <0x8100 0x0 0x 0x88 0x0001 0x0
> 0x0001   /* downstream I/O */
> +   0x8200 0x0 0x4000 0x88 0x4000 0x0
> 0x4000>; /* non-prefetchable memory */
> + msi-parent = <>;
> + #interrupt-cells = <1>;
> + interrupt-map-mask = <0 0 0 7>;
> + interrupt-map = < 0 0 1  0 0 GIC_SPI 114
> IRQ_TYPE_LEVEL_HIGH>,
> + < 0 0 2  0 0 GIC_SPI 115
> IRQ_TYPE_LEVEL_HIGH>,
> + < 0 0 3  0 0 GIC_SPI 116
> IRQ_TYPE_LEVEL_HIGH>,
> + < 0 0 4  0 0 GIC_SPI 117
> IRQ_TYPE_LEVEL_HIGH>;
> + status = "disabled";
> + };
> +
>   pcie@1f000 { /* Integrated Endpoint Root Complex */
>   compatible = "pci-host-ecam-generic";
>   reg = <0x01 0xf000 0x0 0x10>;
> --
> 2.9.5

Re: [PATCH 0/4] powerpc: KASAN for 64-bit Book3S on Radix

2019-08-15 Thread Daniel Axtens

Christophe Leroy  writes:

> Le 07/08/2019 à 01:38, Daniel Axtens a écrit :
>> Building on the work of Christophe, Aneesh and Balbir, I've ported
>> KASAN to 64-bit Book3S kernels running on the Radix MMU.
>> 
>> It builds on top Christophe's work on 32bit. It also builds on my
>> generic KASAN_VMALLOC series, available at:
>> https://patchwork.kernel.org/project/linux-mm/list/?series=153209
>
> Would be good to send that one to the powerpc list as well.
>

Done for v4.

>> 
>> This provides full inline instrumentation on radix, but does require
>> that you be able to specify the amount of memory on the system at
>> compile time. More details in patch 4.
>> 
>> Notable changes from the RFC:
>> 
>>   - I've dropped Book3E 64-bit for now.
>> 
>>   - Now instead of hacking into the KASAN core to disable module
>> allocations, we use KASAN_VMALLOC.
>> 
>>   - More testing, including on real hardware. This revealed that
>> discontiguous memory is a bit of a headache, at the moment we
>> must disable memory not contiguous from 0.
>> 
>>   - Update to deal with kasan bitops instrumentation that landed
>> between RFC and now.
>
> This is rather independant and also applies to PPC32. Could it be a 
> separate series that Michael could apply earlier ?
>

Will do this and address your feedback on the rest of the series later.

Regards,
Daniel

> Christophe
>
>> 
>>   - Documentation!
>> 
>>   - Various cleanups and tweaks.
>> 
>> I am getting occasional problems on boot of real hardware where it
>> seems vmalloc space mappings don't get installed in time. (We get a
>> BUG that memory is not accessible, but by the time we hit xmon the
>> memory then is accessible!) It happens once every few boots. I haven't
>> yet been able to figure out what is happening and why. I'm going to
>> look in to it, but I think the patches are in good enough shape to
>> review while I work on it.
>> 
>> Regards,
>> Daniel
>> 
>> Daniel Axtens (4):
>>kasan: allow arches to provide their own early shadow setup
>>kasan: support instrumented bitops with generic non-atomic bitops
>>powerpc: support KASAN instrumentation of bitops
>>powerpc: Book3S 64-bit "heavyweight" KASAN support
>> 
>>   Documentation/dev-tools/kasan.rst|   7 +-
>>   Documentation/powerpc/kasan.txt  | 111 ++
>>   arch/powerpc/Kconfig |   4 +
>>   arch/powerpc/Kconfig.debug   |  21 +++
>>   arch/powerpc/Makefile|   7 +
>>   arch/powerpc/include/asm/bitops.h|  25 ++--
>>   arch/powerpc/include/asm/book3s/64/radix.h   |   5 +
>>   arch/powerpc/include/asm/kasan.h |  35 -
>>   arch/powerpc/kernel/process.c|   8 ++
>>   arch/powerpc/kernel/prom.c   |  57 +++-
>>   arch/powerpc/mm/kasan/Makefile   |   1 +
>>   arch/powerpc/mm/kasan/kasan_init_book3s_64.c |  76 ++
>>   include/asm-generic/bitops-instrumented.h| 144 ++-
>>   include/linux/kasan.h|   2 +
>>   lib/Kconfig.kasan|   3 +
>>   mm/kasan/init.c  |  10 ++
>>   16 files changed, 431 insertions(+), 85 deletions(-)
>>   create mode 100644 Documentation/powerpc/kasan.txt
>>   create mode 100644 arch/powerpc/mm/kasan/kasan_init_book3s_64.c
>>

[PATCH 3/3] powerpc/64s: introduce options to disable use of the tlbie instruction

2019-08-15 Thread Nicholas Piggin

Introduce two options to control the use of the tlbie instruction. A
boot time option which completely disables the kernel using the
instruction, this is currently incompatible with HASH MMU, KVM, and
coherent accelerators.

And a debugfs option can be switched at runtime and avoids using tlbie
for invalidating CPU TLBs for normal process and kernel address
mappings. Coherent accelerators are still managed with tlbie, as will
KVM partition scope translations.

Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a
basic implementation which does not attempt to make any optimisation
beyond the tlbie implementation.

This is useful for performance testing among other things. For example
in certain situations on large systems, using IPIs may be faster than
tlbie as they can be directed rather than broadcast. Later we may also
take advantage of the IPIs to do more interesting things such as trim
the mm cpumask more aggressively.

Signed-off-by: Nicholas Piggin 
---
 .../admin-guide/kernel-parameters.txt |   4 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   9 +
 arch/powerpc/kvm/book3s_hv.c  |   6 +
 arch/powerpc/mm/book3s64/pgtable.c|  47 +
 arch/powerpc/mm/book3s64/radix_tlb.c  | 182 --
 drivers/misc/cxl/main.c   |   4 +
 drivers/misc/ocxl/main.c  |   4 +
 7 files changed, 238 insertions(+), 18 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 7ccd158b3894..5e5206acc64e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -860,6 +860,10 @@
disable_radix   [PPC]
Disable RADIX MMU mode on POWER9
 
+   disable_tlbie   [PPC]
+   Disable TLBIE instruction. Currently does not work
+   with KVM, with HASH MMU, or with coherent accelerators.
+
disable_cpu_apicid= [X86,APIC,SMP]
Format: 
The number of initial APIC ID for the
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index ebf572ea621e..4b4568b7d4f2 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather 
*tlb, unsigned long addre
 
radix__flush_tlb_pwc(tlb, address);
 }
+
+extern bool tlbie_capable;
+extern bool tlbie_enabled;
+
+static inline bool use_tlbie(void)
+{
+   return tlbie_enabled;
+}
+
 #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6ec2e4fe86d3..886d92639f1e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5462,6 +5462,12 @@ static int kvmppc_radix_possible(void)
 static int kvmppc_book3s_init_hv(void)
 {
int r;
+
+   if (!tlbie_capable) {
+   pr_err("KVM-HV: Host does not support TLBIE\n");
+   return -ENODEV;
+   }
+
/*
 * FIXME!! Do we need to check on all cpus ?
 */
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index bbedd7f7ab3f..73ba67d6ca37 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -437,3 +438,49 @@ int ioremap_range(unsigned long ea, phys_addr_t pa, 
unsigned long size, pgprot_t
 
return 0;
 }
+
+/*
+ * Can tlbie be used.
+ */
+bool tlbie_capable __read_mostly = true;
+EXPORT_SYMBOL(tlbie_capable);
+
+/*
+ * Avoid using tlbie for management of CPU TLBs for kernel and process address
+ * spaces. tlbie may still be used for accelerators, and by KVM.
+ */
+bool tlbie_enabled __read_mostly = true;
+
+
+static int __init setup_disable_tlbie(char *str)
+{
+   if (!radix_enabled()) {
+   pr_err("disable_tlbie: Unable to disable TLBIE with Hash 
MMU.\n");
+   return 1;
+   }
+
+   tlbie_capable = false;
+   tlbie_enabled = false;
+
+return 1;
+}
+__setup("disable_tlbie", setup_disable_tlbie);
+
+static int __init pgtable_debugfs_setup(void)
+{
+   if (!tlbie_capable)
+   return 0;
+
+   /*
+* There is no locking vs tlb flushing when changing this value.
+* The tlb flushers will see one value or another, and use either
+* tlbie or tlbiel with IPIs. In both cases the TLBs will be
+* invalidated as expected.
+*/
+   debugfs_create_bool("tlbie_enabled", 0600,
+   powerpc_debugfs_root,
+   _enabled);
+
+   return 0;
+}
+arch_initcall(pgtable_debugfs_setup);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index

[PATCH 2/3] powerpc/64s/radix: all CPUs should flush local translation structure before turning MMU on

2019-08-15 Thread Nicholas Piggin

Rather than sprinkle various translation structure invalidations
around different places in early boot, have each CPU flush everything
from its local translation structures before enabling its MMU.

Radix guests can execute tlbie(l), so have them tlbiel_all in the same
place as radix host does.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index d60cfa05447a..839e01795211 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -382,11 +382,6 @@ static void __init radix_init_pgtable(void)
 */
register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
pr_info("Process table %p and radix root for kernel: %p\n", process_tb, 
init_mm.pgd);
-   asm volatile("ptesync" : : : "memory");
-   asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
-"r" (TLBIEL_INVAL_SET_LPID), "r" (0));
-   asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-   trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
 
/*
 * The init_mm context is given the first available (non-zero) PID,
@@ -633,8 +628,7 @@ void __init radix__early_init_mmu(void)
radix_init_pgtable();
/* Switch to the guard PID before turning on MMU */
radix__switch_mmu_context(NULL, _mm);
-   if (cpu_has_feature(CPU_FTR_HVMODE))
-   tlbiel_all();
+   tlbiel_all();
 }
 
 void radix__early_init_mmu_secondary(void)
@@ -653,8 +647,7 @@ void radix__early_init_mmu_secondary(void)
}
 
radix__switch_mmu_context(NULL, _mm);
-   if (cpu_has_feature(CPU_FTR_HVMODE))
-   tlbiel_all();
+   tlbiel_all();
 }
 
 void radix__mmu_cleanup_all(void)
-- 
2.22.0

[PATCH 1/3] powerpc/64s: Remove mmu_partition_table_set

2019-08-15 Thread Nicholas Piggin

This was an awkward interface that didn't quite suit bare metal boot,
because the host page table base is stored into the partition table at
a different place than the process table base, and the TLB flushing is
unnecessary at boot with the MMU off, provided all CPUs invalidate
all their translation structures before turning their MMU on.

So inline this into kvmhv_set_ptbl_entry, where it pairs up nicely
with the nested case.

Bare metal modifies the partition table directly, and the TLBIEs are
dropped because in HVMODE, all CPUs invalidate their translation
structures before enabling their MMU.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/mmu.h   |  2 --
 arch/powerpc/kvm/book3s_hv.c |  4 +--
 arch/powerpc/kvm/book3s_hv_nested.c  | 35 +++-
 arch/powerpc/mm/book3s64/hash_utils.c|  4 ++-
 arch/powerpc/mm/book3s64/mmu_context.c   |  4 +--
 arch/powerpc/mm/book3s64/pgtable.c   | 30 
 arch/powerpc/mm/book3s64/radix_pgtable.c | 11 +++-
 arch/powerpc/mm/pgtable_64.c |  5 ++--
 8 files changed, 42 insertions(+), 53 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index ba94ce8c22d7..b973a708e8b4 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -256,8 +256,6 @@ extern void radix__mmu_cleanup_all(void);
 
 /* Functions for creating and updating partition table on POWER9 */
 extern void mmu_partition_table_init(void);
-extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
- unsigned long dw1);
 #endif /* CONFIG_PPC64 */
 
 struct mm_struct;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cde3f5a4b3e4..6ec2e4fe86d3 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4786,8 +4786,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 * Since we don't flush the TLB when tearing down a VM,
 * and this lpid might have previously been used,
 * make sure we flush on each core before running the new VM.
-* On POWER9, the tlbie in mmu_partition_table_set_entry()
-* does this flush for us.
+* On POWER9, the flush setting the partition table entry
+* in kvmhv_set_ptbl_entry takes care of this.
 */
if (!cpu_has_feature(CPU_FTR_ARCH_300))
cpumask_setall(>arch.need_tlb_flush);
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 735e0ac6f5b2..76f37dd8a327 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -411,14 +411,35 @@ static void kvmhv_flush_lpid(unsigned int lpid)
 void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
 {
if (!kvmhv_on_pseries()) {
-   mmu_partition_table_set_entry(lpid, dw0, dw1);
-   return;
-   }
+   unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+   partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+   partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+   /*
+* Global flush of TLBs and partition table caches for this
+* lpid. The type of flush (hash or radix) depends on what the
+* previous use of this partition ID was, not the new use.
+*/
+   asm volatile("ptesync" : : : "memory");
+   if (old & PATB_HR) {
+   asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+   asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+   } else {
+   asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+"r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+   }
+   /* do we need fixup here ?*/
+   asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 
-   pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
-   pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-   /* L0 will do the necessary barriers */
-   kvmhv_flush_lpid(lpid);
+   } else {
+   pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+   pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+   /* L0 will do the necessary barriers */
+   kvmhv_flush_lpid(lpid);
+   }
 }
 
 static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index b8ad14bb1170..e19bc3a5d9a5 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -823,7 +823,9 @@ static void __init hash_init_partition_table(phys_addr_t 
hash_table,
 * For now, UPRT

[PATCH 0/3] series to optionally disable tlbie for 64s/radix

2019-08-15 Thread Nicholas Piggin

Since the RFC I accounted for feedback, and also made the patch more
complete so we can actually boot and run a 64s/radix kernel without
using tlbie at all.

KVM and bare metal hash are harder to support. Bare metal hash because
it does some TLB invation with interrupts disabled (can't use IPIs).
It might be possible those invalidates could be avoided or the paths
changed to enable interrupts, but it would be a much bigger change.

KVM with radix guests might be acutally quite simple to support, so I
can look at that next if we get this merged.

KVM with hash guests might not be feasible to do in a performant way,
because of the design of virtualised hash architecture (host flushes
guest effective addresses), at least it would need special real mode
IPI handlers for TLB flushes.

Thanks,
Nick

Nicholas Piggin (3):
  powerpc/64s: Remove mmu_partition_table_set
  powerpc/64s/radix: all CPUs should flush local translation structure
before turning MMU on
  powerpc/64s: introduce options to disable use of the tlbie instruction

 .../admin-guide/kernel-parameters.txt |   4 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   9 +
 arch/powerpc/include/asm/mmu.h|   2 -
 arch/powerpc/kvm/book3s_hv.c  |  10 +-
 arch/powerpc/kvm/book3s_hv_nested.c   |  35 +++-
 arch/powerpc/mm/book3s64/hash_utils.c |   4 +-
 arch/powerpc/mm/book3s64/mmu_context.c|   4 +-
 arch/powerpc/mm/book3s64/pgtable.c|  77 +---
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  22 +--
 arch/powerpc/mm/book3s64/radix_tlb.c  | 182 --
 arch/powerpc/mm/pgtable_64.c  |   5 +-
 drivers/misc/cxl/main.c   |   4 +
 drivers/misc/ocxl/main.c  |   4 +
 13 files changed, 282 insertions(+), 80 deletions(-)

-- 
2.22.0

RE: [PATCH 05/10] PCI: layerscape: Modify the way of getting capability with different PEX

2019-08-15 Thread Xiaowei Bao



> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月15日 20:51
> To: Xiaowei Bao 
> Cc: jingooh...@gmail.com; gustavo.pimen...@synopsys.com;
> bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.com; a...@arndb.de; gre...@linuxfoundation.org;
> M.h. Lian ; Mingkai Hu ;
> Roy Zang ; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-ker...@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-dev@lists.ozlabs.org
> Subject: Re: [PATCH 05/10] PCI: layerscape: Modify the way of getting
> capability with different PEX
> 
> On Thu, Aug 15, 2019 at 04:37:11PM +0800, Xiaowei Bao wrote:
> > The different PCIe controller in one board may be have different
> > capability of MSI or MSIX, so change the way of getting the MSI
> > capability, make it more flexible.
> >
> > Signed-off-by: Xiaowei Bao 
> > ---
> >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 28
> > +++---
> >  1 file changed, 21 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > index be61d96..9404ca0 100644
> > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > @@ -22,6 +22,7 @@
> >
> >  struct ls_pcie_ep {
> > struct dw_pcie  *pci;
> > +   struct pci_epc_features *ls_epc;
> >  };
> >
> >  #define to_ls_pcie_ep(x)   dev_get_drvdata((x)->dev)
> > @@ -40,25 +41,26 @@ static const struct of_device_id
> ls_pcie_ep_of_match[] = {
> > { },
> >  };
> >
> > -static const struct pci_epc_features ls_pcie_epc_features = {
> > -   .linkup_notifier = false,
> > -   .msi_capable = true,
> > -   .msix_capable = false,
> > -};
> > -
> >  static const struct pci_epc_features*  ls_pcie_ep_get_features(struct
> > dw_pcie_ep *ep)  {
> > -   return _pcie_epc_features;
> > +   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +   struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
> > +
> > +   return pcie->ls_epc;
> >  }
> >
> >  static void ls_pcie_ep_init(struct dw_pcie_ep *ep)  {
> > struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +   struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
> > enum pci_barno bar;
> >
> > for (bar = BAR_0; bar <= BAR_5; bar++)
> > dw_pcie_ep_reset_bar(pci, bar);
> > +
> > +   pcie->ls_epc->msi_capable = ep->msi_cap ? true : false;
> > +   pcie->ls_epc->msix_capable = ep->msix_cap ? true : false;
> >  }
> >
> >  static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no, @@
> > -118,6 +120,7 @@ static int __init ls_pcie_ep_probe(struct platform_device
> *pdev)
> > struct device *dev = >dev;
> > struct dw_pcie *pci;
> > struct ls_pcie_ep *pcie;
> > +   struct pci_epc_features *ls_epc;
> > struct resource *dbi_base;
> > int ret;
> >
> > @@ -129,6 +132,10 @@ static int __init ls_pcie_ep_probe(struct
> platform_device *pdev)
> > if (!pci)
> > return -ENOMEM;
> >
> > +   ls_epc = devm_kzalloc(dev, sizeof(*ls_epc), GFP_KERNEL);
> > +   if (!ls_epc)
> > +   return -ENOMEM;
> > +
> > dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM,
> "regs");
> > pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_base);
> > if (IS_ERR(pci->dbi_base))
> > @@ -139,6 +146,13 @@ static int __init ls_pcie_ep_probe(struct
> platform_device *pdev)
> > pci->ops = _pcie_ep_ops;
> > pcie->pci = pci;
> >
> > +   ls_epc->linkup_notifier = false,
> > +   ls_epc->msi_capable = true,
> > +   ls_epc->msix_capable = true,
> 
> As [msi,msix]_capable is shortly set from ls_pcie_ep_init - is there any 
> reason
> to set them here (to potentially incorrect values)?
This is a INIT value, maybe false is better for msi_capable and msix_capable, 
of course, we don't need to set it.
> 
> Thanks,
> 
> Andrew Murray
> 
> > +   ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4),
> > +
> > +   pcie->ls_epc = ls_epc;
> > +
> > platform_set_drvdata(pdev, pcie);
> >
> > ret = ls_add_pcie_ep(pcie, pdev);
> > --
> > 2.9.5
> >

RE: [PATCH 02/10] PCI: designware-ep: Add the doorbell mode of MSI-X in EP mode

2019-08-15 Thread Xiaowei Bao



> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月15日 19:54
> To: Xiaowei Bao 
> Cc: jingooh...@gmail.com; gustavo.pimen...@synopsys.com;
> bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.com; a...@arndb.de; gre...@linuxfoundation.org;
> M.h. Lian ; Mingkai Hu ;
> Roy Zang ; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-ker...@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-dev@lists.ozlabs.org
> Subject: Re: [PATCH 02/10] PCI: designware-ep: Add the doorbell mode of
> MSI-X in EP mode
> 
> On Thu, Aug 15, 2019 at 04:37:08PM +0800, Xiaowei Bao wrote:
> > Add the doorbell mode of MSI-X in EP mode.
> >
> > Signed-off-by: Xiaowei Bao 
> > ---
> >  drivers/pci/controller/dwc/pcie-designware-ep.c | 14 ++
> >  drivers/pci/controller/dwc/pcie-designware.h| 14 ++
> >  2 files changed, 28 insertions(+)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index 75e2955..e3a7cdf 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -454,6 +454,20 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep
> *ep, u8 func_no,
> > return 0;
> >  }
> >
> > +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8
> func_no,
> > +  u16 interrupt_num)
> > +{
> > +   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +   u32 msg_data;
> > +
> > +   msg_data = (func_no << PCIE_MSIX_DOORBELL_PF_SHIFT) |
> > +  (interrupt_num - 1);
> > +
> > +   dw_pcie_writel_dbi(pci, PCIE_MSIX_DOORBELL, msg_data);
> > +
> > +   return 0;
> > +}
> > +
> >  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> >   u16 interrupt_num)
> 
> Have I understood correctly that the hardware provides an alternative
> mechanism that allows for raising MSI-X interrupts without the bother of
> reading the capabilities registers?
Yes, the hardware provide two way to MSI-X, please check the page 492 of 
DWC_pcie_dm_registers_4.30 Menu.
MSIX_DOORBELL_OFF on page 492 0x948 Description: MSI-X Doorbell Register>
> 
> If so is there any good reason to keep dw_pcie_ep_raise_msix_irq? (And thus
> use it in dw_plat_pcie_ep_raise_irq also)?
I am not sure, but I think the dw_pcie_ep_raise_msix_irq function is not 
correct, 
because I think we can't get the MSIX table from the address ep->phys_base + 
tbl_addr, 
but I also don't know where I can get the correct MSIX table.
> 
> 
> >  {
> > diff --git a/drivers/pci/controller/dwc/pcie-designware.h
> > b/drivers/pci/controller/dwc/pcie-designware.h
> > index 2b291e8..cd903e9 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > @@ -88,6 +88,11 @@
> >  #define PCIE_MISC_CONTROL_1_OFF0x8BC
> >  #define PCIE_DBI_RO_WR_EN  BIT(0)
> >
> > +#define PCIE_MSIX_DOORBELL 0x948
> > +#define PCIE_MSIX_DOORBELL_PF_SHIFT24
> > +#define PCIE_MSIX_DOORBELL_VF_SHIFT16
> > +#define PCIE_MSIX_DOORBELL_VF_ACTIVE   BIT(15)
> 
> The _VF defines are not used, I'd suggest removing them.
In fact, I will add the SRIOV support in this file, the SRIOV feature have 
verified 
In my board, but I need wait the EP framework SRIOV patch merge, 
so I defined these two macros.
> 
> Thanks,
> 
> Andrew Murray
> 
> > +
> >  /*
> >   * iATU Unroll-specific register definitions
> >   * From 4.80 core version the address translation will be made by
> > unroll @@ -399,6 +404,8 @@ int dw_pcie_ep_raise_msi_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> >  u8 interrupt_num);
> >  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> >  u16 interrupt_num);
> > +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8
> func_no,
> > +  u16 interrupt_num);
> >  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
> > #else  static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep) @@
> > -431,6 +438,13 @@ static inline int dw_pcie_ep_raise_msix_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> > return 0;
> >  }
> >
> > +static inline int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep
> *ep,
> > +u8 func_no,
> > +u16 interrupt_num)
> > +{
> > +   return 0;
> > +}
> > +
> >  static inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum
> > pci_barno bar)  {  }
> > --
> > 2.9.5
> >
> >
> > ___
> > linux-arm-kernel mailing list
> > linux-arm-ker...@lists.infradead.org
> > https://eur01.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists
> >

RE: [PATCH 01/10] PCI: designware-ep: Add multiple PFs support for DWC

2019-08-15 Thread Xiaowei Bao



> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月15日 19:32
> To: Xiaowei Bao 
> Cc: jingooh...@gmail.com; gustavo.pimen...@synopsys.com;
> bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.com; a...@arndb.de; gre...@linuxfoundation.org;
> M.h. Lian ; Mingkai Hu ;
> Roy Zang ; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-ker...@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-dev@lists.ozlabs.org
> Subject: Re: [PATCH 01/10] PCI: designware-ep: Add multiple PFs support for
> DWC
> 
> On Thu, Aug 15, 2019 at 04:37:07PM +0800, Xiaowei Bao wrote:
> > Add multiple PFs support for DWC, different PF have different config
> > space, we use pf-offset property which get from the DTS to access the
> > different pF config space.
> 
> Thanks for the patch. I haven't seen a cover letter for this series, is there 
> one
> missing?
Maybe I miss, I will add you to review next time, thanks a lot for your 
comments.
> 
> 
> >
> > Signed-off-by: Xiaowei Bao 
> > ---
> >  drivers/pci/controller/dwc/pcie-designware-ep.c |  97
> +-
> >  drivers/pci/controller/dwc/pcie-designware.c| 105
> ++--
> >  drivers/pci/controller/dwc/pcie-designware.h|  10 ++-
> >  include/linux/pci-epc.h |   1 +
> >  4 files changed, 164 insertions(+), 49 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index 2bf5a35..75e2955 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -19,12 +19,14 @@ void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> > pci_epc_linkup(epc);
> >  }
> >
> > -static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno
> bar,
> > -  int flags)
> > +static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, u8 func_no,
> > +  enum pci_barno bar, int flags)
> >  {
> > u32 reg;
> > +   struct pci_epc *epc = pci->ep.epc;
> > +   u32 pf_base = func_no * epc->pf_offset;
> >
> > -   reg = PCI_BASE_ADDRESS_0 + (4 * bar);
> > +   reg = pf_base + PCI_BASE_ADDRESS_0 + (4 * bar);
> 
> I think I'd rather see this arithmetic (and the one for determining pf_base)
> inside a macro or inline header function. This would make this code more
> readable and reduce the chances of an error by avoiding duplication of code.
> 
> For example look at cdns_pcie_ep_fn_writeb and
> ROCKCHIP_PCIE_EP_FUNC_BASE for examples of other EP drivers that do
> this.
Agree, this looks fine, thanks a lot for your comments, I will use this way to 
access
the registers in next version patch.
> 
> 
> > dw_pcie_dbi_ro_wr_en(pci);
> > dw_pcie_writel_dbi2(pci, reg, 0x0);
> > dw_pcie_writel_dbi(pci, reg, 0x0);
> > @@ -37,7 +39,12 @@ static void __dw_pcie_ep_reset_bar(struct dw_pcie
> > *pci, enum pci_barno bar,
> >
> >  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar)  {
> > -   __dw_pcie_ep_reset_bar(pci, bar, 0);
> > +   u8 func_no, funcs;
> > +
> > +   funcs = pci->ep.epc->max_functions;
> > +
> > +   for (func_no = 0; func_no < funcs; func_no++)
> > +   __dw_pcie_ep_reset_bar(pci, func_no, bar, 0);
> >  }
> >
> >  static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
> > @@ -78,28 +85,29 @@ static int dw_pcie_ep_write_header(struct pci_epc
> > *epc, u8 func_no,  {
> > struct dw_pcie_ep *ep = epc_get_drvdata(epc);
> > struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +   u32 pf_base = func_no * epc->pf_offset;
> >
> > dw_pcie_dbi_ro_wr_en(pci);
> > -   dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, hdr->vendorid);
> > -   dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, hdr->deviceid);
> > -   dw_pcie_writeb_dbi(pci, PCI_REVISION_ID, hdr->revid);
> > -   dw_pcie_writeb_dbi(pci, PCI_CLASS_PROG, hdr->progif_code);
> > -   dw_pcie_writew_dbi(pci, PCI_CLASS_DEVICE,
> > +   dw_pcie_writew_dbi(pci, pf_base + PCI_VENDOR_ID, hdr->vendorid);
> > +   dw_pcie_writew_dbi(pci, pf_base + PCI_DEVICE_ID, hdr->deviceid);
> > +   dw_pcie_writeb_dbi(pci, pf_base + PCI_REVISION_ID, hdr->revid);
> > +   dw_pcie_writeb_dbi(pci, pf_base + PCI_CLASS_PROG, hdr->progif_code);
> > +   dw_pcie_writew_dbi(pci, pf_base + PCI_CLASS_DEVICE,
> >hdr->subclass_code | hdr->baseclass_code << 8);
> > -   dw_pcie_writeb_dbi(pci, PCI_CACHE_LINE_SIZE,
> > +   dw_pcie_writeb_dbi(pci, pf_base + PCI_CACHE_LINE_SIZE,
> >hdr->cache_line_size);
> > -   dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_VENDOR_ID,
> > +   dw_pcie_writew_dbi(pci, pf_base + PCI_SUBSYSTEM_VENDOR_ID,
> >hdr->subsys_vendor_id);
> > -   dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_ID, hdr->subsys_id);
> > -   dw_pcie_writeb_dbi(pci, PCI_INTERRUPT_PIN,
> > +   dw_pcie_writew_dbi(pci, pf_base +

Re: [PATCH v4 18/25] powernv/fadump: process architected register state data provided by firmware

2019-08-15 Thread Hari Bathini




On 14/08/19 10:45 PM, Mahesh J Salgaonkar wrote:
> On 2019-07-16 17:04:08 Tue, Hari Bathini wrote:
>> From: Hari Bathini 
>>
>> Firmware provides architected register state data at the time of crash.
>> Process this data and build CPU notes to append to ELF core.
>>
>> Signed-off-by: Hari Bathini 
>> Signed-off-by: Vasant Hegde 
>> ---
>>  arch/powerpc/kernel/fadump-common.h  |4 +
>>  arch/powerpc/platforms/powernv/opal-fadump.c |  197 
>> --
>>  arch/powerpc/platforms/powernv/opal-fadump.h |   39 +
>>  3 files changed, 228 insertions(+), 12 deletions(-)
>>
> [...]
>> @@ -430,6 +577,32 @@ int __init opal_fadump_dt_scan(struct fw_dump 
>> *fadump_conf, ulong node)
>>  return 1;
>>  }
>>  
>> +ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, );
>> +if ((ret != OPAL_SUCCESS) || !addr) {
>> +pr_err("Failed to get CPU metadata (%lld)\n", ret);
>> +return 1;
>> +}
>> +
>> +addr = be64_to_cpu(addr);
>> +pr_debug("CPU metadata addr: %llx\n", addr);
>> +
>> +opal_cpu_metadata = __va(addr);
>> +r_opal_cpu_metadata = (void *)addr;
>> +fadump_conf->cpu_state_data_version =
>> +be32_to_cpu(r_opal_cpu_metadata->cpu_data_version);
>> +if (fadump_conf->cpu_state_data_version !=
>> +HDAT_FADUMP_CPU_DATA_VERSION) {
>> +pr_err("CPU data format version (%lu) mismatch!\n",
>> +   fadump_conf->cpu_state_data_version);
>> +return 1;
>> +}

I think cpu data version mismatch check should still be done early on?

>> +fadump_conf->cpu_state_entry_size =
>> +be32_to_cpu(r_opal_cpu_metadata->cpu_data_size);
>> +fadump_conf->cpu_state_destination_addr =
>> +be64_to_cpu(r_opal_cpu_metadata->region[0].dest);
>> +fadump_conf->cpu_state_data_size =
>> +be64_to_cpu(r_opal_cpu_metadata->region[0].size);
>> +
> 
> opal_fadump_dt_scan isn't the right place to do this. Can you please move 
> above
> cpu related data processing to opal_fadump_build_cpu_notes() ?

I will move the above cpu related data processing to 
opal_fadump_build_cpu_notes().

Thanks
Hari

RE: [PATCH 4/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-08-15 Thread Alastair D'Silva

On Thu, 2019-08-15 at 09:36 +0200, christophe leroy wrote:
> 
> Le 15/08/2019 à 06:10, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > When presented with large amounts of memory being hotplugged
> > (in my test case, ~890GB), the call to flush_dcache_range takes
> > a while (~50 seconds), triggering RCU stalls.
> > 
> > This patch breaks up the call into 16GB chunks, calling
> > cond_resched() inbetween to allow the scheduler to run.
> 
> Is 16GB small enough ? If 890GB takes 50s, 16GB still takes about 1s.
> I'd use 1GB chuncks to remain below 100ms.
> 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/mm/mem.c | 16 ++--
> >   1 file changed, 14 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> > index 5400da87a804..fb0d5e9aa11b 100644
> > --- a/arch/powerpc/mm/mem.c
> > +++ b/arch/powerpc/mm/mem.c
> > @@ -104,11 +104,14 @@ int __weak remove_section_mapping(unsigned
> > long start, unsigned long end)
> > return -ENODEV;
> >   }
> >   
> > +#define FLUSH_CHUNK_SIZE (16ull * 1024ull * 1024ull * 1024ull)
> 
> Can we use SZ_16GB ?
Sure, I'll go with 1GB as you recommended above

> > +
> >   int __ref arch_add_memory(int nid, u64 start, u64 size,
> > struct mhp_restrictions *restrictions)
> >   {
> > unsigned long start_pfn = start >> PAGE_SHIFT;
> > unsigned long nr_pages = size >> PAGE_SHIFT;
> > +   unsigned long i;
> > int rc;
> >   
> > resize_hpt_for_hotplug(memblock_phys_mem_size());
> > @@ -120,7 +123,11 @@ int __ref arch_add_memory(int nid, u64 start,
> > u64 size,
> > start, start + size, rc);
> > return -EFAULT;
> > }
> > -   flush_dcache_range(start, start + size);
> > +
> > +   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> > +   flush_dcache_range(start + i, min(start + size, start +
> > i + FLUSH_CHUNK_SIZE));
> 
> Isn't the line a bit long (I have not checked).
> 
> > +   cond_resched();
> > +   }
> >   
> > return __add_pages(nid, start_pfn, nr_pages, restrictions);
> >   }
> > @@ -131,13 +138,18 @@ void __ref arch_remove_memory(int nid, u64
> > start, u64 size,
> > unsigned long start_pfn = start >> PAGE_SHIFT;
> > unsigned long nr_pages = size >> PAGE_SHIFT;
> > struct page *page = pfn_to_page(start_pfn) +
> > vmem_altmap_offset(altmap);
> > +   unsigned long i;
> > int ret;
> >   
> > __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
> >   
> > /* Remove htab bolted mappings for this section of memory */
> > start = (unsigned long)__va(start);
> > -   flush_dcache_range(start, start + size);
> > +   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> > +   flush_dcache_range(start + i, min(start + size, start +
> > i + FLUSH_CHUNK_SIZE));
> > +   cond_resched();
> > +   }
> > +
> > ret = remove_section_mapping(start, start + size);
> > WARN_ON_ONCE(ret);
> >   
> > 
> 
> Christophe
> 
> ---
> L'absence de virus dans ce courrier électronique a été vérifiée par
> le logiciel antivirus Avast.
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.avast.com_antivirus=DwIDaQ=jf_iaSHvJObTbx-siA1ZOg=cT4tgeEQ0Ll3SIlZDHE5AEXyKy6uKADMtf9_Eb7-vec=TBT2NNM2DXqDWHhSb_WdFPcfAjYk9hP2cvGksF001cQ=XURKAOQQ4h3_RhJlezSguD2kpSitAF-uBhQqVZLU4GU=
>  
> 
-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819

Re: [PATCH 3/6] powerpc: Convert flush_icache_range & friends to C

2019-08-15 Thread Alastair D'Silva

On Thu, 2019-08-15 at 09:29 +0200, christophe leroy wrote:
> 
> Le 15/08/2019 à 06:10, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > Similar to commit 22e9c88d486a
> > ("powerpc/64: reuse PPC32 static inline flush_dcache_range()")
> > this patch converts flush_icache_range() to C, and reimplements the
> > following functions as wrappers around it:
> > __flush_dcache_icache
> > __flush_dcache_icache_phys
> 
> Not sure you can do that for __flush_dcache_icache_phys(), see
> detailed 
> comments below
> 
> > This was done as we discovered a long-standing bug where the length
> > of the
> > range was truncated due to using a 32 bit shift instead of a 64 bit
> > one.
> > 
> > By converting these functions to C, it becomes easier to maintain.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/include/asm/cache.h  |  26 +++---
> >   arch/powerpc/include/asm/cacheflush.h |  32 ---
> >   arch/powerpc/kernel/misc_32.S | 117 ---
> > ---
> >   arch/powerpc/kernel/misc_64.S |  97 -
> >   arch/powerpc/mm/mem.c |  71 +++-
> >   5 files changed, 102 insertions(+), 241 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/cache.h
> > b/arch/powerpc/include/asm/cache.h
> > index f852d5cd746c..728f154204db 100644
> > --- a/arch/powerpc/include/asm/cache.h
> > +++ b/arch/powerpc/include/asm/cache.h
> > @@ -98,20 +98,7 @@ static inline u32 l1_icache_bytes(void)
> >   #endif
> >   #endif /* ! __ASSEMBLY__ */
> >   
> > -#if defined(__ASSEMBLY__)
> > -/*
> > - * For a snooping icache, we still need a dummy icbi to purge all
> > the
> > - * prefetched instructions from the ifetch buffers. We also need a
> > sync
> > - * before the icbi to order the the actual stores to memory that
> > might
> > - * have modified instructions with the icbi.
> > - */
> > -#define PURGE_PREFETCHED_INS   \
> > -   sync;   \
> > -   icbi0,r3;   \
> > -   sync;   \
> > -   isync
> 
> Is this still used anywhere now ?
No, this patch removes all users of it.

> > -
> > -#else
> > +#if !defined(__ASSEMBLY__)
> >   #define __read_mostly
> > __attribute__((__section__(".data..read_mostly")))
> >   
> >   #ifdef CONFIG_PPC_BOOK3S_32
> > @@ -145,6 +132,17 @@ static inline void dcbst(void *addr)
> >   {
> > __asm__ __volatile__ ("dcbst %y0" : : "Z"(*(u8 *)addr) :
> > "memory");
> >   }
> > +
> > +static inline void icbi(void *addr)
> > +{
> > +   __asm__ __volatile__ ("icbi 0, %0" : : "r"(addr) : "memory");
> > +}
> > +
> > +static inline void iccci(void)
> > +{
> > +   __asm__ __volatile__ ("iccci 0, r0");
> 
> I think you need the "memory" clobber too here.
> 
Thanks, I'll add that.

> > +}
> > +
> >   #endif /* !__ASSEMBLY__ */
> >   #endif /* __KERNEL__ */
> >   #endif /* _ASM_POWERPC_CACHE_H */
> > diff --git a/arch/powerpc/include/asm/cacheflush.h
> > b/arch/powerpc/include/asm/cacheflush.h
> > index ed57843ef452..4c3377aff8ed 100644
> > --- a/arch/powerpc/include/asm/cacheflush.h
> > +++ b/arch/powerpc/include/asm/cacheflush.h
> > @@ -42,24 +42,18 @@ extern void flush_dcache_page(struct page
> > *page);
> >   #define flush_dcache_mmap_lock(mapping)   do { } while
> > (0)
> >   #define flush_dcache_mmap_unlock(mapping) do { } while (0)
> >   
> > -extern void flush_icache_range(unsigned long, unsigned long);
> > +void flush_icache_range(unsigned long start, unsigned long stop);
> >   extern void flush_icache_user_range(struct vm_area_struct *vma,
> > struct page *page, unsigned long
> > addr,
> > int len);
> > -extern void __flush_dcache_icache(void *page_va);
> >   extern void flush_dcache_icache_page(struct page *page);
> > -#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> > -extern void __flush_dcache_icache_phys(unsigned long physaddr);
> > -#else
> > -static inline void __flush_dcache_icache_phys(unsigned long
> > physaddr)
> > -{
> > -   BUG();
> > -}
> > -#endif
> >   
> > -/*
> > - * Write any modified data cache blocks out to memory and
> > invalidate them.
> > +/**
> > + * flush_dcache_range(): Write any modified data cache blocks out
> > to memory and invalidate them.
> >* Does not invalidate the corresponding instruction cache
> > blocks.
> > + *
> > + * @start: the start address
> > + * @stop: the stop address (exclusive)
> >*/
> >   static inline void flush_dcache_range(unsigned long start,
> > unsigned long stop)
> >   {
> > @@ -82,6 +76,20 @@ static inline void flush_dcache_range(unsigned
> > long start, unsigned long stop)
> > isync();
> >   }
> >   
> > +/**
> > + * __flush_dcache_icache(): Flush a particular page from the data
> > cache to RAM.
> > + * Note: this is necessary because the instruction cache does
> > *not*
> > + * snoop from the data cache.
> > + *
> > + * @page: the address of the page to flush
> > + */
> > +static inline void

Re: [PATCH] powerpc: Allow flush_(inval_)dcache_range to work across ranges >4GB

2019-08-15 Thread Michael Ellerman

Greg Kroah-Hartman  writes:
> On Thu, Aug 15, 2019 at 02:55:42PM +1000, Alastair D'Silva wrote:
>> From: Alastair D'Silva 
>> 
>> Heads Up: This patch cannot be submitted to Linus's tree, as the affected
>> assembler functions have already been converted to C.

That was done in upstream commit:

22e9c88d486a ("powerpc/64: reuse PPC32 static inline flush_dcache_range()")

Which is a larger change that we don't want to backport. This patch is a
minimal fix for stable trees.


>> When calling flush_(inval_)dcache_range with a size >4GB, we were masking
>> off the upper 32 bits, so we would incorrectly flush a range smaller
>> than intended.
>> 
>> This patch replaces the 32 bit shifts with 64 bit ones, so that
>> the full size is accounted for.
>> 
>> Signed-off-by: Alastair D'Silva 
>> ---
>>  arch/powerpc/kernel/misc_64.S | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)

Acked-by: Michael Ellerman 

> 
>
> This is not the correct way to submit patches for inclusion in the
> stable kernel tree.  Please read:
> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> for how to do this properly.
>
> 

Hi Greg,

This is "option 3", submit the patch directly, and the patch "deviates
from the original upstream patch" because the upstream patch was a
wholesale conversion from asm to C.

This patch applies cleanly to v4.14 and v4.19.

The change log should have mentioned which upstream patch it is not a
backport of, is there anything else we should have done differently to
avoid the formletter bot :)

cheers

Re: [PATCH v3 08/16] powerpc/pseries/svm: Use shared memory for LPPACA structures

2019-08-15 Thread Thiago Jung Bauermann

Michael Ellerman  writes:

> Thiago Jung Bauermann  writes:
>> Michael Ellerman  writes:
>>> Thiago Jung Bauermann  writes:
 From: Anshuman Khandual 

 LPPACA structures need to be shared with the host. Hence they need to be in
 shared memory. Instead of allocating individual chunks of memory for a
 given structure from memblock, a contiguous chunk of memory is allocated
 and then converted into shared memory. Subsequent allocation requests will
 come from the contiguous chunk which will be always shared memory for all
 structures.

 While we are able to use a kmem_cache constructor for the Debug Trace Log,
 LPPACAs are allocated very early in the boot process (before SLUB is
 available) so we need to use a simpler scheme here.

 Introduce helper is_svm_platform() which uses the S bit of the MSR to tell
 whether we're running as a secure guest.

 Signed-off-by: Anshuman Khandual 
 Signed-off-by: Thiago Jung Bauermann 
 ---
  arch/powerpc/include/asm/svm.h | 26 
  arch/powerpc/kernel/paca.c | 43 +-
  2 files changed, 68 insertions(+), 1 deletion(-)

 diff --git a/arch/powerpc/include/asm/svm.h 
 b/arch/powerpc/include/asm/svm.h
 new file mode 100644
 index ..fef3740f46a6
 --- /dev/null
 +++ b/arch/powerpc/include/asm/svm.h
 @@ -0,0 +1,26 @@
 +/* SPDX-License-Identifier: GPL-2.0+ */
 +/*
 + * SVM helper functions
 + *
 + * Copyright 2019 Anshuman Khandual, IBM Corporation.
>>>
>>> Are we sure this copyright date is correct?
>>
>> I may be confused about which year the copyright refers to. I thought it
>> was the year when the patch was committed. If it is the first time the
>> patch was published then this one should be 2018.
>
> I'm not a lawyer etc. but AIUI the date above is about the authorship,
> ie. when it was originally written, not when it was published or
> committed.
>
> In general I don't think it matters too much, but in this case I'm
> pretty sure Anshuman can't have possibly written it in 2019 on behalf of
> IBM :)
>
> So we can either change the date to 2018, or drop his name and just say
> it's copyright 2019 by IBM.

I think it's better to change the date to 2018. The same should be done
for svm.c, svm.h and mem_encrypt.h. I'll send a new patch series with
the correction.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: [PATCH v3 11/16] powerpc/pseries/svm: Export guest SVM status to user space via sysfs

2019-08-15 Thread Thiago Jung Bauermann



Michael Ellerman  writes:

> Thiago Jung Bauermann  writes:
>> Michael Ellerman  writes:
>>> Thiago Jung Bauermann  writes:
 From: Ryan Grimm 
 User space might want to know it's running in a secure VM.  It can't do
 a mfmsr because mfmsr is a privileged instruction.

 The solution here is to create a cpu attribute:

 /sys/devices/system/cpu/svm

 which will read 0 or 1 based on the S bit of the guest's CPU 0.
>>>
>>> Why CPU 0?
>>>
>>> If we have different CPUs running with different MSR_S then something
>>> has gone badly wrong, no?
>>
>> Yes, that would be very bad.
>>
>>> So can't we just read the MSR on whatever CPU the sysfs code happens to
>>> run on.
>>
>> Good point. I made the change in the patch below.
>
> The patch looks good. Although, it raises the question of whether it
> should be an attribute of the CPU at all.
>
> I guess there's not obviously anywhere better for it.

Ok. TBH this patch is not as urgent as the others. It was added so that
tests have an easy way to tell if they're in an SVM. I can leave it out
for now to figure out if there's a better place for this information.

> Still you should document the attribute in 
> Documentation/ABI/testing/sysfs-devices-system-cpu

Indedd, will do that.

--
Thiago Jung Bauermann
IBM Linux Technology Center

Re: [PATCH v3 0/7] soc/fsl/qbman: Enable Kexec for DPAA1 devices

2019-08-15 Thread Li Yang

On Thu, Aug 1, 2019 at 3:20 PM Roy Pledge  wrote:
>
> Most DPAA1 devices do not support a soft reset which is an issue if
> Kexec starts a new kernel. This patch series allows Kexec to function
> by detecting that the QBMan device was previously initialized.
>
> The patches fix some issues with device cleanup as well as ensuring
> that the location of the QBMan private memories has not changed
> after the execution of the Kexec.
>
> Changes since v1:
> - Removed a bug fix and sent it separately to ease backporting
> Changes since v2:
> - Expliciitly flush FQD memory from cache on PPC before unmapping
>
> Roy Pledge (7):
>   soc/fsl/qbman: Rework QBMan private memory setup
>   soc/fsl/qbman: Cleanup buffer pools if BMan was initialized prior to
> bootup
>   soc/fsl/qbman: Cleanup QMan queues if device was already initialized
>   soc/fsl/qbman: Fix drain_mr_fqni()
>   soc/fsl/qbman: Disable interrupts during portal recovery
>   soc/fsl/qbman: Fixup qman_shutdown_fq()
>   soc/fsl/qbman: Update device tree with reserved memory

Series applied for next.  Thanks!

>
>  drivers/soc/fsl/qbman/bman.c| 17 
>  drivers/soc/fsl/qbman/bman_ccsr.c   | 36 +++-
>  drivers/soc/fsl/qbman/bman_portal.c | 18 +++-
>  drivers/soc/fsl/qbman/bman_priv.h   |  5 +++
>  drivers/soc/fsl/qbman/dpaa_sys.c| 63 
>  drivers/soc/fsl/qbman/qman.c| 83 
> +
>  drivers/soc/fsl/qbman/qman_ccsr.c   | 68 +++---
>  drivers/soc/fsl/qbman/qman_portal.c | 18 +++-
>  drivers/soc/fsl/qbman/qman_priv.h   |  8 
>  9 files changed, 255 insertions(+), 61 deletions(-)
>
> --
> 2.7.4
>

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Koenig, Christian

Sorry if the style is messed up, replying from my phone.

Am 15.08.2019 20:27 schrieb Christoph Hellwig :
On Thu, Aug 15, 2019 at 06:21:00PM +, Koenig, Christian wrote:
> >   (2) Add support for DMA_ATTR_NO_KERNEL_MAPPING to this new API instead
> >   of dma_alloc_attrs.  The initial difference with that flag is just
> >   that we allow highmem, but in the future we could also unmap this
> >   memory from the kernel linear mapping entirely on architectures
> >   where we can easily do that.
>
> Mhm, why would we want to do this?

To avoid the CPU misspeculating into this memory.  For example NVMe SSDs
have a feature called host memory buffer that is a lot like your stolen
main ram for the GPU case.  We currently hand the SSD a
DMA_ATTR_NO_KERNEL_MAPPING allocation if it requests such a buffer.  If
possible we'd really like to make sure no speculative execution bug
(or intentional attacker with a kernel exploit for that matter) can easily
access that memory.

Well, for the graphics case I absolutely need to keep the linear kernel 
mapping. Because for certain use cases the memory is accessed by the kernel all 
the time as well.

Why should accessing uncached memory be more of a security problem than 
accessing cached?

Regards,
Christian

Re: 5.2.7 kernel doesn't boot on G5

2019-08-15 Thread Christian Marillat

On 15 août 2019 19:05, Mathieu Malaterre  wrote:

> Does that ring a bell to anyone here ? Thanks

Din't reply to all. Second reply :

Nothing. Fans turn 100% after 1 or 2 minutes.

Christian

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Rob Clark

On Wed, Aug 14, 2019 at 11:51 PM Christoph Hellwig  wrote:
>
> As said before I don't think these low-level helpers are the
> right API to export, but even if they did you'd just cover a tiny
> subset of the architectures.

Are you thinking instead something like:

void dma_sync_sg_for_{cpu,device}(struct device *dev, struct scatterlist *sgl,
  int nents, enum dma_data_direction dir)
{
for_each_sg(sgl, sg, nents, i) {
arch_sync_dma_for_..(dev, sg_phys(sg), sg->length, dir);
}
}
EXPORT_SYMBOL_GPL(dma_sync_sg_for_..)

or did you have something else in mind?

I guess something like this would avoid figuring out *which* archs
actually build drm..

> Also to distil the previous thread - if you remap memory to uncached
> the helper to use is arch_dma_prep_coherent, which does a writeback+
> invalidate everywhere, and there is no need to clean up after a
> long-term uncached mapping.  We might still get speculations into
> that area, if we don't remap the direct mapping, but it isn't like
> invalidting that just before freeing the memory is going to help
> anyone.

hmm, IIUC the aarch64 cache instructions, what I'm doing now is equiv
to what I would get with dma_map_sg(DMA_BIDIRECTIONAL) and
arch_dma_prep_coherent() is equiv to what I'd get w/ DMA_FROM_DEVICE
(but a single pass instead of separate inv+clean passes)..

but I can respin this with a single dma_prep_coherent_sg() which uses
arch_dma_prep_coherent()..

> Also it seems like patches 5 and 6 are missing in my inbox.

Hmm, not entirely sure why.. you should be on the cc list for each
individual patch.

But here is the patchwork link if you want to see them:

https://patchwork.freedesktop.org/series/65211/

BR,
-R

Re: [PATCH v5 3/4] mm/nvdimm: Use correct #defines instead of open coding

2019-08-15 Thread Dan Williams

On Fri, Aug 9, 2019 at 12:45 AM Aneesh Kumar K.V
 wrote:
>
> Use PAGE_SIZE instead of SZ_4K and sizeof(struct page) instead of 64.
> If we have a kernel built with different struct page size the previous
> patch should handle marking the namespace disabled.

Each of these changes carry independent non-overlapping regression
risk, so lets split them into separate patches. Others might

> Signed-off-by: Aneesh Kumar K.V 
> ---
>  drivers/nvdimm/label.c  | 2 +-
>  drivers/nvdimm/namespace_devs.c | 6 +++---
>  drivers/nvdimm/pfn_devs.c   | 3 ++-
>  drivers/nvdimm/region_devs.c| 8 
>  4 files changed, 10 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
> index 73e197babc2f..7ee037063be7 100644
> --- a/drivers/nvdimm/label.c
> +++ b/drivers/nvdimm/label.c
> @@ -355,7 +355,7 @@ static bool slot_valid(struct nvdimm_drvdata *ndd,
>
> /* check that DPA allocations are page aligned */
> if ((__le64_to_cpu(nd_label->dpa)
> -   | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
> +   | __le64_to_cpu(nd_label->rawsize)) % 
> PAGE_SIZE)

The UEFI label specification has no concept of PAGE_SIZE, so this
check is a pure Linux-ism. There's no strict requirement why
slot_valid() needs to check for page alignment and it would seem to
actively hurt cross-page-size compatibility, so let's delete the check
and rely on checksum validation.

> return false;
>
> /* check checksum */
> diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
> index a16e52251a30..a9c76df12cb9 100644
> --- a/drivers/nvdimm/namespace_devs.c
> +++ b/drivers/nvdimm/namespace_devs.c
> @@ -1006,10 +1006,10 @@ static ssize_t __size_store(struct device *dev, 
> unsigned long long val)
> return -ENXIO;
> }
>
> -   div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, );
> +   div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, );
> if (remainder) {
> -   dev_dbg(dev, "%llu is not %dK aligned\n", val,
> -   (SZ_4K * nd_region->ndr_mappings) / SZ_1K);
> +   dev_dbg(dev, "%llu is not %ldK aligned\n", val,
> +   (PAGE_SIZE * nd_region->ndr_mappings) / 
> SZ_1K);
> return -EINVAL;

Yes, looks good, but this deserves its own independent patch.

> }
>
> diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
> index 37e96811c2fc..c1d9be609322 100644
> --- a/drivers/nvdimm/pfn_devs.c
> +++ b/drivers/nvdimm/pfn_devs.c
> @@ -725,7 +725,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
>  * when populating the vmemmap. This *should* be equal to
>  * PMD_SIZE for most architectures.
>  */
> -   offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start;
> +   offset = ALIGN(start + SZ_8K + sizeof(struct page) * npfns,

I'd prefer if this was not dynamic and was instead set to the maximum
size of 'struct page' across all archs just to enhance cross-arch
compatibility. I think that answer is '64'.
> +  align) - start;
> } else if (nd_pfn->mode == PFN_MODE_RAM)
> offset = ALIGN(start + SZ_8K, align) - start;
> else
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index af30cbe7a8ea..20e265a534f8 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -992,10 +992,10 @@ static struct nd_region *nd_region_create(struct 
> nvdimm_bus *nvdimm_bus,
> struct nd_mapping_desc *mapping = _desc->mapping[i];
> struct nvdimm *nvdimm = mapping->nvdimm;
>
> -   if ((mapping->start | mapping->size) % SZ_4K) {
> -   dev_err(_bus->dev, "%s: %s mapping%d is not 4K 
> aligned\n",
> -   caller, dev_name(>dev), i);
> -
> +   if ((mapping->start | mapping->size) % PAGE_SIZE) {
> +   dev_err(_bus->dev,
> +   "%s: %s mapping%d is not %ld aligned\n",
> +   caller, dev_name(>dev), i, PAGE_SIZE);
> return NULL;
> }
>
> --
> 2.21.0
>

Re: next take at setting up a dma mask by default for platform devices

2019-08-15 Thread Alan Stern

On Thu, 15 Aug 2019, Christoph Hellwig wrote:

> On Thu, Aug 15, 2019 at 03:23:18PM +0200, Greg Kroah-Hartman wrote:
> > I've taken the first 2 patches for 5.3-final.  Given that patch 3 needs
> > to be fixed, I'll wait for a respin of these before considering them.
> 
> I have a respun version ready, but I'd really like to hear some
> comments from usb developers about the approach before spamming
> everyone again..

I didn't see any problems with your approach at first glance; it looked 
like a good idea.

Alan Stern

Re: [PATCH v5 1/4] nvdimm: Consider probe return -EOPNOTSUPP as success

2019-08-15 Thread Dan Williams

On Tue, Aug 13, 2019 at 9:22 PM Dan Williams  wrote:
>
> Hi Aneesh, logic looks correct but there are some cleanups I'd like to
> see and a lead-in patch that I attached.
>
> I've started prefixing nvdimm patches with:
>
> libnvdimm/$component:
>
> ...since this patch mostly impacts the pmem driver lets prefix it
> "libnvdimm/pmem: "
>
> On Fri, Aug 9, 2019 at 12:45 AM Aneesh Kumar K.V
>  wrote:
> >
> > This patch add -EOPNOTSUPP as return from probe callback to
>
> s/This patch add/Add/
>
> No need to say "this patch" it's obviously a patch.
>
> > indicate we were not able to initialize a namespace due to pfn superblock
> > feature/version mismatch. We want to consider this a probe success so that
> > we can create new namesapce seed and there by avoid marking the failed
> > namespace as the seed namespace.
>
> Please replace usage of "we" with the exact agent involved as which
> "we" is being referred to gets confusing for the reader.
>
> i.e. "indicate that the pmem driver was not..." "The nvdimm core wants
> to consider this...".
>
> >
> > Signed-off-by: Aneesh Kumar K.V 
> > ---
> >  drivers/nvdimm/bus.c  |  2 +-
> >  drivers/nvdimm/pmem.c | 26 ++
> >  2 files changed, 23 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> > index 798c5c4aea9c..16c35e6446a7 100644
> > --- a/drivers/nvdimm/bus.c
> > +++ b/drivers/nvdimm/bus.c
> > @@ -95,7 +95,7 @@ static int nvdimm_bus_probe(struct device *dev)
> > rc = nd_drv->probe(dev);
> > debug_nvdimm_unlock(dev);
> >
> > -   if (rc == 0)
> > +   if (rc == 0 || rc == -EOPNOTSUPP)
> > nd_region_probe_success(nvdimm_bus, dev);
>
> This now makes the nd_region_probe_success() helper obviously misnamed
> since it now wants to take actions on non-probe success. I attached a
> lead-in cleanup that you can pull into your series that renames that
> routine to nd_region_advance_seeds().
>
> When you rebase this needs a comment about why EOPNOTSUPP has special 
> handling.
>
> > else
> > nd_region_disable(nvdimm_bus, dev);
> > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> > index 4c121dd03dd9..3f498881dd28 100644
> > --- a/drivers/nvdimm/pmem.c
> > +++ b/drivers/nvdimm/pmem.c
> > @@ -490,6 +490,7 @@ static int pmem_attach_disk(struct device *dev,
> >
> >  static int nd_pmem_probe(struct device *dev)
> >  {
> > +   int ret;
> > struct nd_namespace_common *ndns;
> >
> > ndns = nvdimm_namespace_common_probe(dev);
> > @@ -505,12 +506,29 @@ static int nd_pmem_probe(struct device *dev)
> > if (is_nd_pfn(dev))
> > return pmem_attach_disk(dev, ndns);
> >
> > -   /* if we find a valid info-block we'll come back as that 
> > personality */
> > -   if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
> > -   || nd_dax_probe(dev, ndns) == 0)
>
> Similar need for an updated comment here to explain the special
> translation of error codes.
>
> > +   ret = nd_btt_probe(dev, ndns);
> > +   if (ret == 0)
> > return -ENXIO;
> > +   else if (ret == -EOPNOTSUPP)
>
> Are there cases where the btt driver needs to return EOPNOTSUPP? I'd
> otherwise like to keep this special casing constrained to the pfn /
> dax info block cases.

In fact I think EOPNOTSUPP is only something that the device-dax case
would be concerned with because that's the only interface that
attempts to guarantee a given mapping granularity.

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Christoph Hellwig

On Thu, Aug 15, 2019 at 06:48:39PM +, Koenig, Christian wrote:
> Well, for the graphics case I absolutely need to keep the linear kernel 
> mapping. Because for certain use cases the memory is accessed by the kernel 
> all the time as well.

Then don't set DMA_ATTR_NO_KERNEL_MAPPING.  At least for x86 and arm64
we should be able to support uncached allocations easily even without
that, and I plan to support that for your use case.  But if the driver
is explicitly saying it doesn't want a permanent kernel mapping it makes
sense to (optionally if the architecture can easily do it) unmap the
memory from the kernel linear mapping.

[Bug 204371] BUG kmalloc-4k (Tainted: G W ): Object padding overwritten

2019-08-15 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=204371

--- Comment #31 from Christophe Leroy (christophe.le...@c-s.fr) ---
Problem 1: test_add_free_space_entry() contains a kzalloc() to allocate a
bitmap. That's the problem.


Problem 2: btrfs_free_dummy_fs_info() has 3 kfree(). Need to know which one is
creating your last warning (kernel BUG at mm/slub.c:3952!)

-- 
You are receiving this mail because:
You are on the CC list for the bug.

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Christoph Hellwig

On Thu, Aug 15, 2019 at 06:21:00PM +, Koenig, Christian wrote:
> >   (2) Add support for DMA_ATTR_NO_KERNEL_MAPPING to this new API instead
> >   of dma_alloc_attrs.  The initial difference with that flag is just
> >   that we allow highmem, but in the future we could also unmap this
> >   memory from the kernel linear mapping entirely on architectures
> >   where we can easily do that.
> 
> Mhm, why would we want to do this?

To avoid the CPU misspeculating into this memory.  For example NVMe SSDs
have a feature called host memory buffer that is a lot like your stolen
main ram for the GPU case.  We currently hand the SSD a
DMA_ATTR_NO_KERNEL_MAPPING allocation if it requests such a buffer.  If
possible we'd really like to make sure no speculative execution bug
(or intentional attacker with a kernel exploit for that matter) can easily
access that memory.

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Koenig, Christian

Am 15.08.19 um 19:53 schrieb Christoph Hellwig:
> On Thu, Aug 15, 2019 at 06:54:39AM -0700, Rob Clark wrote:
>> On Wed, Aug 14, 2019 at 11:51 PM Christoph Hellwig  wrote:
>>> As said before I don't think these low-level helpers are the
>>> right API to export, but even if they did you'd just cover a tiny
>>> subset of the architectures.
>> Are you thinking instead something like:
>>
>> void dma_sync_sg_for_{cpu,device}(struct device *dev, struct scatterlist 
>> *sgl,
>>int nents, enum dma_data_direction dir)
>> {
>>  for_each_sg(sgl, sg, nents, i) {
>>  arch_sync_dma_for_..(dev, sg_phys(sg), sg->length, dir);
>>  }
>> }
>> EXPORT_SYMBOL_GPL(dma_sync_sg_for_..)
>>
>> or did you have something else in mind?
> No.  We really need an interface thay says please give me uncached
> memory (for some definition of uncached that includes that grapics
> drivers call write combine), and then let the architecture do the right
> thing.  Basically dma_alloc_coherent with DMA_ATTR_NO_KERNEL_MAPPING
> is superficially close to what you want, except that the way the drm
> drivers work you can't actually use it.

The terminology graphics driver use is USWC (Uncached Speculative Write 
Combine).

Essentially it is a leftover from the AGP days where the graphics 
devices couldn't snoop the CPU caches.

Nowadays they either don't want to snoop the CPU caches for performance 
reasons.

Or because of special requirements that certain areas of system memory 
are not cached for certain functionality.

For example the "VRAM" on AMD GPUs which are integrated into the CPU is 
just stolen system memory. Then you can scanout your picture to the 
display from this memory or "normal" system memory, but only if the 
system memory is mapped as USWC.

> The reason for that is if we can we really need to not create another
> uncachable alias, but instead change the page attributes in place.
> On x86 we can and must do that for example, and based on the
> conversation with Will arm64 could do that fairly easily.  arm32 can
> right now only do that for CMA, though.
>
> The big question is what API do we want.  I had a pretty similar
> discussion with Christian on doing such an allocation for amdgpu,
> where the device normally is cache coherent, but they actually want
> to turn it into non-coherent by using PCIe unsnooped transactions.
>
> Here is my high level plan, which still has a few lose end:
>
>   (1) provide a new API:
>
>   struct page *dma_alloc_pages(struct device *dev, unsigned nr_pages,
>   gfp_t gfp, unsigned long flags);
>   void dma_free_pages(struct device *dev, unsigned nr_pages,
>   unsigned long flags);
>
>   These give you back page backed memory that is guaranteed to be
>   addressable by the device (no swiotlb or similar).  The memory can
>   then be mapped using dma_map*, including unmap and dma_sync to
>   bounce ownership around.  This is the replacement for the current
>   dma_alloc_attrs with DMA_ATTR_NON_CONSISTENT API, that is rather
>   badly defined.
>
>   (2) Add support for DMA_ATTR_NO_KERNEL_MAPPING to this new API instead
>   of dma_alloc_attrs.  The initial difference with that flag is just
>   that we allow highmem, but in the future we could also unmap this
>   memory from the kernel linear mapping entirely on architectures
>   where we can easily do that.

Mhm, why would we want to do this?

>
>   (3) Add a dma_pages_map/dma_pages_unmap or similar API that allows you
>   to get a kernel mapping for parts or all of a
>   DMA_ATTR_NO_KERNEL_MAPPING allocation.  This is to replace things
>   like your open-coded vmap in msm (or similarly elsewhere in dma-buf
>   providers).
>
>   (4) Add support for a DMA_ATTR_UNCACHABLE flags (or similar) to the new
>   API, that maps the pages as uncachable iff they have a kernel
>   mapping, including invalidating the caches at time of this page
>   attribute change (or creation of a new mapping).  This API will fail
>   if the architecture does not allow in-place remapping.  Note that for
>   arm32 we could always dip into the CMA pool if one is present to not
>   fail.  We'll also need some helper to map from the DMA_ATTR_* flags
>   to a pgprot for mapping the page to userspace.  There is also a few
>   other weird bits here, e.g. on architectures like mips that use an
>   uncached segment we'll have to fail use with the plain
>   DMA_ATTR_UNCACHABLE flag, but it could be supported with
>   DMA_ATTR_UNCACHABLE | DMA_ATTR_NO_KERNEL_MAPPING.
>
> I was hoping to get most of this done for this merge window, but I'm
> probably lucky if I get at least parts done.  Too much distraction.

Thanks again for taking care of this,
Christian.

>
>> Hmm, not entirely sure why.. you should be on the cc list for each
>> individual patch.
> They finally made it, although even with the

Re: [PATCH v5 06/18] compat_ioctl: move WDIOC handling into wdt drivers

2019-08-15 Thread Guenter Roeck

On Wed, Aug 14, 2019 at 10:49:18PM +0200, Arnd Bergmann wrote:
> All watchdog drivers implement the same set of ioctl commands, and
> fortunately all of them are compatible between 32-bit and 64-bit
> architectures.
> 
> Modern drivers always go through drivers/watchdog/wdt.c as an abstraction
> layer, but older ones implement their own file_operations on a character
> device for this.
> 
> Move the handling from fs/compat_ioctl.c into the individual drivers.
> 
> Note that most of the legacy drivers will never be used on 64-bit
> hardware, because they are for an old 32-bit SoC implementation, but
> doing them all at once is safer than trying to guess which ones do
> or do not need the compat_ioctl handling.
> 
> Signed-off-by: Arnd Bergmann 

Reviewed-by: Guenter Roeck 

This patch doesn't seem to have a useful base (or at least git says so).
It does not apply to mainline nor to my own watchdog-next branch.
I assume you plan to apply the entire series together. Please not
that there will be conflicts against watchdog-next when you do so.

Guenter

[Bug 204371] BUG kmalloc-4k (Tainted: G W ): Object padding overwritten

2019-08-15 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=204371

--- Comment #30 from Erhard F. (erhar...@mailbox.org) ---
On Thu, 15 Aug 2019 17:11:36 +
bugzilla-dae...@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=204371
> 
> --- Comment #29 from Christophe Leroy (christophe.le...@c-s.fr) ---
> Looks good. Does it work better ?
Had some trouble getting the dmesg. With these modifications the btrfs module
hiccups as soon as it gets loaded, during btrfs selftests:

[...]
[  167.258266] Btrfs loaded, crc32c=crc32c-generic, debug=on
[  167.259388] BTRFS: selftest: sectorsize: 4096  nodesize: 4096
[  167.259602] BTRFS: selftest: running btrfs free space cache tests
[  167.259943] BTRFS: selftest: running extent only tests
[  167.260201] BTRFS: selftest: running bitmap only tests
[  167.260501] BTRFS: selftest: running bitmap and extent tests
[  167.260963] WARNING: CPU: 0 PID: 266 at mm/slub.c:1846
___slab_alloc.constprop.75+0x2ac/0x380
[  167.261277] Modules linked in: btrfs(+) auth_rpcgss nfsv4 dns_resolver nfs
lockd grace sunrpc input_leds joydev b43legacy led_class hid_generic mac80211
usbhid hid snd_aoa_codec_tas snd_aoa_fabric_layout snd_aoa cfg80211 rfkill
libarc4 evdev ohci_pci xor zstd_decompress zstd_compress zlib_deflate radeon
raid6_pq zlib_inflate therm_windtunnel ehci_pci ohci_hcd hwmon i2c_algo_bit
ehci_hcd backlight drm_kms_helper sungem firewire_ohci syscopyarea sungem_phy
sysfillrect sr_mod firewire_core sysimgblt fb_sys_fops cdrom crc_itu_t
snd_aoa_i2sbus snd_aoa_soundbus ttm snd_pcm usbcore snd_timer drm usb_common
snd soundcore ssb uninorth_agp drm_panel_orientation_quirks agpgart lzo
lzo_compress lzo_decompress zram zsmalloc
[  167.263795] CPU: 0 PID: 266 Comm: modprobe Tainted: GW
5.3.0-rc4+ #3
[  167.264074] NIP:  c0196ddc LR: c0196dd4 CTR: c019711c
[  167.264236] REGS: ecde9a70 TRAP: 0700   Tainted: GW 
(5.3.0-rc4+)
[  167.264488] MSR:  00021032   CR: 28224222  XER: 
[  167.264709] 
   GPR00: c0196dd4 ecde9b28 eb353380  ef3d3600 0003000e
 0007000f 
   GPR08: 0001 0001 ef42488c ecde9b28 48244222 00a9eff4
00a64e74 0004 
   GPR16:  ee800810  c080  ee800824
f203de58 c07fea34 
   GPR24: 00210d00 ef424888 0d40 ee800800 ef3d3600 ee8032e0
 eedb1af8 
[  167.265857] NIP [c0196ddc] ___slab_alloc.constprop.75+0x2ac/0x380
[  167.266052] LR [c0196dd4] ___slab_alloc.constprop.75+0x2a4/0x380
[  167.266248] Call Trace:
[  167.266308] [ecde9b28] [c0196dd4] ___slab_alloc.constprop.75+0x2a4/0x380
(unreliable)
[  167.266567] [ecde9bb8] [c0196ef0] __slab_alloc.constprop.74+0x40/0x6c
[  167.266776] [ecde9be8] [c0197198] kmem_cache_alloc_trace+0x7c/0x1a0
[  167.267321] [ecde9c28] [f203de58] test_add_free_space_entry+0xf0/0x214
[btrfs]
[  167.267672] [ecde9c78] [f207e238] btrfs_test_free_space_cache+0x918/0x1308
[btrfs]
[  167.268012] [ecde9cd8] [f207ad3c] btrfs_run_sanity_tests+0x8c/0x144 [btrfs]
[  167.268327] [ecde9d08] [f1678cd0] init_btrfs_fs+0xd4/0x12c [btrfs]
[  167.268536] [ecde9d28] [c00052cc] do_one_initcall+0x54/0x288
[  167.282204] [ecde9d98] [c00bebf8] do_init_module+0x60/0x1dc
[  167.295710] [ecde9dc8] [c00c0ad4] load_module+0x1ca4/0x1e18
[  167.308943] [ecde9ea8] [c00c0df4] sys_finit_module+0x98/0xb8
[  167.322086] [ecde9f38] [c0014274] ret_from_syscall+0x0/0x34
[  167.335151] --- interrupt: c01 at 0x8ed2c4
   LR = 0xa757c4
[  167.361356] Instruction dump:
[  167.374279] 7e048378 7f83e378 60e70001 90e10030 4bffda0d 2f83 41beff20
7f84e378 
[  167.387497] 7f63db78 4bffdd35 7e090034 5529d97e <0f09> 2f90 41beff00
7e527a14 
[  167.400820] irq event stamp: 0
[  167.414008] hardirqs last  enabled at (0): [<>] 0x0
[  167.427196] hardirqs last disabled at (0): []
copy_process+0x474/0x1368
[  167.440311] softirqs last  enabled at (0): []
copy_process+0x474/0x1368
[  167.453198] softirqs last disabled at (0): [<>] 0x0
[  167.465906] ---[ end trace 74450a6aa18e595d ]---
[  167.481091] BTRFS: selftest: running space stealing from bitmap to extent
tests
[  167.496220] BTRFS: selftest: running extent buffer operation tests
[  167.510905] BTRFS: selftest: running btrfs_split_item tests
[  167.525961] BTRFS: selftest: running extent I/O tests
[  167.540569] BTRFS: selftest: running find delalloc tests
[  168.104731] BTRFS: selftest: running find_first_clear_extent_bit test
[  168.119119] BTRFS: selftest: running extent buffer bitmap tests
[  168.463591] BTRFS: selftest: running inode tests
[  168.477316] BTRFS: selftest: running btrfs_get_extent tests
[  168.491784] BTRFS: selftest: running hole first btrfs_get_extent test
[  168.506234] BTRFS: selftest: running outstanding_extents tests
[  168.520925] BTRFS: selftest: running qgroup tests
[  168.535039] BTRFS: selftest: running qgroup add/remove tests
[  168.549344] BTRFS: selftest: running qgroup multiple refs test
[  168.563713] BTRFS:

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Christoph Hellwig

On Thu, Aug 15, 2019 at 06:54:39AM -0700, Rob Clark wrote:
> On Wed, Aug 14, 2019 at 11:51 PM Christoph Hellwig  wrote:
> >
> > As said before I don't think these low-level helpers are the
> > right API to export, but even if they did you'd just cover a tiny
> > subset of the architectures.
> 
> Are you thinking instead something like:
> 
> void dma_sync_sg_for_{cpu,device}(struct device *dev, struct scatterlist *sgl,
>   int nents, enum dma_data_direction dir)
> {
> for_each_sg(sgl, sg, nents, i) {
> arch_sync_dma_for_..(dev, sg_phys(sg), sg->length, dir);
> }
> }
> EXPORT_SYMBOL_GPL(dma_sync_sg_for_..)
> 
> or did you have something else in mind?

No.  We really need an interface thay says please give me uncached
memory (for some definition of uncached that includes that grapics
drivers call write combine), and then let the architecture do the right
thing.  Basically dma_alloc_coherent with DMA_ATTR_NO_KERNEL_MAPPING
is superficially close to what you want, except that the way the drm
drivers work you can't actually use it.

The reason for that is if we can we really need to not create another
uncachable alias, but instead change the page attributes in place.
On x86 we can and must do that for example, and based on the
conversation with Will arm64 could do that fairly easily.  arm32 can
right now only do that for CMA, though.

The big question is what API do we want.  I had a pretty similar
discussion with Christian on doing such an allocation for amdgpu,
where the device normally is cache coherent, but they actually want
to turn it into non-coherent by using PCIe unsnooped transactions.

Here is my high level plan, which still has a few lose end:

 (1) provide a new API:

struct page *dma_alloc_pages(struct device *dev, unsigned nr_pages,
gfp_t gfp, unsigned long flags);
void dma_free_pages(struct device *dev, unsigned nr_pages,
unsigned long flags);

 These give you back page backed memory that is guaranteed to be
 addressable by the device (no swiotlb or similar).  The memory can
 then be mapped using dma_map*, including unmap and dma_sync to
 bounce ownership around.  This is the replacement for the current
 dma_alloc_attrs with DMA_ATTR_NON_CONSISTENT API, that is rather
 badly defined.

 (2) Add support for DMA_ATTR_NO_KERNEL_MAPPING to this new API instead
 of dma_alloc_attrs.  The initial difference with that flag is just
 that we allow highmem, but in the future we could also unmap this
 memory from the kernel linear mapping entirely on architectures
 where we can easily do that.

 (3) Add a dma_pages_map/dma_pages_unmap or similar API that allows you
 to get a kernel mapping for parts or all of a
 DMA_ATTR_NO_KERNEL_MAPPING allocation.  This is to replace things
 like your open-coded vmap in msm (or similarly elsewhere in dma-buf
 providers).

 (4) Add support for a DMA_ATTR_UNCACHABLE flags (or similar) to the new
 API, that maps the pages as uncachable iff they have a kernel
 mapping, including invalidating the caches at time of this page
 attribute change (or creation of a new mapping).  This API will fail
 if the architecture does not allow in-place remapping.  Note that for
 arm32 we could always dip into the CMA pool if one is present to not
 fail.  We'll also need some helper to map from the DMA_ATTR_* flags
 to a pgprot for mapping the page to userspace.  There is also a few
 other weird bits here, e.g. on architectures like mips that use an
 uncached segment we'll have to fail use with the plain
 DMA_ATTR_UNCACHABLE flag, but it could be supported with
 DMA_ATTR_UNCACHABLE | DMA_ATTR_NO_KERNEL_MAPPING.

I was hoping to get most of this done for this merge window, but I'm
probably lucky if I get at least parts done.  Too much distraction.

> Hmm, not entirely sure why.. you should be on the cc list for each
> individual patch.

They finally made it, although even with the delay they only ended up
in the spam mailbox.  I still can't see them on the various mailing
lists.

Re: 5.2.7 kernel doesn't boot on G5

2019-08-15 Thread christophe leroy





Le 15/08/2019 à 19:48, Christian Marillat a écrit :

On 15 août 2019 19:29, christophe leroy  wrote:


Le 15/08/2019 à 19:05, Mathieu Malaterre a écrit :

Does that ring a bell to anyone here ? Thanks


Apparently that's 5.2.0, not 5.2.7


Yes, 5.2.7 is the Debian package version. Sorry for the mistake.



Can you test with latest stable version, ie 5.2.8 ?

Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus

Re: 5.2.7 kernel doesn't boot on G5

2019-08-15 Thread christophe leroy




Le 15/08/2019 à 19:05, Mathieu Malaterre a écrit :

Does that ring a bell to anyone here ? Thanks


Apparently that's 5.2.0, not 5.2.7

Christophe



-- Forwarded message -
To: 


Hi,

No log only a photo :

https://www.deb-multimedia.org/tests/20190812_143628.jpg

Christian



---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus

[Bug 204371] BUG kmalloc-4k (Tainted: G W ): Object padding overwritten

2019-08-15 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=204371

--- Comment #29 from Christophe Leroy (christophe.le...@c-s.fr) ---
Looks good. Does it work better ?

-- 
You are receiving this mail because:
You are on the CC list for the bug.

[Bug 204371] BUG kmalloc-4k (Tainted: G W ): Object padding overwritten

2019-08-15 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=204371

--- Comment #28 from Erhard F. (erhar...@mailbox.org) ---
On Thu, 15 Aug 2019 16:45:11 +
bugzilla-dae...@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=204371
> 
> --- Comment #27 from Christophe Leroy (christophe.le...@c-s.fr) ---
> Can you post the changes you did ?
> 
> Did you replace the two kzalloc() by get_zeroed_page()  as suggested ?
> If so, it looks like you missed one kfree() (in free_bitmap()) to be replaced
> by free_page().

Ah yes, I added the (unsigned long) part but forgot to replace kfree() with
free_page(). Now looks like this:

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 062be9dde4c6..c3eed8c3d3fe 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -764,7 +764,7 @@ static int __load_free_space_cache(struct btrfs_root *root,
struct inode *inode,
} else {
ASSERT(num_bitmaps);
num_bitmaps--;
-   e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
+   e->bitmap = (void *)get_zeroed_page(GFP_NOFS);
if (!e->bitmap) {
kmem_cache_free(
btrfs_free_space_cachep, e);
@@ -1881,7 +1881,7 @@ static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info)
 {
unlink_free_space(ctl, bitmap_info);
-   kfree(bitmap_info->bitmap);
+   free_page((unsigned long)bitmap_info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
ctl->total_bitmaps--;
ctl->op->recalc_thresholds(ctl);
@@ -2135,7 +2135,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl
*ctl,
}

/* allocate the bitmap */
-   info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
+   info->bitmap = (void *)get_zeroed_page(GFP_NOFS);
spin_lock(>tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
@@ -2146,7 +2146,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl
*ctl,

 out:
if (info) {
-   kfree(info->bitmap);
+   free_page((unsigned long)info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}

@@ -2802,7 +2802,7 @@ u64 btrfs_alloc_from_cluster(struct
btrfs_block_group_cache *block_group,
if (entry->bytes == 0) {
ctl->free_extents--;
if (entry->bitmap) {
-   kfree(entry->bitmap);
+   free_page((unsigned long)entry->bitmap);
ctl->total_bitmaps--;
ctl->op->recalc_thresholds(ctl);
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.

5.2.7 kernel doesn't boot on G5

2019-08-15 Thread Mathieu Malaterre

Does that ring a bell to anyone here ? Thanks

-- Forwarded message -
To: 


Hi,

No log only a photo :

https://www.deb-multimedia.org/tests/20190812_143628.jpg

Christian

Re: [PATCH 1/3] powerpc/xmon: Check for HV mode when dumping XIVE info from OPAL

2019-08-15 Thread Cédric Le Goater

On 15/08/2019 10:15, Cédric Le Goater wrote:
> On 15/08/2019 09:30, Jordan Niethe wrote:
>> On Wed, 2019-08-14 at 17:47 +0200, Cédric Le Goater wrote:
>>> Currently, the xmon 'dx' command calls OPAL to dump the XIVE state in
>>> the OPAL logs and also outputs some of the fields of the internal
>>> XIVE
>>> structures in Linux. The OPAL calls can only be done on baremetal
>>> (PowerNV) and they crash a pseries machine. Fix by checking the
>>> hypervisor feature of the CPU.
>>>
>>> Signed-off-by: Cédric Le Goater 
>>> ---
>>>  arch/powerpc/xmon/xmon.c | 17 ++---
>>>  1 file changed, 10 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
>>> index 14e56c25879f..25d4adccf750 100644
>>> --- a/arch/powerpc/xmon/xmon.c
>>> +++ b/arch/powerpc/xmon/xmon.c
>>> @@ -2534,13 +2534,16 @@ static void dump_pacas(void)
>>>  static void dump_one_xive(int cpu)
>>>  {
>>> unsigned int hwid = get_hard_smp_processor_id(cpu);
>>> -
>>> -   opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
>>> -   opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
>>> -   opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
>>> -   opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
>>> -   opal_xive_dump(XIVE_DUMP_VP, hwid);
>>> -   opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
>>> +   bool hv = cpu_has_feature(CPU_FTR_HVMODE);
>>> +
>>> +   if (hv) {
>>> +   opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
>>> +   opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
>>> +   opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
>>> +   opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
>>> +   opal_xive_dump(XIVE_DUMP_VP, hwid);
>>> +   opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
>>> +   }
>>>  
>>> if (setjmp(bus_error_jmp) != 0) {
>>> catch_memory_errors = 0;
>> dump_one_xive() / other xive functions are guarded by #ifdef
>> CONFIG_PPC_POWERNV in xmon.c aren't they? With this series would it be
>> that these guards can be removed?
> 
> One could compile without CONFIG_PPC_POWERNV but we would still want 
> these commands to be available for pseries. I missed that.


The changes below should be enough to activate support for the xive 
commands on pseries and powernv. The patch can come as a follow up.

Cheers,

C. 

--- linux.git.orig/arch/powerpc/xmon/xmon.c
+++ linux.git/arch/powerpc/xmon/xmon.c
@@ -238,7 +238,7 @@ Commands:\n\
   dt   dump the tracing buffers (uses printk)\n\
   dtc  dump the tracing buffers for current CPU (uses printk)\n\
 "
-#ifdef CONFIG_PPC_POWERNV
+#ifdef CONFIG_PPC_BOOK3S_64
 "  dx#   dump xive on CPU #\n\
   dxi#  dump xive irq state #\n\
   dxa   dump xive on all CPUs\n"
@@ -2530,7 +2530,7 @@ static void dump_pacas(void)
 }
 #endif
 
-#ifdef CONFIG_PPC_POWERNV
+#ifdef CONFIG_PPC_BOOK3S_64
 static void dump_one_xive(int cpu)
 {
unsigned int hwid = get_hard_smp_processor_id(cpu);
@@ -2632,7 +2632,7 @@ static void dump_xives(void)
else
dump_one_xive(xmon_owner);
 }
-#endif /* CONFIG_PPC_POWERNV */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 static void dump_by_size(unsigned long addr, long count, int size)
 {
@@ -2682,7 +2682,7 @@ dump(void)
return;
}
 #endif
-#ifdef CONFIG_PPC_POWERNV
+#ifdef CONFIG_PPC_BOOK3S_64
if (c == 'x') {
xmon_start_pagination();
dump_xives();

[Bug 204371] BUG kmalloc-4k (Tainted: G W ): Object padding overwritten

2019-08-15 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=204371

--- Comment #27 from Christophe Leroy (christophe.le...@c-s.fr) ---
Can you post the changes you did ?

Did you replace the two kzalloc() by get_zeroed_page()  as suggested ?
If so, it looks like you missed one kfree() (in free_bitmap()) to be replaced
by free_page().

-- 
You are receiving this mail because:
You are on the CC list for the bug.

[Bug 204371] BUG kmalloc-4k (Tainted: G W ): Object padding overwritten

2019-08-15 Thread bugzilla-daemon

https://bugzilla.kernel.org/show_bug.cgi?id=204371

--- Comment #26 from Erhard F. (erhar...@mailbox.org) ---
On Wed, 14 Aug 2019 20:33:51 +
bugzilla-dae...@bugzilla.kernel.org wrote:


> https://bugzilla.kernel.org/show_bug.cgi?id=204371
>
>--- Comment #24 from Christophe Leroy (christophe.le...@c-s.fr) ---
>It confirms what I suspected: due to some debug options, kzalloc() doesn't
>provide aligned areas.
>
>In __load_free_space_cache() can you replace 
>e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
>By
>e->bitmap = (void *)__get_free_page(GFP_NOFS | __GFP_ZERO);
>
>And same in insert_into_bitmap()
>
>Then replace the three kfree() which free bitmaps by something like
>free_page((unsigned long)entry->bitmap)
> 
> --- Comment #25 from Christophe Leroy (christophe.le...@c-s.fr) ---
> You can use get_zeroed_page(GFP_NOFS) instead of __get_free_page(GFP_NOFS |
> __GFP_ZERO)
Think I got everything right after a bit of searching...

Now I get this nice compact output:
[..]
[   46.579181] [ cut here ]
[   46.579378] kernel BUG at mm/slub.c:3952!
[   46.579513] Oops: Exception in kernel mode, sig: 5 [#1]
[   46.579699] BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 PowerMac
[   46.579898] Modules linked in: b43legacy led_class mac80211 cfg80211
snd_aoa_codec_tas snd_aoa_fabric_layout snd_aoa rfkill libarc4 evdev btrfs
therm_windtunnel ohci_pci xor zstd_decompress zstd_compress zlib_deflate
raid6_pq zlib_inflate radeon sr_mod firewire_ohci sungem hwmon snd_aoa_i2sbus
i2c_algo_bit backlight cdrom firewire_core sungem_phy crc_itu_t
snd_aoa_soundbus snd_pcm drm_kms_helper ohci_hcd syscopyarea ehci_pci snd_timer
sysfillrect snd sysimgblt fb_sys_fops ttm ehci_hcd soundcore drm
drm_panel_orientation_quirks usbcore uninorth_agp usb_common agpgart ssb lzo
lzo_compress lzo_decompress zram zsmalloc
[   46.582252] CPU: 0 PID: 261 Comm: umount Tainted: GW
5.3.0-rc4+ #2
[   46.582533] NIP:  c0198228 LR: c0198204 CTR: c01981a4
[   46.582708] REGS: dbbc1c10 TRAP: 0700   Tainted: GW 
(5.3.0-rc4+)
[   46.582990] MSR:  00029032   CR: 22008824  XER: 
[   46.583243] 
   GPR00: f110b444 dbbc1cc8 ec2944a0 ef4329f4 c07fe5f8 8950b76e
1032  
   GPR08: 2854c000 0001  dbbc1d18 c01981a4 008f5ff4
  
   GPR16:   bff5d9dc bff5d9c4 0001 
  
   GPR24:  f110b444 0100 dba800b8 f11b c07fe5f8
ed3d5000 ef4329f4 
[   46.584505] NIP [c0198228] kfree+0x84/0x29c
[   46.584642] LR [c0198204] kfree+0x60/0x29c
[   46.584774] Call Trace:
[   46.585254] [dbbc1cc8] [f110b4e0]
__btrfs_remove_free_space_cache_locked+0x58/0x6c [btrfs] (unreliable)
[   46.585717] [dbbc1d18] [f110b444] free_bitmap+0x24/0x68 [btrfs]
[   46.586008] [dbbc1d38] [f110b4f0]
__btrfs_remove_free_space_cache_locked+0x68/0x6c [btrfs]
[   46.586388] [dbbc1d58] [f110e6ac] btrfs_remove_free_space_cache+0x38/0x84
[btrfs]
[   46.586732] [dbbc1d78] [f10a77a4] btrfs_free_block_groups+0x164/0x24c
[btrfs]
[   46.587073] [dbbc1db8] [f10b993c] close_ctree+0x230/0x2c4 [btrfs]
[   46.587303] [dbbc1df8] [c01ab508] generic_shutdown_super+0x80/0x110
[   46.587531] [dbbc1e18] [c01ab718] kill_anon_super+0x18/0x30
[   46.587802] [dbbc1e38] [f10908b4] btrfs_kill_super+0x18/0x30 [btrfs]
[   46.588039] [dbbc1e58] [c01abdbc] deactivate_locked_super+0x54/0xa4
[   46.588269] [dbbc1e78] [c01cbcb4] cleanup_mnt+0x6c/0xe4
[   46.588456] [dbbc1ea8] [c0054f50] task_work_run+0xa0/0xc0
[   46.588645] [dbbc1ed8] [c000bc44] do_notify_resume+0x160/0x2c8
[   46.588857] [dbbc1f38] [c0014800] do_user_signal+0x2c/0x34
[   46.589052] --- interrupt: c00 at 0x7593d4
   LR = 0x7593b8
[   46.589252] Instruction dump:
[   46.589340] 4bffade1 7c7f1b78 4bffadbd 8123 71290200 40a200f8 813f
552987ff 
[   46.589644] 4082000c 813f0004 552907fe 69290001 <0f09> 7fe3fb78 4bffadcd
7c641b78 
[   46.589961] ---[ end trace 0164244520bfd23a ]---

-- 
You are receiving this mail because:
You are on the CC list for the bug.

Re: [PATCH 7/8] parisc: don't set ARCH_NO_COHERENT_DMA_MMAP

2019-08-15 Thread James Bottomley

On Thu, 2019-08-08 at 19:00 +0300, Christoph Hellwig wrote:
> parisc is the only architecture that sets ARCH_NO_COHERENT_DMA_MMAP
> when an MMU is enabled.  AFAIK this is because parisc CPUs use VIVT
> caches,

We're actually VIPT but the same principle applies.

>  which means exporting normally cachable memory to userspace is
> relatively dangrous due to cache aliasing.
> 
> But normally cachable memory is only allocated by dma_alloc_coherent
> on parisc when using the sba_iommu or ccio_iommu drivers, so just
> remove the .mmap implementation for them so that we don't have to set
> ARCH_NO_COHERENT_DMA_MMAP, which I plan to get rid of.

So I don't think this is quite right.  We have three architectural
variants essentially (hidden behind about 12 cpu types):

   1. pa70xx: These can't turn off page caching, so they were the non
  coherent problem case
   2. pa71xx: These can manufacture coherent memory simply by turning off
  the cache on a per page basis
   3. pa8xxx: these have a full cache flush coherence mechanism.

(I might have this slightly wrong: I vaguely remember the pa71xxlc
variants have some weird cache quirks for DMA as well)

So I think pa70xx we can't mmap.  pa71xx we can provided we mark the
page as uncached ... which should already have happened in the allocate
and pa8xxx which can always mmap dma memory without any special tricks.

James

Re: next take at setting up a dma mask by default for platform devices

2019-08-15 Thread Greg Kroah-Hartman

On Thu, Aug 15, 2019 at 03:25:31PM +0200, Christoph Hellwig wrote:
> On Thu, Aug 15, 2019 at 03:23:18PM +0200, Greg Kroah-Hartman wrote:
> > I've taken the first 2 patches for 5.3-final.  Given that patch 3 needs
> > to be fixed, I'll wait for a respin of these before considering them.
> 
> I have a respun version ready, but I'd really like to hear some
> comments from usb developers about the approach before spamming
> everyone again..

Spam away, we can take it :)

Re: [PATCH 6/6] driver core: initialize a default DMA mask for platform device

2019-08-15 Thread Greg Kroah-Hartman

On Thu, Aug 15, 2019 at 03:38:12PM +0200, Christoph Hellwig wrote:
> On Thu, Aug 15, 2019 at 03:03:25PM +0200, Greg Kroah-Hartman wrote:
> > > --- a/include/linux/platform_device.h
> > > +++ b/include/linux/platform_device.h
> > > @@ -24,6 +24,7 @@ struct platform_device {
> > >   int id;
> > >   boolid_auto;
> > >   struct device   dev;
> > > + u64 dma_mask;
> > 
> > Why is the dma_mask in 'struct device' which is part of this structure,
> > not sufficient here?  Shouldn't the "platform" be setting that up
> > correctly already in the "archdata" type callback?
> 
> Becaus the dma_mask in struct device is a pointer that needs to point
> to something, and this is the best space we can allocate for 'something'.
> m68k and powerpc currently do something roughly equivalent at the moment,
> while everyone else just has horrible, horrible hacks.  As mentioned in
> the changelog the intent of this patch is that we treat platform devices
> like any other bus, where the bus allocates the space for the dma_mask.
> The long term plan is to eventually kill that weird pointer indirection
> that doesn't help anyone, but for that we need to sort out the basics
> first.

Ah, missed that, sorry.  Ok, no objection from me.  Might as well respin
this series and I can queue it up after 5.3-rc5 is out (which will have
your first 2 patches in it.)

thanks,

greg k-h

Re: [PATCH 3/3] papr/scm: Add bad memory ranges to nvdimm bad ranges

2019-08-15 Thread Santosh Sivaraj

"Oliver O'Halloran"  writes:

> On Wed, Aug 14, 2019 at 6:25 PM Santosh Sivaraj  wrote:
>>
>> Subscribe to the MCE notification and add the physical address which
>> generated a memory error to nvdimm bad range.
>>
>> Signed-off-by: Santosh Sivaraj 
>> ---
>>  arch/powerpc/platforms/pseries/papr_scm.c | 65 +++
>>  1 file changed, 65 insertions(+)
>>
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
>> b/arch/powerpc/platforms/pseries/papr_scm.c
>> index a5ac371a3f06..4d25c98a9835 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -12,6 +12,8 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>> +#include 
>>
>>  #include 
>>
>> @@ -39,8 +41,12 @@ struct papr_scm_priv {
>> struct resource res;
>> struct nd_region *region;
>> struct nd_interleave_set nd_set;
>> +   struct list_head list;
>
> list is not a meaningful name. call it something more descriptive.
>
>>  };
>>
>> +LIST_HEAD(papr_nd_regions);
>> +DEFINE_MUTEX(papr_ndr_lock);
>
> Should this be a mutex or a spinlock? I don't know what context the
> mce notifier is called from, but if it's not sleepable then a mutex
> will cause problems. Did you test this with lockdep enabled?

This would be a mutex, we are called from a blocking notifier.

>
>> +
>>  static int drc_pmem_bind(struct papr_scm_priv *p)
>>  {
>> unsigned long ret[PLPAR_HCALL_BUFSIZE];
>> @@ -364,6 +370,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>> dev_info(dev, "Region registered with target node %d and 
>> online node %d",
>>  target_nid, online_nid);
>>
>> +   mutex_lock(_ndr_lock);
>> +   list_add_tail(>list, _nd_regions);
>> +   mutex_unlock(_ndr_lock);
>> +
>
> Where's the matching remove when we unbind the driver?

Missed it completely. Will fix it.

>
>> return 0;
>>pp
>>  err:   nvdimm_bus_unregister(p->bus);
>> @@ -371,6 +381,60 @@ err:   nvdimm_bus_unregister(p->bus);
>> return -ENXIO;
>>  }
>>
>> +static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
>> +void *data)
>> +{
>> +   struct machine_check_event *evt = data;
>> +   struct papr_scm_priv *p;
>> +   u64 phys_addr;
>> +
>> +   if (evt->error_type != MCE_ERROR_TYPE_UE)
>> +   return NOTIFY_DONE;
>> +
>> +   if (list_empty(_nd_regions))
>> +   return NOTIFY_DONE;
>> +
>> +   phys_addr = evt->u.ue_error.physical_address +
>> +   (evt->u.ue_error.effective_address & ~PAGE_MASK);
>
> Wait what? Why is physical_address page aligned, but effective_address
> not? Not a problem with this patch, but still, what the hell?

Not sure why, but its the way now. I can see if I can update it if it makes
sense in a later patch.

>
>> +   if (!evt->u.ue_error.physical_address_provided ||
>> +   !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
>> +   return NOTIFY_DONE;
>> +
>> +   mutex_lock(_ndr_lock);
>> +   list_for_each_entry(p, _nd_regions, list) {
>> +   struct resource res = p->res;
>> +   u64 aligned_addr;
>> +
>
>> +   if (res.start > phys_addr)
>> +   continue;
>> +
>> +   if (res.end < phys_addr)
>> +   continue;
>
> surely there's a helper for this
>
>> +
>> +   aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
>> +   pr_debug("Add memory range (0x%llx -- 0x%llx) as bad 
>> range\n",
>> +aligned_addr, aligned_addr + L1_CACHE_BYTES);
>> +
>> +   if (nvdimm_bus_add_badrange(p->bus,
>> +   aligned_addr, L1_CACHE_BYTES))
>> +   pr_warn("Failed to add bad range (0x%llx -- 
>> 0x%llx)\n",
>> +   aligned_addr, aligned_addr + L1_CACHE_BYTES);
>> +
>> +   nvdimm_region_notify(p->region,
>> +NVDIMM_REVALIDATE_POISON);
>> +
>> +   break;
>
> nit: you can avoid stacking indetation levels by breaking out of the
> loop as soon as you've found the region you're looking for.

True.

>
>> +   }
>> +   mutex_unlock(_ndr_lock);
>> +
>> +   return NOTIFY_OK;
>> +}
>> +
>> +static struct notifier_block mce_ue_nb = {
>> +   .notifier_call = handle_mce_ue
>> +};
>> +
>>  static int papr_scm_probe(struct platform_device *pdev)
>>  {
>> struct device_node *dn = pdev->dev.of_node;
>> @@ -456,6 +520,7 @@ static int papr_scm_probe(struct platform_device *pdev)
>> goto err2;
>>
>> platform_set_drvdata(pdev, p);
>> +   mce_register_notifier(_ue_nb);
>
> Either get rid of the global region list and have a notifier block in
> each device's driver private data, or keep the global list and
> register the notifier in module_init(). Re-registering the

Re: [PATCH v10 2/7] powerpc/mce: Fix MCE handling for huge pages

2019-08-15 Thread Sasha Levin

Hi,

[This is an automated email]

This commit has been processed because it contains a "Fixes:" tag,
fixing commit: ba41e1e1ccb9 powerpc/mce: Hookup derror (load/store) UE errors.

The bot has tested the following trees: v5.2.8, v4.19.66.

v5.2.8: Build OK!
v4.19.66: Failed to apply! Possible dependencies:
360cae313702 ("KVM: PPC: Book3S HV: Nested guest entry via hypercall")
41f4e631daf8 ("KVM: PPC: Book3S HV: Extract PMU save/restore operations as 
C-callable functions")
884dfb722db8 ("KVM: PPC: Book3S HV: Simplify machine check handling")
89329c0be8bd ("KVM: PPC: Book3S HV: Clear partition table entry on vm 
teardown")
8e3f5fc1045d ("KVM: PPC: Book3S HV: Framework and hcall stubs for nested 
virtualization")
95a6432ce903 ("KVM: PPC: Book3S HV: Streamlined guest entry/exit path on P9 
for radix guests")
a43c1590426c ("powerpc/pseries: Flush SLB contents on SLB MCE errors.")
c05772018491 ("powerpc/64s: Better printing of machine check info for guest 
MCEs")
d24ea8a7336a ("KVM: PPC: Book3S: Simplify external interrupt handling")
df709a296ef7 ("KVM: PPC: Book3S HV: Simplify real-mode interrupt handling")
f7035ce9f1df ("KVM: PPC: Book3S HV: Move interrupt delivery on guest entry 
to C code")


NOTE: The patch will not be queued to stable trees until it is upstream.

How should we proceed with this patch?

--
Thanks,
Sasha

Re: [PATCH 6/6] driver core: initialize a default DMA mask for platform device

2019-08-15 Thread Christoph Hellwig

On Thu, Aug 15, 2019 at 03:03:25PM +0200, Greg Kroah-Hartman wrote:
> > --- a/include/linux/platform_device.h
> > +++ b/include/linux/platform_device.h
> > @@ -24,6 +24,7 @@ struct platform_device {
> > int id;
> > boolid_auto;
> > struct device   dev;
> > +   u64 dma_mask;
> 
> Why is the dma_mask in 'struct device' which is part of this structure,
> not sufficient here?  Shouldn't the "platform" be setting that up
> correctly already in the "archdata" type callback?

Becaus the dma_mask in struct device is a pointer that needs to point
to something, and this is the best space we can allocate for 'something'.
m68k and powerpc currently do something roughly equivalent at the moment,
while everyone else just has horrible, horrible hacks.  As mentioned in
the changelog the intent of this patch is that we treat platform devices
like any other bus, where the bus allocates the space for the dma_mask.
The long term plan is to eventually kill that weird pointer indirection
that doesn't help anyone, but for that we need to sort out the basics
first.

Re: [PATCH 6/6] driver core: initialize a default DMA mask for platform device

2019-08-15 Thread Christoph Hellwig

On Wed, Aug 14, 2019 at 04:49:13PM +0100, Robin Murphy wrote:
>> because we have to support platform_device structures that are
>> statically allocated.
>
> This would be a good point to also get rid of the long-standing bodge in 
> platform_device_register_full().

platform_device_register_full looks odd to start with, especially
as the coumentation is rather lacking..

>>   +static void setup_pdev_archdata(struct platform_device *pdev)
>
> Bikeshed: painting the generic DMA API properties as "archdata" feels a bit 
> off-target :/
>
>> +{
>> +if (!pdev->dev.coherent_dma_mask)
>> +pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
>> +if (!pdev->dma_mask)
>> +pdev->dma_mask = DMA_BIT_MASK(32);
>> +if (!pdev->dev.dma_mask)
>> +pdev->dev.dma_mask = >dma_mask;
>> +arch_setup_pdev_archdata(pdev);
>
> AFAICS m68k's implementation of that arch hook becomes entirely redundant 
> after this change, so may as well go. That would just leave powerpc's 
> actual archdata, which at a glance looks like it could probably be cleaned 
> up with not *too* much trouble.

Actually I think we can just kill both off.  At the point archdata
is indeed entirely misnamed.

Re: next take at setting up a dma mask by default for platform devices

2019-08-15 Thread Christoph Hellwig

On Thu, Aug 15, 2019 at 03:23:18PM +0200, Greg Kroah-Hartman wrote:
> I've taken the first 2 patches for 5.3-final.  Given that patch 3 needs
> to be fixed, I'll wait for a respin of these before considering them.

I have a respun version ready, but I'd really like to hear some
comments from usb developers about the approach before spamming
everyone again..

Re: next take at setting up a dma mask by default for platform devices

2019-08-15 Thread Greg Kroah-Hartman

On Sun, Aug 11, 2019 at 10:05:14AM +0200, Christoph Hellwig wrote:
> Hi all,
> 
> this is another attempt to make sure the dma_mask pointer is always
> initialized for platform devices.  Not doing so lead to lots of
> boilerplate code, and makes platform devices different from all our
> major busses like PCI where we always set up a dma_mask.  In the long
> run this should also help to eventually make dma_mask a scalar value
> instead of a pointer and remove even more cruft.
> 
> The bigger blocker for this last time was the fact that the usb
> subsystem uses the presence or lack of a dma_mask to check if the core
> should do dma mapping for the driver, which is highly unusual.  So we
> fix this first.  Note that this has some overlap with the pending
> desire to use the proper dma_mmap_coherent helper for mapping usb
> buffers.  The first two patches from this series should probably
> go into 5.3 and then uses as the basis for the decision to use
> dma_mmap_coherent.

I've taken the first 2 patches for 5.3-final.  Given that patch 3 needs
to be fixed, I'll wait for a respin of these before considering them.

thanks,

greg k-h

Re: [PATCH 3/3] papr/scm: Add bad memory ranges to nvdimm bad ranges

2019-08-15 Thread Oliver O'Halloran

On Wed, Aug 14, 2019 at 6:25 PM Santosh Sivaraj  wrote:
>
> Subscribe to the MCE notification and add the physical address which
> generated a memory error to nvdimm bad range.
>
> Signed-off-by: Santosh Sivaraj 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 65 +++
>  1 file changed, 65 insertions(+)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index a5ac371a3f06..4d25c98a9835 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -12,6 +12,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>
>  #include 
>
> @@ -39,8 +41,12 @@ struct papr_scm_priv {
> struct resource res;
> struct nd_region *region;
> struct nd_interleave_set nd_set;
> +   struct list_head list;

list is not a meaningful name. call it something more descriptive.

>  };
>
> +LIST_HEAD(papr_nd_regions);
> +DEFINE_MUTEX(papr_ndr_lock);

Should this be a mutex or a spinlock? I don't know what context the
mce notifier is called from, but if it's not sleepable then a mutex
will cause problems. Did you test this with lockdep enabled?

> +
>  static int drc_pmem_bind(struct papr_scm_priv *p)
>  {
> unsigned long ret[PLPAR_HCALL_BUFSIZE];
> @@ -364,6 +370,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
> dev_info(dev, "Region registered with target node %d and 
> online node %d",
>  target_nid, online_nid);
>
> +   mutex_lock(_ndr_lock);
> +   list_add_tail(>list, _nd_regions);
> +   mutex_unlock(_ndr_lock);
> +

Where's the matching remove when we unbind the driver?

> return 0;
>
>  err:   nvdimm_bus_unregister(p->bus);
> @@ -371,6 +381,60 @@ err:   nvdimm_bus_unregister(p->bus);
> return -ENXIO;
>  }
>
> +static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
> +void *data)
> +{
> +   struct machine_check_event *evt = data;
> +   struct papr_scm_priv *p;
> +   u64 phys_addr;
> +
> +   if (evt->error_type != MCE_ERROR_TYPE_UE)
> +   return NOTIFY_DONE;
> +
> +   if (list_empty(_nd_regions))
> +   return NOTIFY_DONE;
> +
> +   phys_addr = evt->u.ue_error.physical_address +
> +   (evt->u.ue_error.effective_address & ~PAGE_MASK);

Wait what? Why is physical_address page aligned, but effective_address
not? Not a problem with this patch, but still, what the hell?

> +   if (!evt->u.ue_error.physical_address_provided ||
> +   !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
> +   return NOTIFY_DONE;
> +
> +   mutex_lock(_ndr_lock);
> +   list_for_each_entry(p, _nd_regions, list) {
> +   struct resource res = p->res;
> +   u64 aligned_addr;
> +

> +   if (res.start > phys_addr)
> +   continue;
> +
> +   if (res.end < phys_addr)
> +   continue;

surely there's a helper for this

> +
> +   aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
> +   pr_debug("Add memory range (0x%llx -- 0x%llx) as bad range\n",
> +aligned_addr, aligned_addr + L1_CACHE_BYTES);
> +
> +   if (nvdimm_bus_add_badrange(p->bus,
> +   aligned_addr, L1_CACHE_BYTES))
> +   pr_warn("Failed to add bad range (0x%llx -- 
> 0x%llx)\n",
> +   aligned_addr, aligned_addr + L1_CACHE_BYTES);
> +
> +   nvdimm_region_notify(p->region,
> +NVDIMM_REVALIDATE_POISON);
> +
> +   break;

nit: you can avoid stacking indetation levels by breaking out of the
loop as soon as you've found the region you're looking for.

> +   }
> +   mutex_unlock(_ndr_lock);
> +
> +   return NOTIFY_OK;
> +}
> +
> +static struct notifier_block mce_ue_nb = {
> +   .notifier_call = handle_mce_ue
> +};
> +
>  static int papr_scm_probe(struct platform_device *pdev)
>  {
> struct device_node *dn = pdev->dev.of_node;
> @@ -456,6 +520,7 @@ static int papr_scm_probe(struct platform_device *pdev)
> goto err2;
>
> platform_set_drvdata(pdev, p);
> +   mce_register_notifier(_ue_nb);

Either get rid of the global region list and have a notifier block in
each device's driver private data, or keep the global list and
register the notifier in module_init(). Re-registering the notifier
each time a seperate device is probed seems very sketchy.

> return 0;
>
> --
> 2.21.0
>

Re: [PATCH 6/6] driver core: initialize a default DMA mask for platform device

2019-08-15 Thread Greg Kroah-Hartman

On Sun, Aug 11, 2019 at 10:05:20AM +0200, Christoph Hellwig wrote:
> We still treat devices without a DMA mask as defaulting to 32-bits for
> both mask, but a few releases ago we've started warning about such
> cases, as they require special cases to work around this sloppyness.
> Add a dma_mask field to struct platform_object so that we can initialize
> the dma_mask pointer in struct device and initialize both masks to
> 32-bits by default.  Architectures can still override this in
> arch_setup_pdev_archdata if needed.
> 
> Note that the code looks a little odd with the various conditionals
> because we have to support platform_device structures that are
> statically allocated.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/base/platform.c | 15 +--
>  include/linux/platform_device.h |  1 +
>  2 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/base/platform.c b/drivers/base/platform.c
> index ec974ba9c0c4..b216fcb0a8af 100644
> --- a/drivers/base/platform.c
> +++ b/drivers/base/platform.c
> @@ -264,6 +264,17 @@ struct platform_object {
>   char name[];
>  };
>  
> +static void setup_pdev_archdata(struct platform_device *pdev)
> +{
> + if (!pdev->dev.coherent_dma_mask)
> + pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
> + if (!pdev->dma_mask)
> + pdev->dma_mask = DMA_BIT_MASK(32);
> + if (!pdev->dev.dma_mask)
> + pdev->dev.dma_mask = >dma_mask;
> + arch_setup_pdev_archdata(pdev);
> +};
> +
>  /**
>   * platform_device_put - destroy a platform device
>   * @pdev: platform device to free
> @@ -310,7 +321,7 @@ struct platform_device *platform_device_alloc(const char 
> *name, int id)
>   pa->pdev.id = id;
>   device_initialize(>pdev.dev);
>   pa->pdev.dev.release = platform_device_release;
> - arch_setup_pdev_archdata(>pdev);
> + setup_pdev_archdata(>pdev);
>   }
>  
>   return pa ? >pdev : NULL;
> @@ -512,7 +523,7 @@ EXPORT_SYMBOL_GPL(platform_device_del);
>  int platform_device_register(struct platform_device *pdev)
>  {
>   device_initialize(>dev);
> - arch_setup_pdev_archdata(pdev);
> + setup_pdev_archdata(pdev);
>   return platform_device_add(pdev);
>  }
>  EXPORT_SYMBOL_GPL(platform_device_register);
> diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
> index 9bc36b589827..a2abde2aef25 100644
> --- a/include/linux/platform_device.h
> +++ b/include/linux/platform_device.h
> @@ -24,6 +24,7 @@ struct platform_device {
>   int id;
>   boolid_auto;
>   struct device   dev;
> + u64 dma_mask;

Why is the dma_mask in 'struct device' which is part of this structure,
not sufficient here?  Shouldn't the "platform" be setting that up
correctly already in the "archdata" type callback?

confused,

greg k-h

ppc64le kernel panic on 5.2.9-rc1

2019-08-15 Thread Major Hayden

Hello there,

The CKI Project just found a kernel panic while running the blktests test suite 
on stable 5.2.9-rc1[0]. Michael Ellerman requested for this list to be copied 
on these ppc64le failures.

We have some logs[1] for these failures and they start with 
"ppc64le_host_2_Storage_blktests*". We hope this helps!

[0] 
https://lore.kernel.org/stable/255f9af4-6087-7f56-5860-5aa0397a7...@redhat.com/T/#t
[1] https://artifacts.cki-project.org/pipelines/100875/logs/

--
Major Hayden

Re: [PATCH 05/10] PCI: layerscape: Modify the way of getting capability with different PEX

2019-08-15 Thread Andrew Murray

On Thu, Aug 15, 2019 at 04:37:11PM +0800, Xiaowei Bao wrote:
> The different PCIe controller in one board may be have different
> capability of MSI or MSIX, so change the way of getting the MSI
> capability, make it more flexible.
> 
> Signed-off-by: Xiaowei Bao 
> ---
>  drivers/pci/controller/dwc/pci-layerscape-ep.c | 28 
> +++---
>  1 file changed, 21 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index be61d96..9404ca0 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -22,6 +22,7 @@
>  
>  struct ls_pcie_ep {
>   struct dw_pcie  *pci;
> + struct pci_epc_features *ls_epc;
>  };
>  
>  #define to_ls_pcie_ep(x) dev_get_drvdata((x)->dev)
> @@ -40,25 +41,26 @@ static const struct of_device_id ls_pcie_ep_of_match[] = {
>   { },
>  };
>  
> -static const struct pci_epc_features ls_pcie_epc_features = {
> - .linkup_notifier = false,
> - .msi_capable = true,
> - .msix_capable = false,
> -};
> -
>  static const struct pci_epc_features*
>  ls_pcie_ep_get_features(struct dw_pcie_ep *ep)
>  {
> - return _pcie_epc_features;
> + struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
> +
> + return pcie->ls_epc;
>  }
>  
>  static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
>  {
>   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
>   enum pci_barno bar;
>  
>   for (bar = BAR_0; bar <= BAR_5; bar++)
>   dw_pcie_ep_reset_bar(pci, bar);
> +
> + pcie->ls_epc->msi_capable = ep->msi_cap ? true : false;
> + pcie->ls_epc->msix_capable = ep->msix_cap ? true : false;
>  }
>  
>  static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
> @@ -118,6 +120,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>   struct device *dev = >dev;
>   struct dw_pcie *pci;
>   struct ls_pcie_ep *pcie;
> + struct pci_epc_features *ls_epc;
>   struct resource *dbi_base;
>   int ret;
>  
> @@ -129,6 +132,10 @@ static int __init ls_pcie_ep_probe(struct 
> platform_device *pdev)
>   if (!pci)
>   return -ENOMEM;
>  
> + ls_epc = devm_kzalloc(dev, sizeof(*ls_epc), GFP_KERNEL);
> + if (!ls_epc)
> + return -ENOMEM;
> +
>   dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
>   pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_base);
>   if (IS_ERR(pci->dbi_base))
> @@ -139,6 +146,13 @@ static int __init ls_pcie_ep_probe(struct 
> platform_device *pdev)
>   pci->ops = _pcie_ep_ops;
>   pcie->pci = pci;
>  
> + ls_epc->linkup_notifier = false,
> + ls_epc->msi_capable = true,
> + ls_epc->msix_capable = true,

As [msi,msix]_capable is shortly set from ls_pcie_ep_init - is there any
reason to set them here (to potentially incorrect values)?

Thanks,

Andrew Murray

> + ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4),
> +
> + pcie->ls_epc = ls_epc;
> +
>   platform_set_drvdata(pdev, pcie);
>  
>   ret = ls_add_pcie_ep(pcie, pdev);
> -- 
> 2.9.5
>

Re: [PATCH 02/10] PCI: designware-ep: Add the doorbell mode of MSI-X in EP mode

2019-08-15 Thread Andrew Murray

On Thu, Aug 15, 2019 at 04:37:08PM +0800, Xiaowei Bao wrote:
> Add the doorbell mode of MSI-X in EP mode.
> 
> Signed-off-by: Xiaowei Bao 
> ---
>  drivers/pci/controller/dwc/pcie-designware-ep.c | 14 ++
>  drivers/pci/controller/dwc/pcie-designware.h| 14 ++
>  2 files changed, 28 insertions(+)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> b/drivers/pci/controller/dwc/pcie-designware-ep.c
> index 75e2955..e3a7cdf 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> @@ -454,6 +454,20 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
> func_no,
>   return 0;
>  }
>  
> +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,
> +u16 interrupt_num)
> +{
> + struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + u32 msg_data;
> +
> + msg_data = (func_no << PCIE_MSIX_DOORBELL_PF_SHIFT) |
> +(interrupt_num - 1);
> +
> + dw_pcie_writel_dbi(pci, PCIE_MSIX_DOORBELL, msg_data);
> +
> + return 0;
> +}
> +
>  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> u16 interrupt_num)

Have I understood correctly that the hardware provides an alternative mechanism
that allows for raising MSI-X interrupts without the bother of reading the
capabilities registers?

If so is there any good reason to keep dw_pcie_ep_raise_msix_irq? (And thus use
it in dw_plat_pcie_ep_raise_irq also)?


>  {
> diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
> b/drivers/pci/controller/dwc/pcie-designware.h
> index 2b291e8..cd903e9 100644
> --- a/drivers/pci/controller/dwc/pcie-designware.h
> +++ b/drivers/pci/controller/dwc/pcie-designware.h
> @@ -88,6 +88,11 @@
>  #define PCIE_MISC_CONTROL_1_OFF  0x8BC
>  #define PCIE_DBI_RO_WR_ENBIT(0)
>  
> +#define PCIE_MSIX_DOORBELL   0x948
> +#define PCIE_MSIX_DOORBELL_PF_SHIFT  24
> +#define PCIE_MSIX_DOORBELL_VF_SHIFT  16
> +#define PCIE_MSIX_DOORBELL_VF_ACTIVE BIT(15)

The _VF defines are not used, I'd suggest removing them.

Thanks,

Andrew Murray

> +
>  /*
>   * iATU Unroll-specific register definitions
>   * From 4.80 core version the address translation will be made by unroll
> @@ -399,6 +404,8 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
> func_no,
>u8 interrupt_num);
>  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
>u16 interrupt_num);
> +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,
> +u16 interrupt_num);
>  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
>  #else
>  static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> @@ -431,6 +438,13 @@ static inline int dw_pcie_ep_raise_msix_irq(struct 
> dw_pcie_ep *ep, u8 func_no,
>   return 0;
>  }
>  
> +static inline int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep,
> +  u8 func_no,
> +  u16 interrupt_num)
> +{
> + return 0;
> +}
> +
>  static inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno 
> bar)
>  {
>  }
> -- 
> 2.9.5
> 
> 
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

Re: [PATCH v1 4/4] mmc: sdhci-of-esdhc: add erratum A011334 support in ls1028a 1.0 SoC

2019-08-15 Thread Adrian Hunter

On 14/08/19 10:26 AM, Yinbo Zhu wrote:
> This patch is to add erratum A011334 support in ls1028a 1.0 SoC
> 
> Signed-off-by: Yinbo Zhu 

Acked-by: Adrian Hunter 

> ---
>  drivers/mmc/host/sdhci-of-esdhc.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/mmc/host/sdhci-of-esdhc.c 
> b/drivers/mmc/host/sdhci-of-esdhc.c
> index b16f7d440f78..eb2b290447fc 100644
> --- a/drivers/mmc/host/sdhci-of-esdhc.c
> +++ b/drivers/mmc/host/sdhci-of-esdhc.c
> @@ -1006,6 +1006,7 @@ static struct soc_device_attribute 
> soc_incorrect_hostver[] = {
>  static struct soc_device_attribute soc_fixup_sdhc_clkdivs[] = {
>   { .family = "QorIQ LX2160A", .revision = "1.0", },
>   { .family = "QorIQ LX2160A", .revision = "2.0", },
> + { .family = "QorIQ LS1028A", .revision = "1.0", },
>   { },
>  };
>  
>

[PATCH 2/2] selftests/powerpc: Ignore generated files

2019-08-15 Thread Gustavo Romero

Currently some binary files which are generated when tests are compiled
are not ignored by git, so 'git status' catch them.

For copyloops test, fix wrong binary names already in .gitignore. For
ptrace, security, and stringloops tests add missing binary names to the
.gitignore file.

Signed-off-by: Gustavo Romero 
---
 tools/testing/selftests/powerpc/copyloops/.gitignore   | 8 
 tools/testing/selftests/powerpc/ptrace/.gitignore  | 3 +++
 tools/testing/selftests/powerpc/security/.gitignore| 1 +
 tools/testing/selftests/powerpc/stringloops/.gitignore | 5 -
 4 files changed, 12 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/security/.gitignore

diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore 
b/tools/testing/selftests/powerpc/copyloops/.gitignore
index ce12cd0e2967..de158104912a 100644
--- a/tools/testing/selftests/powerpc/copyloops/.gitignore
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -1,13 +1,13 @@
 copyuser_64_t0
 copyuser_64_t1
 copyuser_64_t2
-copyuser_power7_t0
-copyuser_power7_t1
+copyuser_p7_t0
+copyuser_p7_t1
 memcpy_64_t0
 memcpy_64_t1
 memcpy_64_t2
-memcpy_power7_t0
-memcpy_power7_t1
+memcpy_p7_t0
+memcpy_p7_t1
 copyuser_64_exc_t0
 copyuser_64_exc_t1
 copyuser_64_exc_t2
diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore 
b/tools/testing/selftests/powerpc/ptrace/.gitignore
index 07ec449a2767..dce19f221c46 100644
--- a/tools/testing/selftests/powerpc/ptrace/.gitignore
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -10,3 +10,6 @@ ptrace-tm-spd-vsx
 ptrace-tm-spr
 ptrace-hwbreak
 perf-hwbreak
+core-pkey
+ptrace-pkey
+ptrace-syscall
diff --git a/tools/testing/selftests/powerpc/security/.gitignore 
b/tools/testing/selftests/powerpc/security/.gitignore
new file mode 100644
index ..0b969fba3beb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/.gitignore
@@ -0,0 +1 @@
+rfi_flush
diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore 
b/tools/testing/selftests/powerpc/stringloops/.gitignore
index 0b43da74ee46..31a17e0ba884 100644
--- a/tools/testing/selftests/powerpc/stringloops/.gitignore
+++ b/tools/testing/selftests/powerpc/stringloops/.gitignore
@@ -1 +1,4 @@
-memcmp
+memcmp_64
+memcmp_32
+strlen
+strlen_32
-- 
2.17.1

[PATCH 1/2] powerpc: Document xmon options

2019-08-15 Thread Gustavo Romero

Document all options currently supported by xmon debugger.

Signed-off-by: Gustavo Romero 
---
 .../admin-guide/kernel-parameters.txt | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 7ccd158b3894..6d495aab4d0b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5305,3 +5305,22 @@
A hex value specifying bitmask with supplemental xhci
host controller quirks. Meaning of each bit can be
consulted in header drivers/usb/host/xhci.h.
+
+   xmon[PPC]
+   Format: { early | on | rw | ro | off }
+   Controls if xmon debugger is enabled. Default is off.
+   Passing only "xmon" is equivalent to "xmon=early".
+   early   Call xmon as early as possible on boot; xmon
+   debugger is called from setup_arch().
+   on  xmon debugger hooks will be installed so xmon
+   is only called on a kernel crash. Default mode,
+   i.e. either "ro" or "rw" mode, is controlled
+   with CONFIG_XMON_DEFAULT_RO_MODE.
+   rw  xmon debugger hooks will be installed so xmon
+   is called only on a kernel crash, mode is write,
+   meaning SPR registers, memory and, other data
+   can be written using xmon commands.
+   ro  same as "rw" option above but SPR registers,
+   memory, and other data can't be written using
+   xmon commands.
+   off xmon is disabled.
-- 
2.17.1

[PATCH] powerpc/selftests: Fix and enhance TM signal context tests

2019-08-15 Thread Gustavo Romero

Currently TM signal context tests for GPR, FPR, VMX, and VSX registers
print wrong register numbers (wrongly starting from register 0 instead of
the first register in the non-volatile subset). Besides it the output when
a mismatch happens is poor giving not much information about which context
and which register mismatches, because it prints both contexts at the same
time and not a comparison between the value that mismatches and the value
expected and, moreover, it stops printing on the first mismatch, but it's
important to know if there are other mismatches happening beyond the first
one.

For instance, this is the current output when a mismatch happens:

test: tm_signal_context_chk_gpr
tags: git_version:v5.2-8249-g02e970fae465-dirty
Failed on 0 GPR 1 or 18446744073709551615
failure: tm_signal_context_chk_gpr

test: tm_signal_context_chk_fpu
tags: git_version:v5.2-8248-g09c289e3ef80
Failed on 0 FP -1 or -1
failure: tm_signal_context_chk_fpu

test: tm_signal_context_chk_vmx
tags: git_version:v5.2-8248-g09c289e3ef80
Failed on 0 vmx 0xfffefffdfffc vs 
0xfffefffdfffc
failure: tm_signal_context_chk_vmx

test: tm_signal_context_chk_vsx
tags: git_version:v5.2-8248-g09c289e3ef80
Failed on 0 vsx 0xfefffdfffcff vs 
0xfefffdfffcff
failure: tm_signal_context_chk_vsx

This commit fixes the register numbers printed and enhances the error
output by providing a full list of mismatching registers separated by the
context (non-speculative or speculative context), for example:

test: tm_signal_context_chk_gpr
tags: git_version:v5.2-8249-g02e970fae465-dirty
GPR14 (1st context) == 1 instead of -1 (expected)
GPR15 (1st context) == 2 instead of -2 (expected)
GPR14 (2nd context) == 0 instead of 18446744073709551615 (expected)
GPR15 (2nd context) == 0 instead of 18446744073709551614 (expected)
failure: tm_signal_context_chk_gpr

test: tm_signal_context_chk_fpu
tags: git_version:v5.2-8249-g02e970fae465-dirty
FPR14 (1st context) == -1 instead of 1 (expected)
FPR15 (1st context) == -2 instead of 2 (expected)
failure: tm_signal_context_chk_fpu

test: tm_signal_context_chk_vmx
tags: git_version:v5.2-8249-g02e970fae465-dirty
VMX20 (1st context) == 0xfffefffdfffc instead of 
0x0001000200030004 (expected)
VMX21 (1st context) == 0xfffbfffafff9fff8 instead of 
0x0005000600070008 (expected)
failure: tm_signal_context_chk_vmx

test: tm_signal_context_chk_vsx
tags: git_version:v5.2-8249-g02e970fae465-dirty
VSX20 (1st context) == 0xfefffdfffcff instead of 
0x0001000200030004 (expected)
VSX21 (1st context) == 0xfbfffafff9fff8ff instead of 
0x0005000600070008 (expected)
failure: tm_signal_context_chk_vsx

Finally, this commit adds comments to the tests in the hope that it will
help people not so familiar with TM understand the tests.

Signed-off-by: Gustavo Romero 
---
 .../powerpc/tm/tm-signal-context-chk-fpu.c|  49 +--
 .../powerpc/tm/tm-signal-context-chk-gpr.c|  59 +---
 .../powerpc/tm/tm-signal-context-chk-vmx.c|  74 +++---
 .../powerpc/tm/tm-signal-context-chk-vsx.c| 130 +-
 4 files changed, 228 insertions(+), 84 deletions(-)

diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c 
b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
index d57c2d2ab6ec..254f912ad611 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
@@ -5,10 +5,11 @@
  * Test the kernel's signal frame code.
  *
  * The kernel sets up two sets of ucontexts if the signal was to be
- * delivered while the thread was in a transaction.
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
  * Expected behaviour is that the checkpointed state is in the user
- * context passed to the signal handler. The speculated state can be
- * accessed with the uc_link pointer.
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
  *
  * The rationale for this is that if TM unaware code (which linked
  * against TM libs) installs a signal handler it will not know of the
@@ -28,17 +29,20 @@
 
 #define MAX_ATTEMPT 50
 
-#define NV_FPU_REGS 18
+#define NV_FPU_REGS 18 /* Number of non-volatile FP registers */
+#define FPR14 14 /* First non-volatile FP register to check in f14-31 subset */
 
 long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector 
int *vms, vector int *vss);
 
-/* Be sure there are 2x as many as there are NV FPU regs (2x18) */
+/* Test only non-volatile registers, i.e. 18 fpr registers from f14 to f31 */
 static double fps[] = {
+   /* First context will be set with these values, i.e. non-speculative */
 1, 2, 3, 4, 5, 6, 7, 8,

Re: [PATCH 01/10] PCI: designware-ep: Add multiple PFs support for DWC

2019-08-15 Thread Andrew Murray

On Thu, Aug 15, 2019 at 04:37:07PM +0800, Xiaowei Bao wrote:
> Add multiple PFs support for DWC, different PF have different config space,
> we use pf-offset property which get from the DTS to access the different pF
> config space.

Thanks for the patch. I haven't seen a cover letter for this series, is there
one missing?


> 
> Signed-off-by: Xiaowei Bao 
> ---
>  drivers/pci/controller/dwc/pcie-designware-ep.c |  97 +-
>  drivers/pci/controller/dwc/pcie-designware.c| 105 
> ++--
>  drivers/pci/controller/dwc/pcie-designware.h|  10 ++-
>  include/linux/pci-epc.h |   1 +
>  4 files changed, 164 insertions(+), 49 deletions(-)
> 
> diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
> b/drivers/pci/controller/dwc/pcie-designware-ep.c
> index 2bf5a35..75e2955 100644
> --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> @@ -19,12 +19,14 @@ void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
>   pci_epc_linkup(epc);
>  }
>  
> -static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar,
> -int flags)
> +static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, u8 func_no,
> +enum pci_barno bar, int flags)
>  {
>   u32 reg;
> + struct pci_epc *epc = pci->ep.epc;
> + u32 pf_base = func_no * epc->pf_offset;
>  
> - reg = PCI_BASE_ADDRESS_0 + (4 * bar);
> + reg = pf_base + PCI_BASE_ADDRESS_0 + (4 * bar);

I think I'd rather see this arithmetic (and the one for determining pf_base)
inside a macro or inline header function. This would make this code more 
readable
and reduce the chances of an error by avoiding duplication of code.

For example look at cdns_pcie_ep_fn_writeb and ROCKCHIP_PCIE_EP_FUNC_BASE for
examples of other EP drivers that do this.


>   dw_pcie_dbi_ro_wr_en(pci);
>   dw_pcie_writel_dbi2(pci, reg, 0x0);
>   dw_pcie_writel_dbi(pci, reg, 0x0);
> @@ -37,7 +39,12 @@ static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, 
> enum pci_barno bar,
>  
>  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar)
>  {
> - __dw_pcie_ep_reset_bar(pci, bar, 0);
> + u8 func_no, funcs;
> +
> + funcs = pci->ep.epc->max_functions;
> +
> + for (func_no = 0; func_no < funcs; func_no++)
> + __dw_pcie_ep_reset_bar(pci, func_no, bar, 0);
>  }
>  
>  static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
> @@ -78,28 +85,29 @@ static int dw_pcie_ep_write_header(struct pci_epc *epc, 
> u8 func_no,
>  {
>   struct dw_pcie_ep *ep = epc_get_drvdata(epc);
>   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> + u32 pf_base = func_no * epc->pf_offset;
>  
>   dw_pcie_dbi_ro_wr_en(pci);
> - dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, hdr->vendorid);
> - dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, hdr->deviceid);
> - dw_pcie_writeb_dbi(pci, PCI_REVISION_ID, hdr->revid);
> - dw_pcie_writeb_dbi(pci, PCI_CLASS_PROG, hdr->progif_code);
> - dw_pcie_writew_dbi(pci, PCI_CLASS_DEVICE,
> + dw_pcie_writew_dbi(pci, pf_base + PCI_VENDOR_ID, hdr->vendorid);
> + dw_pcie_writew_dbi(pci, pf_base + PCI_DEVICE_ID, hdr->deviceid);
> + dw_pcie_writeb_dbi(pci, pf_base + PCI_REVISION_ID, hdr->revid);
> + dw_pcie_writeb_dbi(pci, pf_base + PCI_CLASS_PROG, hdr->progif_code);
> + dw_pcie_writew_dbi(pci, pf_base + PCI_CLASS_DEVICE,
>  hdr->subclass_code | hdr->baseclass_code << 8);
> - dw_pcie_writeb_dbi(pci, PCI_CACHE_LINE_SIZE,
> + dw_pcie_writeb_dbi(pci, pf_base + PCI_CACHE_LINE_SIZE,
>  hdr->cache_line_size);
> - dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_VENDOR_ID,
> + dw_pcie_writew_dbi(pci, pf_base + PCI_SUBSYSTEM_VENDOR_ID,
>  hdr->subsys_vendor_id);
> - dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_ID, hdr->subsys_id);
> - dw_pcie_writeb_dbi(pci, PCI_INTERRUPT_PIN,
> + dw_pcie_writew_dbi(pci, pf_base + PCI_SUBSYSTEM_ID, hdr->subsys_id);
> + dw_pcie_writeb_dbi(pci, pf_base + PCI_INTERRUPT_PIN,
>  hdr->interrupt_pin);
>   dw_pcie_dbi_ro_wr_dis(pci);
>  
>   return 0;
>  }
>  
> -static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, enum pci_barno bar,
> -   dma_addr_t cpu_addr,
> +static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, u8 func_no,
> +   enum pci_barno bar, dma_addr_t cpu_addr,
> enum dw_pcie_as_type as_type)
>  {
>   int ret;
> @@ -112,7 +120,7 @@ static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, 
> enum pci_barno bar,
>   return -EINVAL;
>   }
>  
> - ret = dw_pcie_prog_inbound_atu(pci, free_win, bar, cpu_addr,
> + ret = dw_pcie_prog_inbound_atu(pci, func_no, free_win, bar, cpu_addr,
>  as_type);
>   if

Re: [PATCH v4 0/3] kasan: support backing vmalloc space with real shadow memory

2019-08-15 Thread Mark Rutland

On Thu, Aug 15, 2019 at 10:16:33AM +1000, Daniel Axtens wrote:
> Currently, vmalloc space is backed by the early shadow page. This
> means that kasan is incompatible with VMAP_STACK, and it also provides
> a hurdle for architectures that do not have a dedicated module space
> (like powerpc64).
> 
> This series provides a mechanism to back vmalloc space with real,
> dynamically allocated memory. I have only wired up x86, because that's
> the only currently supported arch I can work with easily, but it's
> very easy to wire up other architectures.

I'm happy to send patches for arm64 once we've settled some conflicting
rework going on for 52-bit VA support.

> 
> This has been discussed before in the context of VMAP_STACK:
>  - https://bugzilla.kernel.org/show_bug.cgi?id=202009
>  - https://lkml.org/lkml/2018/7/22/198
>  - https://lkml.org/lkml/2019/7/19/822
> 
> In terms of implementation details:
> 
> Most mappings in vmalloc space are small, requiring less than a full
> page of shadow space. Allocating a full shadow page per mapping would
> therefore be wasteful. Furthermore, to ensure that different mappings
> use different shadow pages, mappings would have to be aligned to
> KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE.
> 
> Instead, share backing space across multiple mappings. Allocate
> a backing page the first time a mapping in vmalloc space uses a
> particular page of the shadow region. Keep this page around
> regardless of whether the mapping is later freed - in the mean time
> the page could have become shared by another vmalloc mapping.
> 
> This can in theory lead to unbounded memory growth, but the vmalloc
> allocator is pretty good at reusing addresses, so the practical memory
> usage appears to grow at first but then stay fairly stable.
> 
> If we run into practical memory exhaustion issues, I'm happy to
> consider hooking into the book-keeping that vmap does, but I am not
> convinced that it will be an issue.

FWIW, I haven't spotted such memory exhaustion after a week of Syzkaller
fuzzing with the last patchset, across 3 machines, so that sounds fine
to me.

Otherwise, this looks good to me now! For the x86 and fork patch, feel
free to add:

Acked-by: Mark Rutland 

Mark.

> 
> v1: https://lore.kernel.org/linux-mm/20190725055503.19507-1-...@axtens.net/
> v2: https://lore.kernel.org/linux-mm/20190729142108.23343-1-...@axtens.net/
>  Address review comments:
>  - Patch 1: use kasan_unpoison_shadow's built-in handling of
> ranges that do not align to a full shadow byte
>  - Patch 3: prepopulate pgds rather than faulting things in
> v3: https://lore.kernel.org/linux-mm/20190731071550.31814-1-...@axtens.net/
>  Address comments from Mark Rutland:
>  - kasan_populate_vmalloc is a better name
>  - handle concurrency correctly
>  - various nits and cleanups
>  - relax module alignment in KASAN_VMALLOC case
> v4: Changes to patch 1 only:
>  - Integrate Mark's rework, thanks Mark!
>  - handle the case where kasan_populate_shadow might fail
>  - poision shadow on free, allowing the alloc path to just
>  unpoision memory that it uses
> 
> Daniel Axtens (3):
>   kasan: support backing vmalloc space with real shadow memory
>   fork: support VMAP_STACK with KASAN_VMALLOC
>   x86/kasan: support KASAN_VMALLOC
> 
>  Documentation/dev-tools/kasan.rst | 60 +++
>  arch/Kconfig  |  9 +++--
>  arch/x86/Kconfig  |  1 +
>  arch/x86/mm/kasan_init_64.c   | 61 
>  include/linux/kasan.h | 24 +++
>  include/linux/moduleloader.h  |  2 +-
>  include/linux/vmalloc.h   | 12 ++
>  kernel/fork.c |  4 ++
>  lib/Kconfig.kasan | 16 
>  lib/test_kasan.c  | 26 
>  mm/kasan/common.c | 67 +++
>  mm/kasan/generic_report.c |  3 ++
>  mm/kasan/kasan.h  |  1 +
>  mm/vmalloc.c  | 28 -
>  14 files changed, 308 insertions(+), 6 deletions(-)
> 
> -- 
> 2.20.1
>

Re: [PATCH 7/8] parisc: don't set ARCH_NO_COHERENT_DMA_MMAP

2019-08-15 Thread Christoph Hellwig

On Thu, Aug 15, 2019 at 10:25:52AM +0100, James Bottomley wrote:
> >  which means exporting normally cachable memory to userspace is
> > relatively dangrous due to cache aliasing.
> > 
> > But normally cachable memory is only allocated by dma_alloc_coherent
> > on parisc when using the sba_iommu or ccio_iommu drivers, so just
> > remove the .mmap implementation for them so that we don't have to set
> > ARCH_NO_COHERENT_DMA_MMAP, which I plan to get rid of.
> 
> So I don't think this is quite right.  We have three architectural
> variants essentially (hidden behind about 12 cpu types):
> 
>1. pa70xx: These can't turn off page caching, so they were the non
>   coherent problem case
>2. pa71xx: These can manufacture coherent memory simply by turning off
>   the cache on a per page basis
>3. pa8xxx: these have a full cache flush coherence mechanism.
> 
> (I might have this slightly wrong: I vaguely remember the pa71xxlc
> variants have some weird cache quirks for DMA as well)
> 
> So I think pa70xx we can't mmap.  pa71xx we can provided we mark the
> page as uncached ... which should already have happened in the allocate
> and pa8xxx which can always mmap dma memory without any special tricks.

Except for the different naming scheme vs the code this matches my
assumptions.

In the code we have three cases (and a fourth EISA case mention in
comments, but not actually implemented as far as I can tell):

arch/parisc/kernel/pci-dma.c says in the top of file comments:

** AFAIK, all PA7100LC and PA7300LC platforms can use this code.

and the handles two different case.  for cpu_type == pcxl or pcxl2
it maps the memory as uncached for dma_alloc_coherent, and for all
other cpu types it fails the coherent allocations.

In addition to that there are the ccio and sba iommu drivers, of which
according to your above comment one is always present for pa8xxx.

Which brings us back to this patch, which ensures that no cacheable
memory is exported to userspace by removing ->mmap from ccio and sba.
It then enabled dma_mmap_coherent for the pcxl or pcxl2 case that
allocates uncached memory, which dma_mmap_coherent does not work
because dma_alloc_coherent already failed for the !pcxl && !pcxl2
and thus there is no memory to mmap.

So if the description is too confusing please suggest a better
one, I'm a little lost between all these code names and product
names (arch/parisc/include/asm/dma-mapping.h uses yet another set).

[PATCH 10/10] misc: pci_endpoint_test: Add LS1088a in pci_device_id table

2019-08-15 Thread Xiaowei Bao

Add LS1088a in pci_device_id table so that pci-epf-test can be used
for testing PCIe EP in LS1088a.

Signed-off-by: Xiaowei Bao 
---
 drivers/misc/pci_endpoint_test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 6e208a0..d531951 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -793,6 +793,7 @@ static const struct pci_device_id pci_endpoint_test_tbl[] = 
{
{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) },
{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) },
{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) },
+   { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x80c0) },
{ PCI_DEVICE_DATA(SYNOPSYS, EDDA, NULL) },
{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_AM654),
  .driver_data = (kernel_ulong_t)_data
-- 
2.9.5

[PATCH 09/10] arm64: dts: layerscape: Add PCIe EP node for ls1088a

2019-08-15 Thread Xiaowei Bao

Add PCIe EP node for ls1088a to support EP mode.

Signed-off-by: Xiaowei Bao 
---
 arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi | 32 ++
 1 file changed, 32 insertions(+)

diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi 
b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
index dfbead4..434a76c 100644
--- a/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
+++ b/arch/arm64/boot/dts/freescale/fsl-ls1088a.dtsi
@@ -471,6 +471,18 @@
status = "disabled";
};
 
+   pcie_ep@340 {
+   compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
+   reg = <0x00 0x0340 0x0 0x0010
+  0x20 0x 0x8 0x>;
+   reg-names = "regs", "addr_space";
+   num-ib-windows = <24>;
+   num-ob-windows = <128>;
+   max-functions = /bits/ 8 <2>;
+   pf-offset = <0x2>;
+   status = "disabled";
+   };
+
pcie@350 {
compatible = "fsl,ls1088a-pcie";
reg = <0x00 0x0350 0x0 0x0010   /* controller 
registers */
@@ -497,6 +509,16 @@
status = "disabled";
};
 
+   pcie_ep@350 {
+   compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
+   reg = <0x00 0x0350 0x0 0x0010
+  0x28 0x 0x8 0x>;
+   reg-names = "regs", "addr_space";
+   num-ib-windows = <6>;
+   num-ob-windows = <8>;
+   status = "disabled";
+   };
+
pcie@360 {
compatible = "fsl,ls1088a-pcie";
reg = <0x00 0x0360 0x0 0x0010   /* controller 
registers */
@@ -523,6 +545,16 @@
status = "disabled";
};
 
+   pcie_ep@360 {
+   compatible = "fsl,ls1088a-pcie-ep","fsl,ls-pcie-ep";
+   reg = <0x00 0x0360 0x0 0x0010
+  0x30 0x 0x8 0x>;
+   reg-names = "regs", "addr_space";
+   num-ib-windows = <6>;
+   num-ob-windows = <8>;
+   status = "disabled";
+   };
+
smmu: iommu@500 {
compatible = "arm,mmu-500";
reg = <0 0x500 0 0x80>;
-- 
2.9.5

[PATCH 08/10] dt-bindings: PCI: Add the pf-offset property

2019-08-15 Thread Xiaowei Bao

Add the pf-offset property for multiple PF.

Signed-off-by: Xiaowei Bao 
---
 Documentation/devicetree/bindings/pci/designware-pcie.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt 
b/Documentation/devicetree/bindings/pci/designware-pcie.txt
index 5561a1c..d658687 100644
--- a/Documentation/devicetree/bindings/pci/designware-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt
@@ -43,6 +43,7 @@ RC mode:
 
 EP mode:
 - max-functions: maximum number of functions that can be configured
+- pf-offset: the offset of each PF's config space
 
 Example configuration:
 
-- 
2.9.5

[PATCH 07/10] PCI: layerscape: Fix some format issue of the code

2019-08-15 Thread Xiaowei Bao

Fix some format issue of the code in EP driver.

Signed-off-by: Xiaowei Bao 
---
 drivers/pci/controller/dwc/pci-layerscape-ep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index a0cd5ff..2ada445 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -64,7 +64,7 @@ static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
 }
 
 static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
- enum pci_epc_irq_type type, u16 interrupt_num)
+   enum pci_epc_irq_type type, u16 interrupt_num)
 {
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
@@ -89,7 +89,7 @@ static const struct dw_pcie_ep_ops pcie_ep_ops = {
 };
 
 static int __init ls_add_pcie_ep(struct ls_pcie_ep *pcie,
-   struct platform_device *pdev)
+struct platform_device *pdev)
 {
struct dw_pcie *pci = pcie->pci;
struct device *dev = pci->dev;
-- 
2.9.5

[PATCH 06/10] PCI: layerscape: Modify the MSIX to the doorbell way

2019-08-15 Thread Xiaowei Bao

The layerscape platform use the doorbell way to trigger MSIX
interrupt in EP mode.

Signed-off-by: Xiaowei Bao 
---
 drivers/pci/controller/dwc/pci-layerscape-ep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index 9404ca0..a0cd5ff 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -74,7 +74,8 @@ static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 
func_no,
case PCI_EPC_IRQ_MSI:
return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
case PCI_EPC_IRQ_MSIX:
-   return dw_pcie_ep_raise_msix_irq(ep, func_no, interrupt_num);
+   return dw_pcie_ep_raise_msix_irq_doorbell(ep, func_no,
+ interrupt_num);
default:
dev_err(pci->dev, "UNKNOWN IRQ type\n");
return -EINVAL;
-- 
2.9.5

[PATCH 05/10] PCI: layerscape: Modify the way of getting capability with different PEX

2019-08-15 Thread Xiaowei Bao

The different PCIe controller in one board may be have different
capability of MSI or MSIX, so change the way of getting the MSI
capability, make it more flexible.

Signed-off-by: Xiaowei Bao 
---
 drivers/pci/controller/dwc/pci-layerscape-ep.c | 28 +++---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index be61d96..9404ca0 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -22,6 +22,7 @@
 
 struct ls_pcie_ep {
struct dw_pcie  *pci;
+   struct pci_epc_features *ls_epc;
 };
 
 #define to_ls_pcie_ep(x)   dev_get_drvdata((x)->dev)
@@ -40,25 +41,26 @@ static const struct of_device_id ls_pcie_ep_of_match[] = {
{ },
 };
 
-static const struct pci_epc_features ls_pcie_epc_features = {
-   .linkup_notifier = false,
-   .msi_capable = true,
-   .msix_capable = false,
-};
-
 static const struct pci_epc_features*
 ls_pcie_ep_get_features(struct dw_pcie_ep *ep)
 {
-   return _pcie_epc_features;
+   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
+
+   return pcie->ls_epc;
 }
 
 static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
 {
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
enum pci_barno bar;
 
for (bar = BAR_0; bar <= BAR_5; bar++)
dw_pcie_ep_reset_bar(pci, bar);
+
+   pcie->ls_epc->msi_capable = ep->msi_cap ? true : false;
+   pcie->ls_epc->msix_capable = ep->msix_cap ? true : false;
 }
 
 static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
@@ -118,6 +120,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
*pdev)
struct device *dev = >dev;
struct dw_pcie *pci;
struct ls_pcie_ep *pcie;
+   struct pci_epc_features *ls_epc;
struct resource *dbi_base;
int ret;
 
@@ -129,6 +132,10 @@ static int __init ls_pcie_ep_probe(struct platform_device 
*pdev)
if (!pci)
return -ENOMEM;
 
+   ls_epc = devm_kzalloc(dev, sizeof(*ls_epc), GFP_KERNEL);
+   if (!ls_epc)
+   return -ENOMEM;
+
dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
pci->dbi_base = devm_pci_remap_cfg_resource(dev, dbi_base);
if (IS_ERR(pci->dbi_base))
@@ -139,6 +146,13 @@ static int __init ls_pcie_ep_probe(struct platform_device 
*pdev)
pci->ops = _pcie_ep_ops;
pcie->pci = pci;
 
+   ls_epc->linkup_notifier = false,
+   ls_epc->msi_capable = true,
+   ls_epc->msix_capable = true,
+   ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4),
+
+   pcie->ls_epc = ls_epc;
+
platform_set_drvdata(pdev, pcie);
 
ret = ls_add_pcie_ep(pcie, pdev);
-- 
2.9.5

[PATCH 04/10] dt-bindings: pci: layerscape-pci: add compatible strings for ls1088a and ls2088a

2019-08-15 Thread Xiaowei Bao

Add compatible strings for ls1088a and ls2088a.

Signed-off-by: Xiaowei Bao 
---
 Documentation/devicetree/bindings/pci/layerscape-pci.txt | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/pci/layerscape-pci.txt 
b/Documentation/devicetree/bindings/pci/layerscape-pci.txt
index e20ceaa..16f592e 100644
--- a/Documentation/devicetree/bindings/pci/layerscape-pci.txt
+++ b/Documentation/devicetree/bindings/pci/layerscape-pci.txt
@@ -22,7 +22,10 @@ Required properties:
 "fsl,ls1043a-pcie"
 "fsl,ls1012a-pcie"
   EP mode:
-   "fsl,ls1046a-pcie-ep", "fsl,ls-pcie-ep"
+   "fsl,ls-pcie-ep"
+   "fsl,ls1046a-pcie-ep"
+   "fsl,ls1088a-pcie-ep"
+   "fsl,ls2088a-pcie-ep"
 - reg: base addresses and lengths of the PCIe controller register blocks.
 - interrupts: A list of interrupt outputs of the controller. Must contain an
   entry for each entry in the interrupt-names property.
-- 
2.9.5

[PATCH 03/10] PCI: designware-ep: Move the function of getting MSI capability forward

2019-08-15 Thread Xiaowei Bao

Move the function of getting MSI capability to the front of init
function, because the init function of the EP platform driver will use
the return value by the function of getting MSI capability.

Signed-off-by: Xiaowei Bao 
---
 drivers/pci/controller/dwc/pcie-designware-ep.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
b/drivers/pci/controller/dwc/pcie-designware-ep.c
index e3a7cdf..0c27c7b 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -631,6 +631,10 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
if (ret < 0)
epc->pf_offset = 0;
 
+   ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI);
+
+   ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX);
+
if (ep->ops->ep_init)
ep->ops->ep_init(ep);
 
@@ -647,9 +651,6 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n");
return -ENOMEM;
}
-   ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI);
-
-   ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX);
 
offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR);
if (offset) {
-- 
2.9.5

[PATCH 01/10] PCI: designware-ep: Add multiple PFs support for DWC

2019-08-15 Thread Xiaowei Bao

Add multiple PFs support for DWC, different PF have different config space,
we use pf-offset property which get from the DTS to access the different pF
config space.

Signed-off-by: Xiaowei Bao 
---
 drivers/pci/controller/dwc/pcie-designware-ep.c |  97 +-
 drivers/pci/controller/dwc/pcie-designware.c| 105 ++--
 drivers/pci/controller/dwc/pcie-designware.h|  10 ++-
 include/linux/pci-epc.h |   1 +
 4 files changed, 164 insertions(+), 49 deletions(-)

diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 2bf5a35..75e2955 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -19,12 +19,14 @@ void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
pci_epc_linkup(epc);
 }
 
-static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar,
-  int flags)
+static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, u8 func_no,
+  enum pci_barno bar, int flags)
 {
u32 reg;
+   struct pci_epc *epc = pci->ep.epc;
+   u32 pf_base = func_no * epc->pf_offset;
 
-   reg = PCI_BASE_ADDRESS_0 + (4 * bar);
+   reg = pf_base + PCI_BASE_ADDRESS_0 + (4 * bar);
dw_pcie_dbi_ro_wr_en(pci);
dw_pcie_writel_dbi2(pci, reg, 0x0);
dw_pcie_writel_dbi(pci, reg, 0x0);
@@ -37,7 +39,12 @@ static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum 
pci_barno bar,
 
 void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar)
 {
-   __dw_pcie_ep_reset_bar(pci, bar, 0);
+   u8 func_no, funcs;
+
+   funcs = pci->ep.epc->max_functions;
+
+   for (func_no = 0; func_no < funcs; func_no++)
+   __dw_pcie_ep_reset_bar(pci, func_no, bar, 0);
 }
 
 static u8 __dw_pcie_ep_find_next_cap(struct dw_pcie *pci, u8 cap_ptr,
@@ -78,28 +85,29 @@ static int dw_pcie_ep_write_header(struct pci_epc *epc, u8 
func_no,
 {
struct dw_pcie_ep *ep = epc_get_drvdata(epc);
struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   u32 pf_base = func_no * epc->pf_offset;
 
dw_pcie_dbi_ro_wr_en(pci);
-   dw_pcie_writew_dbi(pci, PCI_VENDOR_ID, hdr->vendorid);
-   dw_pcie_writew_dbi(pci, PCI_DEVICE_ID, hdr->deviceid);
-   dw_pcie_writeb_dbi(pci, PCI_REVISION_ID, hdr->revid);
-   dw_pcie_writeb_dbi(pci, PCI_CLASS_PROG, hdr->progif_code);
-   dw_pcie_writew_dbi(pci, PCI_CLASS_DEVICE,
+   dw_pcie_writew_dbi(pci, pf_base + PCI_VENDOR_ID, hdr->vendorid);
+   dw_pcie_writew_dbi(pci, pf_base + PCI_DEVICE_ID, hdr->deviceid);
+   dw_pcie_writeb_dbi(pci, pf_base + PCI_REVISION_ID, hdr->revid);
+   dw_pcie_writeb_dbi(pci, pf_base + PCI_CLASS_PROG, hdr->progif_code);
+   dw_pcie_writew_dbi(pci, pf_base + PCI_CLASS_DEVICE,
   hdr->subclass_code | hdr->baseclass_code << 8);
-   dw_pcie_writeb_dbi(pci, PCI_CACHE_LINE_SIZE,
+   dw_pcie_writeb_dbi(pci, pf_base + PCI_CACHE_LINE_SIZE,
   hdr->cache_line_size);
-   dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_VENDOR_ID,
+   dw_pcie_writew_dbi(pci, pf_base + PCI_SUBSYSTEM_VENDOR_ID,
   hdr->subsys_vendor_id);
-   dw_pcie_writew_dbi(pci, PCI_SUBSYSTEM_ID, hdr->subsys_id);
-   dw_pcie_writeb_dbi(pci, PCI_INTERRUPT_PIN,
+   dw_pcie_writew_dbi(pci, pf_base + PCI_SUBSYSTEM_ID, hdr->subsys_id);
+   dw_pcie_writeb_dbi(pci, pf_base + PCI_INTERRUPT_PIN,
   hdr->interrupt_pin);
dw_pcie_dbi_ro_wr_dis(pci);
 
return 0;
 }
 
-static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, enum pci_barno bar,
- dma_addr_t cpu_addr,
+static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, u8 func_no,
+ enum pci_barno bar, dma_addr_t cpu_addr,
  enum dw_pcie_as_type as_type)
 {
int ret;
@@ -112,7 +120,7 @@ static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, 
enum pci_barno bar,
return -EINVAL;
}
 
-   ret = dw_pcie_prog_inbound_atu(pci, free_win, bar, cpu_addr,
+   ret = dw_pcie_prog_inbound_atu(pci, func_no, free_win, bar, cpu_addr,
   as_type);
if (ret < 0) {
dev_err(pci->dev, "Failed to program IB window\n");
@@ -125,7 +133,8 @@ static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, 
enum pci_barno bar,
return 0;
 }
 
-static int dw_pcie_ep_outbound_atu(struct dw_pcie_ep *ep, phys_addr_t 
phys_addr,
+static int dw_pcie_ep_outbound_atu(struct dw_pcie_ep *ep, u8 func_no,
+  phys_addr_t phys_addr,
   u64 pci_addr, size_t size)
 {
u32 free_win;
@@ -137,8 +146,8 @@ static int dw_pcie_ep_outbound_atu(struct dw_pcie_ep *ep, 
phys_addr_t phys_addr,

[PATCH 02/10] PCI: designware-ep: Add the doorbell mode of MSI-X in EP mode

2019-08-15 Thread Xiaowei Bao

Add the doorbell mode of MSI-X in EP mode.

Signed-off-by: Xiaowei Bao 
---
 drivers/pci/controller/dwc/pcie-designware-ep.c | 14 ++
 drivers/pci/controller/dwc/pcie-designware.h| 14 ++
 2 files changed, 28 insertions(+)

diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c 
b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 75e2955..e3a7cdf 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -454,6 +454,20 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
func_no,
return 0;
 }
 
+int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,
+  u16 interrupt_num)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   u32 msg_data;
+
+   msg_data = (func_no << PCIE_MSIX_DOORBELL_PF_SHIFT) |
+  (interrupt_num - 1);
+
+   dw_pcie_writel_dbi(pci, PCIE_MSIX_DOORBELL, msg_data);
+
+   return 0;
+}
+
 int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
  u16 interrupt_num)
 {
diff --git a/drivers/pci/controller/dwc/pcie-designware.h 
b/drivers/pci/controller/dwc/pcie-designware.h
index 2b291e8..cd903e9 100644
--- a/drivers/pci/controller/dwc/pcie-designware.h
+++ b/drivers/pci/controller/dwc/pcie-designware.h
@@ -88,6 +88,11 @@
 #define PCIE_MISC_CONTROL_1_OFF0x8BC
 #define PCIE_DBI_RO_WR_EN  BIT(0)
 
+#define PCIE_MSIX_DOORBELL 0x948
+#define PCIE_MSIX_DOORBELL_PF_SHIFT24
+#define PCIE_MSIX_DOORBELL_VF_SHIFT16
+#define PCIE_MSIX_DOORBELL_VF_ACTIVE   BIT(15)
+
 /*
  * iATU Unroll-specific register definitions
  * From 4.80 core version the address translation will be made by unroll
@@ -399,6 +404,8 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep *ep, u8 
func_no,
 u8 interrupt_num);
 int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
 u16 interrupt_num);
+int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8 func_no,
+  u16 interrupt_num);
 void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
 #else
 static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
@@ -431,6 +438,13 @@ static inline int dw_pcie_ep_raise_msix_irq(struct 
dw_pcie_ep *ep, u8 func_no,
return 0;
 }
 
+static inline int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep,
+u8 func_no,
+u16 interrupt_num)
+{
+   return 0;
+}
+
 static inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno 
bar)
 {
 }
-- 
2.9.5

Re: [PATCH 1/3] powerpc/xmon: Check for HV mode when dumping XIVE info from OPAL

2019-08-15 Thread Cédric Le Goater

On 15/08/2019 09:30, Jordan Niethe wrote:
> On Wed, 2019-08-14 at 17:47 +0200, Cédric Le Goater wrote:
>> Currently, the xmon 'dx' command calls OPAL to dump the XIVE state in
>> the OPAL logs and also outputs some of the fields of the internal
>> XIVE
>> structures in Linux. The OPAL calls can only be done on baremetal
>> (PowerNV) and they crash a pseries machine. Fix by checking the
>> hypervisor feature of the CPU.
>>
>> Signed-off-by: Cédric Le Goater 
>> ---
>>  arch/powerpc/xmon/xmon.c | 17 ++---
>>  1 file changed, 10 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
>> index 14e56c25879f..25d4adccf750 100644
>> --- a/arch/powerpc/xmon/xmon.c
>> +++ b/arch/powerpc/xmon/xmon.c
>> @@ -2534,13 +2534,16 @@ static void dump_pacas(void)
>>  static void dump_one_xive(int cpu)
>>  {
>>  unsigned int hwid = get_hard_smp_processor_id(cpu);
>> -
>> -opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
>> -opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
>> -opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
>> -opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
>> -opal_xive_dump(XIVE_DUMP_VP, hwid);
>> -opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
>> +bool hv = cpu_has_feature(CPU_FTR_HVMODE);
>> +
>> +if (hv) {
>> +opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
>> +opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
>> +opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
>> +opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
>> +opal_xive_dump(XIVE_DUMP_VP, hwid);
>> +opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
>> +}
>>  
>>  if (setjmp(bus_error_jmp) != 0) {
>>  catch_memory_errors = 0;
> dump_one_xive() / other xive functions are guarded by #ifdef
> CONFIG_PPC_POWERNV in xmon.c aren't they? With this series would it be
> that these guards can be removed?

One could compile without CONFIG_PPC_POWERNV but we would still want 
these commands to be available for pseries. I missed that.

I think we should move the opal_xive_dump calls to some other command, 
dxo for 'dump_xive_opal' may be, as they only output in the OPAL logs.

This needs some rework.

Thanks,

C.

Re: [RFC PATCH v4 1/2] powerpc/xmon: Allow listing active breakpoints in read-only mode

2019-08-15 Thread Andrew Donnellan


On 15/8/19 3:06 pm, Christopher M. Riedl wrote:> case 'c':

+   if (xmon_is_ro) {
+   printf(xmon_ro_msg);
+   break;
+   }
if (!scanhex()) {
/* clear all breakpoints */
for (i = 0; i < NBPTS; ++i)


Clearing breakpoints is probably alright too.

--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited

Re: [PATCH 7/8] parisc: don't set ARCH_NO_COHERENT_DMA_MMAP

2019-08-15 Thread Christoph Hellwig

Helger, or other parisc folks: can you take a look at this patch
in particular and the series in general?  Thanks!

Re: [PATCH 4/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-08-15 Thread christophe leroy





Le 15/08/2019 à 06:10, Alastair D'Silva a écrit :

From: Alastair D'Silva 

When presented with large amounts of memory being hotplugged
(in my test case, ~890GB), the call to flush_dcache_range takes
a while (~50 seconds), triggering RCU stalls.

This patch breaks up the call into 16GB chunks, calling
cond_resched() inbetween to allow the scheduler to run.


Is 16GB small enough ? If 890GB takes 50s, 16GB still takes about 1s.
I'd use 1GB chuncks to remain below 100ms.



Signed-off-by: Alastair D'Silva 
---
  arch/powerpc/mm/mem.c | 16 ++--
  1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 5400da87a804..fb0d5e9aa11b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -104,11 +104,14 @@ int __weak remove_section_mapping(unsigned long start, 
unsigned long end)
return -ENODEV;
  }
  
+#define FLUSH_CHUNK_SIZE (16ull * 1024ull * 1024ull * 1024ull)


Can we use SZ_16GB ?


+
  int __ref arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions)
  {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+   unsigned long i;
int rc;
  
  	resize_hpt_for_hotplug(memblock_phys_mem_size());

@@ -120,7 +123,11 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
start, start + size, rc);
return -EFAULT;
}
-   flush_dcache_range(start, start + size);
+
+   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
+   flush_dcache_range(start + i, min(start + size, start + i + 
FLUSH_CHUNK_SIZE));


Isn't the line a bit long (I have not checked).


+   cond_resched();
+   }
  
  	return __add_pages(nid, start_pfn, nr_pages, restrictions);

  }
@@ -131,13 +138,18 @@ void __ref arch_remove_memory(int nid, u64 start, u64 
size,
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
+   unsigned long i;
int ret;
  
  	__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
  
  	/* Remove htab bolted mappings for this section of memory */

start = (unsigned long)__va(start);
-   flush_dcache_range(start, start + size);
+   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
+   flush_dcache_range(start + i, min(start + size, start + i + 
FLUSH_CHUNK_SIZE));
+   cond_resched();
+   }
+
ret = remove_section_mapping(start, start + size);
WARN_ON_ONCE(ret);
  



Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus

Re: [PATCH 1/3] powerpc/xmon: Check for HV mode when dumping XIVE info from OPAL

2019-08-15 Thread Jordan Niethe

On Wed, 2019-08-14 at 17:47 +0200, Cédric Le Goater wrote:
> Currently, the xmon 'dx' command calls OPAL to dump the XIVE state in
> the OPAL logs and also outputs some of the fields of the internal
> XIVE
> structures in Linux. The OPAL calls can only be done on baremetal
> (PowerNV) and they crash a pseries machine. Fix by checking the
> hypervisor feature of the CPU.
> 
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/xmon/xmon.c | 17 ++---
>  1 file changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 14e56c25879f..25d4adccf750 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -2534,13 +2534,16 @@ static void dump_pacas(void)
>  static void dump_one_xive(int cpu)
>  {
>   unsigned int hwid = get_hard_smp_processor_id(cpu);
> -
> - opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
> - opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
> - opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
> - opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
> - opal_xive_dump(XIVE_DUMP_VP, hwid);
> - opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
> + bool hv = cpu_has_feature(CPU_FTR_HVMODE);
> +
> + if (hv) {
> + opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
> + opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
> + opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
> + opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
> + opal_xive_dump(XIVE_DUMP_VP, hwid);
> + opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
> + }
>  
>   if (setjmp(bus_error_jmp) != 0) {
>   catch_memory_errors = 0;
dump_one_xive() / other xive functions are guarded by #ifdef
CONFIG_PPC_POWERNV in xmon.c aren't they? With this series would it be
that these guards can be removed?

Re: [PATCH 3/6] powerpc: Convert flush_icache_range & friends to C

2019-08-15 Thread christophe leroy





Le 15/08/2019 à 06:10, Alastair D'Silva a écrit :

From: Alastair D'Silva 

Similar to commit 22e9c88d486a
("powerpc/64: reuse PPC32 static inline flush_dcache_range()")
this patch converts flush_icache_range() to C, and reimplements the
following functions as wrappers around it:
__flush_dcache_icache
__flush_dcache_icache_phys


Not sure you can do that for __flush_dcache_icache_phys(), see detailed 
comments below




This was done as we discovered a long-standing bug where the length of the
range was truncated due to using a 32 bit shift instead of a 64 bit one.

By converting these functions to C, it becomes easier to maintain.

Signed-off-by: Alastair D'Silva 
---
  arch/powerpc/include/asm/cache.h  |  26 +++---
  arch/powerpc/include/asm/cacheflush.h |  32 ---
  arch/powerpc/kernel/misc_32.S | 117 --
  arch/powerpc/kernel/misc_64.S |  97 -
  arch/powerpc/mm/mem.c |  71 +++-
  5 files changed, 102 insertions(+), 241 deletions(-)

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index f852d5cd746c..728f154204db 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -98,20 +98,7 @@ static inline u32 l1_icache_bytes(void)
  #endif
  #endif /* ! __ASSEMBLY__ */
  
-#if defined(__ASSEMBLY__)

-/*
- * For a snooping icache, we still need a dummy icbi to purge all the
- * prefetched instructions from the ifetch buffers. We also need a sync
- * before the icbi to order the the actual stores to memory that might
- * have modified instructions with the icbi.
- */
-#define PURGE_PREFETCHED_INS   \
-   sync;   \
-   icbi0,r3;   \
-   sync;   \
-   isync


Is this still used anywhere now ?


-
-#else
+#if !defined(__ASSEMBLY__)
  #define __read_mostly __attribute__((__section__(".data..read_mostly")))
  
  #ifdef CONFIG_PPC_BOOK3S_32

@@ -145,6 +132,17 @@ static inline void dcbst(void *addr)
  {
__asm__ __volatile__ ("dcbst %y0" : : "Z"(*(u8 *)addr) : "memory");
  }
+
+static inline void icbi(void *addr)
+{
+   __asm__ __volatile__ ("icbi 0, %0" : : "r"(addr) : "memory");
+}
+
+static inline void iccci(void)
+{
+   __asm__ __volatile__ ("iccci 0, r0");


I think you need the "memory" clobber too here.


+}
+
  #endif /* !__ASSEMBLY__ */
  #endif /* __KERNEL__ */
  #endif /* _ASM_POWERPC_CACHE_H */
diff --git a/arch/powerpc/include/asm/cacheflush.h 
b/arch/powerpc/include/asm/cacheflush.h
index ed57843ef452..4c3377aff8ed 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -42,24 +42,18 @@ extern void flush_dcache_page(struct page *page);
  #define flush_dcache_mmap_lock(mapping)   do { } while (0)
  #define flush_dcache_mmap_unlock(mapping) do { } while (0)
  
-extern void flush_icache_range(unsigned long, unsigned long);

+void flush_icache_range(unsigned long start, unsigned long stop);
  extern void flush_icache_user_range(struct vm_area_struct *vma,
struct page *page, unsigned long addr,
int len);
-extern void __flush_dcache_icache(void *page_va);
  extern void flush_dcache_icache_page(struct page *page);
-#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
-extern void __flush_dcache_icache_phys(unsigned long physaddr);
-#else
-static inline void __flush_dcache_icache_phys(unsigned long physaddr)
-{
-   BUG();
-}
-#endif
  
-/*

- * Write any modified data cache blocks out to memory and invalidate them.
+/**
+ * flush_dcache_range(): Write any modified data cache blocks out to memory 
and invalidate them.
   * Does not invalidate the corresponding instruction cache blocks.
+ *
+ * @start: the start address
+ * @stop: the stop address (exclusive)
   */
  static inline void flush_dcache_range(unsigned long start, unsigned long stop)
  {
@@ -82,6 +76,20 @@ static inline void flush_dcache_range(unsigned long start, 
unsigned long stop)
isync();
  }
  
+/**

+ * __flush_dcache_icache(): Flush a particular page from the data cache to RAM.
+ * Note: this is necessary because the instruction cache does *not*
+ * snoop from the data cache.
+ *
+ * @page: the address of the page to flush
+ */
+static inline void __flush_dcache_icache(void *page)
+{
+   unsigned long page_addr = (unsigned long)page;


The function is small enough to call this addr instead of page_addr 
without ambiguity.



+
+   flush_icache_range(page_addr, page_addr + PAGE_SIZE);


Finally, I think that's not so simple. For the 44x, if MMU_FTR_TYPE_44x 
is set, that's just a clean_dcache_range().
If that feature is not set, it looks like a standard 
flush_icache_range() but without the special CONFIG_4xx handling with 
iccci we have in flush_icache_range()



+}
+
  /*
   * Write any modified data cache blocks out to memory.
   * Does not invalidate the

Re: [PATCH] powerpc: Allow flush_(inval_)dcache_range to work across ranges >4GB

2019-08-15 Thread Greg Kroah-Hartman

On Thu, Aug 15, 2019 at 02:55:42PM +1000, Alastair D'Silva wrote:
> From: Alastair D'Silva 
> 
> Heads Up: This patch cannot be submitted to Linus's tree, as the affected
> assembler functions have already been converted to C.
> 
> When calling flush_(inval_)dcache_range with a size >4GB, we were masking
> off the upper 32 bits, so we would incorrectly flush a range smaller
> than intended.
> 
> This patch replaces the 32 bit shifts with 64 bit ones, so that
> the full size is accounted for.
> 
> Signed-off-by: Alastair D'Silva 
> ---
>  arch/powerpc/kernel/misc_64.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)



This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.

Re: [PATCH 4/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-08-15 Thread Mike Rapoport

On Thu, Aug 15, 2019 at 02:10:49PM +1000, Alastair D'Silva wrote:
> From: Alastair D'Silva 
> 
> When presented with large amounts of memory being hotplugged
> (in my test case, ~890GB), the call to flush_dcache_range takes
> a while (~50 seconds), triggering RCU stalls.
> 
> This patch breaks up the call into 16GB chunks, calling
> cond_resched() inbetween to allow the scheduler to run.
> 
> Signed-off-by: Alastair D'Silva 
> ---
>  arch/powerpc/mm/mem.c | 16 ++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 5400da87a804..fb0d5e9aa11b 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -104,11 +104,14 @@ int __weak remove_section_mapping(unsigned long start, 
> unsigned long end)
>   return -ENODEV;
>  }
> 
> +#define FLUSH_CHUNK_SIZE (16ull * 1024ull * 1024ull * 1024ull)

IMHO this begs for adding SZ_16G to include/linux/sizes.h and using it here

> +
>  int __ref arch_add_memory(int nid, u64 start, u64 size,
>   struct mhp_restrictions *restrictions)
>  {
>   unsigned long start_pfn = start >> PAGE_SHIFT;
>   unsigned long nr_pages = size >> PAGE_SHIFT;
> + unsigned long i;
>   int rc;
> 
>   resize_hpt_for_hotplug(memblock_phys_mem_size());
> @@ -120,7 +123,11 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
>   start, start + size, rc);
>   return -EFAULT;
>   }
> - flush_dcache_range(start, start + size);
> +
> + for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> + flush_dcache_range(start + i, min(start + size, start + i + 
> FLUSH_CHUNK_SIZE));
> + cond_resched();
> + }
> 
>   return __add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
> @@ -131,13 +138,18 @@ void __ref arch_remove_memory(int nid, u64 start, u64 
> size,
>   unsigned long start_pfn = start >> PAGE_SHIFT;
>   unsigned long nr_pages = size >> PAGE_SHIFT;
>   struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
> + unsigned long i;
>   int ret;
> 
>   __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
> 
>   /* Remove htab bolted mappings for this section of memory */
>   start = (unsigned long)__va(start);
> - flush_dcache_range(start, start + size);
> + for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> + flush_dcache_range(start + i, min(start + size, start + i + 
> FLUSH_CHUNK_SIZE));
> + cond_resched();
> + }
> +
>   ret = remove_section_mapping(start, start + size);
>   WARN_ON_ONCE(ret);
> 
> -- 
> 2.21.0
> 

-- 
Sincerely yours,
Mike.

Re: [PATCH 0/6] drm+dma: cache support for arm, etc

2019-08-15 Thread Christoph Hellwig

As said before I don't think these low-level helpers are the
right API to export, but even if they did you'd just cover a tiny
subset of the architectures.

Also to distil the previous thread - if you remap memory to uncached
the helper to use is arch_dma_prep_coherent, which does a writeback+
invalidate everywhere, and there is no need to clean up after a
long-term uncached mapping.  We might still get speculations into
that area, if we don't remap the direct mapping, but it isn't like
invalidting that just before freeing the memory is going to help
anyone.

Also it seems like patches 5 and 6 are missing in my inbox.

Re: [PATCH v3 11/16] powerpc/pseries/svm: Export guest SVM status to user space via sysfs

2019-08-15 Thread Michael Ellerman

Thiago Jung Bauermann  writes:
> Michael Ellerman  writes:
>> Thiago Jung Bauermann  writes:
>>> From: Ryan Grimm 
>>> User space might want to know it's running in a secure VM.  It can't do
>>> a mfmsr because mfmsr is a privileged instruction.
>>>
>>> The solution here is to create a cpu attribute:
>>>
>>> /sys/devices/system/cpu/svm
>>>
>>> which will read 0 or 1 based on the S bit of the guest's CPU 0.
>>
>> Why CPU 0?
>>
>> If we have different CPUs running with different MSR_S then something
>> has gone badly wrong, no?
>
> Yes, that would be very bad.
>
>> So can't we just read the MSR on whatever CPU the sysfs code happens to
>> run on.
>
> Good point. I made the change in the patch below.

The patch looks good. Although, it raises the question of whether it
should be an attribute of the CPU at all.

I guess there's not obviously anywhere better for it.

Still you should document the attribute in 
Documentation/ABI/testing/sysfs-devices-system-cpu

cheers

> From 2d951305e118bf286f8e83cbf396448085186357 Mon Sep 17 00:00:00 2001
> From: Ryan Grimm 
> Date: Tue, 15 Jan 2019 11:56:29 -0600
> Subject: [PATCH] powerpc/pseries/svm: Export guest SVM status to user space
>  via sysfs
>
> User space might want to know it's running in a secure VM.  It can't do
> a mfmsr because mfmsr is a privileged instruction.
>
> The solution here is to create a cpu attribute:
>
> /sys/devices/system/cpu/svm
>
> which will read 0 or 1 based on the S bit of the current CPU.
>
> Signed-off-by: Ryan Grimm 
> Signed-off-by: Thiago Jung Bauermann 
> ---
>  arch/powerpc/kernel/sysfs.c | 20 
>  1 file changed, 20 insertions(+)
>
> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
> index e2147d7c9e72..80a676da11cb 100644
> --- a/arch/powerpc/kernel/sysfs.c
> +++ b/arch/powerpc/kernel/sysfs.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "cacheinfo.h"
>  #include "setup.h"
> @@ -715,6 +716,23 @@ static struct device_attribute pa6t_attrs[] = {
>  #endif /* HAS_PPC_PMC_PA6T */
>  #endif /* HAS_PPC_PMC_CLASSIC */
>  
> +#ifdef CONFIG_PPC_SVM
> +static ssize_t show_svm(struct device *dev, struct device_attribute *attr, 
> char *buf)
> +{
> + return sprintf(buf, "%u\n", is_secure_guest());
> +}
> +static DEVICE_ATTR(svm, 0444, show_svm, NULL);
> +
> +static void create_svm_file(void)
> +{
> + device_create_file(cpu_subsys.dev_root, _attr_svm);
> +}
> +#else
> +static void create_svm_file(void)
> +{
> +}
> +#endif /* CONFIG_PPC_SVM */
> +
>  static int register_cpu_online(unsigned int cpu)
>  {
>   struct cpu *c = _cpu(cpu_devices, cpu);
> @@ -1058,6 +1076,8 @@ static int __init topology_init(void)
>   sysfs_create_dscr_default();
>  #endif /* CONFIG_PPC64 */
>  
> + create_svm_file();
> +
>   return 0;
>  }
>  subsys_initcall(topology_init);

98 matches

Mail list logo