[PATCH kernel] powerpc/powernv/tce: Implement pnv_phb::tce_invalidate

2018-10-15 Thread Alexey Kardashevskiy
At the moment there are 5 types of TCE invalidation:
- IODA1/POWER7 - uses raw PHB registers;
- IODA2 "direct" - uses raw PHB3 registers;
- IODA2 "opal" - calls OPAL for invalidation;
- NPU - always invalidates the entire cache;
- NPU2 - calls OPAL for invalidation but uses wrong OPAL_PCI_TCE_KILL
(which is an OPAL token and @kill_type should be one of
OPAL_PCI_TCE_KILL_xxx).

We also have separate helpers for invalidating entire PE or some pages,
with ambiguous names such as pnv_pci_phb3_tce_invalidate_pe() and
pnv_pci_ioda2_tce_invalidate_pe() which difference is not clear at all.

This defines a new powernv-phb hook: tce_invalidate(). This adds a new
helper - pnv_pci_tce_invalidate() - which walks through all attached
PEs and calls tce_invalidate(). Depending on parameters, new hook
invalidates TCE for specific pages or for the entire PE.

This defines a pnv_pci_ioda_tce_invalidate_pe() helper to invalidate a PE,
and uses it instead of pnv_pci_ioda2_tce_invalidate_entire() (NPU) and
pnv_pci_ioda2_tce_invalidate_pe() (IODA2). This does not cause a change
as for NPU2 skiboot falls through to OPAL_PCI_TCE_KILL_ALL since the NPU2
introduction anyway.

While we at it, this changes IODA1's iommu_table_ops to use the same
get/clear/exchange handlers which now get more generic name (i.e.
s/ioda2/ioda/).

As a result, the redesigned code uses now the correct kill_type
for NPU2 PHBs.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci.h  |  10 +-
 arch/powerpc/platforms/powernv/npu-dma.c  |   8 +-
 arch/powerpc/platforms/powernv/pci-ioda-tce.c |  16 ++
 arch/powerpc/platforms/powernv/pci-ioda.c | 310 ++
 4 files changed, 147 insertions(+), 197 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 2131373..a4b4863 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -119,6 +119,9 @@ struct pnv_phb {
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
+   void (*tce_invalidate)(struct pnv_phb *phb, struct pnv_ioda_pe *pe,
+   struct iommu_table *tbl, unsigned long index,
+   unsigned long npages, bool realmode);
 
struct {
/* Global bridge info */
@@ -229,7 +232,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
 
 /* Nvlink functions */
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
 extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
struct iommu_table *tbl);
@@ -266,5 +268,11 @@ extern void pnv_pci_unlink_table_and_group(struct 
iommu_table *tbl,
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift);
+extern void pnv_pci_tce_invalidate(struct iommu_table *tbl,
+   unsigned long index, unsigned long npages, bool realmode);
+static inline void pnv_pci_ioda_tce_invalidate_pe(struct pnv_ioda_pe *pe)
+{
+   pe->phb->tce_invalidate(pe->phb, pe, NULL, 0, 0, false);
+}
 
 #endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 8006c54..7931d42 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -223,7 +223,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
return rc;
}
-   pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+   pnv_pci_ioda_tce_invalidate_pe(npe);
 
/* Add the table to the list so its TCE cache will get invalidated */
pnv_pci_link_table_and_group(phb->hose->node, num,
@@ -247,7 +247,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
return rc;
}
-   pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+   pnv_pci_ioda_tce_invalidate_pe(npe);
 
pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
>table_group);
@@ -313,7 +313,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
0 /* bypass base */, top);
 
if (rc == OPAL_SUCCESS)
-   pnv_pci_ioda2_tce_invalidate_entire(phb, false);
+   pnv_pci_ioda_tce_invalidate_pe(npe);
 
return rc;
 }
@@ -377,7 +377,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
pe_err(npe, "Failed to disable bypass, err %lld\n", rc);

Re: [PATCH 8/8] kconfig: remove CONFIG_MCA leftovers

2018-10-15 Thread Masahiro Yamada
On Sun, Oct 14, 2018 at 12:11 AM Christoph Hellwig  wrote:
>
> Signed-off-by: Christoph Hellwig 
> ---


Can you use "powerpc:" or something
for the subject line?

I'd like to see "kconfig:" only for patches
that touch the scripts/kconfig/ directory.




>  arch/powerpc/Kconfig | 4 
>  drivers/scsi/Kconfig | 6 +++---
>  2 files changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index f0ea460653cd..8b9edf7caf96 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -944,10 +944,6 @@ config FSL_GTM
> help
>   Freescale General-purpose Timers support
>
> -# Yes MCA RS/6000s exist but Linux-PPC does not currently support any
> -config MCA
> -   bool
> -
>  config PCI_DOMAINS
> def_bool PCI
>
> diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
> index 7c097006c54d..d3734c54aec9 100644
> --- a/drivers/scsi/Kconfig
> +++ b/drivers/scsi/Kconfig
> @@ -535,7 +535,7 @@ config SCSI_HPTIOP
>
>  config SCSI_BUSLOGIC
> tristate "BusLogic SCSI support"
> -   depends on (PCI || ISA || MCA) && SCSI && ISA_DMA_API && VIRT_TO_BUS
> +   depends on (PCI || ISA) && SCSI && ISA_DMA_API && VIRT_TO_BUS
> ---help---
>   This is support for BusLogic MultiMaster and FlashPoint SCSI Host
>   Adapters. Consult the SCSI-HOWTO, available from
> @@ -1142,12 +1142,12 @@ config SCSI_LPFC_DEBUG_FS
>
>  config SCSI_SIM710
> tristate "Simple 53c710 SCSI support (Compaq, NCR machines)"
> -   depends on (EISA || MCA) && SCSI
> +   depends on EISA && SCSI
> select SCSI_SPI_ATTRS
> ---help---
>   This driver is for NCR53c710 based SCSI host adapters.
>
> - It currently supports Compaq EISA cards and NCR MCA cards
> + It currently supports Compaq EISA cards.
>
>  config SCSI_DC395x
> tristate "Tekram DC395(U/UW/F) and DC315(U) SCSI support"
> --
> 2.19.1
>


-- 
Best Regards
Masahiro Yamada


[PATCH v8 9/9] powerpc: clean stack pointers naming

2018-10-15 Thread Christophe Leroy
Some stack pointers used to also be thread_info pointers
and were called tp. Now that they are only stack pointers,
rename them sp.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/irq.c  | 17 +++--
 arch/powerpc/kernel/setup_64.c | 20 ++--
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 62cfccf4af89..754f0efc507b 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -659,21 +659,21 @@ void __do_irq(struct pt_regs *regs)
 void do_IRQ(struct pt_regs *regs)
 {
struct pt_regs *old_regs = set_irq_regs(regs);
-   void *curtp, *irqtp, *sirqtp;
+   void *cursp, *irqsp, *sirqsp;
 
/* Switch to the irq stack to handle this */
-   curtp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
-   irqtp = hardirq_ctx[raw_smp_processor_id()];
-   sirqtp = softirq_ctx[raw_smp_processor_id()];
+   cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
+   irqsp = hardirq_ctx[raw_smp_processor_id()];
+   sirqsp = softirq_ctx[raw_smp_processor_id()];
 
/* Already there ? */
-   if (unlikely(curtp == irqtp || curtp == sirqtp)) {
+   if (unlikely(cursp == irqsp || cursp == sirqsp)) {
__do_irq(regs);
set_irq_regs(old_regs);
return;
}
/* Switch stack and call */
-   call_do_irq(regs, irqtp);
+   call_do_irq(regs, irqsp);
 
set_irq_regs(old_regs);
 }
@@ -732,10 +732,7 @@ void irq_ctx_init(void)
 
 void do_softirq_own_stack(void)
 {
-   void *irqtp;
-
-   irqtp = softirq_ctx[smp_processor_id()];
-   call_do_softirq(irqtp);
+   call_do_softirq(softirq_ctx[smp_processor_id()]);
 }
 
 irq_hw_number_t virq_to_hw(unsigned int virq)
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 6792e9c90689..4912ec0320b8 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -717,22 +717,22 @@ void __init emergency_stack_init(void)
limit = min(ppc64_bolted_size(), ppc64_rma_size);
 
for_each_possible_cpu(i) {
-   void *ti;
+   void *sp;
 
-   ti = alloc_stack(limit, i);
-   memset(ti, 0, THREAD_SIZE);
-   paca_ptrs[i]->emergency_sp = ti + THREAD_SIZE;
+   sp = alloc_stack(limit, i);
+   memset(sp, 0, THREAD_SIZE);
+   paca_ptrs[i]->emergency_sp = sp + THREAD_SIZE;
 
 #ifdef CONFIG_PPC_BOOK3S_64
/* emergency stack for NMI exception handling. */
-   ti = alloc_stack(limit, i);
-   memset(ti, 0, THREAD_SIZE);
-   paca_ptrs[i]->nmi_emergency_sp = ti + THREAD_SIZE;
+   sp = alloc_stack(limit, i);
+   memset(sp, 0, THREAD_SIZE);
+   paca_ptrs[i]->nmi_emergency_sp = sp + THREAD_SIZE;
 
/* emergency stack for machine check exception handling. */
-   ti = alloc_stack(limit, i);
-   memset(ti, 0, THREAD_SIZE);
-   paca_ptrs[i]->mc_emergency_sp = ti + THREAD_SIZE;
+   sp = alloc_stack(limit, i);
+   memset(sp, 0, THREAD_SIZE);
+   paca_ptrs[i]->mc_emergency_sp = sp + THREAD_SIZE;
 #endif
}
 }
-- 
2.13.3



[PATCH v8 8/9] powerpc/64: Remove CURRENT_THREAD_INFO

2018-10-15 Thread Christophe Leroy
Now that current_thread_info is located at the beginning of 'current'
task struct, CURRENT_THREAD_INFO macro is not really needed any more.

This patch replaces it by loads of the value at PACACURRENT(r13).

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/exception-64s.h   |  4 ++--
 arch/powerpc/include/asm/thread_info.h |  4 
 arch/powerpc/kernel/entry_64.S | 10 +-
 arch/powerpc/kernel/exceptions-64e.S   |  2 +-
 arch/powerpc/kernel/exceptions-64s.S   |  2 +-
 arch/powerpc/kernel/idle_book3e.S  |  2 +-
 arch/powerpc/kernel/idle_power4.S  |  2 +-
 arch/powerpc/kernel/trace/ftrace_64_mprofile.S |  6 +++---
 8 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/exception-64s.h 
b/arch/powerpc/include/asm/exception-64s.h
index 3b4767ed3ec5..dd6a5ae7a769 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -671,7 +671,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 
 #define RUNLATCH_ON\
 BEGIN_FTR_SECTION  \
-   CURRENT_THREAD_INFO(r3, r1);\
+   ld  r3, PACACURRENT(r13);   \
ld  r4,TI_LOCAL_FLAGS(r3);  \
andi.   r0,r4,_TLF_RUNLATCH;\
beqlppc64_runlatch_on_trampoline;   \
@@ -721,7 +721,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 #ifdef CONFIG_PPC_970_NAP
 #define FINISH_NAP \
 BEGIN_FTR_SECTION  \
-   CURRENT_THREAD_INFO(r11, r1);   \
+   ld  r11, PACACURRENT(r13);  \
ld  r9,TI_LOCAL_FLAGS(r11); \
andi.   r10,r9,_TLF_NAPPING;\
bnelpower4_fixup_nap;   \
diff --git a/arch/powerpc/include/asm/thread_info.h 
b/arch/powerpc/include/asm/thread_info.h
index c959b8d66cac..8e1d0195ac36 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -17,10 +17,6 @@
 
 #define THREAD_SIZE(1 << THREAD_SHIFT)
 
-#ifdef CONFIG_PPC64
-#define CURRENT_THREAD_INFO(dest, sp)  stringify_in_c(ld dest, 
PACACURRENT(r13))
-#endif
-
 #ifndef __ASSEMBLY__
 #include 
 #include 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 03cbf409c3f8..b017bd3da1ed 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -158,7 +158,7 @@ system_call:/* label this so stack 
traces look sane */
li  r10,IRQS_ENABLED
std r10,SOFTE(r1)
 
-   CURRENT_THREAD_INFO(r11, r1)
+   ld  r11, PACACURRENT(r13)
ld  r10,TI_FLAGS(r11)
andi.   r11,r10,_TIF_SYSCALL_DOTRACE
bne .Lsyscall_dotrace   /* does not return */
@@ -205,7 +205,7 @@ system_call:/* label this so stack 
traces look sane */
ld  r3,RESULT(r1)
 #endif
 
-   CURRENT_THREAD_INFO(r12, r1)
+   ld  r12, PACACURRENT(r13)
 
ld  r8,_MSR(r1)
 #ifdef CONFIG_PPC_BOOK3S
@@ -336,7 +336,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
/* Repopulate r9 and r10 for the syscall path */
addir9,r1,STACK_FRAME_OVERHEAD
-   CURRENT_THREAD_INFO(r10, r1)
+   ld  r10, PACACURRENT(r13)
ld  r10,TI_FLAGS(r10)
 
cmpldi  r0,NR_syscalls
@@ -734,7 +734,7 @@ _GLOBAL(ret_from_except_lite)
mtmsrd  r10,1 /* Update machine state */
 #endif /* CONFIG_PPC_BOOK3E */
 
-   CURRENT_THREAD_INFO(r9, r1)
+   ld  r9, PACACURRENT(r13)
ld  r3,_MSR(r1)
 #ifdef CONFIG_PPC_BOOK3E
ld  r10,PACACURRENT(r13)
@@ -848,7 +848,7 @@ resume_kernel:
 1: bl  preempt_schedule_irq
 
/* Re-test flags and eventually loop */
-   CURRENT_THREAD_INFO(r9, r1)
+   ld  r9, PACACURRENT(r13)
ld  r4,TI_FLAGS(r9)
andi.   r0,r4,_TIF_NEED_RESCHED
bne 1b
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 231d066b4a3d..dfafcd0af009 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -469,7 +469,7 @@ exc_##n##_bad_stack:
\
  * interrupts happen before the wait instruction.
  */
 #define CHECK_NAPPING()
\
-   CURRENT_THREAD_INFO(r11, r1);   \
+   ld  r11, PACACURRENT(r13);  \
ld  r10,TI_LOCAL_FLAGS(r11);\
andi.   r9,r10,_TLF_NAPPING;\
beq+1f; \
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 89d32bb79d5e..1cbe1a78df57 100644

[PATCH v8 7/9] powerpc/32: Remove CURRENT_THREAD_INFO and rename TI_CPU

2018-10-15 Thread Christophe Leroy
Now that thread_info is similar to task_struct, it's address is in r2
so CURRENT_THREAD_INFO() macro is useless. This patch removes it.

At the same time, as the 'cpu' field is not anymore in thread_info,
this patch renames it to TASK_CPU.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Makefile  |  2 +-
 arch/powerpc/include/asm/thread_info.h |  2 --
 arch/powerpc/kernel/asm-offsets.c  |  2 +-
 arch/powerpc/kernel/entry_32.S | 43 --
 arch/powerpc/kernel/epapr_hcalls.S |  5 ++--
 arch/powerpc/kernel/head_fsl_booke.S   |  5 ++--
 arch/powerpc/kernel/idle_6xx.S |  8 +++
 arch/powerpc/kernel/idle_e500.S|  8 +++
 arch/powerpc/kernel/misc_32.S  |  3 +--
 arch/powerpc/mm/hash_low_32.S  | 14 ---
 arch/powerpc/sysdev/6xx-suspend.S  |  5 ++--
 11 files changed, 35 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index c363e765ee38..a81e5b5aa37a 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -430,7 +430,7 @@ ifdef CONFIG_SMP
 prepare: task_cpu_prepare
 
 task_cpu_prepare: prepare0
-   $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TI_CPU") 
print $$3;}' include/generated/asm-offsets.h))
+   $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == 
"TASK_CPU") print $$3;}' include/generated/asm-offsets.h))
 endif
 
 # Check toolchain versions:
diff --git a/arch/powerpc/include/asm/thread_info.h 
b/arch/powerpc/include/asm/thread_info.h
index d91523c2c7d8..c959b8d66cac 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -19,8 +19,6 @@
 
 #ifdef CONFIG_PPC64
 #define CURRENT_THREAD_INFO(dest, sp)  stringify_in_c(ld dest, 
PACACURRENT(r13))
-#else
-#define CURRENT_THREAD_INFO(dest, sp)  stringify_in_c(mr dest, r2)
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 93040c19942e..1191791edc7e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -96,7 +96,7 @@ int main(void)
 #endif /* CONFIG_PPC64 */
OFFSET(TASK_STACK, task_struct, stack);
 #ifdef CONFIG_SMP
-   OFFSET(TI_CPU, task_struct, cpu);
+   OFFSET(TASK_CPU, task_struct, cpu);
 #endif
 
 #ifdef CONFIG_LIVEPATCH
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index bd3b146e18a3..d0c546ce387e 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -168,8 +168,7 @@ transfer_to_handler:
tophys(r11,r11)
addir11,r11,global_dbcr0@l
 #ifdef CONFIG_SMP
-   CURRENT_THREAD_INFO(r9, r1)
-   lwz r9,TI_CPU(r9)
+   lwz r9,TASK_CPU(r2)
slwir9,r9,3
add r11,r11,r9
 #endif
@@ -180,8 +179,7 @@ transfer_to_handler:
stw r12,4(r11)
 #endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-   CURRENT_THREAD_INFO(r9, r1)
-   tophys(r9, r9)
+   tophys(r9, r2)
ACCOUNT_CPU_USER_ENTRY(r9, r11, r12)
 #endif
 
@@ -195,8 +193,7 @@ transfer_to_handler:
ble-stack_ovf   /* then the kernel stack overflowed */
 5:
 #if defined(CONFIG_6xx) || defined(CONFIG_E500)
-   CURRENT_THREAD_INFO(r9, r1)
-   tophys(r9,r9)   /* check local flags */
+   tophys(r9,r2)   /* check local flags */
lwz r12,TI_LOCAL_FLAGS(r9)
mtcrf   0x01,r12
bt- 31-TLF_NAPPING,4f
@@ -345,8 +342,7 @@ _GLOBAL(DoSyscall)
mtmsr   r11
 1:
 #endif /* CONFIG_TRACE_IRQFLAGS */
-   CURRENT_THREAD_INFO(r10, r1)
-   lwz r11,TI_FLAGS(r10)
+   lwz r11,TI_FLAGS(r2)
andi.   r11,r11,_TIF_SYSCALL_DOTRACE
bne-syscall_dotrace
 syscall_dotrace_cont:
@@ -379,13 +375,12 @@ ret_from_syscall:
lwz r3,GPR3(r1)
 #endif
mr  r6,r3
-   CURRENT_THREAD_INFO(r12, r1)
/* disable interrupts so current_thread_info()->flags can't change */
LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */
/* Note: We don't bother telling lockdep about it */
SYNC
MTMSRD(r10)
-   lwz r9,TI_FLAGS(r12)
+   lwz r9,TI_FLAGS(r2)
li  r8,-MAX_ERRNO
andi.   
r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
bne-syscall_exit_work
@@ -432,8 +427,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
andi.   r4,r8,MSR_PR
beq 3f
-   CURRENT_THREAD_INFO(r4, r1)
-   ACCOUNT_CPU_USER_EXIT(r4, r5, r7)
+   ACCOUNT_CPU_USER_EXIT(r2, r5, r7)
 3:
 #endif
lwz r4,_LINK(r1)
@@ -526,7 +520,7 @@ syscall_exit_work:
/* Clear per-syscall TIF flags if any are set.  */
 
li  r11,_TIF_PERSYSCALL_MASK
-   addir12,r12,TI_FLAGS
+   addir12,r2,TI_FLAGS
 3: lwarx   r8,0,r12

[PATCH v8 6/9] powerpc: 'current_set' is now a table of task_struct pointers

2018-10-15 Thread Christophe Leroy
The table of pointers 'current_set' has been used for retrieving
the stack and current. They used to be thread_info pointers as
they were pointing to the stack and current was taken from the
'task' field of the thread_info.

Now, the pointers of 'current_set' table are now both pointers
to task_struct and pointers to thread_info.

As they are used to get current, and the stack pointer is
retrieved from current's stack field, this patch changes
their type to task_struct, and renames secondary_ti to
secondary_current.

Reviewed-by: Nicholas Piggin 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/asm-prototypes.h |  4 ++--
 arch/powerpc/kernel/head_32.S |  6 +++---
 arch/powerpc/kernel/head_44x.S|  4 ++--
 arch/powerpc/kernel/head_fsl_booke.S  |  4 ++--
 arch/powerpc/kernel/smp.c | 10 --
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 2741831482f4..d6fdc7e79d09 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -23,8 +23,8 @@
 #include 
 
 /* SMP */
-extern struct thread_info *current_set[NR_CPUS];
-extern struct thread_info *secondary_ti;
+extern struct task_struct *current_set[NR_CPUS];
+extern struct task_struct *secondary_current;
 void start_secondary(void *unused);
 
 /* kexec */
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 44dfd73b2a62..ba0341bd5a00 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -842,9 +842,9 @@ __secondary_start:
 #endif /* CONFIG_6xx */
 
/* get current's stack and current */
-   lis r1,secondary_ti@ha
-   tophys(r1,r1)
-   lwz r2,secondary_ti@l(r1)
+   lis r2,secondary_current@ha
+   tophys(r2,r2)
+   lwz r2,secondary_current@l(r2)
tophys(r1,r2)
lwz r1,TASK_STACK(r1)
 
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 2c7e90f36358..48e4de4dfd0c 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -1021,8 +1021,8 @@ _GLOBAL(start_secondary_47x)
/* Now we can get our task struct and real stack pointer */
 
/* Get current's stack and current */
-   lis r1,secondary_ti@ha
-   lwz r2,secondary_ti@l(r1)
+   lis r2,secondary_current@ha
+   lwz r2,secondary_current@l(r2)
lwz r1,TASK_STACK(r2)
 
/* Current stack pointer */
diff --git a/arch/powerpc/kernel/head_fsl_booke.S 
b/arch/powerpc/kernel/head_fsl_booke.S
index b8a2b789677e..0d27bfff52dd 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -1076,8 +1076,8 @@ __secondary_start:
bl  call_setup_cpu
 
/* get current's stack and current */
-   lis r1,secondary_ti@ha
-   lwz r2,secondary_ti@l(r1)
+   lis r2,secondary_current@ha
+   lwz r2,secondary_current@l(r2)
lwz r1,TASK_STACK(r2)
 
/* stack */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 0f9f7f1f4e2f..2df2bc3ea17d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -75,7 +75,7 @@
 static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 #endif
 
-struct thread_info *secondary_ti;
+struct task_struct *secondary_current;
 bool has_big_cores;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
@@ -663,7 +663,7 @@ void smp_send_stop(void)
 }
 #endif /* CONFIG_NMI_IPI */
 
-struct thread_info *current_set[NR_CPUS];
+struct task_struct *current_set[NR_CPUS];
 
 static void smp_store_cpu_info(int id)
 {
@@ -928,7 +928,7 @@ void smp_prepare_boot_cpu(void)
paca_ptrs[boot_cpuid]->__current = current;
 #endif
set_numa_node(numa_cpu_lookup_table[boot_cpuid]);
-   current_set[boot_cpuid] = task_thread_info(current);
+   current_set[boot_cpuid] = current;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1013,8 +1013,6 @@ static bool secondaries_inhibited(void)
 
 static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
 {
-   struct thread_info *ti = task_thread_info(idle);
-
 #ifdef CONFIG_STACKPROTECTOR
idle->stack_canary = get_random_canary();
 #endif
@@ -1028,7 +1026,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct 
task_struct *idle)
 #endif
 #endif
idle->cpu = cpu;
-   secondary_ti = current_set[cpu] = ti;
+   secondary_current = current_set[cpu] = idle;
 }
 
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
-- 
2.13.3



[PATCH v8 5/9] powerpc: regain entire stack space

2018-10-15 Thread Christophe Leroy
thread_info is not anymore in the stack, so the entire stack
can now be used.

There is also no risk anymore of corrupting task_cpu(p) with a
stack overflow so the patch removes the test.

When doing this, an explicit test for NULL stack pointer is
needed in validate_sp() as it is not anymore implicitely covered
by the sizeof(thread_info) gap.

In the meantime, with the previous patch all pointers to the stacks
are not anymore pointers to thread_info so this patch changes them
to void*

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/irq.h   | 10 +-
 arch/powerpc/include/asm/processor.h |  3 +--
 arch/powerpc/kernel/asm-offsets.c|  1 -
 arch/powerpc/kernel/entry_32.S   | 14 --
 arch/powerpc/kernel/irq.c| 19 +--
 arch/powerpc/kernel/misc_32.S|  6 ++
 arch/powerpc/kernel/process.c| 32 +---
 arch/powerpc/kernel/setup_64.c   |  8 
 8 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 2efbae8d93be..966ddd4d2414 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -48,9 +48,9 @@ struct pt_regs;
  * Per-cpu stacks for handling critical, debug and machine check
  * level interrupts.
  */
-extern struct thread_info *critirq_ctx[NR_CPUS];
-extern struct thread_info *dbgirq_ctx[NR_CPUS];
-extern struct thread_info *mcheckirq_ctx[NR_CPUS];
+extern void *critirq_ctx[NR_CPUS];
+extern void *dbgirq_ctx[NR_CPUS];
+extern void *mcheckirq_ctx[NR_CPUS];
 extern void exc_lvl_ctx_init(void);
 #else
 #define exc_lvl_ctx_init()
@@ -59,8 +59,8 @@ extern void exc_lvl_ctx_init(void);
 /*
  * Per-cpu stacks for handling hard and soft interrupts.
  */
-extern struct thread_info *hardirq_ctx[NR_CPUS];
-extern struct thread_info *softirq_ctx[NR_CPUS];
+extern void *hardirq_ctx[NR_CPUS];
+extern void *softirq_ctx[NR_CPUS];
 
 extern void irq_ctx_init(void);
 void call_do_softirq(void *sp);
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 53649b9b7dc4..238f0938c859 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -331,8 +331,7 @@ struct thread_struct {
 #define ARCH_MIN_TASKALIGN 16
 
 #define INIT_SP(sizeof(init_stack) + (unsigned long) 
_stack)
-#define INIT_SP_LIMIT \
-   (_ALIGN_UP(sizeof(struct thread_info), 16) + (unsigned long)_stack)
+#define INIT_SP_LIMIT  ((unsigned long)_stack)
 
 #ifdef CONFIG_SPE
 #define SPEFSCR_INIT \
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 5ef6db2d8a9e..93040c19942e 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -92,7 +92,6 @@ int main(void)
DEFINE(SIGSEGV, SIGSEGV);
DEFINE(NMI_MASK, NMI_MASK);
 #else
-   DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16));
OFFSET(KSP_LIMIT, thread_struct, ksp_limit);
 #endif /* CONFIG_PPC64 */
OFFSET(TASK_STACK, task_struct, stack);
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index fa7a69ffb37a..bd3b146e18a3 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -97,14 +97,11 @@ crit_transfer_to_handler:
mfspr   r0,SPRN_SRR1
stw r0,_SRR1(r11)
 
-   /* set the stack limit to the current stack
-* and set the limit to protect the thread_info
-* struct
-*/
+   /* set the stack limit to the current stack */
mfspr   r8,SPRN_SPRG_THREAD
lwz r0,KSP_LIMIT(r8)
stw r0,SAVED_KSP_LIMIT(r11)
-   rlwimi  r0,r1,0,0,(31-THREAD_SHIFT)
+   rlwinm  r0,r1,0,0,(31 - THREAD_SHIFT)
stw r0,KSP_LIMIT(r8)
/* fall through */
 #endif
@@ -121,14 +118,11 @@ crit_transfer_to_handler:
mfspr   r0,SPRN_SRR1
stw r0,crit_srr1@l(0)
 
-   /* set the stack limit to the current stack
-* and set the limit to protect the thread_info
-* struct
-*/
+   /* set the stack limit to the current stack */
mfspr   r8,SPRN_SPRG_THREAD
lwz r0,KSP_LIMIT(r8)
stw r0,saved_ksp_limit@l(0)
-   rlwimi  r0,r1,0,0,(31-THREAD_SHIFT)
+   rlwinm  r0,r1,0,0,(31 - THREAD_SHIFT)
stw r0,KSP_LIMIT(r8)
/* fall through */
 #endif
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 3fdb6b6973cf..62cfccf4af89 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -618,9 +618,8 @@ static inline void check_stack_overflow(void)
sp = current_stack_pointer() & (THREAD_SIZE-1);
 
/* check for stack overflow: is there less than 2KB free? */
-   if (unlikely(sp < (sizeof(struct thread_info) + 2048))) {
-   pr_err("do_IRQ: stack overflow: %ld\n",
-   sp - sizeof(struct thread_info));
+  

[PATCH v8 4/9] powerpc: Activate CONFIG_THREAD_INFO_IN_TASK

2018-10-15 Thread Christophe Leroy
This patch activates CONFIG_THREAD_INFO_IN_TASK which
moves the thread_info into task_struct.

Moving thread_info into task_struct has the following advantages:
- It protects thread_info from corruption in the case of stack
overflows.
- Its address is harder to determine if stack addresses are
leaked, making a number of attacks more difficult.

This has the following consequences:
- thread_info is now located at the beginning of task_struct.
- The 'cpu' field is now in task_struct, and only exists when
CONFIG_SMP is active.
- thread_info doesn't have anymore the 'task' field.

This patch:
- Removes all recopy of thread_info struct when the stack changes.
- Changes the CURRENT_THREAD_INFO() macro to point to current.
- Selects CONFIG_THREAD_INFO_IN_TASK.
- Modifies raw_smp_processor_id() to get ->cpu from current without
including linux/sched.h to avoid circular inclusion and without
including asm/asm-offsets.h to avoid symbol names duplication
between ASM constants and C constants.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/Makefile  |  7 +
 arch/powerpc/include/asm/ptrace.h  |  2 +-
 arch/powerpc/include/asm/smp.h | 17 +++-
 arch/powerpc/include/asm/thread_info.h | 17 ++--
 arch/powerpc/kernel/asm-offsets.c  |  7 +++--
 arch/powerpc/kernel/entry_32.S |  9 +++
 arch/powerpc/kernel/exceptions-64e.S   | 11 
 arch/powerpc/kernel/head_32.S  |  6 ++---
 arch/powerpc/kernel/head_44x.S |  4 +--
 arch/powerpc/kernel/head_64.S  |  1 +
 arch/powerpc/kernel/head_booke.h   |  8 +-
 arch/powerpc/kernel/head_fsl_booke.S   |  7 +++--
 arch/powerpc/kernel/irq.c  | 47 +-
 arch/powerpc/kernel/kgdb.c | 28 
 arch/powerpc/kernel/machine_kexec_64.c |  6 ++---
 arch/powerpc/kernel/setup_64.c | 21 ---
 arch/powerpc/kernel/smp.c  |  2 +-
 arch/powerpc/net/bpf_jit32.h   |  5 ++--
 19 files changed, 52 insertions(+), 154 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3d008115fe18..96cb2cee4a5e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -239,6 +239,7 @@ config PPC
select RTC_LIB
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
+   select THREAD_INFO_IN_TASK
select VIRT_TO_BUS  if !PPC64
#
# Please keep this list sorted alphabetically.
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 6c4f8a099bbb..c363e765ee38 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -426,6 +426,13 @@ else
 endif
 endif
 
+ifdef CONFIG_SMP
+prepare: task_cpu_prepare
+
+task_cpu_prepare: prepare0
+   $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TI_CPU") 
print $$3;}' include/generated/asm-offsets.h))
+endif
+
 # Check toolchain versions:
 # - gcc-4.6 is the minimum kernel-wide version so nothing required.
 checkbin:
diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 2ba2a1e52291..69fd72834a41 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -151,7 +151,7 @@ extern int ptrace_put_reg(struct task_struct *task, int 
regno,
  unsigned long data);
 
 #define current_pt_regs() \
-   ((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) 
- 1)
+   ((struct pt_regs *)((unsigned long)task_stack_page(current) + 
THREAD_SIZE) - 1)
 /*
  * We use the least-significant bit of the trap field to indicate
  * whether we have saved the full set of registers, or only a
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 41695745032c..0de717e16dd6 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -83,7 +83,22 @@ int is_cpu_dead(unsigned int cpu);
 /* 32-bit */
 extern int smp_hw_index[];
 
-#define raw_smp_processor_id() (current_thread_info()->cpu)
+/*
+ * This is particularly ugly: it appears we can't actually get the definition
+ * of task_struct here, but we need access to the CPU this task is running on.
+ * Instead of using task_struct we're using _TASK_CPU which is extracted from
+ * asm-offsets.h by kbuild to get the current processor ID.
+ *
+ * This also needs to be safeguarded when building asm-offsets.s because at
+ * that time _TASK_CPU is not defined yet. It could have been guarded by
+ * _TASK_CPU itself, but we want the build to fail if _TASK_CPU is missing
+ * when building something else than asm-offsets.s
+ */
+#ifdef GENERATING_ASM_OFFSETS
+#define raw_smp_processor_id() (0)
+#else
+#define raw_smp_processor_id() (*(unsigned int *)((void *)current + 
_TASK_CPU))
+#endif
 #define hard_smp_processor_id()(smp_hw_index[smp_processor_id()])
 
 static inline int 

[PATCH v8 3/9] powerpc: Prepare for moving thread_info into task_struct

2018-10-15 Thread Christophe Leroy
This patch cleans the powerpc kernel before activating
CONFIG_THREAD_INFO_IN_TASK:
- The purpose of the pointer given to call_do_softirq() and
call_do_irq() is to point the new stack ==> change it to void* and
rename it 'sp'
- Don't use CURRENT_THREAD_INFO() to locate the stack.
- Fix a few comments.
- Replace current_thread_info()->task by current
- Remove unnecessary casts to thread_info, as they'll become invalid
once thread_info is not in stack anymore.
- Rename THREAD_INFO to TASK_STASK: as it is in fact the offset of the
pointer to the stack in task_struct, this pointer will not be impacted
by the move of THREAD_INFO.
- Makes TASK_STACK available to PPC64. PPC64 will need it to get the
stack pointer from current once the thread_info have been moved.
- Modifies klp_init_thread_info() to take task_struct pointer argument.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/irq.h   |  4 ++--
 arch/powerpc/include/asm/livepatch.h |  7 ---
 arch/powerpc/include/asm/processor.h |  4 ++--
 arch/powerpc/include/asm/reg.h   |  2 +-
 arch/powerpc/kernel/asm-offsets.c|  2 +-
 arch/powerpc/kernel/entry_32.S   |  2 +-
 arch/powerpc/kernel/entry_64.S   |  2 +-
 arch/powerpc/kernel/head_32.S|  4 ++--
 arch/powerpc/kernel/head_40x.S   |  4 ++--
 arch/powerpc/kernel/head_44x.S   |  2 +-
 arch/powerpc/kernel/head_8xx.S   |  2 +-
 arch/powerpc/kernel/head_booke.h |  4 ++--
 arch/powerpc/kernel/head_fsl_booke.S |  4 ++--
 arch/powerpc/kernel/irq.c|  2 +-
 arch/powerpc/kernel/misc_32.S|  4 ++--
 arch/powerpc/kernel/process.c|  8 
 arch/powerpc/kernel/setup-common.c   |  2 +-
 arch/powerpc/kernel/setup_32.c   | 15 +--
 arch/powerpc/kernel/smp.c|  4 +++-
 19 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index ee39ce56b2a2..2efbae8d93be 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -63,8 +63,8 @@ extern struct thread_info *hardirq_ctx[NR_CPUS];
 extern struct thread_info *softirq_ctx[NR_CPUS];
 
 extern void irq_ctx_init(void);
-extern void call_do_softirq(struct thread_info *tp);
-extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp);
+void call_do_softirq(void *sp);
+void call_do_irq(struct pt_regs *regs, void *sp);
 extern void do_IRQ(struct pt_regs *regs);
 extern void __init init_IRQ(void);
 extern void __do_irq(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/livepatch.h 
b/arch/powerpc/include/asm/livepatch.h
index 47a03b9b528b..8a81d10ccc82 100644
--- a/arch/powerpc/include/asm/livepatch.h
+++ b/arch/powerpc/include/asm/livepatch.h
@@ -43,13 +43,14 @@ static inline unsigned long 
klp_get_ftrace_location(unsigned long faddr)
return ftrace_location_range(faddr, faddr + 16);
 }
 
-static inline void klp_init_thread_info(struct thread_info *ti)
+static inline void klp_init_thread_info(struct task_struct *p)
 {
+   struct thread_info *ti = task_thread_info(p);
/* + 1 to account for STACK_END_MAGIC */
-   ti->livepatch_sp = (unsigned long *)(ti + 1) + 1;
+   ti->livepatch_sp = end_of_stack(p) + 1;
 }
 #else
-static void klp_init_thread_info(struct thread_info *ti) { }
+static inline void klp_init_thread_info(struct task_struct *p) { }
 #endif /* CONFIG_LIVEPATCH */
 
 #endif /* _ASM_POWERPC_LIVEPATCH_H */
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 2caa44db709a..53649b9b7dc4 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -40,7 +40,7 @@
 
 #ifndef __ASSEMBLY__
 #include 
-#include 
+#include 
 #include 
 #include 
 
@@ -332,7 +332,7 @@ struct thread_struct {
 
 #define INIT_SP(sizeof(init_stack) + (unsigned long) 
_stack)
 #define INIT_SP_LIMIT \
-   (_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) _stack)
+   (_ALIGN_UP(sizeof(struct thread_info), 16) + (unsigned long)_stack)
 
 #ifdef CONFIG_SPE
 #define SPEFSCR_INIT \
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 640a4d818772..d2528a0b2f5b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1058,7 +1058,7 @@
  * - SPRG9 debug exception scratch
  *
  * All 32-bit:
- * - SPRG3 current thread_info pointer
+ * - SPRG3 current thread_struct physical addr pointer
  *(virtual on BookE, physical on others)
  *
  * 32-bit classic:
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 10ef2e4db2fd..47adbb8673f1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -90,10 +90,10 @@ int main(void)
DEFINE(SIGSEGV, SIGSEGV);
DEFINE(NMI_MASK, NMI_MASK);
 #else
-   OFFSET(THREAD_INFO, task_struct, stack);
DEFINE(THREAD_INFO_GAP, 

[PATCH v8 2/9] powerpc: Only use task_struct 'cpu' field on SMP

2018-10-15 Thread Christophe Leroy
When moving to CONFIG_THREAD_INFO_IN_TASK, the thread_info 'cpu' field
gets moved into task_struct and only defined when CONFIG_SMP is set.

This patch ensures that TI_CPU is only used when CONFIG_SMP is set and
that task_struct 'cpu' field is not used directly out of SMP code.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/head_fsl_booke.S | 2 ++
 arch/powerpc/kernel/misc_32.S| 4 
 arch/powerpc/xmon/xmon.c | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/head_fsl_booke.S 
b/arch/powerpc/kernel/head_fsl_booke.S
index e2750b856c8f..05b574f416b3 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -243,8 +243,10 @@ set_ivor:
li  r0,0
stwur0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1)
 
+#ifdef CONFIG_SMP
CURRENT_THREAD_INFO(r22, r1)
stw r24, TI_CPU(r22)
+#endif
 
bl  early_init
 
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 695b24a2d954..2f0fe8bfc078 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -183,10 +183,14 @@ _GLOBAL(low_choose_750fx_pll)
or  r4,r4,r5
mtspr   SPRN_HID1,r4
 
+#ifdef CONFIG_SMP
/* Store new HID1 image */
CURRENT_THREAD_INFO(r6, r1)
lwz r6,TI_CPU(r6)
slwir6,r6,2
+#else
+   li  r6, 0
+#endif
addis   r6,r6,nap_save_hid1@ha
stw r4,nap_save_hid1@l(r6)
 
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 58e67b67a97c..7fb265cca4fb 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2991,7 +2991,7 @@ static void show_task(struct task_struct *tsk)
printf("%px %016lx %6d %6d %c %2d %s\n", tsk,
tsk->thread.ksp,
tsk->pid, tsk->parent->pid,
-   state, task_thread_info(tsk)->cpu,
+   state, task_cpu(tsk),
tsk->comm);
 }
 
-- 
2.13.3



[PATCH v8 1/9] book3s/64: avoid circular header inclusion in mmu-hash.h

2018-10-15 Thread Christophe Leroy
When activating CONFIG_THREAD_INFO_IN_TASK, linux/sched.h
includes asm/current.h. This generates a circular dependency.
To avoid that, asm/processor.h shall not be included in mmu-hash.h

In order to do that, this patch moves into a new header called
asm/task_size_user64.h the information from asm/processor.h required
by mmu-hash.h

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  2 +-
 arch/powerpc/include/asm/processor.h  | 34 +-
 arch/powerpc/include/asm/task_size_user64.h   | 42 +++
 arch/powerpc/kvm/book3s_hv_hmi.c  |  1 +
 4 files changed, 45 insertions(+), 34 deletions(-)
 create mode 100644 arch/powerpc/include/asm/task_size_user64.h

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index fc7f056e9d97..026450f8304e 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -23,7 +23,7 @@
  */
 #include 
 #include 
-#include 
+#include 
 #include 
 
 /*
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 7d04d60a39c9..2caa44db709a 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -101,40 +101,8 @@ void release_thread(struct task_struct *);
 #endif
 
 #ifdef CONFIG_PPC64
-/*
- * 64-bit user address space can have multiple limits
- * For now supported values are:
- */
-#define TASK_SIZE_64TB  (0x4000UL)
-#define TASK_SIZE_128TB (0x8000UL)
-#define TASK_SIZE_512TB (0x0002UL)
-#define TASK_SIZE_1PB   (0x0004UL)
-#define TASK_SIZE_2PB   (0x0008UL)
-/*
- * With 52 bits in the address we can support
- * upto 4PB of range.
- */
-#define TASK_SIZE_4PB   (0x0010UL)
 
-/*
- * For now 512TB is only supported with book3s and 64K linux page size.
- */
-#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
-/*
- * Max value currently used:
- */
-#define TASK_SIZE_USER64   TASK_SIZE_4PB
-#define DEFAULT_MAP_WINDOW_USER64  TASK_SIZE_128TB
-#define TASK_CONTEXT_SIZE  TASK_SIZE_512TB
-#else
-#define TASK_SIZE_USER64   TASK_SIZE_64TB
-#define DEFAULT_MAP_WINDOW_USER64  TASK_SIZE_64TB
-/*
- * We don't need to allocate extended context ids for 4K page size, because
- * we limit the max effective address on this config to 64TB.
- */
-#define TASK_CONTEXT_SIZE  TASK_SIZE_64TB
-#endif
+#include 
 
 /*
  * 32-bit user address space is 4GB - 1 page
diff --git a/arch/powerpc/include/asm/task_size_user64.h 
b/arch/powerpc/include/asm/task_size_user64.h
new file mode 100644
index ..a4043075864b
--- /dev/null
+++ b/arch/powerpc/include/asm/task_size_user64.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_TASK_SIZE_USER64_H
+#define _ASM_POWERPC_TASK_SIZE_USER64_H
+
+#ifdef CONFIG_PPC64
+/*
+ * 64-bit user address space can have multiple limits
+ * For now supported values are:
+ */
+#define TASK_SIZE_64TB  (0x4000UL)
+#define TASK_SIZE_128TB (0x8000UL)
+#define TASK_SIZE_512TB (0x0002UL)
+#define TASK_SIZE_1PB   (0x0004UL)
+#define TASK_SIZE_2PB   (0x0008UL)
+/*
+ * With 52 bits in the address we can support
+ * upto 4PB of range.
+ */
+#define TASK_SIZE_4PB   (0x0010UL)
+
+/*
+ * For now 512TB is only supported with book3s and 64K linux page size.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES)
+/*
+ * Max value currently used:
+ */
+#define TASK_SIZE_USER64   TASK_SIZE_4PB
+#define DEFAULT_MAP_WINDOW_USER64  TASK_SIZE_128TB
+#define TASK_CONTEXT_SIZE  TASK_SIZE_512TB
+#else
+#define TASK_SIZE_USER64   TASK_SIZE_64TB
+#define DEFAULT_MAP_WINDOW_USER64  TASK_SIZE_64TB
+/*
+ * We don't need to allocate extended context ids for 4K page size, because
+ * we limit the max effective address on this config to 64TB.
+ */
+#define TASK_CONTEXT_SIZE  TASK_SIZE_64TB
+#endif
+
+#endif /* CONFIG_PPC64 */
+#endif /* _ASM_POWERPC_TASK_SIZE_USER64_H */
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index e3f738eb1cac..64b5011475c7 100644
--- a/arch/powerpc/kvm/book3s_hv_hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 void wait_for_subcore_guest_exit(void)
 {
-- 
2.13.3



[PATCH v8 0/9] powerpc: Switch to CONFIG_THREAD_INFO_IN_TASK

2018-10-15 Thread Christophe Leroy
The purpose of this serie is to activate CONFIG_THREAD_INFO_IN_TASK which
moves the thread_info into task_struct.

Moving thread_info into task_struct has the following advantages:
- It protects thread_info from corruption in the case of stack
overflows.
- Its address is harder to determine if stack addresses are
leaked, making a number of attacks more difficult.

Changes since v7:
 - Rebased on fb6c6ce7907d ("Automatic merge of branches 'master', 'next' and 
'fixes' into merge")

Changes since v6:
 - Fixed validate_sp() to exclude NULL sp in 'regain entire stack space' patch 
(early crash with CONFIG_KMEMLEAK)

Changes since v5:
 - Fixed livepatch_sp setup by using end_of_stack() instead of hardcoding
 - Fixed PPC_BPF_LOAD_CPU() macro

Changes since v4:
 - Fixed a build failure on 32bits SMP when include/generated/asm-offsets.h is 
not
 already existing, was due to spaces instead of a tab in the Makefile

Changes since RFC v3: (based on Nick's review)
 - Renamed task_size.h to task_size_user64.h to better relate to what it 
contains.
 - Handling of the isolation of thread_info cpu field inside CONFIG_SMP #ifdefs 
moved to a separate patch.
 - Removed CURRENT_THREAD_INFO macro completely.
 - Added a guard in asm/smp.h to avoid build failure before _TASK_CPU is 
defined.
 - Added a patch at the end to rename 'tp' pointers to 'sp' pointers
 - Renamed 'tp' into 'sp' pointers in preparation patch when relevant
 - Fixed a few commit logs
 - Fixed checkpatch report.

Changes since RFC v2:
 - Removed the modification of names in asm-offsets
 - Created a rule in arch/powerpc/Makefile to append the offset of current->cpu 
in CFLAGS
 - Modified asm/smp.h to use the offset set in CFLAGS
 - Squashed the renaming of THREAD_INFO to TASK_STACK in the preparation patch
 - Moved the modification of current_pt_regs in the patch activating 
CONFIG_THREAD_INFO_IN_TASK

Changes since RFC v1:
 - Removed the first patch which was modifying header inclusion order in timer
 - Modified some names in asm-offsets to avoid conflicts when including 
asm-offsets in C files
 - Modified asm/smp.h to avoid having to include linux/sched.h (using 
asm-offsets instead)
 - Moved some changes from the activation patch to the preparation patch.

Christophe Leroy (9):
  book3s/64: avoid circular header inclusion in mmu-hash.h
  powerpc: Only use task_struct 'cpu' field on SMP
  powerpc: Prepare for moving thread_info into task_struct
  powerpc: Activate CONFIG_THREAD_INFO_IN_TASK
  powerpc: regain entire stack space
  powerpc: 'current_set' is now a table of task_struct pointers
  powerpc/32: Remove CURRENT_THREAD_INFO and rename TI_CPU
  powerpc/64: Remove CURRENT_THREAD_INFO
  powerpc: clean stack pointers naming

 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/Makefile  |  7 +++
 arch/powerpc/include/asm/asm-prototypes.h  |  4 +-
 arch/powerpc/include/asm/book3s/64/mmu-hash.h  |  2 +-
 arch/powerpc/include/asm/exception-64s.h   |  4 +-
 arch/powerpc/include/asm/irq.h | 14 ++---
 arch/powerpc/include/asm/livepatch.h   |  7 ++-
 arch/powerpc/include/asm/processor.h   | 39 +
 arch/powerpc/include/asm/ptrace.h  |  2 +-
 arch/powerpc/include/asm/reg.h |  2 +-
 arch/powerpc/include/asm/smp.h | 17 +-
 arch/powerpc/include/asm/task_size_user64.h| 42 ++
 arch/powerpc/include/asm/thread_info.h | 19 ---
 arch/powerpc/kernel/asm-offsets.c  | 10 ++--
 arch/powerpc/kernel/entry_32.S | 66 --
 arch/powerpc/kernel/entry_64.S | 12 ++--
 arch/powerpc/kernel/epapr_hcalls.S |  5 +-
 arch/powerpc/kernel/exceptions-64e.S   | 13 +
 arch/powerpc/kernel/exceptions-64s.S   |  2 +-
 arch/powerpc/kernel/head_32.S  | 14 ++---
 arch/powerpc/kernel/head_40x.S |  4 +-
 arch/powerpc/kernel/head_44x.S |  8 +--
 arch/powerpc/kernel/head_64.S  |  1 +
 arch/powerpc/kernel/head_8xx.S |  2 +-
 arch/powerpc/kernel/head_booke.h   | 12 +---
 arch/powerpc/kernel/head_fsl_booke.S   | 16 +++---
 arch/powerpc/kernel/idle_6xx.S |  8 +--
 arch/powerpc/kernel/idle_book3e.S  |  2 +-
 arch/powerpc/kernel/idle_e500.S|  8 +--
 arch/powerpc/kernel/idle_power4.S  |  2 +-
 arch/powerpc/kernel/irq.c  | 77 +-
 arch/powerpc/kernel/kgdb.c | 28 --
 arch/powerpc/kernel/machine_kexec_64.c |  6 +-
 arch/powerpc/kernel/misc_32.S  | 17 +++---
 arch/powerpc/kernel/process.c  | 40 ++---
 arch/powerpc/kernel/setup-common.c |  2 +-
 arch/powerpc/kernel/setup_32.c | 15 ++---
 arch/powerpc/kernel/setup_64.c | 41 --
 

Re: [PATCH V3 2/2] powerpc/mm/iommu: Allow migration of cma allocated pages during mm_iommu_get

2018-10-15 Thread Alexey Kardashevskiy



On 18/09/2018 21:58, Aneesh Kumar K.V wrote:
> Current code doesn't do page migration if the page allocated is a compound 
> page.
> With HugeTLB migration support, we can end up allocating hugetlb pages from
> CMA region. Also THP pages can be allocated from CMA region. This patch 
> updates
> the code to handle compound pages correctly.
> 
> This use the new helper get_user_pages_cma_migrate. It does one get_user_pages
> with right count, instead of doing one get_user_pages per page. That avoids
> reading page table multiple times.
> 
> The patch also convert the hpas member of mm_iommu_table_group_mem_t to a 
> union.
> We use the same storage location to store pointers to struct page. We cannot
> update alll the code path use struct page *, because we access hpas in real 
> mode
> and we can't do that struct page * to pfn conversion in real mode.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/mm/mmu_context_iommu.c | 120 
>  1 file changed, 35 insertions(+), 85 deletions(-)
> 
> diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
> b/arch/powerpc/mm/mmu_context_iommu.c
> index c9ee9e23845f..f0d8645872cb 100644
> --- a/arch/powerpc/mm/mmu_context_iommu.c
> +++ b/arch/powerpc/mm/mmu_context_iommu.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  static DEFINE_MUTEX(mem_list_mutex);
>  
> @@ -30,8 +31,18 @@ struct mm_iommu_table_group_mem_t {
>   atomic64_t mapped;
>   unsigned int pageshift;
>   u64 ua; /* userspace address */
> - u64 entries;/* number of entries in hpas[] */
> - u64 *hpas;  /* vmalloc'ed */
> + u64 entries;/* number of entries in hpages[] */
> + /*
> +  * in mm_iommu_get we temporarily use this to store
> +  * struct page address.
> +  *
> +  * We need to convert ua to hpa in real mode. Make it
> +  * simpler by storing physicall address.
> +  */
> + union {
> + struct page **hpages;   /* vmalloc'ed */
> + phys_addr_t *hpas;


It could always be hpages. Now it is slightly complicated though because
of MM_IOMMU_TABLE_GROUP_PAGE_DIRTY...

> + };
>  };
>  
>  static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
> @@ -74,63 +85,14 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
>  }
>  EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
>  
> -/*
> - * Taken from alloc_migrate_target with changes to remove CMA allocations
> - */
> -struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
> -{
> - gfp_t gfp_mask = GFP_USER;
> - struct page *new_page;
> -
> - if (PageCompound(page))
> - return NULL;
> -
> - if (PageHighMem(page))
> - gfp_mask |= __GFP_HIGHMEM;
> -
> - /*
> -  * We don't want the allocation to force an OOM if possibe
> -  */
> - new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
> - return new_page;
> -}
> -
> -static int mm_iommu_move_page_from_cma(struct page *page)
> -{
> - int ret = 0;
> - LIST_HEAD(cma_migrate_pages);
> -
> - /* Ignore huge pages for now */
> - if (PageCompound(page))
> - return -EBUSY;
> -
> - lru_add_drain();
> - ret = isolate_lru_page(page);
> - if (ret)
> - return ret;
> -
> - list_add(>lru, _migrate_pages);
> - put_page(page); /* Drop the gup reference */
> -
> - ret = migrate_pages(_migrate_pages, new_iommu_non_cma_page,
> - NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
> - if (ret) {
> - if (!list_empty(_migrate_pages))
> - putback_movable_pages(_migrate_pages);
> - }
> -
> - return 0;
> -}
> -
>  long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
> entries,
>   struct mm_iommu_table_group_mem_t **pmem)
>  {
>   struct mm_iommu_table_group_mem_t *mem;
> - long i, j, ret = 0, locked_entries = 0;
> + long i, ret = 0, locked_entries = 0;
>   unsigned int pageshift;
>   unsigned long flags;
>   unsigned long cur_ua;
> - struct page *page = NULL;
>  
>   mutex_lock(_list_mutex);
>  
> @@ -177,41 +139,24 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long 
> ua, unsigned long entries,
>   goto unlock_exit;
>   }
>  
> + ret = get_user_pages_cma_migrate(ua, entries, 1, mem->hpages);
> + if (ret != entries) {
> + /* free the reference taken */
> + for (i = 0; i < ret; i++)
> + put_page(mem->hpages[i]);
> +
> + vfree(mem->hpas);
> + kfree(mem);
> + ret = -EFAULT;
> + goto unlock_exit;
> + } else

Do not need "else".


> + ret = 0;
> +
> + pageshift = PAGE_SHIFT;
>   for (i = 0; i < entries; ++i) {
> + struct page *page = mem->hpages[i];
>   cur_ua = ua + (i << PAGE_SHIFT);
> - if (1 != 

Re: [PATCH V3 1/2] mm: Add get_user_pages_cma_migrate

2018-10-15 Thread Alexey Kardashevskiy



On 18/09/2018 21:58, Aneesh Kumar K.V wrote:
> This helper does a get_user_pages_fast and if it find pages in the CMA area
> it will try to migrate them before taking page reference. This makes sure that
> we don't keep non-movable pages (due to page reference count) in the CMA area.
> Not able to move pages out of CMA area result in CMA allocation failures.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  include/linux/hugetlb.h |   2 +
>  include/linux/migrate.h |   3 +
>  mm/hugetlb.c|   4 +-
>  mm/migrate.c| 132 
>  4 files changed, 139 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 6b68e345f0ca..1abccb1a1ecc 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -357,6 +357,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, 
> int preferred_nid,
>   nodemask_t *nmask);
>  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct 
> *vma,
>   unsigned long address);
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +  int nid, nodemask_t *nmask);
>  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
>   pgoff_t idx);
>  
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index f2b4abbca55e..d82b35afd2eb 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -286,6 +286,9 @@ static inline int migrate_vma(const struct 
> migrate_vma_ops *ops,
>  }
>  #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
>  
> +extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int 
> write,
> +   struct page **pages);
> +
>  #endif /* CONFIG_MIGRATION */
>  
>  #endif /* _LINUX_MIGRATE_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3c21775f196b..1abbfcb84f66 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1585,8 +1585,8 @@ static struct page *alloc_surplus_huge_page(struct 
> hstate *h, gfp_t gfp_mask,
>   return page;
>  }
>  
> -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> - int nid, nodemask_t *nmask)
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +  int nid, nodemask_t *nmask)
>  {
>   struct page *page;
>  
> diff --git a/mm/migrate.c b/mm/migrate.c
> index d6a2e89b086a..2f92534ea7a1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3006,3 +3006,135 @@ int migrate_vma(const struct migrate_vma_ops *ops,
>  }
>  EXPORT_SYMBOL(migrate_vma);
>  #endif /* defined(MIGRATE_VMA_HELPER) */
> +
> +static struct page *new_non_cma_page(struct page *page, unsigned long 
> private)
> +{
> + /*
> +  * We want to make sure we allocate the new page from the same node
> +  * as the source page.
> +  */
> + int nid = page_to_nid(page);
> + gfp_t gfp_mask = GFP_USER | __GFP_THISNODE;
> +
> + if (PageHighMem(page))
> + gfp_mask |= __GFP_HIGHMEM;
> +
> + if (PageTransHuge(page)) {
> + struct page *thp;
> + gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE;
> +
> + /*
> +  * Remove the movable mask so that we don't allocate from
> +  * CMA area again.
> +  */
> + thp_gfpmask &= ~__GFP_MOVABLE;
> + thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);


HPAGE_PMD_ORDER is 2MB or 1GB? THP are always that PMD order?


> + if (!thp)
> + return NULL;
> + prep_transhuge_page(thp);
> + return thp;
> +
> +#ifdef  CONFIG_HUGETLB_PAGE
> + } else if (PageHuge(page)) {
> +
> + struct hstate *h = page_hstate(page);
> + /*
> +  * We don't want to dequeue from the pool because pool pages 
> will
> +  * mostly be from the CMA region.
> +  */
> + return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
> +#endif
> + }
> +
> + return __alloc_pages_node(nid, gfp_mask, 0);
> +}
> +
> +/**
> + * get_user_pages_cma_migrate() - pin user pages in memory by migrating 
> pages in CMA region
> + * @start:   starting user address
> + * @nr_pages:number of pages from start to pin
> + * @write:   whether pages will be written to
> + * @pages:   array that receives pointers to the pages pinned.
> + *   Should be at least nr_pages long.
> + *
> + * Attempt to pin user pages in memory without taking mm->mmap_sem.
> + * If not successful, it will fall back to taking the lock and
> + * calling get_user_pages().


I do not see any locking or get_user_pages(), hidden somewhere?

> + *
> + * If the pinned pages are backed by CMA region, we migrate those pages out,
> + * allocating new pages from non-CMA region. This helps in avoiding 

Re: linux-next: Tree for Oct 15

2018-10-15 Thread Benjamin Herrenschmidt
On Tue, 2018-10-16 at 13:19 +1100, Stephen Rothwell wrote:
> Hi all,
> 
> On Tue, 16 Oct 2018 13:02:16 +1100 Stephen Rothwell  
> wrote:
> > 
> > Reverting fe3d2a45e8079fdd7d4da1ff07f4b40bc3cb499f (and the following 2
> > commits) produces a kernel that boots.
> 
> Instead of that, I applied this patch on top of linux-next and it boots
> and produces a stack trace ...
> 
> From: Stephen Rothwell 
> Date: Tue, 16 Oct 2018 13:07:01 +1100
> Subject: [PATCH] mm/memblock.c: use dump_stack() instead of WARN_ON_ONCE for
>  the alignment checks
> 
> Using WARN_ON_ONCE too early causes the PowerPC kernel to fail.

Interesting ... I thought I had fixed that. Might need to be re-fixed.

> Signed-off-by: Stephen Rothwell 
> ---
>  mm/memblock.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 5fefc70253ee..f2ef3915a356 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -1298,8 +1298,10 @@ static phys_addr_t __init 
> memblock_alloc_range_nid(phys_addr_t size,
>  {
>   phys_addr_t found;
>  
> - if (WARN_ON_ONCE(!align))
> + if (!align) {
> + dump_stack();
>   align = SMP_CACHE_BYTES;
> + }
>  
>   found = memblock_find_in_range_node(size, align, start, end, nid,
>   flags);
> @@ -1423,8 +1425,10 @@ static void * __init memblock_alloc_internal(
>   if (WARN_ON_ONCE(slab_is_available()))
>   return kzalloc_node(size, GFP_NOWAIT, nid);
>  
> - if (WARN_ON_ONCE(!align))
> + if (!align) {
> + dump_stack();
>   align = SMP_CACHE_BYTES;
> + }
>  
>   if (max_addr > memblock.current_limit)
>   max_addr = memblock.current_limit;
> -- 
> 2.18.0
> 
> So, patch "memblock: stop using implicit alignment to SMP_CACHE_BYTES"
> should *not* remove the 0 -> SMP_CACHE_BYTES update from mm/memblock.c
> and just add the dump_stack().



Re: [PATCH v2 2/2] mm: speed up mremap by 500x on large regions

2018-10-15 Thread Joel Fernandes
On Mon, Oct 15, 2018 at 10:18:14AM +0200, Martin Schwidefsky wrote:
> On Mon, 15 Oct 2018 09:10:53 +0200
> Christian Borntraeger  wrote:
> 
> > On 10/12/2018 03:37 AM, Joel Fernandes (Google) wrote:
> > > Android needs to mremap large regions of memory during memory management
> > > related operations. The mremap system call can be really slow if THP is
> > > not enabled. The bottleneck is move_page_tables, which is copying each
> > > pte at a time, and can be really slow across a large map. Turning on THP
> > > may not be a viable option, and is not for us. This patch speeds up the
> > > performance for non-THP system by copying at the PMD level when possible.
> > > 
> > > The speed up is three orders of magnitude. On a 1GB mremap, the mremap
> > > completion times drops from 160-250 millesconds to 380-400 microseconds.
> > > 
> > > Before:
> > > Total mremap time for 1GB data: 242321014 nanoseconds.
> > > Total mremap time for 1GB data: 196842467 nanoseconds.
> > > Total mremap time for 1GB data: 167051162 nanoseconds.
> > > 
> > > After:
> > > Total mremap time for 1GB data: 385781 nanoseconds.
> > > Total mremap time for 1GB data: 388959 nanoseconds.
> > > Total mremap time for 1GB data: 402813 nanoseconds.
> > > 
> > > Incase THP is enabled, the optimization is skipped. I also flush the
> > > tlb every time we do this optimization since I couldn't find a way to
> > > determine if the low-level PTEs are dirty. It is seen that the cost of
> > > doing so is not much compared the improvement, on both x86-64 and arm64.
> > > 
> > > Cc: minc...@kernel.org
> > > Cc: pan...@google.com
> > > Cc: hu...@google.com
> > > Cc: lokeshgi...@google.com
> > > Cc: dan...@google.com
> > > Cc: mho...@kernel.org
> > > Cc: kir...@shutemov.name
> > > Cc: a...@linux-foundation.org
> > > Signed-off-by: Joel Fernandes (Google) 
> > > ---
> > >  mm/mremap.c | 62 +
> > >  1 file changed, 62 insertions(+)
> > > 
> > > diff --git a/mm/mremap.c b/mm/mremap.c
> > > index 9e68a02a52b1..d82c485822ef 100644
> > > --- a/mm/mremap.c
> > > +++ b/mm/mremap.c
> > > @@ -191,6 +191,54 @@ static void move_ptes(struct vm_area_struct *vma, 
> > > pmd_t *old_pmd,
> > >   drop_rmap_locks(vma);
> > >  }
> > >  
> > > +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long 
> > > old_addr,
> > > +   unsigned long new_addr, unsigned long old_end,
> > > +   pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
> > > +{
> > > + spinlock_t *old_ptl, *new_ptl;
> > > + struct mm_struct *mm = vma->vm_mm;
> > > +
> > > + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
> > > + || old_end - old_addr < PMD_SIZE)
> > > + return false;
> > > +
> > > + /*
> > > +  * The destination pmd shouldn't be established, free_pgtables()
> > > +  * should have release it.
> > > +  */
> > > + if (WARN_ON(!pmd_none(*new_pmd)))
> > > + return false;
> > > +
> > > + /*
> > > +  * We don't have to worry about the ordering of src and dst
> > > +  * ptlocks because exclusive mmap_sem prevents deadlock.
> > > +  */
> > > + old_ptl = pmd_lock(vma->vm_mm, old_pmd);
> > > + if (old_ptl) {
> > > + pmd_t pmd;
> > > +
> > > + new_ptl = pmd_lockptr(mm, new_pmd);
> > > + if (new_ptl != old_ptl)
> > > + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> > > +
> > > + /* Clear the pmd */
> > > + pmd = *old_pmd;
> > > + pmd_clear(old_pmd);  
> > 
> > Adding Martin Schwidefsky.
> > Is this mapping maybe still in use on other CPUs? If yes, I think for
> > s390 we need to flush here as well (in other word we might need to introduce
> > pmd_clear_flush). On s390 you have to use instructions like CRDTE,IPTE or 
> > IDTE
> > to modify page table entries that are still in use. Otherwise you can get a 
> > delayed access exception which is - in contrast to page faults - not 
> > recoverable.
> 
> Just clearing an active pmd would be broken for s390. We need the equivalent
> of the ptep_get_and_clear() function for pmds. For s390 this function would
> look like this:
> 
> static inline pte_t pmdp_get_and_clear(struct mm_struct *mm,
>unsigned long addr, pmd_t *pmdp)
> {
> return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
> }
> 
> Just like pmdp_huge_get_and_clear() in fact.

I agree architecture like s390 may need additional explicit instructions to
avoid any unrecoverable failure. So the good news is in my last patch I sent, I
have put this behind an architecture flag (HAVE_MOVE_PMD), so we don't have
to enable it with architectures that cannot handle it:
https://www.spinics.net/lists/linux-mm/msg163621.html

Also we are triggering this optimization only if the page is not a transparent
huge page by calling pmd_trans_huge(). For regular pages, it should be safe to
not do the atomic get_and_clear AIUI because Linux doesn't use any bits from
the PMD like the dirty 

[PATCH v4 18/18] of: unittest: initialize args before calling of_*parse_*()

2018-10-15 Thread frowand . list
From: Frank Rowand 

Callers of of_irq_parse_one() blindly use the pointer args.np
without checking whether of_irq_parse_one() had an error and
thus did not set the value of args.np.  Initialize args to
zero so that using the format "%pOF" to show the value of
args.np will show "(null)" when of_irq_parse_one() has an
error.  This prevents the dereference of a random value.

Make the same fix for callers of of_parse_phandle_with_args()
and of_parse_phandle_with_args_map().

Reported-by: Guenter Roeck 
Signed-off-by: Frank Rowand 
---
 drivers/of/unittest.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 785985bdbfa6..5f4db23e4752 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -375,6 +375,7 @@ static void __init of_unittest_parse_phandle_with_args(void)
for (i = 0; i < 8; i++) {
bool passed = true;
 
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args(np, "phandle-list",
"#phandle-cells", i, );
 
@@ -428,6 +429,7 @@ static void __init of_unittest_parse_phandle_with_args(void)
}
 
/* Check for missing list property */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args(np, "phandle-list-missing",
"#phandle-cells", 0, );
unittest(rc == -ENOENT, "expected:%i got:%i\n", -ENOENT, rc);
@@ -436,6 +438,7 @@ static void __init of_unittest_parse_phandle_with_args(void)
unittest(rc == -ENOENT, "expected:%i got:%i\n", -ENOENT, rc);
 
/* Check for missing cells property */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args(np, "phandle-list",
"#phandle-cells-missing", 0, );
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
@@ -444,6 +447,7 @@ static void __init of_unittest_parse_phandle_with_args(void)
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
 
/* Check for bad phandle in list */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args(np, "phandle-list-bad-phandle",
"#phandle-cells", 0, );
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
@@ -452,6 +456,7 @@ static void __init of_unittest_parse_phandle_with_args(void)
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
 
/* Check for incorrectly formed argument list */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args(np, "phandle-list-bad-args",
"#phandle-cells", 1, );
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
@@ -502,6 +507,7 @@ static void __init 
of_unittest_parse_phandle_with_args_map(void)
for (i = 0; i < 8; i++) {
bool passed = true;
 
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args_map(np, "phandle-list",
"phandle", i, );
 
@@ -559,21 +565,25 @@ static void __init 
of_unittest_parse_phandle_with_args_map(void)
}
 
/* Check for missing list property */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args_map(np, "phandle-list-missing",
"phandle", 0, );
unittest(rc == -ENOENT, "expected:%i got:%i\n", -ENOENT, rc);
 
/* Check for missing cells,map,mask property */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args_map(np, "phandle-list",
"phandle-missing", 0, );
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
 
/* Check for bad phandle in list */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args_map(np, "phandle-list-bad-phandle",
"phandle", 0, );
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
 
/* Check for incorrectly formed argument list */
+   memset(, 0, sizeof(args));
rc = of_parse_phandle_with_args_map(np, "phandle-list-bad-args",
"phandle", 1, );
unittest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc);
@@ -780,7 +790,7 @@ static void __init of_unittest_parse_interrupts(void)
for (i = 0; i < 4; i++) {
bool passed = true;
 
-   args.args_count = 0;
+   memset(, 0, sizeof(args));
rc = of_irq_parse_one(np, i, );
 
passed &= !rc;
@@ -801,7 +811,7 @@ static void __init of_unittest_parse_interrupts(void)
for (i = 0; i < 4; i++) {
bool passed = true;
 
-   args.args_count = 0;
+   memset(, 0, sizeof(args));
rc = 

[PATCH v4 17/18] of: unittest: find overlays[] entry by name instead of index

2018-10-15 Thread frowand . list
From: Frank Rowand 

One accessor of overlays[] was using a hard coded index value to
find the correct array entry instead of searching for the entry
containing the correct name.

Signed-off-by: Frank Rowand 
---
 drivers/of/unittest.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 1c2bd8503095..785985bdbfa6 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -2178,7 +2178,7 @@ struct overlay_info {
 OVERLAY_INFO_EXTERN(overlay_bad_phandle);
 OVERLAY_INFO_EXTERN(overlay_bad_symbol);
 
-/* order of entries is hard-coded into users of overlays[] */
+/* entries found by name */
 static struct overlay_info overlays[] = {
OVERLAY_INFO(overlay_base, -),
OVERLAY_INFO(overlay, 0),
@@ -2201,7 +2201,8 @@ struct overlay_info {
OVERLAY_INFO(overlay_bad_add_dup_prop, -EINVAL),
OVERLAY_INFO(overlay_bad_phandle, -EINVAL),
OVERLAY_INFO(overlay_bad_symbol, -EINVAL),
-   {}
+   /* end marker */
+   {.dtb_begin = NULL, .dtb_end = NULL, .expected_result = 0, .name = NULL}
 };
 
 static struct device_node *overlay_base_root;
@@ -2231,6 +2232,19 @@ void __init unittest_unflatten_overlay_base(void)
u32 data_size;
void *new_fdt;
u32 size;
+   int found = 0;
+   const char *overlay_name = "overlay_base";
+
+   for (info = overlays; info && info->name; info++) {
+   if (!strcmp(overlay_name, info->name)) {
+   found = 1;
+   break;
+   }
+   }
+   if (!found) {
+   pr_err("no overlay data for %s\n", overlay_name);
+   return;
+   }
 
info = [0];
 
@@ -2278,11 +2292,10 @@ static int __init overlay_data_apply(const char 
*overlay_name, int *overlay_id)
 {
struct overlay_info *info;
int found = 0;
-   int k;
int ret;
u32 size;
 
-   for (k = 0, info = overlays; info && info->name; info++, k++) {
+   for (info = overlays; info && info->name; info++) {
if (!strcmp(overlay_name, info->name)) {
found = 1;
break;
-- 
Frank Rowand 



[PATCH v4 16/18] of: unittest: allow base devicetree to have symbol metadata

2018-10-15 Thread frowand . list
From: Frank Rowand 

The overlay metadata nodes in the FDT created from testcases.dts
are not handled properly.

The __fixups__ and __local_fixups__ node were added to the live
devicetree, but should not be.

Only the first property in the /__symbols__ node was added to the
live devicetree if the live devicetree already contained a
/__symbols node.  All of the node's properties must be added.

Signed-off-by: Frank Rowand 
---
 drivers/of/unittest.c | 43 +++
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 6d80f474c8f2..1c2bd8503095 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1057,20 +1057,44 @@ static void __init of_unittest_platform_populate(void)
  * of np into dup node (present in live tree) and
  * updates parent of children of np to dup.
  *
- * @np:node already present in live tree
+ * @np:node whose properties are being added to the live tree
  * @dup:   node present in live tree to be updated
  */
 static void update_node_properties(struct device_node *np,
struct device_node *dup)
 {
struct property *prop;
+   struct property *save_next;
struct device_node *child;
-
-   for_each_property_of_node(np, prop)
-   of_add_property(dup, prop);
+   int ret;
 
for_each_child_of_node(np, child)
child->parent = dup;
+
+   /*
+* "unittest internal error: unable to add testdata property"
+*
+*If this message reports a property in node '/__symbols__' then
+*the respective unittest overlay contains a label that has the
+*same name as a label in the live devicetree.  The label will
+*be in the live devicetree only if the devicetree source was
+*compiled with the '-@' option.  If you encounter this error,
+*please consider renaming __all__ of the labels in the unittest
+*overlay dts files with an odd prefix that is unlikely to be
+*used in a real devicetree.
+*/
+
+   /*
+* open code for_each_property_of_node() because of_add_property()
+* sets prop->next to NULL
+*/
+   for (prop = np->properties; prop != NULL; prop = save_next) {
+   save_next = prop->next;
+   ret = of_add_property(dup, prop);
+   if (ret)
+   pr_err("unittest internal error: unable to add testdata 
property %pOF/%s",
+  np, prop->name);
+   }
 }
 
 /**
@@ -1079,18 +1103,23 @@ static void update_node_properties(struct device_node 
*np,
  *
  * @np:Node to attach to live tree
  */
-static int attach_node_and_children(struct device_node *np)
+static void attach_node_and_children(struct device_node *np)
 {
struct device_node *next, *dup, *child;
unsigned long flags;
const char *full_name;
 
full_name = kasprintf(GFP_KERNEL, "%pOF", np);
+
+   if (!strcmp(full_name, "/__local_fixups__") ||
+   !strcmp(full_name, "/__fixups__"))
+   return;
+
dup = of_find_node_by_path(full_name);
kfree(full_name);
if (dup) {
update_node_properties(np, dup);
-   return 0;
+   return;
}
 
child = np->child;
@@ -,8 +1140,6 @@ static int attach_node_and_children(struct device_node 
*np)
attach_node_and_children(child);
child = next;
}
-
-   return 0;
 }
 
 /**
-- 
Frank Rowand 



[PATCH v4 15/18] of: overlay: set node fields from properties when add new overlay node

2018-10-15 Thread frowand . list
From: Frank Rowand 

Overlay nodes added by add_changeset_node() do not have the node
fields name, phandle, and type set.

The node passed to __of_attach_node() when the add node changeset
entry is processed does not contain any properties.  The node's
properties are located in add property changeset entries that will
be processed after the add node changeset is applied.

Set the node's fields in the node contained in the add node
changeset entry and do not set them to incorrect values in
add_changeset_node().

A visible symptom that is fixed by this patch is the names of nodes
added by overlays that have an entry in /sys/bus/platform/drivers/*/
will contain the unit-address but the node-name will be ,  for
example, "fc4ab000.".  After applying the patch the name, in
this example, for node restart@fc4ab000 is "fc4ab000.restart".

Signed-off-by: Frank Rowand 
---
 drivers/of/dynamic.c | 27 ++-
 drivers/of/overlay.c | 29 -
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index a94f727ec3da..a9f5d5fb3f25 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -205,15 +205,24 @@ static void __of_attach_node(struct device_node *np)
const __be32 *phandle;
int sz;
 
-   np->name = __of_get_property(np, "name", NULL) ? : "";
-   np->type = __of_get_property(np, "device_type", NULL) ? : "";
-
-   phandle = __of_get_property(np, "phandle", );
-   if (!phandle)
-   phandle = __of_get_property(np, "linux,phandle", );
-   if (IS_ENABLED(CONFIG_PPC_PSERIES) && !phandle)
-   phandle = __of_get_property(np, "ibm,phandle", );
-   np->phandle = (phandle && (sz >= 4)) ? be32_to_cpup(phandle) : 0;
+   if (!of_node_check_flag(np, OF_OVERLAY)) {
+   np->name = __of_get_property(np, "name", NULL);
+   np->type = __of_get_property(np, "device_type", NULL);
+   if (!np->name)
+   np->name = "";
+   if (!np->type)
+   np->type = "";
+
+   phandle = __of_get_property(np, "phandle", );
+   if (!phandle)
+   phandle = __of_get_property(np, "linux,phandle", );
+   if (IS_ENABLED(CONFIG_PPC_PSERIES) && !phandle)
+   phandle = __of_get_property(np, "ibm,phandle", );
+   if (phandle && (sz >= 4))
+   np->phandle = be32_to_cpup(phandle);
+   else
+   np->phandle = 0;
+   }
 
np->child = NULL;
np->sibling = np->parent->child;
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 01afd22566ed..d011177e5aaa 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -307,10 +307,11 @@ static int add_changeset_property(struct 
overlay_changeset *ovcs,
int ret = 0;
bool check_for_non_overlay_node = false;
 
-   if (!of_prop_cmp(overlay_prop->name, "name") ||
-   !of_prop_cmp(overlay_prop->name, "phandle") ||
-   !of_prop_cmp(overlay_prop->name, "linux,phandle"))
-   return 0;
+   if (target->in_livetree)
+   if (!of_prop_cmp(overlay_prop->name, "name") ||
+   !of_prop_cmp(overlay_prop->name, "phandle") ||
+   !of_prop_cmp(overlay_prop->name, "linux,phandle"))
+   return 0;
 
if (target->in_livetree)
prop = of_find_property(target->np, overlay_prop->name, NULL);
@@ -331,6 +332,10 @@ static int add_changeset_property(struct overlay_changeset 
*ovcs,
if (!prop) {
 
check_for_non_overlay_node = true;
+   if (!target->in_livetree) {
+   new_prop->next = target->np->deadprops;
+   target->np->deadprops = new_prop;
+   }
ret = of_changeset_add_property(>cset, target->np,
new_prop);
 
@@ -410,9 +415,10 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
struct target *target, struct device_node *node)
 {
const char *node_kbasename;
+   const __be32 *phandle;
struct device_node *tchild;
struct target target_child;
-   int ret = 0;
+   int ret = 0, size;
 
node_kbasename = kbasename(node->full_name);
 
@@ -426,6 +432,19 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
return -ENOMEM;
 
tchild->parent = target->np;
+   tchild->name = __of_get_property(node, "name", NULL);
+   tchild->type = __of_get_property(node, "device_type", NULL);
+
+   if (!tchild->name)
+   tchild->name = "";
+   if (!tchild->type)
+   tchild->type = "";
+
+   /* ignore obsolete "linux,phandle" */
+   phandle = 

[PATCH v4 14/18] of: unittest: remove unused of_unittest_apply_overlay() argument

2018-10-15 Thread frowand . list
From: Frank Rowand 

Argument unittest_nr is not used in of_unittest_apply_overlay(),
remove it.

Signed-off-by: Frank Rowand 
---
 drivers/of/unittest.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index efd9c947f192..6d80f474c8f2 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -1419,8 +1419,7 @@ static void of_unittest_destroy_tracked_overlays(void)
} while (defers > 0);
 }
 
-static int __init of_unittest_apply_overlay(int overlay_nr, int unittest_nr,
-   int *overlay_id)
+static int __init of_unittest_apply_overlay(int overlay_nr, int *overlay_id)
 {
const char *overlay_name;
 
@@ -1453,7 +1452,7 @@ static int __init of_unittest_apply_overlay_check(int 
overlay_nr,
}
 
ovcs_id = 0;
-   ret = of_unittest_apply_overlay(overlay_nr, unittest_nr, _id);
+   ret = of_unittest_apply_overlay(overlay_nr, _id);
if (ret != 0) {
/* of_unittest_apply_overlay already called unittest() */
return ret;
@@ -1489,7 +1488,7 @@ static int __init 
of_unittest_apply_revert_overlay_check(int overlay_nr,
 
/* apply the overlay */
ovcs_id = 0;
-   ret = of_unittest_apply_overlay(overlay_nr, unittest_nr, _id);
+   ret = of_unittest_apply_overlay(overlay_nr, _id);
if (ret != 0) {
/* of_unittest_apply_overlay already called unittest() */
return ret;
-- 
Frank Rowand 



[PATCH v4 13/18] of: overlay: check prevents multiple fragments touching same property

2018-10-15 Thread frowand . list
From: Frank Rowand 

Add test case of two fragments updating the same property.  After
adding the test case, the system hangs at end of boot, after
after slub stack dumps from kfree() in crypto modprobe code.

Multiple overlay fragments adding, modifying, or deleting the same
property is not supported.  Add check to detect the attempt and fail
the overlay apply.

Before this patch, the first fragment error would terminate
processing.  Allow fragment checking to proceed and report all
of the fragment errors before terminating the overlay apply. This
is not a hot path, thus not a performance issue (the error is not
transient and requires fixing the overlay before attempting to
apply it again).

After applying this patch, the devicetree unittest messages will
include:

   OF: overlay: ERROR: multiple fragments add, update, and/or delete property 
/testcase-data-2/substation@100/motor-1/rpm_avail

   ...

   ### dt-test ### end of unittest - 212 passed, 0 failed

The check to detect two fragments updating the same property is
folded into the patch that created the test case to maintain
bisectability.

Signed-off-by: Frank Rowand 
---
Changes since v3:
  - Update patch comment header to state that this patch modifies the
previous patch to not return immediately on fragment error and
explain this is not a performance issue.
  - remove redundant "overlay" from two error messages.  "OF: overlay:"
is already present in pr_fmt()

 drivers/of/overlay.c   | 118 ++---
 drivers/of/unittest-data/Makefile  |   1 +
 .../of/unittest-data/overlay_bad_add_dup_prop.dts  |  24 +
 drivers/of/unittest-data/overlay_base.dts  |   1 +
 drivers/of/unittest.c  |   5 +
 5 files changed, 112 insertions(+), 37 deletions(-)
 create mode 100644 drivers/of/unittest-data/overlay_bad_add_dup_prop.dts

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 0dc7452e2ed5..01afd22566ed 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -518,52 +518,96 @@ static int build_changeset_symbols_node(struct 
overlay_changeset *ovcs,
return 0;
 }
 
+static int find_dup_cset_node_entry(struct overlay_changeset *ovcs,
+   struct of_changeset_entry *ce_1)
+{
+   struct of_changeset_entry *ce_2;
+   char *fn_1, *fn_2;
+   int node_path_match;
+
+   if (ce_1->action != OF_RECONFIG_ATTACH_NODE &&
+   ce_1->action != OF_RECONFIG_DETACH_NODE)
+   return 0;
+
+   ce_2 = ce_1;
+   list_for_each_entry_continue(ce_2, >cset.entries, node) {
+   if ((ce_2->action != OF_RECONFIG_ATTACH_NODE &&
+ce_2->action != OF_RECONFIG_DETACH_NODE) ||
+   of_node_cmp(ce_1->np->full_name, ce_2->np->full_name))
+   continue;
+
+   fn_1 = kasprintf(GFP_KERNEL, "%pOF", ce_1->np);
+   fn_2 = kasprintf(GFP_KERNEL, "%pOF", ce_2->np);
+   node_path_match = !strcmp(fn_1, fn_2);
+   kfree(fn_1);
+   kfree(fn_2);
+   if (node_path_match) {
+   pr_err("ERROR: multiple fragments add and/or delete 
node %pOF\n",
+  ce_1->np);
+   return -EINVAL;
+   }
+   }
+
+   return 0;
+}
+
+static int find_dup_cset_prop(struct overlay_changeset *ovcs,
+   struct of_changeset_entry *ce_1)
+{
+   struct of_changeset_entry *ce_2;
+   char *fn_1, *fn_2;
+   int node_path_match;
+
+   if (ce_1->action != OF_RECONFIG_ADD_PROPERTY &&
+   ce_1->action != OF_RECONFIG_REMOVE_PROPERTY &&
+   ce_1->action != OF_RECONFIG_UPDATE_PROPERTY)
+   return 0;
+
+   ce_2 = ce_1;
+   list_for_each_entry_continue(ce_2, >cset.entries, node) {
+   if ((ce_2->action != OF_RECONFIG_ADD_PROPERTY &&
+ce_2->action != OF_RECONFIG_REMOVE_PROPERTY &&
+ce_2->action != OF_RECONFIG_UPDATE_PROPERTY) ||
+   of_node_cmp(ce_1->np->full_name, ce_2->np->full_name))
+   continue;
+
+   fn_1 = kasprintf(GFP_KERNEL, "%pOF", ce_1->np);
+   fn_2 = kasprintf(GFP_KERNEL, "%pOF", ce_2->np);
+   node_path_match = !strcmp(fn_1, fn_2);
+   kfree(fn_1);
+   kfree(fn_2);
+   if (node_path_match &&
+   !of_prop_cmp(ce_1->prop->name, ce_2->prop->name)) {
+   pr_err("ERROR: multiple fragments add, update, and/or 
delete property %pOF/%s\n",
+  ce_1->np, ce_1->prop->name);
+   return -EINVAL;
+   }
+   }
+
+   return 0;
+}
+
 /**
- * check_changeset_dup_add_node() - changeset validation: duplicate add node
+ * changeset_dup_entry_check() - check for duplicate entries
  * @ovcs:  Overlay changeset
  *
- * Check changeset 

[PATCH v4 12/18] of: overlay: check prevents multiple fragments add or delete same node

2018-10-15 Thread frowand . list
From: Frank Rowand 

Multiple overlay fragments adding or deleting the same node is not
supported.  Replace code comment of such, with check to detect the
attempt and fail the overlay apply.

Devicetree unittest where multiple fragments added the same node was
added in the previous patch in the series.  After applying this patch
the unittest messages will no longer include:

   Duplicate name in motor-1, renamed to "controller#1"
   OF: overlay: of_overlay_apply() err=0
   ### dt-test ### of_overlay_fdt_apply() expected -22, ret=0, 
overlay_bad_add_dup_node
   ### dt-test ### FAIL of_unittest_overlay_high_level():2419 Adding overlay 
'overlay_bad_add_dup_node' failed

   ...

   ### dt-test ### end of unittest - 210 passed, 1 failed

but will instead include:

   OF: overlay: ERROR: multiple overlay fragments add and/or delete node 
/testcase-data-2/substation@100/motor-1/controller

   ...

   ### dt-test ### end of unittest - 211 passed, 0 failed

Signed-off-by: Frank Rowand 
---

checkpatch errors "line over 80 characters" are ok, they will be
fixed later in this series

 drivers/of/overlay.c | 58 
 1 file changed, 49 insertions(+), 9 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 34396d6db1b3..0dc7452e2ed5 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -402,14 +402,6 @@ static int add_changeset_property(struct overlay_changeset 
*ovcs,
  *   a live devicetree created from Open Firmware.
  *
  * NOTE_2: Multiple mods of created nodes not supported.
- *   If more than one fragment contains a node that does not already exist
- *   in the live tree, then for each fragment of_changeset_attach_node()
- *   will add a changeset entry to add the node.  When the changeset is
- *   applied, __of_attach_node() will attach the node twice (once for
- *   each fragment).  At this point the device tree will be corrupted.
- *
- *   TODO: add integrity check to ensure that multiple fragments do not
- * create the same node.
  *
  * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
  * invalid @overlay.
@@ -527,6 +519,54 @@ static int build_changeset_symbols_node(struct 
overlay_changeset *ovcs,
 }
 
 /**
+ * check_changeset_dup_add_node() - changeset validation: duplicate add node
+ * @ovcs:  Overlay changeset
+ *
+ * Check changeset @ovcs->cset for multiple add node entries for the same
+ * node.
+ *
+ * Returns 0 on success, -ENOMEM if memory allocation failure, or -EINVAL if
+ * invalid overlay in @ovcs->fragments[].
+ */
+static int check_changeset_dup_add_node(struct overlay_changeset *ovcs)
+{
+   struct of_changeset_entry *ce_1, *ce_2;
+   char *fn_1, *fn_2;
+   int name_match;
+
+   list_for_each_entry(ce_1, >cset.entries, node) {
+
+   if (ce_1->action == OF_RECONFIG_ATTACH_NODE ||
+   ce_1->action == OF_RECONFIG_DETACH_NODE) {
+
+   ce_2 = ce_1;
+   list_for_each_entry_continue(ce_2, >cset.entries, 
node) {
+   if (ce_2->action == OF_RECONFIG_ATTACH_NODE ||
+   ce_2->action == OF_RECONFIG_DETACH_NODE) {
+   /* inexpensive name compare */
+   if (!of_node_cmp(ce_1->np->full_name,
+   ce_2->np->full_name)) {
+   /* expensive full path name 
compare */
+   fn_1 = kasprintf(GFP_KERNEL, 
"%pOF", ce_1->np);
+   fn_2 = kasprintf(GFP_KERNEL, 
"%pOF", ce_2->np);
+   name_match = !strcmp(fn_1, 
fn_2);
+   kfree(fn_1);
+   kfree(fn_2);
+   if (name_match) {
+   pr_err("ERROR: multiple 
overlay fragments add and/or delete node %pOF\n",
+  ce_1->np);
+   return -EINVAL;
+   }
+   }
+   }
+   }
+   }
+   }
+
+   return 0;
+}
+
+/**
  * build_changeset() - populate overlay changeset in @ovcs from 
@ovcs->fragments
  * @ovcs:  Overlay changeset
  *
@@ -581,7 +621,7 @@ static int build_changeset(struct overlay_changeset *ovcs)
}
}
 
-   return 0;
+   return check_changeset_dup_add_node(ovcs);
 }
 
 /*
-- 
Frank Rowand 



[PATCH v4 11/18] of: overlay: test case of two fragments adding same node

2018-10-15 Thread frowand . list
From: Frank Rowand 

Multiple overlay fragments adding or deleting the same node is not
supported.  An attempt to do so results in an incorrect devicetree.
The node name will be munged for the second add.

After adding this patch, the unittest messages will show:

   Duplicate name in motor-1, renamed to "controller#1"
   OF: overlay: of_overlay_apply() err=0
   ### dt-test ### of_overlay_fdt_apply() expected -22, ret=0, 
overlay_bad_add_dup_node
   ### dt-test ### FAIL of_unittest_overlay_high_level():2419 Adding overlay 
'overlay_bad_add_dup_node' failed

   ...

   ### dt-test ### end of unittest - 210 passed, 1 failed

The incorrect (munged) node name "controller#1" can be seen in the
/proc filesystem:

   $ pwd
   /proc/device-tree/testcase-data-2/substation@100/motor-1
   $ ls
   compatiblecontrollercontroller#1  name  phandle   spin
   $ ls controller
   power_bus
   $ ls controller#1
   power_bus_emergency

Signed-off-by: Frank Rowand 
---
 drivers/of/unittest-data/Makefile  |  1 +
 .../of/unittest-data/overlay_bad_add_dup_node.dts  | 28 ++
 drivers/of/unittest.c  |  5 
 3 files changed, 34 insertions(+)
 create mode 100644 drivers/of/unittest-data/overlay_bad_add_dup_node.dts

diff --git a/drivers/of/unittest-data/Makefile 
b/drivers/of/unittest-data/Makefile
index 013d85e694c6..166dbdbfd1c5 100644
--- a/drivers/of/unittest-data/Makefile
+++ b/drivers/of/unittest-data/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_OF_OVERLAY) += overlay.dtb.o \
overlay_12.dtb.o \
overlay_13.dtb.o \
overlay_15.dtb.o \
+   overlay_bad_add_dup_node.dtb.o \
overlay_bad_phandle.dtb.o \
overlay_bad_symbol.dtb.o \
overlay_base.dtb.o
diff --git a/drivers/of/unittest-data/overlay_bad_add_dup_node.dts 
b/drivers/of/unittest-data/overlay_bad_add_dup_node.dts
new file mode 100644
index ..145dfc3b1024
--- /dev/null
+++ b/drivers/of/unittest-data/overlay_bad_add_dup_node.dts
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/dts-v1/;
+/plugin/;
+
+/*
+ * _1/motor-1 and _ctrl_1 are the same node:
+ *   /testcase-data-2/substation@100/motor-1
+ *
+ * Thus the new node "controller" in each fragment will
+ * result in an attempt to add the same node twice.
+ * This will result in an error and the overlay apply
+ * will fail.
+ */
+
+_1 {
+
+   motor-1 {
+   controller {
+   power_bus = < 0x1 0x2 >;
+   };
+   };
+};
+
+_ctrl_1 {
+   controller {
+   power_bus_emergency = < 0x101 0x102 >;
+   };
+};
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 722537e14848..471b8eb6e842 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -2147,6 +2147,7 @@ struct overlay_info {
 OVERLAY_INFO_EXTERN(overlay_12);
 OVERLAY_INFO_EXTERN(overlay_13);
 OVERLAY_INFO_EXTERN(overlay_15);
+OVERLAY_INFO_EXTERN(overlay_bad_add_dup_node);
 OVERLAY_INFO_EXTERN(overlay_bad_phandle);
 OVERLAY_INFO_EXTERN(overlay_bad_symbol);
 
@@ -2169,6 +2170,7 @@ struct overlay_info {
OVERLAY_INFO(overlay_12, 0),
OVERLAY_INFO(overlay_13, 0),
OVERLAY_INFO(overlay_15, 0),
+   OVERLAY_INFO(overlay_bad_add_dup_node, -EINVAL),
OVERLAY_INFO(overlay_bad_phandle, -EINVAL),
OVERLAY_INFO(overlay_bad_symbol, -EINVAL),
{}
@@ -2413,6 +2415,9 @@ static __init void of_unittest_overlay_high_level(void)
unittest(overlay_data_apply("overlay", NULL),
 "Adding overlay 'overlay' failed\n");
 
+   unittest(overlay_data_apply("overlay_bad_add_dup_node", NULL),
+"Adding overlay 'overlay_bad_add_dup_node' failed\n");
+
unittest(overlay_data_apply("overlay_bad_phandle", NULL),
 "Adding overlay 'overlay_bad_phandle' failed\n");
 
-- 
Frank Rowand 



[PATCH v4 10/18] of: overlay: make all pr_debug() and pr_err() messages unique

2018-10-15 Thread frowand . list
From: Frank Rowand 

Make overlay.c debug and error messages unique so that they can be
unambiguously found by grep.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 3e1e519c12f0..34396d6db1b3 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -517,7 +517,7 @@ static int build_changeset_symbols_node(struct 
overlay_changeset *ovcs,
for_each_property_of_node(overlay_symbols_node, prop) {
ret = add_changeset_property(ovcs, target, prop, 1);
if (ret) {
-   pr_debug("Failed to apply prop @%pOF/%s, err=%d\n",
+   pr_debug("Failed to apply symbols prop @%pOF/%s, 
err=%d\n",
 target->np, prop->name, ret);
return ret;
}
@@ -561,7 +561,8 @@ static int build_changeset(struct overlay_changeset *ovcs)
ret = build_changeset_next_level(ovcs, ,
 fragment->overlay);
if (ret) {
-   pr_debug("apply failed '%pOF'\n", fragment->target);
+   pr_debug("fragment apply failed '%pOF'\n",
+fragment->target);
return ret;
}
}
@@ -574,7 +575,8 @@ static int build_changeset(struct overlay_changeset *ovcs)
ret = build_changeset_symbols_node(ovcs, ,
   fragment->overlay);
if (ret) {
-   pr_debug("apply failed '%pOF'\n", fragment->target);
+   pr_debug("symbols fragment apply failed '%pOF'\n",
+fragment->target);
return ret;
}
}
@@ -883,7 +885,7 @@ static int of_overlay_apply(const void *fdt, struct 
device_node *tree,
 
ret = __of_changeset_apply_notify(>cset);
if (ret)
-   pr_err("overlay changeset entry notify error %d\n", ret);
+   pr_err("overlay apply changeset entry notify error %d\n", ret);
/* notify failure is not fatal, continue */
 
list_add_tail(>ovcs_list, _list);
@@ -1142,7 +1144,7 @@ int of_overlay_remove(int *ovcs_id)
 
ret = __of_changeset_revert_notify(>cset);
if (ret)
-   pr_err("overlay changeset entry notify error %d\n", ret);
+   pr_err("overlay remove changeset entry notify error %d\n", ret);
/* notify failure is not fatal, continue */
 
*ovcs_id = 0;
-- 
Frank Rowand 



[PATCH v4 09/18] of: overlay: validate overlay properties #address-cells and #size-cells

2018-10-15 Thread frowand . list
From: Frank Rowand 

If overlay properties #address-cells or #size-cells are already in
the live devicetree for any given node, then the values in the
overlay must match the values in the live tree.

If the properties are already in the live tree then there is no
need to create a changeset entry to add them since they must
have the same value.  This reduces the memory used by the
changeset and eliminates a possible memory leak.  This is
verified by 12 fewer warnings during the devicetree unittest,
as the possible memory leak warnings about #address-cells and

Signed-off-by: Frank Rowand 
---
Changes since v3:
  - for errors of an overlay changing the value of #size-cells or
#address-cells, return -EINVAL so that overlay apply will fail
  - for errors of an overlay changing the value of #size-cells or
#address-cells, make the message more direct.
Old message:
  OF: overlay: ERROR: overlay and/or live tree #size-cells invalid in node 
/soc/base_fpga_region
New message:
  OF: overlay: ERROR: changing value of /soc/base_fpga_region/#size-cells 
not allowed

 drivers/of/overlay.c | 42 +++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 272a0d1a5e18..3e1e519c12f0 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -287,7 +287,12 @@ static struct property *dup_and_fixup_symbol_prop(
  * @target may be either in the live devicetree or in a new subtree that
  * is contained in the changeset.
  *
- * Some special properties are not updated (no error returned).
+ * Some special properties are not added or updated (no error returned):
+ * "name", "phandle", "linux,phandle".
+ *
+ * Properties "#address-cells" and "#size-cells" are not updated if they
+ * are already in the live tree, but if present in the live tree, the values
+ * in the overlay must match the values in the live tree.
  *
  * Update of property in symbols node is not allowed.
  *
@@ -300,6 +305,7 @@ static int add_changeset_property(struct overlay_changeset 
*ovcs,
 {
struct property *new_prop = NULL, *prop;
int ret = 0;
+   bool check_for_non_overlay_node = false;
 
if (!of_prop_cmp(overlay_prop->name, "name") ||
!of_prop_cmp(overlay_prop->name, "phandle") ||
@@ -322,13 +328,43 @@ static int add_changeset_property(struct 
overlay_changeset *ovcs,
if (!new_prop)
return -ENOMEM;
 
-   if (!prop)
+   if (!prop) {
+
+   check_for_non_overlay_node = true;
ret = of_changeset_add_property(>cset, target->np,
new_prop);
-   else
+
+   } else if (!of_prop_cmp(prop->name, "#address-cells")) {
+
+   if (prop->length != 4 || new_prop->length != 4 ||
+   *(u32 *)prop->value != *(u32 *)new_prop->value) {
+   pr_err("ERROR: changing value of %pOF/#address-cells is 
not allowed\n",
+  target->np);
+   ret = -EINVAL;
+   }
+
+   } else if (!of_prop_cmp(prop->name, "#size-cells")) {
+
+   if (prop->length != 4 || new_prop->length != 4 ||
+   *(u32 *)prop->value != *(u32 *)new_prop->value) {
+   pr_err("ERROR: changing value of %pOF/#size-cells is 
not allowed\n",
+  target->np);
+   ret = -EINVAL;
+   }
+
+   } else {
+
+   check_for_non_overlay_node = true;
ret = of_changeset_update_property(>cset, target->np,
   new_prop);
 
+   }
+
+   if (check_for_non_overlay_node &&
+   !of_node_check_flag(target->np, OF_OVERLAY))
+   pr_err("WARNING: %s(), memory leak will occur if overlay 
removed.  Property: %pOF/%s\n",
+  __func__, target->np, new_prop->name);
+
if (ret) {
kfree(new_prop->name);
kfree(new_prop->value);
-- 
Frank Rowand 



[PATCH v4 08/18] of: overlay: reorder fields in struct fragment

2018-10-15 Thread frowand . list
From: Frank Rowand 

Order the fields of struct fragment in the same order as
struct of_overlay_notify_data.  The order in struct fragment is
not significant.  If both structs are ordered the same then when
examining the data in a debugger or dump the human involved does
not have to remember which context they are examining.

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 7fcf4a812d06..272a0d1a5e18 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -49,8 +49,8 @@ struct target {
  * @overlay:   pointer to the __overlay__ node
  */
 struct fragment {
-   struct device_node *target;
struct device_node *overlay;
+   struct device_node *target;
 };
 
 /**
-- 
Frank Rowand 



[PATCH v4 07/18] of: dynamic: change type of of_{at, de}tach_node() to void

2018-10-15 Thread frowand . list
From: Frank Rowand 

of_attach_node() and of_detach_node() always return zero, so
their return value is meaningless.  Change their type to void
and fix all callers to ignore return value.

Signed-off-by: Frank Rowand 
---
 arch/powerpc/platforms/pseries/dlpar.c| 13 ++---
 arch/powerpc/platforms/pseries/reconfig.c |  6 +-
 drivers/of/dynamic.c  |  9 ++---
 include/linux/of.h|  4 ++--
 4 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
b/arch/powerpc/platforms/pseries/dlpar.c
index e3010b14aea5..0027eea94a8b 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -244,15 +244,9 @@ struct device_node *dlpar_configure_connector(__be32 
drc_index,
 
 int dlpar_attach_node(struct device_node *dn, struct device_node *parent)
 {
-   int rc;
-
dn->parent = parent;
 
-   rc = of_attach_node(dn);
-   if (rc) {
-   printk(KERN_ERR "Failed to add device node %pOF\n", dn);
-   return rc;
-   }
+   of_attach_node(dn);
 
return 0;
 }
@@ -260,7 +254,6 @@ int dlpar_attach_node(struct device_node *dn, struct 
device_node *parent)
 int dlpar_detach_node(struct device_node *dn)
 {
struct device_node *child;
-   int rc;
 
child = of_get_next_child(dn, NULL);
while (child) {
@@ -268,9 +261,7 @@ int dlpar_detach_node(struct device_node *dn)
child = of_get_next_child(dn, child);
}
 
-   rc = of_detach_node(dn);
-   if (rc)
-   return rc;
+   of_detach_node(dn);
 
of_node_put(dn);
 
diff --git a/arch/powerpc/platforms/pseries/reconfig.c 
b/arch/powerpc/platforms/pseries/reconfig.c
index 0e0208117e77..0b72098da454 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -47,11 +47,7 @@ static int pSeries_reconfig_add_node(const char *path, 
struct property *proplist
goto out_err;
}
 
-   err = of_attach_node(np);
-   if (err) {
-   printk(KERN_ERR "Failed to add device node %s\n", path);
-   goto out_err;
-   }
+   of_attach_node(np);
 
of_node_put(np->parent);
 
diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index fe8816cca99b..a94f727ec3da 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -224,7 +224,7 @@ static void __of_attach_node(struct device_node *np)
 /**
  * of_attach_node() - Plug a device node into the tree and global list.
  */
-int of_attach_node(struct device_node *np)
+void of_attach_node(struct device_node *np)
 {
struct of_reconfig_data rd;
unsigned long flags;
@@ -241,8 +241,6 @@ int of_attach_node(struct device_node *np)
mutex_unlock(_mutex);
 
of_reconfig_notify(OF_RECONFIG_ATTACH_NODE, );
-
-   return 0;
 }
 
 void __of_detach_node(struct device_node *np)
@@ -273,11 +271,10 @@ void __of_detach_node(struct device_node *np)
 /**
  * of_detach_node() - "Unplug" a node from the device tree.
  */
-int of_detach_node(struct device_node *np)
+void of_detach_node(struct device_node *np)
 {
struct of_reconfig_data rd;
unsigned long flags;
-   int rc = 0;
 
memset(, 0, sizeof(rd));
rd.dn = np;
@@ -291,8 +288,6 @@ int of_detach_node(struct device_node *np)
mutex_unlock(_mutex);
 
of_reconfig_notify(OF_RECONFIG_DETACH_NODE, );
-
-   return rc;
 }
 EXPORT_SYMBOL_GPL(of_detach_node);
 
diff --git a/include/linux/of.h b/include/linux/of.h
index aa1dafaec6ae..72c593455019 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -406,8 +406,8 @@ extern int of_phandle_iterator_args(struct 
of_phandle_iterator *it,
 #define OF_RECONFIG_REMOVE_PROPERTY0x0004
 #define OF_RECONFIG_UPDATE_PROPERTY0x0005
 
-extern int of_attach_node(struct device_node *);
-extern int of_detach_node(struct device_node *);
+extern void of_attach_node(struct device_node *np);
+extern void of_detach_node(struct device_node *np);
 
 #define of_match_ptr(_ptr) (_ptr)
 
-- 
Frank Rowand 



[PATCH v4 06/18] of: overlay: do not duplicate properties from overlay for new nodes

2018-10-15 Thread frowand . list
From: Frank Rowand 

When allocating a new node, add_changeset_node() was duplicating the
properties from the respective node in the overlay instead of
allocating a node with no properties.

When this patch is applied the errors reported by the devictree
unittest from patch "of: overlay: add tests to validate kfrees from
overlay removal" will no longer occur.  These error messages are of
the form:

   "OF: ERROR: ..."

and the unittest results will change from:

   ### dt-test ### end of unittest - 203 passed, 7 failed

to

   ### dt-test ### end of unittest - 210 passed, 0 failed

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 94740f4ee34c..7fcf4a812d06 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -393,7 +393,7 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
break;
 
if (!tchild) {
-   tchild = __of_node_dup(node, node_kbasename);
+   tchild = __of_node_dup(NULL, node_kbasename);
if (!tchild)
return -ENOMEM;
 
-- 
Frank Rowand 



[PATCH v4 05/18] of: overlay: use prop add changeset entry for property in new nodes

2018-10-15 Thread frowand . list
From: Frank Rowand 

The changeset entry 'update property' was used for new properties in
an overlay instead of 'add property'.

The decision of whether to use 'update property' was based on whether
the property already exists in the subtree where the node is being
spliced into.  At the top level of creating a changeset describing the
overlay, the target node is in the live devicetree, so checking whether
the property exists in the target node returns the correct result.
As soon as the changeset creation algorithm recurses into a new node,
the target is no longer in the live devicetree, but is instead in the
detached overlay tree, thus all properties are incorrectly found to
already exist in the target.

This fix will expose another devicetree bug that will be fixed
in the following patch in the series.

When this patch is applied the errors reported by the devictree
unittest will change, and the unittest results will change from:

   ### dt-test ### end of unittest - 210 passed, 0 failed

to

   ### dt-test ### end of unittest - 203 passed, 7 failed

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 112 ++-
 1 file changed, 74 insertions(+), 38 deletions(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 32cfee68f2e3..94740f4ee34c 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -24,6 +24,26 @@
 #include "of_private.h"
 
 /**
+ * struct target - info about current target node as recursing through overlay
+ * @np:node where current level of overlay will be 
applied
+ * @in_livetree:   @np is a node in the live devicetree
+ *
+ * Used in the algorithm to create the portion of a changeset that describes
+ * an overlay fragment, which is a devicetree subtree.  Initially @np is a node
+ * in the live devicetree where the overlay subtree is targeted to be grafted
+ * into.  When recursing to the next level of the overlay subtree, the target
+ * also recurses to the next level of the live devicetree, as long as overlay
+ * subtree node also exists in the live devicetree.  When a node in the overlay
+ * subtree does not exist at the same level in the live devicetree, target->np
+ * points to a newly allocated node, and all subsequent targets in the subtree
+ * will be newly allocated nodes.
+ */
+struct target {
+   struct device_node *np;
+   bool in_livetree;
+};
+
+/**
  * struct fragment - info about fragment nodes in overlay expanded device tree
  * @target:target of the overlay operation
  * @overlay:   pointer to the __overlay__ node
@@ -72,8 +92,7 @@ static int devicetree_corrupt(void)
 }
 
 static int build_changeset_next_level(struct overlay_changeset *ovcs,
-   struct device_node *target_node,
-   const struct device_node *overlay_node);
+   struct target *target, const struct device_node *overlay_node);
 
 /*
  * of_resolve_phandles() finds the largest phandle in the live tree.
@@ -257,14 +276,17 @@ static struct property *dup_and_fixup_symbol_prop(
 /**
  * add_changeset_property() - add @overlay_prop to overlay changeset
  * @ovcs:  overlay changeset
- * @target_node:   where to place @overlay_prop in live tree
+ * @target:where @overlay_prop will be placed
  * @overlay_prop:  property to add or update, from overlay tree
  * @is_symbols_prop:   1 if @overlay_prop is from node "/__symbols__"
  *
- * If @overlay_prop does not already exist in @target_node, add changeset entry
- * to add @overlay_prop in @target_node, else add changeset entry to update
+ * If @overlay_prop does not already exist in live devicetree, add changeset
+ * entry to add @overlay_prop in @target, else add changeset entry to update
  * value of @overlay_prop.
  *
+ * @target may be either in the live devicetree or in a new subtree that
+ * is contained in the changeset.
+ *
  * Some special properties are not updated (no error returned).
  *
  * Update of property in symbols node is not allowed.
@@ -273,20 +295,22 @@ static struct property *dup_and_fixup_symbol_prop(
  * invalid @overlay.
  */
 static int add_changeset_property(struct overlay_changeset *ovcs,
-   struct device_node *target_node,
-   struct property *overlay_prop,
+   struct target *target, struct property *overlay_prop,
bool is_symbols_prop)
 {
struct property *new_prop = NULL, *prop;
int ret = 0;
 
-   prop = of_find_property(target_node, overlay_prop->name, NULL);
-
if (!of_prop_cmp(overlay_prop->name, "name") ||
!of_prop_cmp(overlay_prop->name, "phandle") ||
!of_prop_cmp(overlay_prop->name, "linux,phandle"))
return 0;
 
+   if (target->in_livetree)
+   prop = of_find_property(target->np, overlay_prop->name, NULL);
+   else
+   prop = NULL;
+
if (is_symbols_prop) {
if (prop)
  

[PATCH v4 04/18] powerpc/pseries: add of_node_put() in dlpar_detach_node()

2018-10-15 Thread frowand . list
From: Frank Rowand 

"of: overlay: add missing of_node_get() in __of_attach_node_sysfs"
added a missing of_node_get() to __of_attach_node_sysfs().  This
results in a refcount imbalance for nodes attached with
dlpar_attach_node().  The calling sequence from dlpar_attach_node()
to __of_attach_node_sysfs() is:

   dlpar_attach_node()
  of_attach_node()
 __of_attach_node_sysfs()

Signed-off-by: Frank Rowand 
---

* UNTESTED.  I need people with the affected PowerPC systems
*(systems that dynamically allocate and deallocate
*devicetree nodes) to test this patch.

 arch/powerpc/platforms/pseries/dlpar.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
b/arch/powerpc/platforms/pseries/dlpar.c
index a0b20c03f078..e3010b14aea5 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -272,6 +272,8 @@ int dlpar_detach_node(struct device_node *dn)
if (rc)
return rc;
 
+   of_node_put(dn);
+
return 0;
 }
 
-- 
Frank Rowand 



[PATCH v4 03/18] of: overlay: add missing of_node_get() in __of_attach_node_sysfs

2018-10-15 Thread frowand . list
From: Frank Rowand 

There is a matching of_node_put() in __of_detach_node_sysfs()

Remove misleading comment from function header comment for
of_detach_node().

This patch may result in memory leaks from code that directly calls
the dynamic node add and delete functions directly instead of
using changesets.

Signed-off-by: Frank Rowand 
---

This patch should result in powerpc systems that dynamically
allocate a node, then later deallocate the node to have a
memory leak when the node is deallocated.

The next patch in the series will fix the leak.

 drivers/of/dynamic.c | 3 ---
 drivers/of/kobj.c| 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index 24c97b7a050f..fe8816cca99b 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -272,9 +272,6 @@ void __of_detach_node(struct device_node *np)
 
 /**
  * of_detach_node() - "Unplug" a node from the device tree.
- *
- * The caller must hold a reference to the node.  The memory associated with
- * the node is not freed until its refcount goes to zero.
  */
 int of_detach_node(struct device_node *np)
 {
diff --git a/drivers/of/kobj.c b/drivers/of/kobj.c
index 7a0a18980b98..c72eef988041 100644
--- a/drivers/of/kobj.c
+++ b/drivers/of/kobj.c
@@ -133,6 +133,9 @@ int __of_attach_node_sysfs(struct device_node *np)
}
if (!name)
return -ENOMEM;
+
+   of_node_get(np);
+
rc = kobject_add(>kobj, parent, "%s", name);
kfree(name);
if (rc)
@@ -159,6 +162,5 @@ void __of_detach_node_sysfs(struct device_node *np)
kobject_del(>kobj);
}
 
-   /* finally remove the kobj_init ref */
of_node_put(np);
 }
-- 
Frank Rowand 



[PATCH v4 02/18] of: overlay: add missing of_node_put() after add new node to changeset

2018-10-15 Thread frowand . list
From: Frank Rowand 

The refcount of a newly added overlay node decrements to one
(instead of zero) when the overlay changeset is destroyed.  This
change will cause the final decrement be to zero.

After applying this patch, new validation warnings will be
reported from the devicetree unittest during boot due to
a pre-existing devicetree bug.  The warnings will be similar to:

  OF: ERROR: memory leak of_node_release() overlay node 
/testcase-data/overlay-node/test-bus/test-unittest4 before free overlay 
changeset

This pre-existing devicetree bug will also trigger a WARN_ONCE() from
refcount_sub_and_test_checked() when an overlay changeset is
destroyed without having first been applied.  This scenario occurs
when an error in the overlay is detected during the overlay changeset
creation:

  WARNING: CPU: 0 PID: 1 at lib/refcount.c:187 
refcount_sub_and_test_checked+0xa8/0xbc
  refcount_t: underflow; use-after-free.

  (unwind_backtrace) from (show_stack+0x10/0x14)
  (show_stack) from (dump_stack+0x6c/0x8c)
  (dump_stack) from (__warn+0xdc/0x104)
  (__warn) from (warn_slowpath_fmt+0x44/0x6c)
  (warn_slowpath_fmt) from (refcount_sub_and_test_checked+0xa8/0xbc)
  (refcount_sub_and_test_checked) from (kobject_put+0x24/0x208)
  (kobject_put) from (of_changeset_destroy+0x2c/0xb4)
  (of_changeset_destroy) from (free_overlay_changeset+0x1c/0x9c)
  (free_overlay_changeset) from (of_overlay_remove+0x284/0x2cc)
  (of_overlay_remove) from 
(of_unittest_apply_revert_overlay_check.constprop.4+0xf8/0x1e8)
  (of_unittest_apply_revert_overlay_check.constprop.4) from 
(of_unittest_overlay+0x960/0xed8)
  (of_unittest_overlay) from (of_unittest+0x1cc4/0x2138)
  (of_unittest) from (do_one_initcall+0x4c/0x28c)
  (do_one_initcall) from (kernel_init_freeable+0x29c/0x378)
  (kernel_init_freeable) from (kernel_init+0x8/0x110)
  (kernel_init) from (ret_from_fork+0x14/0x2c)

Signed-off-by: Frank Rowand 
---
 drivers/of/overlay.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index 1176cb4b6e4e..32cfee68f2e3 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -379,7 +379,9 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
if (ret)
return ret;
 
-   return build_changeset_next_level(ovcs, tchild, node);
+   ret = build_changeset_next_level(ovcs, tchild, node);
+   of_node_put(tchild);
+   return ret;
}
 
if (node->phandle && tchild->phandle)
-- 
Frank Rowand 



[PATCH v4 01/18] of: overlay: add tests to validate kfrees from overlay removal

2018-10-15 Thread frowand . list
From: Frank Rowand 

Add checks:
  - attempted kfree due to refcount reaching zero before overlay
is removed
  - properties linked to an overlay node when the node is removed
  - node refcount > one during node removal in a changeset destroy,
if the node was created by the changeset

After applying this patch, several validation warnings will be
reported from the devicetree unittest during boot due to
pre-existing devicetree bugs. The warnings will be similar to:

  OF: ERROR: of_node_release() overlay node 
/testcase-data/overlay-node/test-bus/test-unittest11/test-unittest111 contains 
unexpected properties
  OF: ERROR: memory leak - destroy cset entry: attach overlay node 
/testcase-data-2/substation@100/hvac-medium-2 expected refcount 1 instead of 2. 
 of_node_get() / of_node_put() are unbalanced for this node.

Signed-off-by: Frank Rowand 
---
Changes since v3:
  - Add expected value of refcount for destroy cset entry error.  Also
explain the cause of the error.

 drivers/of/dynamic.c | 29 +
 drivers/of/overlay.c |  1 +
 include/linux/of.h   | 15 ++-
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index f4f8ed9b5454..24c97b7a050f 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -330,6 +330,25 @@ void of_node_release(struct kobject *kobj)
if (!of_node_check_flag(node, OF_DYNAMIC))
return;
 
+   if (of_node_check_flag(node, OF_OVERLAY)) {
+
+   if (!of_node_check_flag(node, OF_OVERLAY_FREE_CSET)) {
+   /* premature refcount of zero, do not free memory */
+   pr_err("ERROR: memory leak %s() overlay node %pOF 
before free overlay changeset\n",
+  __func__, node);
+   return;
+   }
+
+   /*
+* If node->properties non-empty then properties were added
+* to this node either by different overlay that has not
+* yet been removed, or by a non-overlay mechanism.
+*/
+   if (node->properties)
+   pr_err("ERROR: %s() overlay node %pOF contains 
unexpected properties\n",
+  __func__, node);
+   }
+
property_list_free(node->properties);
property_list_free(node->deadprops);
 
@@ -434,6 +453,16 @@ struct device_node *__of_node_dup(const struct device_node 
*np,
 
 static void __of_changeset_entry_destroy(struct of_changeset_entry *ce)
 {
+   if (ce->action == OF_RECONFIG_ATTACH_NODE &&
+   of_node_check_flag(ce->np, OF_OVERLAY)) {
+   if (kref_read(>np->kobj.kref) > 1) {
+   pr_err("ERROR: memory leak - destroy cset entry: attach 
overlay node %pOF expected refcount 1 instead of %d.  of_node_get() / 
of_node_put() are unbalanced for this node.\n",
+  ce->np, kref_read(>np->kobj.kref));
+   } else {
+   of_node_set_flag(ce->np, OF_OVERLAY_FREE_CSET);
+   }
+   }
+
of_node_put(ce->np);
list_del(>node);
kfree(ce);
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
index eda57ef12fd0..1176cb4b6e4e 100644
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -373,6 +373,7 @@ static int add_changeset_node(struct overlay_changeset 
*ovcs,
return -ENOMEM;
 
tchild->parent = target_node;
+   of_node_set_flag(tchild, OF_OVERLAY);
 
ret = of_changeset_attach_node(>cset, tchild);
if (ret)
diff --git a/include/linux/of.h b/include/linux/of.h
index 4d25e4f952d9..aa1dafaec6ae 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -138,11 +138,16 @@ static inline void of_node_put(struct device_node *node) 
{ }
 extern struct device_node *of_stdout;
 extern raw_spinlock_t devtree_lock;
 
-/* flag descriptions (need to be visible even when !CONFIG_OF) */
-#define OF_DYNAMIC 1 /* node and properties were allocated via kmalloc */
-#define OF_DETACHED2 /* node has been detached from the device tree */
-#define OF_POPULATED   3 /* device already created for the node */
-#define OF_POPULATED_BUS   4 /* of_platform_populate recursed to children 
of this node */
+/*
+ * struct device_node flag descriptions
+ * (need to be visible even when !CONFIG_OF)
+ */
+#define OF_DYNAMIC 1 /* (and properties) allocated via kmalloc */
+#define OF_DETACHED2 /* detached from the device tree */
+#define OF_POPULATED   3 /* device already created */
+#define OF_POPULATED_BUS   4 /* platform bus created for children */
+#define OF_OVERLAY 5 /* allocated for an overlay */
+#define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
 
 #define OF_BAD_ADDR((u64)-1)
 
-- 
Frank Rowand 



[PATCH v4 00/18] of: overlay: validation checks, subsequent fixes

2018-10-15 Thread frowand . list
From: Frank Rowand 

Add checks to (1) overlay apply process and (2) memory freeing
triggered by overlay release.  The checks are intended to detect
possible memory leaks and invalid overlays.

The checks revealed bugs in existing code.  Fixed the bugs.

While fixing bugs, noted other issues, which are fixed in
separate patches.

*  Powerpc folks: I was not able to test the patches that
*  directly impact Powerpc systems that use dynamic
*  devicetree.  Please review that code carefully and
*  test.  The specific patches are: 03/16, 04/16, 07/16

FPGA folks:

  I made the validation checks that should result in an
  invalid live devicetree report "ERROR" and cause the overlay apply
  to fail.

  I made the memory leak validation tests report "WARNING" and allow
  the overlay apply to complete successfully.  Please let me know
  if you encounter the warnings.  There are at least two paths
  forward to deal with the cases that trigger the warning: (1) change
  the warning to an error and fail the overlay apply, or (2) find a
  way to detect the potential memory leaks and free the memory
  appropriately.

ALL people:

  The validations do _not_ address another major concern I have with
  releasing overlays, which is use after free errors.

Changes since v3:
  - 01/18: Add expected value of refcount for destroy cset entry error.  Also
explain the cause of the error.

  - 09/18: for errors of an overlay changing the value of #size-cells or
#address-cells, return -EINVAL so that overlay apply will fail
  - 09/18: for errors of an overlay changing the value of #size-cells or
#address-cells, make the message more direct.
Old message:
  OF: overlay: ERROR: overlay and/or live tree #size-cells invalid in node 
/soc/base_fpga_region
New message:
  OF: overlay: ERROR: changing value of /soc/base_fpga_region/#size-cells 
not allowed

  - 13/18: Update patch comment header to state that this patch modifies the
previous patch to not return immediately on fragment error and
explain this is not a performance issue.
  - 13/18: remove redundant "overlay" from two error messages.  "OF: overlay:"
is already present in pr_fmt()

Changes since v2:

  - 13/18: Use continue to reduce indentation in find_dup_cset_node_entry()
and find_dup_cset_prop()

Changes since v1:

  - move patch 16/16 to 17/18
  - move patch 15/16 to 18/18
  - new patch 15/18
  - new patch 16/18

  - 05/18: add_changeset_node() header comment: incorrect comment for @target

  - 18/18: add same fix for of_parse_phandle_with_args()
  - 18/18: add same fix for of_parse_phandle_with_args_map()

Frank Rowand (18):
  of: overlay: add tests to validate kfrees from overlay removal
  of: overlay: add missing of_node_put() after add new node to changeset
  of: overlay: add missing of_node_get() in __of_attach_node_sysfs
  powerpc/pseries: add of_node_put() in dlpar_detach_node()
  of: overlay: use prop add changeset entry for property in new nodes
  of: overlay: do not duplicate properties from overlay for new nodes
  of: dynamic: change type of of_{at,de}tach_node() to void
  of: overlay: reorder fields in struct fragment
  of: overlay: validate overlay properties #address-cells and
#size-cells
  of: overlay: make all pr_debug() and pr_err() messages unique
  of: overlay: test case of two fragments adding same node
  of: overlay: check prevents multiple fragments add or delete same node
  of: overlay: check prevents multiple fragments touching same property
  of: unittest: remove unused of_unittest_apply_overlay() argument
  of: overlay: set node fields from properties when add new overlay node
  of: unittest: allow base devicetree to have symbol metadata
  of: unittest: find overlays[] entry by name instead of index
  of: unittest: initialize args before calling of_*parse_*()

 arch/powerpc/platforms/pseries/dlpar.c |  15 +-
 arch/powerpc/platforms/pseries/reconfig.c  |   6 +-
 drivers/of/dynamic.c   |  68 +++--
 drivers/of/kobj.c  |   4 +-
 drivers/of/overlay.c   | 302 -
 drivers/of/unittest-data/Makefile  |   2 +
 .../of/unittest-data/overlay_bad_add_dup_node.dts  |  28 ++
 .../of/unittest-data/overlay_bad_add_dup_prop.dts  |  24 ++
 drivers/of/unittest-data/overlay_base.dts  |   1 +
 drivers/of/unittest.c  |  96 +--
 include/linux/of.h |  19 +-
 11 files changed, 443 insertions(+), 122 deletions(-)
 create mode 100644 drivers/of/unittest-data/overlay_bad_add_dup_node.dts
 create mode 100644 drivers/of/unittest-data/overlay_bad_add_dup_prop.dts

-- 
Frank Rowand 



[PATCH kernel] powerpc/powernv/ioda: Reduce a number of hooks in pnv_phb

2018-10-15 Thread Alexey Kardashevskiy
fixup_phb() is never used, this removes it.

pick_m64_pe() and reserve_m64_pe() are always defined for all powernv
PHBs: they are initialized by pnv_ioda_parse_m64_window() which is
called unconditionally from pnv_pci_init_ioda_phb() which initializes
all known PHB types on powernv so we can open code them.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci.h  | 4 
 arch/powerpc/platforms/powernv/pci-ioda.c | 9 +++--
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 8b37b28..2131373 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -115,11 +115,7 @@ struct pnv_phb {
 unsigned int hwirq, unsigned int virq,
 unsigned int is_64, struct msi_msg *msg);
void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
-   void (*fixup_phb)(struct pci_controller *hose);
int (*init_m64)(struct pnv_phb *phb);
-   void (*reserve_m64_pe)(struct pci_bus *bus,
-  unsigned long *pe_bitmap, bool all);
-   struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 78b61f0..15a4556 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -518,8 +518,6 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb 
*phb)
phb->init_m64 = pnv_ioda1_init_m64;
else
phb->init_m64 = pnv_ioda2_init_m64;
-   phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe;
-   phb->pick_m64_pe = pnv_ioda_pick_m64_pe;
 }
 
 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -1161,8 +1159,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct 
pci_bus *bus, bool all)
pe = >ioda.pe_array[phb->ioda.root_pe_idx];
 
/* Check if PE is determined by M64 */
-   if (!pe && phb->pick_m64_pe)
-   pe = phb->pick_m64_pe(bus, all);
+   if (!pe)
+   pe = pnv_ioda_pick_m64_pe(bus, all);
 
/* The PE number isn't pinned by M64 */
if (!pe)
@@ -3395,8 +3393,7 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, 
unsigned long type)
return;
 
/* Reserve PEs according to used M64 resources */
-   if (phb->reserve_m64_pe)
-   phb->reserve_m64_pe(bus, NULL, all);
+   pnv_ioda_reserve_m64_pe(bus, NULL, all);
 
/*
 * Assign PE. We might run here because of partial hotplug.
-- 
2.11.0



[PATCH kernel] powerpc/powernv/ioda1: Remove dead code for a single device PE

2018-10-15 Thread Alexey Kardashevskiy
At the moment PNV_IODA_PE_DEV is only used for NPU PEs which are not
present on IODA1 machines (i.e. POWER7) so let's remove a piece of
dead code.

Signed-off-by: Alexey Kardashevskiy 
---

We might actually want to get rid of the entire IODA1 there.
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index cde7102..78b61f0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2367,15 +2367,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb 
*phb,
pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
iommu_init_table(tbl, phb->hose->node);
 
-   if (pe->flags & PNV_IODA_PE_DEV) {
-   /*
-* Setting table base here only for carrying iommu_group
-* further down to let iommu_add_device() do the job.
-* pnv_pci_ioda_dma_dev_setup will override it later anyway.
-*/
-   set_iommu_table_base(>pdev->dev, tbl);
-   iommu_add_device(>pdev->dev);
-   } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+   if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
 
return;
-- 
2.11.0



Re: [PATCH kernel v2] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-10-15 Thread Alexey Kardashevskiy



On 16/10/2018 13:19, Alistair Popple wrote:
>> reset_ntl() does what npu2_dev_procedure_reset() does plus more stuff,
>> there nothing really in npu2_dev_procedure_reset() which reset_ntl()
>> does not do already from the hardware standpoint. And it did stop HMIs
>> for me though.
>>
>> but ok, what will be sufficient then if not reset_ntl()?
> 
> Argh, yes you are correct. Specifically both npu2_dev_procedure_reset() and
> reset_ntl() contain:
> 
>   /* NTL Reset */
>   val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
>   val |= PPC_BIT(8) | PPC_BIT(9);
>   npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);
> 
> Which should fence the brick. However from what I recall there was more to
> reliably preventing HMIs than merely fencing the brick. It invovled a sequence
> of fencing and flushing the cache with dcbf instructions at the right time 
> which
> is why we also have the FLR. Unfortunately I don't know the precise details,
> perhaps if we send enough coffee Balbir's way he might be able remind us?


He suggested and ack'ed that skiboot patch, I can repeat beers^wcoffee
but it won't change much ;)



> 
> - Alistair
> 
>>
>>
>>>
>>> - Alistair
>>>



>
> - Alistair
>
> On Monday, 15 October 2018 6:17:51 PM AEDT Alexey Kardashevskiy wrote:
>> Ping?
>>
>>
>> On 02/10/2018 13:20, Alexey Kardashevskiy wrote:
>>> The skiboot firmware has a hot reset handler which fences the NVIDIA 
>>> V100
>>> GPU RAM on Witherspoons and makes accesses no-op instead of throwing 
>>> HMIs:
>>> https://github.com/open-power/skiboot/commit/fca2b2b839a67
>>>
>>> Now we are going to pass V100 via VFIO which most certainly involves
>>> KVM guests which are often terminated without getting a chance to 
>>> offline
>>> GPU RAM so we end up with a running machine with misconfigured memory.
>>> Accessing this memory produces hardware management interrupts (HMI)
>>> which bring the host down.
>>>
>>> To suppress HMIs, this wires up this hot reset hook to 
>>> vfio_pci_disable()
>>> via pci_disable_device() which switches NPU2 to a safe mode and prevents
>>> HMIs.
>>>
>>> Signed-off-by: Alexey Kardashevskiy 
>>> ---
>>> Changes:
>>> v2:
>>> * updated the commit log
>>> ---
>>>  arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
>>>  1 file changed, 10 insertions(+)
>>>
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index cde7102..e37b9cc 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -3688,6 +3688,15 @@ static void pnv_pci_release_device(struct 
>>> pci_dev *pdev)
>>> pnv_ioda_release_pe(pe);
>>>  }
>>>  
>>> +static void pnv_npu_disable_device(struct pci_dev *pdev)
>>> +{
>>> +   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
>>> +   struct eeh_pe *eehpe = edev ? edev->pe : NULL;
>>> +
>>> +   if (eehpe && eeh_ops && eeh_ops->reset)
>>> +   eeh_ops->reset(eehpe, EEH_RESET_HOT);
>>> +}
>>> +
>>>  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
>>>  {
>>> struct pnv_phb *phb = hose->private_data;
>>> @@ -3732,6 +3741,7 @@ static const struct pci_controller_ops 
>>> pnv_npu_ioda_controller_ops = {
>>> .reset_secondary_bus= pnv_pci_reset_secondary_bus,
>>> .dma_set_mask   = pnv_npu_dma_set_mask,
>>> .shutdown   = pnv_pci_ioda_shutdown,
>>> +   .disable_device = pnv_npu_disable_device,
>>>  };
>>>  
>>>  static const struct pci_controller_ops 
>>> pnv_npu_ocapi_ioda_controller_ops = {
>>>
>>
>>
>
>


>>>
>>>
>>
>>
> 
> 

-- 
Alexey


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Stephen Rothwell
Hi all,

On Tue, 16 Oct 2018 13:02:16 +1100 Stephen Rothwell  
wrote:
>
> Reverting fe3d2a45e8079fdd7d4da1ff07f4b40bc3cb499f (and the following 2
> commits) produces a kernel that boots.

Instead of that, I applied this patch on top of linux-next and it boots
and produces a stack trace ...

From: Stephen Rothwell 
Date: Tue, 16 Oct 2018 13:07:01 +1100
Subject: [PATCH] mm/memblock.c: use dump_stack() instead of WARN_ON_ONCE for
 the alignment checks

Using WARN_ON_ONCE too early causes the PowerPC kernel to fail.

Signed-off-by: Stephen Rothwell 
---
 mm/memblock.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index 5fefc70253ee..f2ef3915a356 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1298,8 +1298,10 @@ static phys_addr_t __init 
memblock_alloc_range_nid(phys_addr_t size,
 {
phys_addr_t found;
 
-   if (WARN_ON_ONCE(!align))
+   if (!align) {
+   dump_stack();
align = SMP_CACHE_BYTES;
+   }
 
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
@@ -1423,8 +1425,10 @@ static void * __init memblock_alloc_internal(
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, nid);
 
-   if (WARN_ON_ONCE(!align))
+   if (!align) {
+   dump_stack();
align = SMP_CACHE_BYTES;
+   }
 
if (max_addr > memblock.current_limit)
max_addr = memblock.current_limit;
-- 
2.18.0

So, patch "memblock: stop using implicit alignment to SMP_CACHE_BYTES"
should *not* remove the 0 -> SMP_CACHE_BYTES update from mm/memblock.c
and just add the dump_stack().
-- 
Cheers,
Stephen Rothwell


pgpkHnZy7qcTr.pgp
Description: OpenPGP digital signature


Re: [PATCH kernel v2] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-10-15 Thread Alistair Popple
> reset_ntl() does what npu2_dev_procedure_reset() does plus more stuff,
> there nothing really in npu2_dev_procedure_reset() which reset_ntl()
> does not do already from the hardware standpoint. And it did stop HMIs
> for me though.
> 
> but ok, what will be sufficient then if not reset_ntl()?

Argh, yes you are correct. Specifically both npu2_dev_procedure_reset() and
reset_ntl() contain:

/* NTL Reset */
val = npu2_read(ndev->npu, NPU2_NTL_MISC_CFG1(ndev));
val |= PPC_BIT(8) | PPC_BIT(9);
npu2_write(ndev->npu, NPU2_NTL_MISC_CFG1(ndev), val);

Which should fence the brick. However from what I recall there was more to
reliably preventing HMIs than merely fencing the brick. It invovled a sequence
of fencing and flushing the cache with dcbf instructions at the right time which
is why we also have the FLR. Unfortunately I don't know the precise details,
perhaps if we send enough coffee Balbir's way he might be able remind us?

- Alistair

> 
> 
> > 
> > - Alistair
> > 
> >>
> >>
> >>
> >>>
> >>> - Alistair
> >>>
> >>> On Monday, 15 October 2018 6:17:51 PM AEDT Alexey Kardashevskiy wrote:
>  Ping?
> 
> 
>  On 02/10/2018 13:20, Alexey Kardashevskiy wrote:
> > The skiboot firmware has a hot reset handler which fences the NVIDIA 
> > V100
> > GPU RAM on Witherspoons and makes accesses no-op instead of throwing 
> > HMIs:
> > https://github.com/open-power/skiboot/commit/fca2b2b839a67
> >
> > Now we are going to pass V100 via VFIO which most certainly involves
> > KVM guests which are often terminated without getting a chance to 
> > offline
> > GPU RAM so we end up with a running machine with misconfigured memory.
> > Accessing this memory produces hardware management interrupts (HMI)
> > which bring the host down.
> >
> > To suppress HMIs, this wires up this hot reset hook to 
> > vfio_pci_disable()
> > via pci_disable_device() which switches NPU2 to a safe mode and prevents
> > HMIs.
> >
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> > Changes:
> > v2:
> > * updated the commit log
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
> >  1 file changed, 10 insertions(+)
> >
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index cde7102..e37b9cc 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -3688,6 +3688,15 @@ static void pnv_pci_release_device(struct 
> > pci_dev *pdev)
> > pnv_ioda_release_pe(pe);
> >  }
> >  
> > +static void pnv_npu_disable_device(struct pci_dev *pdev)
> > +{
> > +   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
> > +   struct eeh_pe *eehpe = edev ? edev->pe : NULL;
> > +
> > +   if (eehpe && eeh_ops && eeh_ops->reset)
> > +   eeh_ops->reset(eehpe, EEH_RESET_HOT);
> > +}
> > +
> >  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
> >  {
> > struct pnv_phb *phb = hose->private_data;
> > @@ -3732,6 +3741,7 @@ static const struct pci_controller_ops 
> > pnv_npu_ioda_controller_ops = {
> > .reset_secondary_bus= pnv_pci_reset_secondary_bus,
> > .dma_set_mask   = pnv_npu_dma_set_mask,
> > .shutdown   = pnv_pci_ioda_shutdown,
> > +   .disable_device = pnv_npu_disable_device,
> >  };
> >  
> >  static const struct pci_controller_ops 
> > pnv_npu_ocapi_ioda_controller_ops = {
> >
> 
> 
> >>>
> >>>
> >>
> >>
> > 
> > 
> 
> 




Re: [PATCH kernel v2] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-10-15 Thread Alexey Kardashevskiy



On 16/10/2018 12:44, Alistair Popple wrote:
> Hi Alexey,
> 
> On Tuesday, 16 October 2018 12:37:49 PM AEDT Alexey Kardashevskiy wrote:
>>
>> On 16/10/2018 11:38, Alistair Popple wrote:
>>> Hi Alexey,
>>>
>>> Looking at the skiboot side I think we only fence the NVLink bricks as part 
>>> of a
>>> PCIe function level reset (FLR) rather than a PCI Hot or Fundamental reset 
>>> which
>>> I believe is what the code here does. So to fence the bricks you would need 
>>> to
>>> do either a FLR on the given link or alter Skiboot to fence a given link as 
>>> part
>>> of a hot reset.
>>
>> The code here calls OPAL to execute this code:
>>
>> https://github.com/open-power/skiboot/commit/fca2b2b839a673a1e52fc6b19ee6d33b2dfbc003
>>
>> This resets all links on an NPU which is fine for now as we pass GPUs in
>> groups only. Or I missed something?
> 
> From what I can see in
> https://github.com/open-power/skiboot/commit/2947eaa14e771e572d4e84bf003318c590c1c7d4
> we only fence the bricks in npu2_dev_procedure_reset()
> (https://github.com/open-power/skiboot/blob/master/hw/npu2-hw-procedures.c#L937)
> which itself is only called from the FLR path and not the code path you point
> out above.
> 
> The code path above only resets the NTL which I don't believe is sufficient to
> prevent HMIs.

reset_ntl() does what npu2_dev_procedure_reset() does plus more stuff,
there nothing really in npu2_dev_procedure_reset() which reset_ntl()
does not do already from the hardware standpoint. And it did stop HMIs
for me though.

but ok, what will be sufficient then if not reset_ntl()?



> 
> - Alistair
> 
>>
>>
>>
>>>
>>> - Alistair
>>>
>>> On Monday, 15 October 2018 6:17:51 PM AEDT Alexey Kardashevskiy wrote:
 Ping?


 On 02/10/2018 13:20, Alexey Kardashevskiy wrote:
> The skiboot firmware has a hot reset handler which fences the NVIDIA V100
> GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs:
> https://github.com/open-power/skiboot/commit/fca2b2b839a67
>
> Now we are going to pass V100 via VFIO which most certainly involves
> KVM guests which are often terminated without getting a chance to offline
> GPU RAM so we end up with a running machine with misconfigured memory.
> Accessing this memory produces hardware management interrupts (HMI)
> which bring the host down.
>
> To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable()
> via pci_disable_device() which switches NPU2 to a safe mode and prevents
> HMIs.
>
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v2:
> * updated the commit log
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
>  1 file changed, 10 insertions(+)
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index cde7102..e37b9cc 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -3688,6 +3688,15 @@ static void pnv_pci_release_device(struct pci_dev 
> *pdev)
>   pnv_ioda_release_pe(pe);
>  }
>  
> +static void pnv_npu_disable_device(struct pci_dev *pdev)
> +{
> + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
> + struct eeh_pe *eehpe = edev ? edev->pe : NULL;
> +
> + if (eehpe && eeh_ops && eeh_ops->reset)
> + eeh_ops->reset(eehpe, EEH_RESET_HOT);
> +}
> +
>  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
>  {
>   struct pnv_phb *phb = hose->private_data;
> @@ -3732,6 +3741,7 @@ static const struct pci_controller_ops 
> pnv_npu_ioda_controller_ops = {
>   .reset_secondary_bus= pnv_pci_reset_secondary_bus,
>   .dma_set_mask   = pnv_npu_dma_set_mask,
>   .shutdown   = pnv_pci_ioda_shutdown,
> + .disable_device = pnv_npu_disable_device,
>  };
>  
>  static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops 
> = {
>


>>>
>>>
>>
>>
> 
> 

-- 
Alexey


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Stephen Rothwell
Hi all,

On Mon, 15 Oct 2018 15:22:13 -0700 Guenter Roeck  wrote:
>
> On Tue, Oct 16, 2018 at 07:33:59AM +1100, Stephen Rothwell wrote:
> > 
> > On Mon, 15 Oct 2018 12:39:14 -0700 Guenter Roeck  
> > wrote:  
> > >
> > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:  
> > > > Hi all,
> > > > 
> > > > Changes since 20181012:
> > > > 
> > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > > > 
> > > 
> > > Bisect log:
> > >   
> 
> Trying again. Not very useful since some of the steps fail with compile 
> errors.
> The problem does seem to be related to the bootmem changes, though.

OK, I managed to do the bisect:

fe3d2a45e8079fdd7d4da1ff07f4b40bc3cb499f is the first bad commit
commit fe3d2a45e8079fdd7d4da1ff07f4b40bc3cb499f
Author: Mike Rapoport 
Date:   Sat Oct 13 14:05:51 2018 +1100

memblock: stop using implicit alignment to SMP_CACHE_BYTES

When a memblock allocation APIs are called with align = 0, the alignment
is implicitly set to SMP_CACHE_BYTES.

Implicit alignment is done deep in the memblock allocator and it can
come as a surprise.  Not that such an alignment would be wrong even
when used incorrectly but it is better to be explicit for the sake of
clarity and the prinicple of the least surprise.

Replace all such uses of memblock APIs with the 'align' parameter
explicitly set to SMP_CACHE_BYTES and stop implicit alignment assignment
in the memblock internal allocation functions.

For the case when memblock APIs are used via helper functions, e.g.  like
iommu_arena_new_node() in Alpha, the helper functions were detected with
Coccinelle's help and then manually examined and updated where
appropriate.

The direct memblock APIs users were updated using the semantic patch below:

@@
expression size, min_addr, max_addr, nid;
@@
(
|
- memblock_alloc_try_nid_raw(size, 0, min_addr, max_addr, nid)
+ memblock_alloc_try_nid_raw(size, SMP_CACHE_BYTES, min_addr, max_addr,
nid)
|
- memblock_alloc_try_nid_nopanic(size, 0, min_addr, max_addr, nid)
+ memblock_alloc_try_nid_nopanic(size, SMP_CACHE_BYTES, min_addr, max_addr,
nid)
|
- memblock_alloc_try_nid(size, 0, min_addr, max_addr, nid)
+ memblock_alloc_try_nid(size, SMP_CACHE_BYTES, min_addr, max_addr, nid)
|
- memblock_alloc(size, 0)
+ memblock_alloc(size, SMP_CACHE_BYTES)
|
- memblock_alloc_raw(size, 0)
+ memblock_alloc_raw(size, SMP_CACHE_BYTES)
|
- memblock_alloc_from(size, 0, min_addr)
+ memblock_alloc_from(size, SMP_CACHE_BYTES, min_addr)
|
- memblock_alloc_nopanic(size, 0)
+ memblock_alloc_nopanic(size, SMP_CACHE_BYTES)
|
- memblock_alloc_low(size, 0)
+ memblock_alloc_low(size, SMP_CACHE_BYTES)
|
- memblock_alloc_low_nopanic(size, 0)
+ memblock_alloc_low_nopanic(size, SMP_CACHE_BYTES)
|
- memblock_alloc_from_nopanic(size, 0, min_addr)
+ memblock_alloc_from_nopanic(size, SMP_CACHE_BYTES, min_addr)
|
- memblock_alloc_node(size, 0, nid)
+ memblock_alloc_node(size, SMP_CACHE_BYTES, nid)
)

[mho...@suse.com: changelog update]
Link: 
http://lkml.kernel.org/r/1538687224-17535-1-git-send-email-r...@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport 
Suggested-by: Michal Hocko 
Acked-by: Paul Burton [MIPS]
Acked-by: Michael Ellerman  [powerpc]
Acked-by: Michal Hocko 
Cc: Catalin Marinas 
Cc: Chris Zankel 
Cc: Geert Uytterhoeven 
Cc: Guan Xuetao 
Cc: Ingo Molnar 
Cc: Matt Turner 
Cc: Michal Simek 
Cc: Richard Weinberger 
Cc: Russell King 
Cc: Thomas Gleixner 
Cc: Tony Luck 
Signed-off-by: Andrew Morton 
Signed-off-by: Stephen Rothwell 

:04 04 9fdad987cc69ffad8de2d0d621facc66b096aac1 
c9959a9199e01f282d1d291a6280b203a8561e7a M  arch
:04 04 de303e4a0ad2b500de3fca2f65c8943c12c0b077 
1581aed934cfb96b2706df9bfa7746edfadffea6 M  drivers
:04 04 f9179d8189f08e3575031a76181a64eedd148db5 
293bc953dcfc2b718f5c7cdd58722284f4393dec M  include
:04 04 3019b5f917a20420537ac5cf4dc90b3c7f1aa56f 
1b88f0791341d17abd259d7a8d2e0cc55147d8f6 M  init
:04 04 8819d52ce4e6463fc95a2e841baeeddbc9fb8c52 
dc167f38bd8c4a80e0ac84ecc981e95b98703393 M  kernel
:04 04 30c64583e66fc20181d7ce2b6ced9d7e060e1042 
4e57b31864cef4e921bbd73150f63637a819e3c4 M  lib
:04 04 1b341ab0dd034f0fef37c234a771419924e0ecc9 
9c489ba60978950733bf678191833fdb9689bdab M  mm

# bad: [ca0591d03a2d373e0019ad357fbbee69c8272381] Add linux-next specific files 
for 20181015
# good: [3a27203102ebfa67bd0bced05b1def499bb59db2] Merge tag 
'libnvdimm-fixes-4.19-rc8' of 
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
gi

Re: [PATCH kernel v2] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-10-15 Thread Alistair Popple
Hi Alexey,

On Tuesday, 16 October 2018 12:37:49 PM AEDT Alexey Kardashevskiy wrote:
> 
> On 16/10/2018 11:38, Alistair Popple wrote:
> > Hi Alexey,
> > 
> > Looking at the skiboot side I think we only fence the NVLink bricks as part 
> > of a
> > PCIe function level reset (FLR) rather than a PCI Hot or Fundamental reset 
> > which
> > I believe is what the code here does. So to fence the bricks you would need 
> > to
> > do either a FLR on the given link or alter Skiboot to fence a given link as 
> > part
> > of a hot reset.
> 
> The code here calls OPAL to execute this code:
> 
> https://github.com/open-power/skiboot/commit/fca2b2b839a673a1e52fc6b19ee6d33b2dfbc003
> 
> This resets all links on an NPU which is fine for now as we pass GPUs in
> groups only. Or I missed something?

>From what I can see in
https://github.com/open-power/skiboot/commit/2947eaa14e771e572d4e84bf003318c590c1c7d4
we only fence the bricks in npu2_dev_procedure_reset()
(https://github.com/open-power/skiboot/blob/master/hw/npu2-hw-procedures.c#L937)
which itself is only called from the FLR path and not the code path you point
out above.

The code path above only resets the NTL which I don't believe is sufficient to
prevent HMIs.

- Alistair

>
> 
> 
> > 
> > - Alistair
> > 
> > On Monday, 15 October 2018 6:17:51 PM AEDT Alexey Kardashevskiy wrote:
> >> Ping?
> >>
> >>
> >> On 02/10/2018 13:20, Alexey Kardashevskiy wrote:
> >>> The skiboot firmware has a hot reset handler which fences the NVIDIA V100
> >>> GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs:
> >>> https://github.com/open-power/skiboot/commit/fca2b2b839a67
> >>>
> >>> Now we are going to pass V100 via VFIO which most certainly involves
> >>> KVM guests which are often terminated without getting a chance to offline
> >>> GPU RAM so we end up with a running machine with misconfigured memory.
> >>> Accessing this memory produces hardware management interrupts (HMI)
> >>> which bring the host down.
> >>>
> >>> To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable()
> >>> via pci_disable_device() which switches NPU2 to a safe mode and prevents
> >>> HMIs.
> >>>
> >>> Signed-off-by: Alexey Kardashevskiy 
> >>> ---
> >>> Changes:
> >>> v2:
> >>> * updated the commit log
> >>> ---
> >>>  arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
> >>>  1 file changed, 10 insertions(+)
> >>>
> >>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> >>> b/arch/powerpc/platforms/powernv/pci-ioda.c
> >>> index cde7102..e37b9cc 100644
> >>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >>> @@ -3688,6 +3688,15 @@ static void pnv_pci_release_device(struct pci_dev 
> >>> *pdev)
> >>>   pnv_ioda_release_pe(pe);
> >>>  }
> >>>  
> >>> +static void pnv_npu_disable_device(struct pci_dev *pdev)
> >>> +{
> >>> + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
> >>> + struct eeh_pe *eehpe = edev ? edev->pe : NULL;
> >>> +
> >>> + if (eehpe && eeh_ops && eeh_ops->reset)
> >>> + eeh_ops->reset(eehpe, EEH_RESET_HOT);
> >>> +}
> >>> +
> >>>  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
> >>>  {
> >>>   struct pnv_phb *phb = hose->private_data;
> >>> @@ -3732,6 +3741,7 @@ static const struct pci_controller_ops 
> >>> pnv_npu_ioda_controller_ops = {
> >>>   .reset_secondary_bus= pnv_pci_reset_secondary_bus,
> >>>   .dma_set_mask   = pnv_npu_dma_set_mask,
> >>>   .shutdown   = pnv_pci_ioda_shutdown,
> >>> + .disable_device = pnv_npu_disable_device,
> >>>  };
> >>>  
> >>>  static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops 
> >>> = {
> >>>
> >>
> >>
> > 
> > 
> 
> 




Re: [PATCH kernel v2] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-10-15 Thread Alexey Kardashevskiy



On 16/10/2018 11:38, Alistair Popple wrote:
> Hi Alexey,
> 
> Looking at the skiboot side I think we only fence the NVLink bricks as part 
> of a
> PCIe function level reset (FLR) rather than a PCI Hot or Fundamental reset 
> which
> I believe is what the code here does. So to fence the bricks you would need to
> do either a FLR on the given link or alter Skiboot to fence a given link as 
> part
> of a hot reset.

The code here calls OPAL to execute this code:

https://github.com/open-power/skiboot/commit/fca2b2b839a673a1e52fc6b19ee6d33b2dfbc003

This resets all links on an NPU which is fine for now as we pass GPUs in
groups only. Or I missed something?



> 
> - Alistair
> 
> On Monday, 15 October 2018 6:17:51 PM AEDT Alexey Kardashevskiy wrote:
>> Ping?
>>
>>
>> On 02/10/2018 13:20, Alexey Kardashevskiy wrote:
>>> The skiboot firmware has a hot reset handler which fences the NVIDIA V100
>>> GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs:
>>> https://github.com/open-power/skiboot/commit/fca2b2b839a67
>>>
>>> Now we are going to pass V100 via VFIO which most certainly involves
>>> KVM guests which are often terminated without getting a chance to offline
>>> GPU RAM so we end up with a running machine with misconfigured memory.
>>> Accessing this memory produces hardware management interrupts (HMI)
>>> which bring the host down.
>>>
>>> To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable()
>>> via pci_disable_device() which switches NPU2 to a safe mode and prevents
>>> HMIs.
>>>
>>> Signed-off-by: Alexey Kardashevskiy 
>>> ---
>>> Changes:
>>> v2:
>>> * updated the commit log
>>> ---
>>>  arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
>>>  1 file changed, 10 insertions(+)
>>>
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index cde7102..e37b9cc 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -3688,6 +3688,15 @@ static void pnv_pci_release_device(struct pci_dev 
>>> *pdev)
>>> pnv_ioda_release_pe(pe);
>>>  }
>>>  
>>> +static void pnv_npu_disable_device(struct pci_dev *pdev)
>>> +{
>>> +   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
>>> +   struct eeh_pe *eehpe = edev ? edev->pe : NULL;
>>> +
>>> +   if (eehpe && eeh_ops && eeh_ops->reset)
>>> +   eeh_ops->reset(eehpe, EEH_RESET_HOT);
>>> +}
>>> +
>>>  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
>>>  {
>>> struct pnv_phb *phb = hose->private_data;
>>> @@ -3732,6 +3741,7 @@ static const struct pci_controller_ops 
>>> pnv_npu_ioda_controller_ops = {
>>> .reset_secondary_bus= pnv_pci_reset_secondary_bus,
>>> .dma_set_mask   = pnv_npu_dma_set_mask,
>>> .shutdown   = pnv_pci_ioda_shutdown,
>>> +   .disable_device = pnv_npu_disable_device,
>>>  };
>>>  
>>>  static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = 
>>> {
>>>
>>
>>
> 
> 

-- 
Alexey


Re: linux-next: Tree for Oct 15

2018-10-15 Thread David Miller
From: Rob Herring 
Date: Mon, 15 Oct 2018 20:00:24 -0500

> David, Can you revert commit 0b9871a3a8cc. I'll have to find another approach.

Ok.


Re: [PATCH] powerpc/traps: restore recoverability of machine_check interrupts

2018-10-15 Thread Nicholas Piggin
On Mon, 15 Oct 2018 09:40:50 +0200
Christophe LEROY  wrote:

> Cc: sta...@vger.kernel.org 
> 
> Le 13/10/2018 à 11:16, Christophe Leroy a écrit :
> > commit b96672dd840f ("powerpc: Machine check interrupt is a non-
> > maskable interrupt") added a call to nmi_enter() at the beginning of
> > machine check restart exception handler. Due to that, in_interrupt()
> > always returns true regardless of the state before entering the
> > exception, and die() panics even when the system was not already in
> > interrupt.
> > 
> > This patch calls nmi_exit() before calling die() in order to restore
> > the interrupt state we had before calling nmi_enter()
> > 
> > Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable 
> > interrupt")
> > Signed-off-by: Christophe Leroy 

Reviewed-by: Nicholas Piggin 

This looks good to me and probably the simplest fix.

powernv will need a fix on top of this, to remove the die and let it
just fall through unrecovered to the traps.c code. pseries seems to
be okay. But this patch looks good.

Thanks,
Nick


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Rob Herring
+davem

On Mon, Oct 15, 2018 at 3:35 PM Rob Herring  wrote:
>
> On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell  
> wrote:
> >
> > Hi Guenter,
> >
> > [Just cc'ing the PPC and devicetree folks]
> >
> > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  wrote:
> > >
> > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > >
> > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > >
> > > Same here. Interestingly, this only affects little endian pseries
> > > boots; big endian works fine. I'll try to bisect later.
> > >
> > > ALl ppc qemu tests (including big endian pseries) also generate a warning.
> > >
> > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > .memblock_alloc_range_nid+0x20/0x68
> > > Modules linked in:
> > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > (4.19.0-rc7-next-20181015)
> > > MSR:  80021000   CR: 24000422  XER: 2000
> > > IRQMASK: 1
> > > GPR00: c0f99490 c1217d00 c121a500 00c0
> > > GPR04:    
> > > GPR08:  00c0 0018 00b7
> > > GPR12: 0040 c0fe7840  
> > > GPR16:    
> > > GPR20:    
> > > GPR24:    
> > > GPR28: c304 c1262088 00c0 c0fea500
> > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > Call Trace:
> > > [c1217d00] [c2a0] 0xc2a0 (unreliable)
> > > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > > [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > [c1217f90] [c528] start_here_multiplatform+0x68/0x80
> > >
> > >
> > > sparc images crash, starting with next-20181009. Bisect with
> > > next-201810112 points to the merge of devicetree/for-next, though
> > > devicetree/for-next itself does not have the problem (bisect log
> > > attached below). The crash is in devicetree code.
> > >
> > > Crash logs:
> > > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
>
> The sparc crash appears to be related to changes I made. Looking into it.

The problem is a combination of commit 0b9871a3a8cc ("sparc: Convert
to using %pOFn instead of device_node.name") and commit 6d0a70a284be
("vsprintf: print OF node name using full_name"). The Sparc functions
in prom_*.c can't use %pOFn to ultimately construct full_name as %pOFn
is derived from full_name. Reverting the former commit gets QEMU
booting again.

David, Can you revert commit 0b9871a3a8cc. I'll have to find another approach.

Rob


Re: [PATCH v06 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-15 Thread Michael Ellerman
Michael Bringmann  writes:
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 2b796da..9c76345 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
>   return rc;
>  }
>
> +static int dlpar_memory_readd_multiple(void)
> +{
> + struct drmem_lmb *lmb;
> + int rc;
> +
> + pr_info("Attempting to update multiple LMBs\n");
> +
> + for_each_drmem_lmb(lmb) {
> + if (drmem_lmb_update(lmb)) {
> + rc = dlpar_memory_readd_helper(lmb);
> + drmem_remove_lmb_update(lmb);
> + }
> + }
> +
> + return rc;
> +}

This leaves rc potentially uninitialised.

What should the result be in that case, -EINVAL ?

cheers


Re: [PATCH kernel v2] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-10-15 Thread Alistair Popple
Hi Alexey,

Looking at the skiboot side I think we only fence the NVLink bricks as part of a
PCIe function level reset (FLR) rather than a PCI Hot or Fundamental reset which
I believe is what the code here does. So to fence the bricks you would need to
do either a FLR on the given link or alter Skiboot to fence a given link as part
of a hot reset.

- Alistair

On Monday, 15 October 2018 6:17:51 PM AEDT Alexey Kardashevskiy wrote:
> Ping?
> 
> 
> On 02/10/2018 13:20, Alexey Kardashevskiy wrote:
> > The skiboot firmware has a hot reset handler which fences the NVIDIA V100
> > GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs:
> > https://github.com/open-power/skiboot/commit/fca2b2b839a67
> > 
> > Now we are going to pass V100 via VFIO which most certainly involves
> > KVM guests which are often terminated without getting a chance to offline
> > GPU RAM so we end up with a running machine with misconfigured memory.
> > Accessing this memory produces hardware management interrupts (HMI)
> > which bring the host down.
> > 
> > To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable()
> > via pci_disable_device() which switches NPU2 to a safe mode and prevents
> > HMIs.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> > Changes:
> > v2:
> > * updated the commit log
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
> >  1 file changed, 10 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index cde7102..e37b9cc 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -3688,6 +3688,15 @@ static void pnv_pci_release_device(struct pci_dev 
> > *pdev)
> > pnv_ioda_release_pe(pe);
> >  }
> >  
> > +static void pnv_npu_disable_device(struct pci_dev *pdev)
> > +{
> > +   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
> > +   struct eeh_pe *eehpe = edev ? edev->pe : NULL;
> > +
> > +   if (eehpe && eeh_ops && eeh_ops->reset)
> > +   eeh_ops->reset(eehpe, EEH_RESET_HOT);
> > +}
> > +
> >  static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
> >  {
> > struct pnv_phb *phb = hose->private_data;
> > @@ -3732,6 +3741,7 @@ static const struct pci_controller_ops 
> > pnv_npu_ioda_controller_ops = {
> > .reset_secondary_bus= pnv_pci_reset_secondary_bus,
> > .dma_set_mask   = pnv_npu_dma_set_mask,
> > .shutdown   = pnv_pci_ioda_shutdown,
> > +   .disable_device = pnv_npu_disable_device,
> >  };
> >  
> >  static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = 
> > {
> > 
> 
> 




Re: linux-next: qemu boot failures with today's linux-next

2018-10-15 Thread Stephen Rothwell
Hi Michael,

On Mon, 15 Oct 2018 23:45:10 +1100 Michael Ellerman  wrote:
>
> > Preparing to boot Linux version 4.19.0-rc7 (sfr@ash) (gcc version 8.2.0 
> > (Debian 8.2.0-4)) #2 SMP Mon Oct 15 18:53:28 AEDT 2018  
>   ^^
>   I assume that's wrong, this is actually 
> linux-next you're booting?

Yes, it was just before -rc8 came out and I suppress the extra version
information to save rebuilding things that depend on the version.

> > Booting Linux via __start() @ 0x0040 ...
> 
> If you git Ctrl-a-c you should get the qemu prompt. Then you can run
> 'info registers' to print the regs and maybe see where it's stuck.
> 
> And/or build with EARLY_DEBUG_LPAR to get early console output.

That gave one more line:

[0.00] printk: bootconsole [udbg0] enabled

-- 
Cheers,
Stephen Rothwell


pgpBIffIp2p5Y.pgp
Description: OpenPGP digital signature


Re: [PATCH 2/4] mm: speed up mremap by 500x on large regions (v2)

2018-10-15 Thread Joel Fernandes
On Mon, Oct 15, 2018 at 02:42:09AM -0700, Christoph Hellwig wrote:
> On Fri, Oct 12, 2018 at 06:31:58PM -0700, Joel Fernandes (Google) wrote:
> > Android needs to mremap large regions of memory during memory management
> > related operations.
> 
> Just curious: why?

In Android we have a requirement of moving a large (up to a GB now, but may
grow bigger in future) memory range from one location to another. This move
operation has to happen when the application threads are paused for this
operation. Therefore, an inefficient move like it is now (for example 250ms
on arm64) will cause response time issues for applications, which is not
acceptable. Huge pages cannot be used in such memory ranges to avoid this
inefficiency as (when the application threads are running) our fault handlers
are designed to process 4KB pages at a time, to keep response times low. So
using huge pages in this context can, again, cause response time issues.

Also, the mremap syscall waiting for quarter of a second for a large mremap
is quite weird and we ought to improve it where possible.

> > +   if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
> > +   || old_end - old_addr < PMD_SIZE)
> 
> The || goes on the first line.

Ok, fixed.

> > +   } else if (extent == PMD_SIZE && 
> > IS_ENABLED(CONFIG_HAVE_MOVE_PMD)) {
> 
> Overly long line.

Ok, fixed. Preview of updated patch is below.

thanks,

 - Joel

--8<---
From: "Joel Fernandes (Google)" 
Subject: [PATCH 2/4] mm: speed up mremap by 500x on large regions (v3)

Android needs to mremap large regions of memory during memory management
related operations. The mremap system call can be really slow if THP is
not enabled. The bottleneck is move_page_tables, which is copying each
pte at a time, and can be really slow across a large map. Turning on THP
may not be a viable option, and is not for us. This patch speeds up the
performance for non-THP system by copying at the PMD level when possible.

The speed up is three orders of magnitude. On a 1GB mremap, the mremap
completion times drops from 160-250 millesconds to 380-400 microseconds.

Before:
Total mremap time for 1GB data: 242321014 nanoseconds.
Total mremap time for 1GB data: 196842467 nanoseconds.
Total mremap time for 1GB data: 167051162 nanoseconds.

After:
Total mremap time for 1GB data: 385781 nanoseconds.
Total mremap time for 1GB data: 388959 nanoseconds.
Total mremap time for 1GB data: 402813 nanoseconds.

Incase THP is enabled, the optimization is mostly skipped except in
certain situations. I also flush the tlb every time we do this
optimization since I couldn't find a way to determine if the low-level
PTEs are dirty. It is seen that the cost of doing so is not much
compared the improvement, on both x86-64 and arm64.

Cc: minc...@kernel.org
Cc: pan...@google.com
Cc: hu...@google.com
Cc: lokeshgi...@google.com
Cc: dan...@google.com
Cc: mho...@kernel.org
Cc: kir...@shutemov.name
Cc: a...@linux-foundation.org
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 
---
 arch/Kconfig |  5 
 mm/mremap.c  | 66 
 2 files changed, 71 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 6801123932a5..9724fe39884f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -518,6 +518,11 @@ config HAVE_IRQ_TIME_ACCOUNTING
  Archs need to ensure they use a high enough resolution clock to
  support irq time accounting and then call 
enable_sched_clock_irqtime().
 
+config HAVE_MOVE_PMD
+   bool
+   help
+ Archs that select this are able to move page tables at the PMD level.
+
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
bool
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 9e68a02a52b1..a8dd98a59975 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -191,6 +191,54 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t 
*old_pmd,
drop_rmap_locks(vma);
 }
 
+static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+{
+   spinlock_t *old_ptl, *new_ptl;
+   struct mm_struct *mm = vma->vm_mm;
+
+   if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK) ||
+   old_end - old_addr < PMD_SIZE)
+   return false;
+
+   /*
+* The destination pmd shouldn't be established, free_pgtables()
+* should have release it.
+*/
+   if (WARN_ON(!pmd_none(*new_pmd)))
+   return false;
+
+   /*
+* We don't have to worry about the ordering of src and dst
+* ptlocks because exclusive mmap_sem prevents deadlock.
+*/
+   old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+   if (old_ptl) {
+   pmd_t pmd;
+
+   new_ptl = pmd_lockptr(mm, new_pmd);
+   if (new_ptl != old_ptl)
+   spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+  

Re: linux-next: Tree for Oct 15

2018-10-15 Thread Rob Herring
On Mon, Oct 15, 2018 at 5:34 PM Guenter Roeck  wrote:
>
> On Mon, Oct 15, 2018 at 05:13:08PM -0500, Rob Herring wrote:
> > On Mon, Oct 15, 2018 at 5:10 PM Guenter Roeck  wrote:
> > >
> > > On Mon, Oct 15, 2018 at 04:48:27PM -0500, Rob Herring wrote:
> > > > On Mon, Oct 15, 2018 at 4:18 PM Guenter Roeck  
> > > > wrote:
> > > > >
> > > > > On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> > > > > > On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell 
> > > > > >  wrote:
> > > > > > >
> > > > > > > Hi Guenter,
> > > > > > >
> > > > > > > [Just cc'ing the PPC and devicetree folks]
> > > > > > >
> > > > > > > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck 
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell 
> > > > > > > > wrote:
> > > > > > > > >
> > > > > > > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed 
> > > > > > > > > today.
> > > > > > > >
> > > > > > > > Same here. Interestingly, this only affects little endian 
> > > > > > > > pseries
> > > > > > > > boots; big endian works fine. I'll try to bisect later.
> > > > > > > >
> > > > > > > > ALl ppc qemu tests (including big endian pseries) also generate 
> > > > > > > > a warning.
> > > > > > > >
> > > > > > > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > > > > > > .memblock_alloc_range_nid+0x20/0x68
> > > > > > > > Modules linked in:
> > > > > > > > CPU: 0 PID: 0 Comm: swapper Not tainted 
> > > > > > > > 4.19.0-rc7-next-20181015 #1
> > > > > > > > NIP:  c0f99198 LR: c0f99490 CTR: 
> > > > > > > > c0bb8364
> > > > > > > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > > > > > > (4.19.0-rc7-next-20181015)
> > > > > > > > MSR:  80021000   CR: 24000422  XER: 2000
> > > > > > > > IRQMASK: 1
> > > > > > > > GPR00: c0f99490 c1217d00 c121a500 
> > > > > > > > 00c0
> > > > > > > > GPR04:    
> > > > > > > > 
> > > > > > > > GPR08:  00c0 0018 
> > > > > > > > 00b7
> > > > > > > > GPR12: 0040 c0fe7840  
> > > > > > > > 
> > > > > > > > GPR16:    
> > > > > > > > 
> > > > > > > > GPR20:    
> > > > > > > > 
> > > > > > > > GPR24:    
> > > > > > > > 
> > > > > > > > GPR28: c304 c1262088 00c0 
> > > > > > > > c0fea500
> > > > > > > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > > > > > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > > > > Call Trace:
> > > > > > > > [c1217d00] [c2a0] 0xc2a0 
> > > > > > > > (unreliable)
> > > > > > > > [c1217d80] [c0f99490] 
> > > > > > > > .memblock_alloc_base+0x18/0x48
> > > > > > > > [c1217df0] [c0f7a274] 
> > > > > > > > .allocate_paca_ptrs+0x3c/0x74
> > > > > > > > [c1217e70] [c0f78bf0] 
> > > > > > > > .early_init_devtree+0x288/0x320
> > > > > > > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > > > > > > [c1217f90] [c528] 
> > > > > > 

Re: linux-next: Tree for Oct 15

2018-10-15 Thread Guenter Roeck
On Mon, Oct 15, 2018 at 05:13:08PM -0500, Rob Herring wrote:
> On Mon, Oct 15, 2018 at 5:10 PM Guenter Roeck  wrote:
> >
> > On Mon, Oct 15, 2018 at 04:48:27PM -0500, Rob Herring wrote:
> > > On Mon, Oct 15, 2018 at 4:18 PM Guenter Roeck  wrote:
> > > >
> > > > On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> > > > > On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell 
> > > > >  wrote:
> > > > > >
> > > > > > Hi Guenter,
> > > > > >
> > > > > > [Just cc'ing the PPC and devicetree folks]
> > > > > >
> > > > > > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > > > > > >
> > > > > > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed 
> > > > > > > > today.
> > > > > > >
> > > > > > > Same here. Interestingly, this only affects little endian pseries
> > > > > > > boots; big endian works fine. I'll try to bisect later.
> > > > > > >
> > > > > > > ALl ppc qemu tests (including big endian pseries) also generate a 
> > > > > > > warning.
> > > > > > >
> > > > > > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > > > > > .memblock_alloc_range_nid+0x20/0x68
> > > > > > > Modules linked in:
> > > > > > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 
> > > > > > > #1
> > > > > > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > > > > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > > > > > (4.19.0-rc7-next-20181015)
> > > > > > > MSR:  80021000   CR: 24000422  XER: 2000
> > > > > > > IRQMASK: 1
> > > > > > > GPR00: c0f99490 c1217d00 c121a500 
> > > > > > > 00c0
> > > > > > > GPR04:    
> > > > > > > 
> > > > > > > GPR08:  00c0 0018 
> > > > > > > 00b7
> > > > > > > GPR12: 0040 c0fe7840  
> > > > > > > 
> > > > > > > GPR16:    
> > > > > > > 
> > > > > > > GPR20:    
> > > > > > > 
> > > > > > > GPR24:    
> > > > > > > 
> > > > > > > GPR28: c304 c1262088 00c0 
> > > > > > > c0fea500
> > > > > > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > > > > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > > > Call Trace:
> > > > > > > [c1217d00] [c2a0] 0xc2a0 
> > > > > > > (unreliable)
> > > > > > > [c1217d80] [c0f99490] 
> > > > > > > .memblock_alloc_base+0x18/0x48
> > > > > > > [c1217df0] [c0f7a274] 
> > > > > > > .allocate_paca_ptrs+0x3c/0x74
> > > > > > > [c1217e70] [c0f78bf0] 
> > > > > > > .early_init_devtree+0x288/0x320
> > > > > > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > > > > > [c1217f90] [c528] 
> > > > > > > start_here_multiplatform+0x68/0x80
> > > > > > >
> > > > > > >
> > > > > > > sparc images crash, starting with next-20181009. Bisect with
> > > > > > > next-201810112 points to the merge of devicetree/for-next, though
> > > > > > > devicetree/for-next itself does not have the problem (bisect log
> > > > > > > attached below). The crash is in devicetree code.
> > > > > > &g

Re: linux-next: Tree for Oct 15

2018-10-15 Thread Guenter Roeck
On Mon, Oct 15, 2018 at 05:13:08PM -0500, Rob Herring wrote:
> On Mon, Oct 15, 2018 at 5:10 PM Guenter Roeck  wrote:
> >
> > On Mon, Oct 15, 2018 at 04:48:27PM -0500, Rob Herring wrote:
> > > On Mon, Oct 15, 2018 at 4:18 PM Guenter Roeck  wrote:
> > > >
> > > > On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> > > > > On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell 
> > > > >  wrote:
> > > > > >
> > > > > > Hi Guenter,
> > > > > >
> > > > > > [Just cc'ing the PPC and devicetree folks]
> > > > > >
> > > > > > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > > > > > >
> > > > > > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed 
> > > > > > > > today.
> > > > > > >
> > > > > > > Same here. Interestingly, this only affects little endian pseries
> > > > > > > boots; big endian works fine. I'll try to bisect later.
> > > > > > >
> > > > > > > ALl ppc qemu tests (including big endian pseries) also generate a 
> > > > > > > warning.
> > > > > > >
> > > > > > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > > > > > .memblock_alloc_range_nid+0x20/0x68
> > > > > > > Modules linked in:
> > > > > > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 
> > > > > > > #1
> > > > > > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > > > > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > > > > > (4.19.0-rc7-next-20181015)
> > > > > > > MSR:  80021000   CR: 24000422  XER: 2000
> > > > > > > IRQMASK: 1
> > > > > > > GPR00: c0f99490 c1217d00 c121a500 
> > > > > > > 00c0
> > > > > > > GPR04:    
> > > > > > > 
> > > > > > > GPR08:  00c0 0018 
> > > > > > > 00b7
> > > > > > > GPR12: 0040 c0fe7840  
> > > > > > > 
> > > > > > > GPR16:    
> > > > > > > 
> > > > > > > GPR20:    
> > > > > > > 
> > > > > > > GPR24:    
> > > > > > > 
> > > > > > > GPR28: c304 c1262088 00c0 
> > > > > > > c0fea500
> > > > > > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > > > > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > > > Call Trace:
> > > > > > > [c1217d00] [c2a0] 0xc2a0 
> > > > > > > (unreliable)
> > > > > > > [c1217d80] [c0f99490] 
> > > > > > > .memblock_alloc_base+0x18/0x48
> > > > > > > [c1217df0] [c0f7a274] 
> > > > > > > .allocate_paca_ptrs+0x3c/0x74
> > > > > > > [c1217e70] [c0f78bf0] 
> > > > > > > .early_init_devtree+0x288/0x320
> > > > > > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > > > > > [c1217f90] [c528] 
> > > > > > > start_here_multiplatform+0x68/0x80
> > > > > > >
> > > > > > >
> > > > > > > sparc images crash, starting with next-20181009. Bisect with
> > > > > > > next-201810112 points to the merge of devicetree/for-next, though
> > > > > > > devicetree/for-next itself does not have the problem (bisect log
> > > > > > > attached below). The crash is in devicetree code.
> > > > > > >
> > > > > > > Crash logs:
> > > > > > > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > > > > > > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
> > > > >
> > > > > The sparc crash appears to be related to changes I made. Looking into 
> > > > > it.
> > > > >
> > > >
> > > > Let me know if you need me to test anything or do some debugging.
> > >
> > > Well, I'm not having any luck getting sparc qemu to work. Here's what
> > > I'm trying with a sparc32_defconfig kernel:
> > >
> > > $ qemu-system-sparc -kernel .build-sparc/vmlinux -M SS-4 -nographic -m
> > > 256 -no-reboot
> > > rom: requested regions overlap (rom phdr #0: .build-sparc/vmlinux.
> > > free=0x057a, addr=0x)
> > > qemu-system-sparc: rom check and register reset failed
> > >
> >
> > What is your qemu version ?
> 
> 2.11 from ubuntu 18.04:
> QEMU emulator version 2.11.1(Debian 1:2.11+dfsg-1ubuntu7.5)
> 

This works for me:

qemu-system-sparc -M SS-4 -kernel arch/sparc/boot/zImage -no-reboot \
-drive file=hda.sqf,if=scsi,format=raw \
-append 'root=/dev/sda rw init=/sbin/init.sh panic=1 console=ttyS0' \
-nographic -monitor none

arch/sparc/boot/image works as well.

This is with qemu 2.5 (from Ubuntu 16.04). I'll try with 2.11 tonight.
My current private version is based on qemu 3.0.

Guenter


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Guenter Roeck
On Tue, Oct 16, 2018 at 07:33:59AM +1100, Stephen Rothwell wrote:
> Hi Guenter,
> 
> [Again, just cc'ing the PPC folks]
> 
> On Mon, 15 Oct 2018 12:39:14 -0700 Guenter Roeck  wrote:
> >
> > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > Hi all,
> > > 
> > > Changes since 20181012:
> > > 
> > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > >   
> > 
> > Bisect log:
> > 

Trying again. Not very useful since some of the steps fail with compile errors.
The problem does seem to be related to the bootmem changes, though.

I might try again tonight if I find the time.

Guenter

# bad: [ca0591d03a2d373e0019ad357fbbee69c8272381] Add linux-next specific files 
for 20181015
# good: [0238df646e6224016a45505d2c111a24669ebe21] Linux 4.19-rc7
git bisect start 'HEAD' 'v4.19-rc7'
# good: [2fc8fb4c02a0f7b9d5e5b4de80cbcef7c808068b] Merge remote-tracking branch 
'spi-nor/spi-nor/next'
git bisect good 2fc8fb4c02a0f7b9d5e5b4de80cbcef7c808068b
# good: [a54eefdf4d208f6904da9e836ff32d7dde0c9516] Merge remote-tracking branch 
'tip/auto-latest'
git bisect good a54eefdf4d208f6904da9e836ff32d7dde0c9516
# good: [d71e0d25be750d02a3d04500aeb151bb94465811] Merge remote-tracking branch 
'staging/staging-next'
git bisect good d71e0d25be750d02a3d04500aeb151bb94465811
# good: [7961c8ea9d81f927a78e30bb7c194310ed6b7c1d] Merge remote-tracking branch 
'pinctrl/for-next'
git bisect good 7961c8ea9d81f927a78e30bb7c194310ed6b7c1d
# good: [c3d392e6aae57d54fdc683f7432c3e248602bebb] Merge remote-tracking branch 
'xarray/xarray'
git bisect good c3d392e6aae57d54fdc683f7432c3e248602bebb
# good: [880c1034475c873963d6250eb95ebbbf5604a281] userfaultfd: selftest: 
cleanup help messages
git bisect good 880c1034475c873963d6250eb95ebbbf5604a281
# good: [9f1fa0ab60f7b09d335bbaf33db9116241059708] reiserfs: propagate errors 
from fill_with_dentries() properly
git bisect good 9f1fa0ab60f7b09d335bbaf33db9116241059708
# good: [596046ffd571f32fa3d3e7ffdf7861b71a258552] memblock: replace 
alloc_bootmem_low with memblock_alloc_low (2)
git bisect good 596046ffd571f32fa3d3e7ffdf7861b71a258552
# bad: [ef07e25e5bb02b420cb66004420cea3e0d65d107] dma-direct: fix up for the 
removal of linux/bootmem.h
git bisect bad ef07e25e5bb02b420cb66004420cea3e0d65d107
# good: [ddaa897c9ab76969a74d67a65b6616895f349644] memblock: replace 
alloc_bootmem with memblock_alloc
git bisect good ddaa897c9ab76969a74d67a65b6616895f349644
# good: [3b79243c2ef23d829a2f01f8c9526f17b80a7a32] memblock: rename 
free_all_bootmem to memblock_free_all
git bisect good 3b79243c2ef23d829a2f01f8c9526f17b80a7a32
# good: [d5fa9634892df2bc6bab6101f18df6ba5a2490c5] mm: remove nobootmem
git bisect good d5fa9634892df2bc6bab6101f18df6ba5a2490c5
# bad: [75fd637c22bc9bb5c959b7f93c2c5e5f0495992c] mm: remove 
include/linux/bootmem.h
git bisect bad 75fd637c22bc9bb5c959b7f93c2c5e5f0495992c
# good: [979961b3058df1a6d24ab423dd6fa6f20982f591] memblock: replace 
BOOTMEM_ALLOC_* with MEMBLOCK variants
git bisect good 979961b3058df1a6d24ab423dd6fa6f20982f591
# first bad commit: [75fd637c22bc9bb5c959b7f93c2c5e5f0495992c] mm: remove 
include/linux/bootmem.h


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Andrew Morton
On Tue, 16 Oct 2018 07:24:39 +1100 Stephen Rothwell  
wrote:

> On Tue, 16 Oct 2018 07:12:40 +1100 Stephen Rothwell  
> wrote:
> >
> > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  wrote:
> > >
> > > ALl ppc qemu tests (including big endian pseries) also generate a warning.
> > > 
> > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > .memblock_alloc_range_nid+0x20/0x68
> 
> That is:
> 
> static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
> phys_addr_t align, phys_addr_t start,
> phys_addr_t end, int nid,
> enum memblock_flags flags)
> {
>if (WARN_ON_ONCE(!align))
> align = SMP_CACHE_BYTES;
> 
> Looks like patch
> 
>   "memblock: stop using implicit alignment to SMP_CACHE_BYTES"
> 
> missed some places ...

To be expected, I guess.  I'm pretty relaxed about this ;) Let's do
another sweep in a week or so, after which we'll have a couple of
months to mop up any leftovers.



Re: linux-next: Tree for Oct 15

2018-10-15 Thread Rob Herring
On Mon, Oct 15, 2018 at 5:10 PM Guenter Roeck  wrote:
>
> On Mon, Oct 15, 2018 at 04:48:27PM -0500, Rob Herring wrote:
> > On Mon, Oct 15, 2018 at 4:18 PM Guenter Roeck  wrote:
> > >
> > > On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> > > > On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell 
> > > >  wrote:
> > > > >
> > > > > Hi Guenter,
> > > > >
> > > > > [Just cc'ing the PPC and devicetree folks]
> > > > >
> > > > > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  
> > > > > wrote:
> > > > > >
> > > > > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > > > > >
> > > > > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed 
> > > > > > > today.
> > > > > >
> > > > > > Same here. Interestingly, this only affects little endian pseries
> > > > > > boots; big endian works fine. I'll try to bisect later.
> > > > > >
> > > > > > ALl ppc qemu tests (including big endian pseries) also generate a 
> > > > > > warning.
> > > > > >
> > > > > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > > > > .memblock_alloc_range_nid+0x20/0x68
> > > > > > Modules linked in:
> > > > > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > > > > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > > > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > > > > (4.19.0-rc7-next-20181015)
> > > > > > MSR:  80021000   CR: 24000422  XER: 2000
> > > > > > IRQMASK: 1
> > > > > > GPR00: c0f99490 c1217d00 c121a500 
> > > > > > 00c0
> > > > > > GPR04:    
> > > > > > 
> > > > > > GPR08:  00c0 0018 
> > > > > > 00b7
> > > > > > GPR12: 0040 c0fe7840  
> > > > > > 
> > > > > > GPR16:    
> > > > > > 
> > > > > > GPR20:    
> > > > > > 
> > > > > > GPR24:    
> > > > > > 
> > > > > > GPR28: c304 c1262088 00c0 
> > > > > > c0fea500
> > > > > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > > > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > > Call Trace:
> > > > > > [c1217d00] [c2a0] 0xc2a0 
> > > > > > (unreliable)
> > > > > > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > > > > > [c1217e70] [c0f78bf0] 
> > > > > > .early_init_devtree+0x288/0x320
> > > > > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > > > > [c1217f90] [c528] 
> > > > > > start_here_multiplatform+0x68/0x80
> > > > > >
> > > > > >
> > > > > > sparc images crash, starting with next-20181009. Bisect with
> > > > > > next-201810112 points to the merge of devicetree/for-next, though
> > > > > > devicetree/for-next itself does not have the problem (bisect log
> > > > > > attached below). The crash is in devicetree code.
> > > > > >
> > > > > > Crash logs:
> > > > > > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > > > > > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
> > > >
> > > > The sparc crash appears to be related to changes I made. Looking into 
> > > > it.
> > > >
> > >
> > > Let me know if you need me to test anything or do some debugging.
> >
> > Well, I'm not having any luck getting sparc qemu to work. Here's what
> > I'm trying with a sparc32_defconfig kernel:
> >
> > $ qemu-system-sparc -kernel .build-sparc/vmlinux -M SS-4 -nographic -m
> > 256 -no-reboot
> > rom: requested regions overlap (rom phdr #0: .build-sparc/vmlinux.
> > free=0x057a, addr=0x)
> > qemu-system-sparc: rom check and register reset failed
> >
>
> What is your qemu version ?

2.11 from ubuntu 18.04:
QEMU emulator version 2.11.1(Debian 1:2.11+dfsg-1ubuntu7.5)

Rob


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Guenter Roeck
On Mon, Oct 15, 2018 at 04:48:27PM -0500, Rob Herring wrote:
> On Mon, Oct 15, 2018 at 4:18 PM Guenter Roeck  wrote:
> >
> > On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> > > On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell  
> > > wrote:
> > > >
> > > > Hi Guenter,
> > > >
> > > > [Just cc'ing the PPC and devicetree folks]
> > > >
> > > > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  
> > > > wrote:
> > > > >
> > > > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > > > >
> > > > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > > > >
> > > > > Same here. Interestingly, this only affects little endian pseries
> > > > > boots; big endian works fine. I'll try to bisect later.
> > > > >
> > > > > ALl ppc qemu tests (including big endian pseries) also generate a 
> > > > > warning.
> > > > >
> > > > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > > > .memblock_alloc_range_nid+0x20/0x68
> > > > > Modules linked in:
> > > > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > > > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > > > (4.19.0-rc7-next-20181015)
> > > > > MSR:  80021000   CR: 24000422  XER: 2000
> > > > > IRQMASK: 1
> > > > > GPR00: c0f99490 c1217d00 c121a500 
> > > > > 00c0
> > > > > GPR04:    
> > > > > 
> > > > > GPR08:  00c0 0018 
> > > > > 00b7
> > > > > GPR12: 0040 c0fe7840  
> > > > > 
> > > > > GPR16:    
> > > > > 
> > > > > GPR20:    
> > > > > 
> > > > > GPR24:    
> > > > > 
> > > > > GPR28: c304 c1262088 00c0 
> > > > > c0fea500
> > > > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > Call Trace:
> > > > > [c1217d00] [c2a0] 0xc2a0 (unreliable)
> > > > > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > > > > [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> > > > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > > > [c1217f90] [c528] 
> > > > > start_here_multiplatform+0x68/0x80
> > > > >
> > > > >
> > > > > sparc images crash, starting with next-20181009. Bisect with
> > > > > next-201810112 points to the merge of devicetree/for-next, though
> > > > > devicetree/for-next itself does not have the problem (bisect log
> > > > > attached below). The crash is in devicetree code.
> > > > >
> > > > > Crash logs:
> > > > > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > > > > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
> > >
> > > The sparc crash appears to be related to changes I made. Looking into it.
> > >
> >
> > Let me know if you need me to test anything or do some debugging.
> 
> Well, I'm not having any luck getting sparc qemu to work. Here's what
> I'm trying with a sparc32_defconfig kernel:
> 
> $ qemu-system-sparc -kernel .build-sparc/vmlinux -M SS-4 -nographic -m
> 256 -no-reboot
> rom: requested regions overlap (rom phdr #0: .build-sparc/vmlinux.
> free=0x057a, addr=0x)
> qemu-system-sparc: rom check and register reset failed
> 

What is your qemu version ?

Guenter

> Using zImage or image file didn't work any better.
> 
> Then I tried sticking the kernel in a disk image, but that didn't get
> much farther.
> 
> Rob


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Rob Herring
On Mon, Oct 15, 2018 at 4:18 PM Guenter Roeck  wrote:
>
> On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> > On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell  
> > wrote:
> > >
> > > Hi Guenter,
> > >
> > > [Just cc'ing the PPC and devicetree folks]
> > >
> > > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  
> > > wrote:
> > > >
> > > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > > >
> > > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > > >
> > > > Same here. Interestingly, this only affects little endian pseries
> > > > boots; big endian works fine. I'll try to bisect later.
> > > >
> > > > ALl ppc qemu tests (including big endian pseries) also generate a 
> > > > warning.
> > > >
> > > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > > .memblock_alloc_range_nid+0x20/0x68
> > > > Modules linked in:
> > > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > > (4.19.0-rc7-next-20181015)
> > > > MSR:  80021000   CR: 24000422  XER: 2000
> > > > IRQMASK: 1
> > > > GPR00: c0f99490 c1217d00 c121a500 
> > > > 00c0
> > > > GPR04:    
> > > > 
> > > > GPR08:  00c0 0018 
> > > > 00b7
> > > > GPR12: 0040 c0fe7840  
> > > > 
> > > > GPR16:    
> > > > 
> > > > GPR20:    
> > > > 
> > > > GPR24:    
> > > > 
> > > > GPR28: c304 c1262088 00c0 
> > > > c0fea500
> > > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > Call Trace:
> > > > [c1217d00] [c2a0] 0xc2a0 (unreliable)
> > > > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > > > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > > > [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> > > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > > [c1217f90] [c528] start_here_multiplatform+0x68/0x80
> > > >
> > > >
> > > > sparc images crash, starting with next-20181009. Bisect with
> > > > next-201810112 points to the merge of devicetree/for-next, though
> > > > devicetree/for-next itself does not have the problem (bisect log
> > > > attached below). The crash is in devicetree code.
> > > >
> > > > Crash logs:
> > > > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > > > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
> >
> > The sparc crash appears to be related to changes I made. Looking into it.
> >
>
> Let me know if you need me to test anything or do some debugging.

Well, I'm not having any luck getting sparc qemu to work. Here's what
I'm trying with a sparc32_defconfig kernel:

$ qemu-system-sparc -kernel .build-sparc/vmlinux -M SS-4 -nographic -m
256 -no-reboot
rom: requested regions overlap (rom phdr #0: .build-sparc/vmlinux.
free=0x057a, addr=0x)
qemu-system-sparc: rom check and register reset failed

Using zImage or image file didn't work any better.

Then I tried sticking the kernel in a disk image, but that didn't get
much farther.

Rob


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Guenter Roeck
On Tue, Oct 16, 2018 at 07:33:59AM +1100, Stephen Rothwell wrote:
> Hi Guenter,
> 
> [Again, just cc'ing the PPC folks]
> 
> On Mon, 15 Oct 2018 12:39:14 -0700 Guenter Roeck  wrote:
> >
> > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > Hi all,
> > > 
> > > Changes since 20181012:
> > > 
> > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > >   
> > 
> > Bisect log:
> > 
> > # bad: [774ea0551a2966c8fc29a6f675c3e28c5c6fa586] Add linux-next specific 
> > files for 20181012
> > # good: [0238df646e6224016a45505d2c111a24669ebe21] Linux 4.19-rc7
> > git bisect start 'HEAD' 'v4.19-rc7'
> > # good: [dfbf78faefa3c26d94208398e62bf25ea798e7f2] Merge remote-tracking 
> > branch 'spi-nor/spi-nor/next'
> > git bisect good dfbf78faefa3c26d94208398e62bf25ea798e7f2
> > # good: [3f296bb430327676912966c56d2f078f74e6b4ab] Merge remote-tracking 
> > branch 'tip/auto-latest'
> > git bisect good 3f296bb430327676912966c56d2f078f74e6b4ab
> > # good: [056ff0c45d1780f7bac1b54bd4160647efc500ad] Merge remote-tracking 
> > branch 'staging/staging-next'
> > git bisect good 056ff0c45d1780f7bac1b54bd4160647efc500ad
> > # good: [d7946b50c21a7d88af6c8e88d976ba3dfca651cc] Merge remote-tracking 
> > branch 'pinctrl/for-next'
> > git bisect good d7946b50c21a7d88af6c8e88d976ba3dfca651cc
> > # good: [72b5ca3121d5352fbb8fe3e1abaa86748205c0cb] Merge remote-tracking 
> > branch 'xarray/xarray'
> > git bisect good 72b5ca3121d5352fbb8fe3e1abaa86748205c0cb
> > # good: [e3895cf23a25da6dea2c8e986d4f6c24fafe5448] hugetlb: introduce 
> > generic version of prepare_hugepage_range
> > git bisect good e3895cf23a25da6dea2c8e986d4f6c24fafe5448
> > # good: [627f8833ac26e66d4b50676a0251499474bb4ee4] reiserfs: propagate 
> > errors from fill_with_dentries() properly
> > git bisect good 627f8833ac26e66d4b50676a0251499474bb4ee4
> > # good: [e38910adf47ba1d0b5a5a573cc26bde1ec533147] memblock: replace 
> > alloc_bootmem_pages_node with memblock_alloc_node
> > git bisect good e38910adf47ba1d0b5a5a573cc26bde1ec533147
> > # bad: [f89bdd2c52666d9da4bf4ef3a97a7188586ba0fb] dma-direct: fix up for 
> > the removal of linux/bootmem.h
> > git bisect bad f89bdd2c52666d9da4bf4ef3a97a7188586ba0fb
> > # good: [3108d998dfc36eb7f6b7f2917fc561258f742094] mm: nobootmem: remove 
> > bootmem allocation APIs
> > git bisect good 3108d998dfc36eb7f6b7f2917fc561258f742094
> > # good: [1f94dacb1d0ed0d1068b89ad867a198d3eca7bf2] memblock: rename 
> > __free_pages_bootmem to memblock_free_pages
> > git bisect good 1f94dacb1d0ed0d1068b89ad867a198d3eca7bf2
> > # good: [c3954ade0c1b499ae587f3edb813876216212836] memblock: replace 
> > BOOTMEM_ALLOC_* with MEMBLOCK variants
> > git bisect good c3954ade0c1b499ae587f3edb813876216212836
> > # bad: [cde1c7f7e92aef241f1c1a09a4d1f1f06fd565b6] mm: remove 
> > include/linux/bootmem.h
> > git bisect bad cde1c7f7e92aef241f1c1a09a4d1f1f06fd565b6
> > # first bad commit: [cde1c7f7e92aef241f1c1a09a4d1f1f06fd565b6] mm: remove 
> > include/linux/bootmem.h
> > 
> > Reverting this patch together with its fix-up "powerpc: fix up for removal 
> > of
> > linux/bootmem.h" fixes the problem. This also fixes the traceback seen with 
> > all
> > other ppc64 images.
> > 
> > Guenter
> 
> Thanks for this ... though a strange result as those patches were in
> next-20181012 as well, so I wonder what else changed.
> 
Quite simple - the bisect is wrong. For some reason I started with
next-20181012 (which was fine), not 20181015 (which is broken).
Repeating it now. Sorry for the confusion.

Guenter


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Guenter Roeck
On Mon, Oct 15, 2018 at 03:35:12PM -0500, Rob Herring wrote:
> On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell  
> wrote:
> >
> > Hi Guenter,
> >
> > [Just cc'ing the PPC and devicetree folks]
> >
> > On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  wrote:
> > >
> > > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > > >
> > > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> > >
> > > Same here. Interestingly, this only affects little endian pseries
> > > boots; big endian works fine. I'll try to bisect later.
> > >
> > > ALl ppc qemu tests (including big endian pseries) also generate a warning.
> > >
> > > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > > .memblock_alloc_range_nid+0x20/0x68
> > > Modules linked in:
> > > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > > REGS: c1217a78 TRAP: 0700   Not tainted  
> > > (4.19.0-rc7-next-20181015)
> > > MSR:  80021000   CR: 24000422  XER: 2000
> > > IRQMASK: 1
> > > GPR00: c0f99490 c1217d00 c121a500 00c0
> > > GPR04:    
> > > GPR08:  00c0 0018 00b7
> > > GPR12: 0040 c0fe7840  
> > > GPR16:    
> > > GPR20:    
> > > GPR24:    
> > > GPR28: c304 c1262088 00c0 c0fea500
> > > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > > Call Trace:
> > > [c1217d00] [c2a0] 0xc2a0 (unreliable)
> > > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > > [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> > > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > > [c1217f90] [c528] start_here_multiplatform+0x68/0x80
> > >
> > >
> > > sparc images crash, starting with next-20181009. Bisect with
> > > next-201810112 points to the merge of devicetree/for-next, though
> > > devicetree/for-next itself does not have the problem (bisect log
> > > attached below). The crash is in devicetree code.
> > >
> > > Crash logs:
> > > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
> 
> The sparc crash appears to be related to changes I made. Looking into it.
> 

Let me know if you need me to test anything or do some debugging.

Thanks,
Guenter


Re: [PATCH 4/8] pci: consolidate PCI config entry in drivers/pci

2018-10-15 Thread Bjorn Helgaas
s/^pci: /PCI: / in subject

On Sat, Oct 13, 2018 at 05:10:12PM +0200, Christoph Hellwig wrote:
> There is no good reason to duplicate the PCI menu in every architecture.
> Instead provide a selectable HAS_PCI symbol that indicates availability
> of PCI support and the handle the rest in drivers/pci.
> 
> Note that for powerpc we now select HAS_PCI globally instead of the
> convoluted mess of conditional or or non-conditional support per board,
> similar to what we do e.g. on x86.  For alpha PCI is selected for the
> non-jensen configs as it was the default before, and a lot of code does
> not compile without PCI enabled.  On other architectures with limited
> PCI support that wasn't as complicated I've left the selection as-is.

Thanks for doing this.  It's a great cleanup.  I know you have a few
things you're cleaning up, but add my:

Acked-by: Bjorn Helgaas 

when you do that.


Re: [PATCH 1/8] aha152x: rename the PCMCIA define

2018-10-15 Thread Bjorn Helgaas
On Sat, Oct 13, 2018 at 05:10:09PM +0200, Christoph Hellwig wrote:
> We plan to enable building the pcmcia core and drivers, and the
> non-prefixed PCMCIA name clashes with some arch headers.

In the followup PCMCIA patch, you capitalized "PCMCIA core".


Re: [PATCH v3 00/18] of: overlay: validation checks, subsequent fixes

2018-10-15 Thread Alan Tull
On Mon, Oct 15, 2018 at 3:24 PM Frank Rowand  wrote:
>
> On 10/15/18 12:21, Alan Tull wrote:
> > On Sun, Oct 14, 2018 at 7:26 PM  wrote:
> >>
> >> From: Frank Rowand 
> >>
> >> Add checks to (1) overlay apply process and (2) memory freeing
> >> triggered by overlay release.  The checks are intended to detect
> >> possible memory leaks and invalid overlays.
> >>
> >> The checks revealed bugs in existing code.  Fixed the bugs.
> >>
> >> While fixing bugs, noted other issues, which are fixed in
> >> separate patches.
> >>
> >> *  Powerpc folks: I was not able to test the patches that
> >> *  directly impact Powerpc systems that use dynamic
> >> *  devicetree.  Please review that code carefully and
> >> *  test.  The specific patches are: 03/16, 04/16, 07/16
> >>
> >> FPGA folks:
> >>
> >>   I made the validation checks that should result in an
> >>   invalid live devicetree report "ERROR" and cause the overlay apply
> >>   to fail.
> >>
> >>   I made the memory leak validation tests report "WARNING" and allow
> >>   the overlay apply to complete successfully.  Please let me know
> >>   if you encounter the warnings.  There are at least two paths
> >>   forward to deal with the cases that trigger the warning: (1) change
> >>   the warning to an error and fail the overlay apply, or (2) find a
> >>   way to detect the potential memory leaks and free the memory
> >>   appropriately.
> >
> > I reran my FPGA testing.  The strings are fixed, no longer NULL.  I
>
> Thanks for the further testing!
>
>
> > have functionality back, my test passes now.  I'm seeing the intended
> > warnings about any properties added to existing nodes.  That includes
> > warnings about added symbols.  Below is a simplified part to show some
> > of what I'm seeing.
> >
> > By the way my testing is all using Pantelis' DT overlay configfs interface.
> >
> > root@arria10:~# ./apply-static-region.sh
> >
> > Applying dtbo: socfpga_arria10_socdk_sdmmc_ghrd_ovl_ext_cfg.dtb
> >
> > [ 1821.088640] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property:
> > /soc/base_fpga_region/ranges
> > [ 1821.103307] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property:
> > /soc/base_fpga_region/external-fpga-config
> > [ 1821.117359] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property:
> > /soc/base_fpga_region/clocks
> > [ 1821.130130] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property:
> > /soc/base_fpga_region/clock-names
> > [ 1821.143449] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property: /__symbols__/clk_0
> > [ 1821.155357] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property: /__symbols__/ILC
> > [ 1821.167074] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property:
> > /__symbols__/freeze_controller_0
> > [ 1821.180171] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property:
> > /__symbols__/sysid_qsys_0
> > [ 1821.192662] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property: /__symbols__/led_pio
> > [ 1821.204720] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property: /__symbols__/button_pio
> > [ 1821.217034] OF: overlay: WARNING: add_changeset_property(), memory
> > leak will occur if overlay removed.  Property: /__symbols__/dipsw_pio
> > [ 1821.231977] of-fpga-region soc:base_fpga_region:fpga_pr_region0:
> > FPGA Region probed
> > [ 1821.240144] altera_freeze_br ff200450.freeze_controller: fpga
> > bridge [freeze] registered
> >
> > root@arria10:~# ./apply-static-region.sh root@arria10:~# rmdir
> > /sys/kernel/config/device-tree/overlays/1-socfpga_arria10_socdk_sdmmc_ghrd_ovl_ext_cfg.dtb
> >
>
>
> > [ 1823.805564] OF: ERROR: memory leak - destroy cset entry: attach
> > overlay node /soc/base_fpga_region/clk_0 with refcount 2
>
> That is indicating that an unbalanced of_node_get() / of_node_put()
> exists for that node.  I'll have to update that message to be more
> explicit about that.

Yes, that sounds good

>
> -Frank
>
> >
> > Alan
> >
> >
> > Alan
> >
> >
> >
> >>
> >> ALL people:
> >>
> >>   The validations do _not_ address another major concern I have with
> >>   releasing overlays, which is use after free errors.
> >>
> >> Changes since v2:
> >>
> >>   - 13/18: Use continue to reduce indentation in find_dup_cset_node_entry()
> >> and find_dup_cset_prop()
> >>
> >> Changes since v1:
> >>
> >>   - move patch 16/16 to 17/18
> >>   - move patch 15/16 to 18/18
> >>   - new patch 15/18
> >>   - new patch 16/18
> >>
> >>   - 05/18: add_changeset_node() header comment: incorrect comment for 
> >> @target
> >>
> >>   - 

Re: linux-next: Tree for Oct 15

2018-10-15 Thread Rob Herring
On Mon, Oct 15, 2018 at 3:12 PM Stephen Rothwell  wrote:
>
> Hi Guenter,
>
> [Just cc'ing the PPC and devicetree folks]
>
> On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  wrote:
> >
> > On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > >
> > > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> >
> > Same here. Interestingly, this only affects little endian pseries
> > boots; big endian works fine. I'll try to bisect later.
> >
> > ALl ppc qemu tests (including big endian pseries) also generate a warning.
> >
> > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > .memblock_alloc_range_nid+0x20/0x68
> > Modules linked in:
> > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > NIP:  c0f99198 LR: c00000f99490 CTR: c0bb8364
> > REGS: c1217a78 TRAP: 0700   Not tainted  (4.19.0-rc7-next-20181015)
> > MSR:  80021000   CR: 24000422  XER: 2000
> > IRQMASK: 1
> > GPR00: c0f99490 c1217d00 c121a500 00c0
> > GPR04:    
> > GPR08:  00c0 0018 00b7
> > GPR12: 0040 c0fe7840  
> > GPR16:    
> > GPR20:    
> > GPR24:    
> > GPR28: c304 c1262088 00c0 c0fea500
> > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > Call Trace:
> > [c1217d00] [c2a0] 0xc2a0 (unreliable)
> > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > [c1217f90] [c528] start_here_multiplatform+0x68/0x80
> >
> >
> > sparc images crash, starting with next-20181009. Bisect with
> > next-201810112 points to the merge of devicetree/for-next, though
> > devicetree/for-next itself does not have the problem (bisect log
> > attached below). The crash is in devicetree code.
> >
> > Crash logs:
> > https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> > https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio

The sparc crash appears to be related to changes I made. Looking into it.

Rob


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Stephen Rothwell
Hi Guenter,

[Again, just cc'ing the PPC folks]

On Mon, 15 Oct 2018 12:39:14 -0700 Guenter Roeck  wrote:
>
> On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > Hi all,
> > 
> > Changes since 20181012:
> > 
> > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> >   
> 
> Bisect log:
> 
> # bad: [774ea0551a2966c8fc29a6f675c3e28c5c6fa586] Add linux-next specific 
> files for 20181012
> # good: [0238df646e6224016a45505d2c111a24669ebe21] Linux 4.19-rc7
> git bisect start 'HEAD' 'v4.19-rc7'
> # good: [dfbf78faefa3c26d94208398e62bf25ea798e7f2] Merge remote-tracking 
> branch 'spi-nor/spi-nor/next'
> git bisect good dfbf78faefa3c26d94208398e62bf25ea798e7f2
> # good: [3f296bb430327676912966c56d2f078f74e6b4ab] Merge remote-tracking 
> branch 'tip/auto-latest'
> git bisect good 3f296bb430327676912966c56d2f078f74e6b4ab
> # good: [056ff0c45d1780f7bac1b54bd4160647efc500ad] Merge remote-tracking 
> branch 'staging/staging-next'
> git bisect good 056ff0c45d1780f7bac1b54bd4160647efc500ad
> # good: [d7946b50c21a7d88af6c8e88d976ba3dfca651cc] Merge remote-tracking 
> branch 'pinctrl/for-next'
> git bisect good d7946b50c21a7d88af6c8e88d976ba3dfca651cc
> # good: [72b5ca3121d5352fbb8fe3e1abaa86748205c0cb] Merge remote-tracking 
> branch 'xarray/xarray'
> git bisect good 72b5ca3121d5352fbb8fe3e1abaa86748205c0cb
> # good: [e3895cf23a25da6dea2c8e986d4f6c24fafe5448] hugetlb: introduce generic 
> version of prepare_hugepage_range
> git bisect good e3895cf23a25da6dea2c8e986d4f6c24fafe5448
> # good: [627f8833ac26e66d4b50676a0251499474bb4ee4] reiserfs: propagate errors 
> from fill_with_dentries() properly
> git bisect good 627f8833ac26e66d4b50676a0251499474bb4ee4
> # good: [e38910adf47ba1d0b5a5a573cc26bde1ec533147] memblock: replace 
> alloc_bootmem_pages_node with memblock_alloc_node
> git bisect good e38910adf47ba1d0b5a5a573cc26bde1ec533147
> # bad: [f89bdd2c52666d9da4bf4ef3a97a7188586ba0fb] dma-direct: fix up for the 
> removal of linux/bootmem.h
> git bisect bad f89bdd2c52666d9da4bf4ef3a97a7188586ba0fb
> # good: [3108d998dfc36eb7f6b7f2917fc561258f742094] mm: nobootmem: remove 
> bootmem allocation APIs
> git bisect good 3108d998dfc36eb7f6b7f2917fc561258f742094
> # good: [1f94dacb1d0ed0d1068b89ad867a198d3eca7bf2] memblock: rename 
> __free_pages_bootmem to memblock_free_pages
> git bisect good 1f94dacb1d0ed0d1068b89ad867a198d3eca7bf2
> # good: [c3954ade0c1b499ae587f3edb813876216212836] memblock: replace 
> BOOTMEM_ALLOC_* with MEMBLOCK variants
> git bisect good c3954ade0c1b499ae587f3edb813876216212836
> # bad: [cde1c7f7e92aef241f1c1a09a4d1f1f06fd565b6] mm: remove 
> include/linux/bootmem.h
> git bisect bad cde1c7f7e92aef241f1c1a09a4d1f1f06fd565b6
> # first bad commit: [cde1c7f7e92aef241f1c1a09a4d1f1f06fd565b6] mm: remove 
> include/linux/bootmem.h
> 
> Reverting this patch together with its fix-up "powerpc: fix up for removal of
> linux/bootmem.h" fixes the problem. This also fixes the traceback seen with 
> all
> other ppc64 images.
> 
> Guenter

Thanks for this ... though a strange result as those patches were in
next-20181012 as well, so I wonder what else changed.

-- 
Cheers,
Stephen Rothwell


pgpv8nh5rylnu.pgp
Description: OpenPGP digital signature


Re: linux-next: Tree for Oct 15

2018-10-15 Thread Stephen Rothwell
Hi all,

On Tue, 16 Oct 2018 07:12:40 +1100 Stephen Rothwell  
wrote:
>
> On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  wrote:
> >
> > ALl ppc qemu tests (including big endian pseries) also generate a warning.
> > 
> > WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> > .memblock_alloc_range_nid+0x20/0x68

That is:

static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
enum memblock_flags flags)
{
   if (WARN_ON_ONCE(!align))
align = SMP_CACHE_BYTES;

Looks like patch

  "memblock: stop using implicit alignment to SMP_CACHE_BYTES"

missed some places ...

> > Modules linked in:
> > CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> > NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> > REGS: c1217a78 TRAP: 0700   Not tainted  (4.19.0-rc7-next-20181015)
> > MSR:  80021000   CR: 24000422  XER: 2000
> > IRQMASK: 1 
> > GPR00: c0f99490 c1217d00 c121a500 00c0 
> > GPR04:     
> > GPR08:  00c0 0018 00b7 
> > GPR12: 0040 c0fe7840   
> > GPR16:     
> > GPR20:     
> > GPR24:     
> > GPR28: c304 c1262088 00c0 c0fea500 
> > NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> > LR [c0f99490] .memblock_alloc_base+0x18/0x48
> > Call Trace:
> > [c1217d00] [c2a0] 0xc2a0 (unreliable)
> > [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> > [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> > [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> > [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> > [c1217f90] [c528] start_here_multiplatform+0x68/0x80

-- 
Cheers,
Stephen Rothwell


pgpMKAN1qdMMz.pgp
Description: OpenPGP digital signature


Re: [PATCH v3 00/18] of: overlay: validation checks, subsequent fixes

2018-10-15 Thread Frank Rowand
On 10/15/18 12:21, Alan Tull wrote:
> On Sun, Oct 14, 2018 at 7:26 PM  wrote:
>>
>> From: Frank Rowand 
>>
>> Add checks to (1) overlay apply process and (2) memory freeing
>> triggered by overlay release.  The checks are intended to detect
>> possible memory leaks and invalid overlays.
>>
>> The checks revealed bugs in existing code.  Fixed the bugs.
>>
>> While fixing bugs, noted other issues, which are fixed in
>> separate patches.
>>
>> *  Powerpc folks: I was not able to test the patches that
>> *  directly impact Powerpc systems that use dynamic
>> *  devicetree.  Please review that code carefully and
>> *  test.  The specific patches are: 03/16, 04/16, 07/16
>>
>> FPGA folks:
>>
>>   I made the validation checks that should result in an
>>   invalid live devicetree report "ERROR" and cause the overlay apply
>>   to fail.
>>
>>   I made the memory leak validation tests report "WARNING" and allow
>>   the overlay apply to complete successfully.  Please let me know
>>   if you encounter the warnings.  There are at least two paths
>>   forward to deal with the cases that trigger the warning: (1) change
>>   the warning to an error and fail the overlay apply, or (2) find a
>>   way to detect the potential memory leaks and free the memory
>>   appropriately.
> 
> I reran my FPGA testing.  The strings are fixed, no longer NULL.  I

Thanks for the further testing!


> have functionality back, my test passes now.  I'm seeing the intended
> warnings about any properties added to existing nodes.  That includes
> warnings about added symbols.  Below is a simplified part to show some
> of what I'm seeing.
> 
> By the way my testing is all using Pantelis' DT overlay configfs interface.
> 
> root@arria10:~# ./apply-static-region.sh
> 
> Applying dtbo: socfpga_arria10_socdk_sdmmc_ghrd_ovl_ext_cfg.dtb
> 
> [ 1821.088640] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property:
> /soc/base_fpga_region/ranges
> [ 1821.103307] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property:
> /soc/base_fpga_region/external-fpga-config
> [ 1821.117359] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property:
> /soc/base_fpga_region/clocks
> [ 1821.130130] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property:
> /soc/base_fpga_region/clock-names
> [ 1821.143449] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property: /__symbols__/clk_0
> [ 1821.155357] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property: /__symbols__/ILC
> [ 1821.167074] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property:
> /__symbols__/freeze_controller_0
> [ 1821.180171] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property:
> /__symbols__/sysid_qsys_0
> [ 1821.192662] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property: /__symbols__/led_pio
> [ 1821.204720] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property: /__symbols__/button_pio
> [ 1821.217034] OF: overlay: WARNING: add_changeset_property(), memory
> leak will occur if overlay removed.  Property: /__symbols__/dipsw_pio
> [ 1821.231977] of-fpga-region soc:base_fpga_region:fpga_pr_region0:
> FPGA Region probed
> [ 1821.240144] altera_freeze_br ff200450.freeze_controller: fpga
> bridge [freeze] registered
> 
> root@arria10:~# ./apply-static-region.sh root@arria10:~# rmdir
> /sys/kernel/config/device-tree/overlays/1-socfpga_arria10_socdk_sdmmc_ghrd_ovl_ext_cfg.dtb
> 


> [ 1823.805564] OF: ERROR: memory leak - destroy cset entry: attach
> overlay node /soc/base_fpga_region/clk_0 with refcount 2

That is indicating that an unbalanced of_node_get() / of_node_put()
exists for that node.  I'll have to update that message to be more
explicit about that.

-Frank

> 
> Alan
> 
> 
> Alan
> 
> 
> 
>>
>> ALL people:
>>
>>   The validations do _not_ address another major concern I have with
>>   releasing overlays, which is use after free errors.
>>
>> Changes since v2:
>>
>>   - 13/18: Use continue to reduce indentation in find_dup_cset_node_entry()
>> and find_dup_cset_prop()
>>
>> Changes since v1:
>>
>>   - move patch 16/16 to 17/18
>>   - move patch 15/16 to 18/18
>>   - new patch 15/18
>>   - new patch 16/18
>>
>>   - 05/18: add_changeset_node() header comment: incorrect comment for @target
>>
>>   - 18/18: add same fix for of_parse_phandle_with_args()
>>   - 18/18: add same fix for of_parse_phandle_with_args_map()
>>
>> Frank Rowand (18):
>>   of: overlay: add tests to validate kfrees from overlay removal
>>   of: overlay: add missing of_node_put() after add new node to changeset
>>   of: overlay: add missing 

Re: [PATCH v3 09/18] of: overlay: validate overlay properties #address-cells and #size-cells

2018-10-15 Thread Frank Rowand
On 10/15/18 12:01, Alan Tull wrote:
> On Sun, Oct 14, 2018 at 7:26 PM  wrote:
>>
>> From: Frank Rowand 
>>
>> If overlay properties #address-cells or #size-cells are already in
>> the live devicetree for any given node, then the values in the
>> overlay must match the values in the live tree.
>>
>> If the properties are already in the live tree then there is no
>> need to create a changeset entry to add them since they must
>> have the same value.  This reduces the memory used by the
>> changeset and eliminates a possible memory leak.  This is
>> verified by 12 fewer warnings during the devicetree unittest,
>> as the possible memory leak warnings about #address-cells and
>>
>> Signed-off-by: Frank Rowand 
>> ---
>>  drivers/of/overlay.c | 38 +++---
>>  1 file changed, 35 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
>> index 272a0d1a5e18..ee66651db553 100644
>> --- a/drivers/of/overlay.c
>> +++ b/drivers/of/overlay.c
>> @@ -287,7 +287,12 @@ static struct property *dup_and_fixup_symbol_prop(
>>   * @target may be either in the live devicetree or in a new subtree that
>>   * is contained in the changeset.
>>   *
>> - * Some special properties are not updated (no error returned).
>> + * Some special properties are not added or updated (no error returned):
>> + * "name", "phandle", "linux,phandle".
>> + *
>> + * Properties "#address-cells" and "#size-cells" are not updated if they
>> + * are already in the live tree, but if present in the live tree, the values
>> + * in the overlay must match the values in the live tree.
>>   *
>>   * Update of property in symbols node is not allowed.
>>   *
>> @@ -300,6 +305,7 @@ static int add_changeset_property(struct 
>> overlay_changeset *ovcs,
>>  {
>> struct property *new_prop = NULL, *prop;
>> int ret = 0;
>> +   bool check_for_non_overlay_node = false;
>>
>> if (!of_prop_cmp(overlay_prop->name, "name") ||
>> !of_prop_cmp(overlay_prop->name, "phandle") ||
>> @@ -322,13 +328,39 @@ static int add_changeset_property(struct 
>> overlay_changeset *ovcs,
>> if (!new_prop)
>> return -ENOMEM;
>>
>> -   if (!prop)
>> +   if (!prop) {
>> +
>> +   check_for_non_overlay_node = true;
>> ret = of_changeset_add_property(>cset, target->np,
>> new_prop);
>> -   else
>> +
>> +   } else if (!of_prop_cmp(prop->name, "#address-cells")) {
>> +
> 
> Hi Frank,
> 
> If we get these ERROR messages, I suggest that this function should
> return an error so the overlay will be rejected.
> 
>> +   if (prop->length != 4 || new_prop->length != 4 ||
>> +   *(u32 *)prop->value != *(u32 *)new_prop->value)
> 
> *(u32 *)prop->value != *(u32 *)new_prop->value) {
> 
>> +   pr_err("ERROR: overlay and/or live tree 
>> #address-cells invalid in node %pOF\n",
>> +  target->np);
> 
>ret = -EINVAL;
> }
> 
> Otherwise there is an ERROR message, but it continues trying to apply
> the invalid overlay anyway and I get an oops.  By adding the ret =
> -EINVAL, the overlay gets rejected and the oops is avoided.

Yes, that sounds good.


>> +
>> +   } else if (!of_prop_cmp(prop->name, "#size-cells")) {
>> +
>> +   if (prop->length != 4 || new_prop->length != 4 ||
>> +   *(u32 *)prop->value != *(u32 *)new_prop->value)
>> +   pr_err("ERROR: overlay and/or live tree #size-cells 
>> invalid in node %pOF\n",
>> +  target->np);
> 
> Add the ret = -EINVAL here also.  This give me the following (if my
> overlay changes #address-cells):

Yes.


> [   21.167551] OF: overlay: ERROR: overlay and/or live tree
> #address-cells invalid in node /soc/base_fpga_region
> [   21.177442] OF: overlay: add_changeset_property ret=-22
> [   21.182656] create_overlay: Failed to create overlay (err=-22)
> 
> Also, I wonder if the ERROR message could be more direct.  Currently
> it says the #address-cells property is invalid but that doesn't say
> anything about why it's invalid.  How about something like:
> 
>  OF: overlay: ERROR: changing #address-cells not allowed 
> (/soc/base_fpga_region)

That sounds like a more useful message, maybe a slight change
s/changing #address-cells/changing value of #address-cells/


> The 'OF: overlay' part still makes it clear it's overlay related.  The
> rest of it makes it clear *why* it's invalid.  This ERROR will be a
> surprise for people who have been using overlays, so that could be
> helpful to light the way a bit.
> 
> Alan
> 
>> +
>> +   } else {
>> +
>> +   check_for_non_overlay_node = true;
>> ret = of_changeset_update_property(>cset, target->np,
>>new_prop);
>>
>> +   

Re: linux-next: Tree for Oct 15

2018-10-15 Thread Stephen Rothwell
Hi Guenter,

[Just cc'ing the PPC and devicetree folks]

On Mon, 15 Oct 2018 11:26:37 -0700 Guenter Roeck  wrote:
>
> On Mon, Oct 15, 2018 at 07:25:46PM +1100, Stephen Rothwell wrote:
> > 
> > My qemu boots of a powerpc pseries_le_defconfig kernel failed today.
> 
> Same here. Interestingly, this only affects little endian pseries
> boots; big endian works fine. I'll try to bisect later.
> 
> ALl ppc qemu tests (including big endian pseries) also generate a warning.
> 
> WARNING: CPU: 0 PID: 0 at mm/memblock.c:1301 
> .memblock_alloc_range_nid+0x20/0x68
> Modules linked in:
> CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7-next-20181015 #1
> NIP:  c0f99198 LR: c0f99490 CTR: c0bb8364
> REGS: c00001217a78 TRAP: 0700   Not tainted  (4.19.0-rc7-next-20181015)
> MSR:  80021000   CR: 24000422  XER: 2000
> IRQMASK: 1 
> GPR00: c0f99490 c1217d00 c121a500 00c0 
> GPR04:     
> GPR08:  00c0 0018 00b7 
> GPR12: 0040 c0fe7840   
> GPR16:     
> GPR20:     
> GPR24:     
> GPR28: c304 c1262088 00c0 c0fea500 
> NIP [c0f99198] .memblock_alloc_range_nid+0x20/0x68
> LR [c0f99490] .memblock_alloc_base+0x18/0x48
> Call Trace:
> [c1217d00] [c2a0] 0xc2a0 (unreliable)
> [c1217d80] [c0f99490] .memblock_alloc_base+0x18/0x48
> [c1217df0] [c0f7a274] .allocate_paca_ptrs+0x3c/0x74
> [c1217e70] [c0f78bf0] .early_init_devtree+0x288/0x320
> [c1217f10] [c0f79b6c] .early_setup+0x80/0x130
> [c1217f90] [c528] start_here_multiplatform+0x68/0x80
> 
> 
> sparc images crash, starting with next-20181009. Bisect with
> next-201810112 points to the merge of devicetree/for-next, though
> devicetree/for-next itself does not have the problem (bisect log
> attached below). The crash is in devicetree code.
> 
> Crash logs:
> https://kerneltests.org/builders/qemu-sparc64-next/builds/981/steps/qemubuildcommand_1/logs/stdio
> https://kerneltests.org/builders/qemu-sparc-next/builds/975/steps/qemubuildcommand_1/logs/stdio
> 
> Guenter
> 
> ---
> # bad: [774ea0551a2966c8fc29a6f675c3e28c5c6fa586] Add linux-next specific 
> files for 20181012
> # good: [0238df646e6224016a45505d2c111a24669ebe21] Linux 4.19-rc7
> git bisect start 'HEAD' 'v4.19-rc7'
> # good: [dfbf78faefa3c26d94208398e62bf25ea798e7f2] Merge remote-tracking 
> branch 'spi-nor/spi-nor/next'
> git bisect good dfbf78faefa3c26d94208398e62bf25ea798e7f2
> # bad: [3f296bb430327676912966c56d2f078f74e6b4ab] Merge remote-tracking 
> branch 'tip/auto-latest'
> git bisect bad 3f296bb430327676912966c56d2f078f74e6b4ab
> # good: [efad9cbc89fbef3c4b3905e1c01a8191eae4c772] Merge remote-tracking 
> branch 'sound/for-next'
> git bisect good efad9cbc89fbef3c4b3905e1c01a8191eae4c772
> # good: [7d12a265b24001fbff1ff260c2f6bd802224a7c0] Merge remote-tracking 
> branch 'iommu/next'
> git bisect good 7d12a265b24001fbff1ff260c2f6bd802224a7c0
> # good: [4fc72c0ef3c1e792caf06d25ef68c7c871730e31] Merge branch 'ras/core'
> git bisect good 4fc72c0ef3c1e792caf06d25ef68c7c871730e31
> # good: [d74865bd3996c7a6f3e8ce6e626c1fe474e39494] Merge branch 'x86/mm'
> git bisect good d74865bd3996c7a6f3e8ce6e626c1fe474e39494
> # bad: [1b1ab6a98adab8a0436024b369305a978e365a13] Merge remote-tracking 
> branch 'mailbox/mailbox-for-next'
> git bisect bad 1b1ab6a98adab8a0436024b369305a978e365a13
> # good: [389d0a8a7af8ff8bb6301382333c7e8f748d7cd6] Merge branch 
> 'dt/cpu-type-rework' into dt/next
> git bisect good 389d0a8a7af8ff8bb6301382333c7e8f748d7cd6
> # good: [4355151de47c2b4bc72c026ee743bd9ed7f71ba3] Merge branch 'all-dtbs' 
> into dt/next
> git bisect good 4355151de47c2b4bc72c026ee743bd9ed7f71ba3
> # good: [60d744213fd9433b10b23afafb694a44c8e96cb8] Merge remote-tracking 
> branch 'vfio/next'
> git bisect good 60d744213fd9433b10b23afafb694a44c8e96cb8
> # good: [9f0a0a381c5db56e7922dbeea6831f27db58372f] mailbox: mediatek: Add 
> check for possible failure of kzalloc
> git bisect good 9f0a0a381c5db56e7922dbeea6831f27db58372f
> # good: [157b4129ded8ba756ef17c058192e734889673e4] dt-bindings: arm: fsl: 
> Move DCFG and SCFG bindings to their own docs
> git bisect good 157b4129ded8ba756ef17c058192e734889673e4
> # bad: [bed61948ea6c57bc73fb3ded9421c1bdd8cbe4d9] Merge remote-track

[PATCH v06 2/5] powerpc/drmem: Add internal_flags feature

2018-10-15 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };

 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }

+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,

lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;

*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,

lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;

func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,

lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[PATCH v06 5/5] migration/memory: Support 'ibm,dynamic-memory-v2'

2018-10-15 Thread Michael Bringmann
migration/memory: This patch adds recognition for changes to the
associativity of memory blocks described by 'ibm,dynamic-memory-v2'.
If the associativity of an LMB has changed, it should be readded to
the system in order to update local and general kernel data structures.
This patch builds upon previous enhancements that scan the device-tree
"ibm,dynamic-memory" properties using the base LMB array, and a copy
derived from the updated properties.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index dc2aa34..8c08eb2 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -1184,7 +1184,8 @@ static int pseries_memory_notifier(struct notifier_block 
*nb,
err = pseries_remove_mem_node(rd->dn);
break;
case OF_RECONFIG_UPDATE_PROPERTY:
-   if (!strcmp(rd->prop->name, "ibm,dynamic-memory")) {
+   if (!strcmp(rd->prop->name, "ibm,dynamic-memory") ||
+   !strcmp(rd->prop->name, "ibm,dynamic-memory-v2")) {
struct drmem_lmb_info *dinfo =
drmem_lmbs_init(rd->prop);
if (!dinfo)



[PATCH v06 4/5] migration/memory: Evaluate LMB assoc changes

2018-10-15 Thread Michael Bringmann
migration/memory: This patch adds code that recognizes changes to
the associativity of memory blocks described by the device-tree
properties in order to drive equivalent 'hotplug' operations to
update local and general kernel data structures to reflect those
changes.  These differences may include:

* Evaluate 'ibm,dynamic-memory' properties when processing the
  updated device-tree properties of the system during Post Migration
  events (migration_store).  The new functionality looks for changes
  to the aa_index values for each drc_index/LMB to identify any memory
  blocks that should be readded.

* In an LPAR migration scenario, the "ibm,associativity-lookup-arrays"
  property may change.  In the event that a row of the array differs,
  locate all assigned memory blocks with that 'aa_index' and 're-add'
  them to the system memory block data structures.  In the process of
  the 're-add', the system routines will update the corresponding entry
  for the memory in the LMB structures and any other relevant kernel
  data structures.

A number of previous extensions made to the DRMEM code for scanning
device-tree properties and creating LMB arrays are used here to
ensure that the resulting code is simpler and more usable:

* Use new paired list iterator for the DRMEM LMB info arrays to find
  differences in old and new versions of properties.
* Use new iterator for copies of the DRMEM info arrays to evaluate
  completely new structures.
* Combine common code for parsing and evaluating memory description
  properties based on the DRMEM LMB array model to greatly simplify
  extension from the older property 'ibm,dynamic-memory' to the new
  property model of 'ibm,dynamic-memory-v2'.

For support, add a new pseries hotplug action for DLPAR operations,
PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.  It is a variant of the READD
operation which performs the action upon multiple instances of the
resource at one time.  The operation is to be triggered by device-tree
analysis of updates by RTAS events analyzed by 'migation_store' during
post-migration processing.  It will be used for memory updates,
initially.

Signed-off-by: Michael Bringmann 
---
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
Changes in v04:
  -- Move dlpar_memory_readd_multiple() function definition and use
 into previous patch along with action constant definition.
  -- Correct spacing in patch
Changes in v03:
  -- Modify the code that parses the memory affinity attributes to
 mark relevant DRMEM LMB array entries using the internal_flags
 mechanism instead of generate unique hotplug actions for each
 memory block to be readded.  The change is intended to both
 simplify the code, and to require fewer resources on systems
 with huge amounts of memory.
  -- Save up notice about any all LMB entries until the end of the
 'migration_store' operation at which point a single action is
 queued to scan the entire DRMEM array.
  -- Add READD_MULTIPLE function for memory that scans the DRMEM
 array to identify multiple entries that were marked previously.
 The corresponding memory blocks are to be readded to the system
 to update relevant data structures outside of the powerpc-
 specific code.
  -- Change dlpar_memory_pmt_changes_action to directly queue worker
 to pseries work queue.
---
 arch/powerpc/include/asm/topology.h |7 +
 arch/powerpc/mm/numa.c  |6 -
 arch/powerpc/platforms/pseries/hotplug-memory.c |  207 +++
 arch/powerpc/platforms/pseries/mobility.c   |3 
 arch/powerpc/platforms/pseries/pseries.h|8 +
 5 files changed, 186 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index a4a718d..fbe03df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -135,5 +135,12 @@ static inline void shared_proc_topology_init(void) {}
 #endif
 #endif

+
+struct assoc_arrays {
+   u32 n_arrays;
+   u32 array_sz;
+   const __be32 *arrays;
+};
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 693ae1c..f1e7287 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -368,12 +368,6 @@ static unsigned long read_n_cells(int n, const __be32 
**buf)
return result;
 }

-struct assoc_arrays {
-   u32 n_arrays;
-   u32 array_sz;
-   const __be32 *arrays;
-};
-
 /*
  * Retrieve and validate the list of associativity arrays for drconf
  * memory from the ibm,associativity-lookup-arrays property of the
diff --git 

[PATCH v06 3/5] migration/memory: Add hotplug READD_MULTIPLE

2018-10-15 Thread Michael Bringmann
migration/memory: This patch adds a new pseries hotplug action
for CPU and memory operations, PSERIES_HP_ELOG_ACTION_READD_MULTIPLE.
This is a variant of the READD operation which performs the action
upon multiple instances of the resource at one time.  The operation
is to be triggered by device-tree analysis of updates by RTAS events
analyzed by 'migation_store' during post-migration processing.  It
will be used for memory updates, initially.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Provide dlpar_memory_readd_helper routine to compress some common code
Changes in v04:
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 previous patch.
  -- Pull in implementation of dlpar_memory_readd_multiple() to go
 with operation flag.
---
 arch/powerpc/include/asm/rtas.h |1 +
 arch/powerpc/platforms/pseries/hotplug-memory.c |   44 ---
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 0183e95..cc00451 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -333,6 +333,7 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ACTION_ADD 1
 #define PSERIES_HP_ELOG_ACTION_REMOVE  2
 #define PSERIES_HP_ELOG_ACTION_READD   3
+#define PSERIES_HP_ELOG_ACTION_READD_MULTIPLE  4

 #define PSERIES_HP_ELOG_ID_DRC_NAME1
 #define PSERIES_HP_ELOG_ID_DRC_INDEX   2
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2b796da..9c76345 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -507,6 +507,19 @@ static int dlpar_memory_remove_by_index(u32 drc_index)
return rc;
 }

+static int dlpar_memory_readd_helper(struct drmem_lmb *lmb)
+{
+   int rc;
+
+   rc = dlpar_remove_lmb(lmb);
+   if (!rc) {
+   rc = dlpar_add_lmb(lmb);
+   if (rc)
+   dlpar_release_drc(lmb->drc_index);
+   }
+   return rc;
+}
+
 static int dlpar_memory_readd_by_index(u32 drc_index)
 {
struct drmem_lmb *lmb;
@@ -519,12 +532,7 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
for_each_drmem_lmb(lmb) {
if (lmb->drc_index == drc_index) {
lmb_found = 1;
-   rc = dlpar_remove_lmb(lmb);
-   if (!rc) {
-   rc = dlpar_add_lmb(lmb);
-   if (rc)
-   dlpar_release_drc(lmb->drc_index);
-   }
+   rc = dlpar_memory_readd_helper(lmb);
break;
}
}
@@ -541,6 +549,23 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
return rc;
 }

+static int dlpar_memory_readd_multiple(void)
+{
+   struct drmem_lmb *lmb;
+   int rc;
+
+   pr_info("Attempting to update multiple LMBs\n");
+
+   for_each_drmem_lmb(lmb) {
+   if (drmem_lmb_update(lmb)) {
+   rc = dlpar_memory_readd_helper(lmb);
+   drmem_remove_lmb_update(lmb);
+   }
+   }
+
+   return rc;
+}
+
 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
struct drmem_lmb *lmb, *start_lmb, *end_lmb;
@@ -641,6 +666,10 @@ static int dlpar_memory_readd_by_index(u32 drc_index)
 {
return -EOPNOTSUPP;
 }
+static int dlpar_memory_readd_multiple(void)
+{
+   return -EOPNOTSUPP;
+}

 static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 {
@@ -918,6 +947,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
drc_index = hp_elog->_drc_u.drc_index;
rc = dlpar_memory_readd_by_index(drc_index);
break;
+   case PSERIES_HP_ELOG_ACTION_READD_MULTIPLE:
+   rc = dlpar_memory_readd_multiple();
+   break;
default:
pr_err("Invalid action (%d) specified\n", hp_elog->action);
rc = -EINVAL;



Fwd: [PATCH v06 2/5] powerpc/drmem: Add internal_flags feature

2018-10-15 Thread Michael Bringmann
powerpc/drmem: Add internal_flags field to each LMB to allow
marking of kernel software-specific operations that need not
be exported to other users.  For instance, if information about
selected LMBs needs to be maintained for subsequent passes
through the system, it can be encoded into the LMB array itself
without requiring the allocation and maintainance of additional
data structures.

Signed-off-by: Michael Bringmann 
---
Changes in v04:
  -- Add another initialization of 'lmb->internal_flags' to
 init_drmem_v2_lmbs.
---
 arch/powerpc/include/asm/drmem.h |   18 ++
 arch/powerpc/mm/drmem.c  |3 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index cfe8598..dbb3e6c 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -17,6 +17,7 @@ struct drmem_lmb {
u32 drc_index;
u32 aa_index;
u32 flags;
+   u32 internal_flags;
 };

 struct drmem_lmb_info {
@@ -94,6 +95,23 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
return lmb->flags & DRMEM_LMB_RESERVED;
 }

+#define DRMEM_LMBINT_UPDATE0x0001
+
+static inline void drmem_mark_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags |= DRMEM_LMBINT_UPDATE;
+}
+
+static inline void drmem_remove_lmb_update(struct drmem_lmb *lmb)
+{
+   lmb->internal_flags &= ~DRMEM_LMBINT_UPDATE;
+}
+
+static inline bool drmem_lmb_update(struct drmem_lmb *lmb)
+{
+   return lmb->internal_flags & DRMEM_LMBINT_UPDATE;
+}
+
 u64 drmem_lmb_memory_max(void);
 void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index ded9dbf..f199fe5 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -207,6 +207,7 @@ static void read_drconf_v1_cell(struct drmem_lmb *lmb,

lmb->aa_index = of_read_number(p++, 1);
lmb->flags = of_read_number(p++, 1);
+   lmb->internal_flags = 0;

*prop = p;
 }
@@ -265,6 +266,7 @@ static void __walk_drmem_v2_lmbs(const __be32 *prop, const 
__be32 *usm,

lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
+   lmb.internal_flags = 0;

func(, );
}
@@ -441,6 +443,7 @@ static void init_drmem_v2_lmbs(const __be32 *prop,

lmb->aa_index = dr_cell.aa_index;
lmb->flags = dr_cell.flags;
+   lmb->internal_flags = 0;
}
}
 }



[PATCH v06 1/5] powerpc/drmem: Export 'dynamic-memory' loader

2018-10-15 Thread Michael Bringmann
powerpc/drmem: Export many of the functions of DRMEM to parse
"ibm,dynamic-memory" and "ibm,dynamic-memory-v2" during hotplug
operations and for Post Migration events.

Also modify the DRMEM initialization code to allow it to,

* Be called after system initialization
* Provide a separate user copy of the LMB array that is produces
* Free the user copy upon request

In addition, a couple of changes were made to make the creation
of additional copies of the LMB array more useful including,

* Add iterator function to work through a pair of drmem_info arrays
  with a callback function to apply specific tests.
* Modify DRMEM code to replace usages of dt_root_addr_cells, and
  dt_mem_next_cell, as these are only available at first boot.

Signed-off-by: Michael Bringmann 
---
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
---
 arch/powerpc/include/asm/drmem.h |   13 +
 arch/powerpc/mm/drmem.c  |   96 ++
 2 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 7c1d8e7..cfe8598 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -35,6 +35,11 @@ struct drmem_lmb_info {
_info->lmbs[0],   \
_info->lmbs[drmem_info->n_lmbs - 1])

+#define for_each_dinfo_lmb(dinfo, lmb) \
+   for_each_drmem_lmb_in_range((lmb),  \
+   >lmbs[0],\
+   >lmbs[dinfo->n_lmbs - 1])
+
 /*
  * The of_drconf_cell_v1 struct defines the layout of the LMB data
  * specified in the ibm,dynamic-memory device tree property.
@@ -94,6 +99,14 @@ void __init walk_drmem_lmbs(struct device_node *dn,
void (*func)(struct drmem_lmb *, const __be32 **));
 int drmem_update_dt(void);

+struct drmem_lmb_info *drmem_lmbs_init(struct property *prop);
+void drmem_lmbs_free(struct drmem_lmb_info *dinfo);
+int walk_drmem_lmbs_pairs(struct drmem_lmb_info *dinfo_oth,
+ int (*func)(struct drmem_lmb *cnt,
+   struct drmem_lmb *oth,
+   void *data),
+ void *data);
+
 #ifdef CONFIG_PPC_PSERIES
 void __init walk_drmem_lmbs_early(unsigned long node,
void (*func)(struct drmem_lmb *, const __be32 **));
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 3f18036..ded9dbf 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -20,6 +20,7 @@

 static struct drmem_lmb_info __drmem_info;
 struct drmem_lmb_info *drmem_info = &__drmem_info;
+static int n_root_addr_cells;

 u64 drmem_lmb_memory_max(void)
 {
@@ -193,12 +194,13 @@ int drmem_update_dt(void)
return rc;
 }

-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
   const __be32 **prop)
 {
const __be32 *p = *prop;

-   lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   lmb->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);

p++; /* skip reserved field */
@@ -209,7 +211,7 @@ static void __init read_drconf_v1_cell(struct drmem_lmb 
*lmb,
*prop = p;
 }

-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct drmem_lmb lmb;
@@ -225,13 +227,14 @@ static void __init __walk_drmem_v1_lmbs(const __be32 
*prop, const __be32 *usm,
}
 }

-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
   const __be32 **prop)
 {
const __be32 *p = *prop;

dr_cell->seq_lmbs = of_read_number(p++, 1);
-   dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, );
+   dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+   p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -239,7 +242,7 @@ static void __init read_drconf_v2_cell(struct 
of_drconf_cell_v2 *dr_cell,
*prop = p;
 }

-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
+static void __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
void (*func)(struct drmem_lmb *, const __be32 **))
 {
struct of_drconf_cell_v2 dr_cell;
@@ -275,6 +278,9 @@ void __init walk_drmem_lmbs_early(unsigned long node,
const __be32 *prop, *usm;
int len;

+   if 

[PATCH v06 0/5] powerpc/migration: Affinity fix for memory

2018-10-15 Thread Michael Bringmann
The migration of LPARs across Power systems affects many attributes
including that of the associativity of memory blocks.  The patches
in this set execute when a system is coming up fresh upon a migration
target.  They are intended to,

* Recognize changes to the associativity of memory recorded in
  internal data structures when compared to the latest copies in
  the device tree (e.g. ibm,dynamic-memory, ibm,dynamic-memory-v2).
* Recognize changes to the associativity mapping (e.g. ibm,
  associativity-lookup-arrays), locate all assigned memory blocks
  corresponding to each changed row, and readd all such blocks.
* Generate calls to other code layers to reset the data structures
  related to associativity of memory.
* Re-register the 'changed' entities into the target system.
  Re-registration of memory blocks mostly entails acting as if they
  have been newly hot-added into the target system.

This code builds upon features introduced in a previous patch set
that updates CPUs for affinity changes that may occur during LPM.

Signed-off-by: Michael Bringmann 

Michael Bringmann (5):
  powerpc/drmem: Export 'dynamic-memory' loader
  powerpc/drmem: Add internal_flags feature
  migration/memory: Add hotplug flags READD_MULTIPLE
  migration/memory: Evaluate LMB assoc changes
  migration/memory: Support 'ibm,dynamic-memory-v2'
---
Changes in v06:
  -- Rebase to powerpc next branch to account for recent code changes.
  -- Fix prototype problem when CONFIG_MEMORY_HOTPLUG not defined.
Changes in v05:
  -- Add walk_drmem_lmbs_pairs to replace macro for_each_pair_lmb
  -- Use walk_drmem_lmbs_pairs and callback instead of local loop
  -- Provide dlpar_memory_readd_helper routine to compress some common code
  -- Move common structure from numa.c + hotplug-memory.c to header file.
  -- Clarify some comments.
Changes in v04:
  -- Move dlpar_memory_readd_multiple() to patch with new ACTION
 constant.
  -- Move init of 'lmb->internal_flags' in init_drmem_v2_lmbs to
 patch with other references to flag.
  -- Correct spacing in one of the patches
Changes in v03:
  -- Change operation to tag changed LMBs in DRMEM array instead of
 queuing a potentially huge number of structures.
  -- Added another hotplug queue event for CPU/memory operations
  -- Added internal_flags feature to DRMEM
  -- Improve the patch description language for the patch set.
  -- Revise patch set to queue worker for memory association
 updates directly to pseries worker queue.



Re: [PATCH v3 13/18] of: overlay: check prevents multiple fragments touching same property

2018-10-15 Thread Frank Rowand
On 10/14/18 20:21, Frank Rowand wrote:
> On 10/14/18 18:55, Joe Perches wrote:
>> On Sun, 2018-10-14 at 18:52 -0700, Frank Rowand wrote:
>>> On 10/14/18 18:06, Joe Perches wrote:
 On Sun, 2018-10-14 at 17:24 -0700, frowand.l...@gmail.com wrote:
> From: Frank Rowand 
>
> Add test case of two fragments updating the same property.  After
> adding the test case, the system hangs at end of boot, after
> after slub stack dumps from kfree() in crypto modprobe code.
>> []
 I think this is worse performance than before.

 This now walks all entries when before it would
 return -EINVAL directly when it found a match.
>>>
>>> Yes, it is worse performance, but that is OK.
>>>
>>> This is a check that is done when a devicetree overlay is applied.
>>> If an error occurs then that means that the overlay was incorrectly
>>> specified.  The file drivers/of/unittest-data/overlay_bad_add_dup_prop.dts
>>> in this patch provides an example of how a bad overlay can be created.
>>>
>>> Once an error was detected, the check could return immediately, or it
>>> could continue to give a complete list of detected errors.  I chose to
>>> give the complete list of detected errors.
>>
>> Swell.  Please describe that in the commit message.
> 
> If a version 4 of the series is created I will update the commit
> message.  As a stand alone item I do not think it is worth a
> new version.

And there will be a version 4, so I will update the commit message.

-Frank


Re: [PATCH v3 00/18] of: overlay: validation checks, subsequent fixes

2018-10-15 Thread Alan Tull
On Sun, Oct 14, 2018 at 7:26 PM  wrote:
>
> From: Frank Rowand 
>
> Add checks to (1) overlay apply process and (2) memory freeing
> triggered by overlay release.  The checks are intended to detect
> possible memory leaks and invalid overlays.
>
> The checks revealed bugs in existing code.  Fixed the bugs.
>
> While fixing bugs, noted other issues, which are fixed in
> separate patches.
>
> *  Powerpc folks: I was not able to test the patches that
> *  directly impact Powerpc systems that use dynamic
> *  devicetree.  Please review that code carefully and
> *  test.  The specific patches are: 03/16, 04/16, 07/16
>
> FPGA folks:
>
>   I made the validation checks that should result in an
>   invalid live devicetree report "ERROR" and cause the overlay apply
>   to fail.
>
>   I made the memory leak validation tests report "WARNING" and allow
>   the overlay apply to complete successfully.  Please let me know
>   if you encounter the warnings.  There are at least two paths
>   forward to deal with the cases that trigger the warning: (1) change
>   the warning to an error and fail the overlay apply, or (2) find a
>   way to detect the potential memory leaks and free the memory
>   appropriately.

I reran my FPGA testing.  The strings are fixed, no longer NULL.  I
have functionality back, my test passes now.  I'm seeing the intended
warnings about any properties added to existing nodes.  That includes
warnings about added symbols.  Below is a simplified part to show some
of what I'm seeing.

By the way my testing is all using Pantelis' DT overlay configfs interface.

root@arria10:~# ./apply-static-region.sh

Applying dtbo: socfpga_arria10_socdk_sdmmc_ghrd_ovl_ext_cfg.dtb

[ 1821.088640] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property:
/soc/base_fpga_region/ranges
[ 1821.103307] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property:
/soc/base_fpga_region/external-fpga-config
[ 1821.117359] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property:
/soc/base_fpga_region/clocks
[ 1821.130130] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property:
/soc/base_fpga_region/clock-names
[ 1821.143449] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property: /__symbols__/clk_0
[ 1821.155357] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property: /__symbols__/ILC
[ 1821.167074] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property:
/__symbols__/freeze_controller_0
[ 1821.180171] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property:
/__symbols__/sysid_qsys_0
[ 1821.192662] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property: /__symbols__/led_pio
[ 1821.204720] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property: /__symbols__/button_pio
[ 1821.217034] OF: overlay: WARNING: add_changeset_property(), memory
leak will occur if overlay removed.  Property: /__symbols__/dipsw_pio
[ 1821.231977] of-fpga-region soc:base_fpga_region:fpga_pr_region0:
FPGA Region probed
[ 1821.240144] altera_freeze_br ff200450.freeze_controller: fpga
bridge [freeze] registered

root@arria10:~# ./apply-static-region.sh root@arria10:~# rmdir
/sys/kernel/config/device-tree/overlays/1-socfpga_arria10_socdk_sdmmc_ghrd_ovl_ext_cfg.dtb

[ 1823.805564] OF: ERROR: memory leak - destroy cset entry: attach
overlay node /soc/base_fpga_region/clk_0 with refcount 2

Alan


Alan



>
> ALL people:
>
>   The validations do _not_ address another major concern I have with
>   releasing overlays, which is use after free errors.
>
> Changes since v2:
>
>   - 13/18: Use continue to reduce indentation in find_dup_cset_node_entry()
> and find_dup_cset_prop()
>
> Changes since v1:
>
>   - move patch 16/16 to 17/18
>   - move patch 15/16 to 18/18
>   - new patch 15/18
>   - new patch 16/18
>
>   - 05/18: add_changeset_node() header comment: incorrect comment for @target
>
>   - 18/18: add same fix for of_parse_phandle_with_args()
>   - 18/18: add same fix for of_parse_phandle_with_args_map()
>
> Frank Rowand (18):
>   of: overlay: add tests to validate kfrees from overlay removal
>   of: overlay: add missing of_node_put() after add new node to changeset
>   of: overlay: add missing of_node_get() in __of_attach_node_sysfs
>   powerpc/pseries: add of_node_put() in dlpar_detach_node()
>   of: overlay: use prop add changeset entry for property in new nodes
>   of: overlay: do not duplicate properties from overlay for new nodes
>   of: dynamic: change type of of_{at,de}tach_node() to void
>   of: overlay: reorder fields in struct fragment
>   of: overlay: validate overlay properties 

Re: [PATCH v3 09/18] of: overlay: validate overlay properties #address-cells and #size-cells

2018-10-15 Thread Alan Tull
On Sun, Oct 14, 2018 at 7:26 PM  wrote:
>
> From: Frank Rowand 
>
> If overlay properties #address-cells or #size-cells are already in
> the live devicetree for any given node, then the values in the
> overlay must match the values in the live tree.
>
> If the properties are already in the live tree then there is no
> need to create a changeset entry to add them since they must
> have the same value.  This reduces the memory used by the
> changeset and eliminates a possible memory leak.  This is
> verified by 12 fewer warnings during the devicetree unittest,
> as the possible memory leak warnings about #address-cells and
>
> Signed-off-by: Frank Rowand 
> ---
>  drivers/of/overlay.c | 38 +++---
>  1 file changed, 35 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c
> index 272a0d1a5e18..ee66651db553 100644
> --- a/drivers/of/overlay.c
> +++ b/drivers/of/overlay.c
> @@ -287,7 +287,12 @@ static struct property *dup_and_fixup_symbol_prop(
>   * @target may be either in the live devicetree or in a new subtree that
>   * is contained in the changeset.
>   *
> - * Some special properties are not updated (no error returned).
> + * Some special properties are not added or updated (no error returned):
> + * "name", "phandle", "linux,phandle".
> + *
> + * Properties "#address-cells" and "#size-cells" are not updated if they
> + * are already in the live tree, but if present in the live tree, the values
> + * in the overlay must match the values in the live tree.
>   *
>   * Update of property in symbols node is not allowed.
>   *
> @@ -300,6 +305,7 @@ static int add_changeset_property(struct 
> overlay_changeset *ovcs,
>  {
> struct property *new_prop = NULL, *prop;
> int ret = 0;
> +   bool check_for_non_overlay_node = false;
>
> if (!of_prop_cmp(overlay_prop->name, "name") ||
> !of_prop_cmp(overlay_prop->name, "phandle") ||
> @@ -322,13 +328,39 @@ static int add_changeset_property(struct 
> overlay_changeset *ovcs,
> if (!new_prop)
> return -ENOMEM;
>
> -   if (!prop)
> +   if (!prop) {
> +
> +   check_for_non_overlay_node = true;
> ret = of_changeset_add_property(>cset, target->np,
> new_prop);
> -   else
> +
> +   } else if (!of_prop_cmp(prop->name, "#address-cells")) {
> +

Hi Frank,

If we get these ERROR messages, I suggest that this function should
return an error so the overlay will be rejected.

> +   if (prop->length != 4 || new_prop->length != 4 ||
> +   *(u32 *)prop->value != *(u32 *)new_prop->value)

*(u32 *)prop->value != *(u32 *)new_prop->value) {

> +   pr_err("ERROR: overlay and/or live tree 
> #address-cells invalid in node %pOF\n",
> +  target->np);

   ret = -EINVAL;
}

Otherwise there is an ERROR message, but it continues trying to apply
the invalid overlay anyway and I get an oops.  By adding the ret =
-EINVAL, the overlay gets rejected and the oops is avoided.

> +
> +   } else if (!of_prop_cmp(prop->name, "#size-cells")) {
> +
> +   if (prop->length != 4 || new_prop->length != 4 ||
> +   *(u32 *)prop->value != *(u32 *)new_prop->value)
> +   pr_err("ERROR: overlay and/or live tree #size-cells 
> invalid in node %pOF\n",
> +  target->np);

Add the ret = -EINVAL here also.  This give me the following (if my
overlay changes #address-cells):

[   21.167551] OF: overlay: ERROR: overlay and/or live tree
#address-cells invalid in node /soc/base_fpga_region
[   21.177442] OF: overlay: add_changeset_property ret=-22
[   21.182656] create_overlay: Failed to create overlay (err=-22)

Also, I wonder if the ERROR message could be more direct.  Currently
it says the #address-cells property is invalid but that doesn't say
anything about why it's invalid.  How about something like:

 OF: overlay: ERROR: changing #address-cells not allowed (/soc/base_fpga_region)

The 'OF: overlay' part still makes it clear it's overlay related.  The
rest of it makes it clear *why* it's invalid.  This ERROR will be a
surprise for people who have been using overlays, so that could be
helpful to light the way a bit.

Alan

> +
> +   } else {
> +
> +   check_for_non_overlay_node = true;
> ret = of_changeset_update_property(>cset, target->np,
>new_prop);
>
> +   }
> +
> +   if (check_for_non_overlay_node &&
> +   !of_node_check_flag(target->np, OF_OVERLAY))
> +   pr_err("WARNING: %s(), memory leak will occur if overlay 
> removed.  Property: %pOF/%s\n",
> +  __func__, target->np, new_prop->name);
> +
> if (ret) {
> kfree(new_prop->name);
> 

[PATCH RFC] powerpc/ftrace: Handle large kernel configs

2018-10-15 Thread Naveen N. Rao
Currently, we expect to be able to reach ftrace_caller() from all
ftrace-enabled functions through a single relative branch. With large
kernel configs, we see functions farther than 32MB of ftrace_caller()
causing ftrace_init() to bail.

One way to solve this is by adding additional trampolines around .text,
.init.text and any other sections with profiled functions.  However,
such trampolines only help if a section does not exceed 64MB.  With
allyesconfig, .text section alone can grow upwards of 100MB, which will
then require us to insert trampolines in the middle of .text... somehow.

In such configurations, gcc/ld emits two types of trampolines for mcount():
1. A long_branch, which has a single branch to mcount() for functions that
   are one hop away from mcount():
c19e8544 <00031b56.long_branch._mcount>:
c19e8544:   4a 69 3f ac b   c007c4f0 
<._mcount>

2. A plt_branch, for functions that are farther away from mcount():
c51f33f8 <0008ba04.plt_branch._mcount>:
c51f33f8:   3d 82 ff a4 addis   r12,r2,-92
c51f33fc:   e9 8c 04 20 ld  r12,1056(r12)
c51f3400:   7d 89 03 a6 mtctr   r12
c51f3404:   4e 80 04 20 bctr

We can reuse those trampolines for ftrace if we can have those
trampolines go to ftrace_caller() instead. On powerpc, we don't support
!CONFIG_DYNAMIC_FTRACE anymore. As such, we can simply patch mcount() to
branch to ftrace_caller() (or to ftrace_regs_caller() on
-mprofile-kernel) allowing us to use those gcc-generated trampolines for
ftrace.

We note down all the existing gcc-generated trampolines during
ftrace_init() and patch branches to those if ftrace_caller() is not
reachable.

Signed-off-by: Naveen N. Rao 
---
The one aspect I am not entirely sure about is if the plt_branch is fine 
for -mprofile-kernel as it depends on r2 being properly setup. If it 
isn't, we will have to setup separate trampolines just for 
-mprofile-kernel.

- Naveen


 arch/powerpc/kernel/trace/ftrace.c | 131 -
 1 file changed, 129 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index 4bfbb54dee51..5fcc05866a23 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -30,6 +30,10 @@
 
 
 #ifdef CONFIG_DYNAMIC_FTRACE
+
+#defineNUM_FTRACE_TRAMPS   8
+static unsigned long ftrace_cc_tramps[NUM_FTRACE_TRAMPS];
+
 static unsigned int
 ftrace_call_replace(unsigned long ip, unsigned long addr, int link)
 {
@@ -270,6 +274,52 @@ __ftrace_make_nop(struct module *mod,
 #endif /* PPC64 */
 #endif /* CONFIG_MODULES */
 
+static void add_ftrace_cc_tramp(unsigned long tramp)
+{
+   int i;
+
+   for (i = 0; i < NUM_FTRACE_TRAMPS; i++)
+   if (!ftrace_cc_tramps[i]) {
+   ftrace_cc_tramps[i] = tramp;
+   return;
+   } else if (ftrace_cc_tramps[i] == tramp)
+   return;
+
+   WARN(1, "No ftrace cc tramp slots available");
+}
+
+static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr)
+{
+   unsigned long tramp, ip = rec->ip;
+   unsigned int op;
+
+   /* read where this goes */
+   if (probe_kernel_read(, (void *)ip, sizeof(int))) {
+   pr_err("Fetching opcode failed.\n");
+   return -EFAULT;
+   }
+
+   /* Make sure that that this is still a 24bit jump */
+   if (!is_bl_op(op)) {
+   pr_err("Not expected bl: opcode is %x\n", op);
+   return -EINVAL;
+   }
+
+   /* lets find where the pointer goes */
+   tramp = find_bl_target(ip, op);
+
+   pr_devel("ip:%lx jumps to %lx", ip, tramp);
+
+   add_ftrace_cc_tramp(tramp);
+
+   if (patch_instruction((unsigned int *)ip, PPC_INST_NOP)) {
+   pr_err("Patching NOP failed.\n");
+   return -EPERM;
+   }
+
+   return 0;
+}
+
 int ftrace_make_nop(struct module *mod,
struct dyn_ftrace *rec, unsigned long addr)
 {
@@ -286,7 +336,8 @@ int ftrace_make_nop(struct module *mod,
old = ftrace_call_replace(ip, addr, 1);
new = PPC_INST_NOP;
return ftrace_modify_code(ip, old, new);
-   }
+   } else if (core_kernel_text(ip))
+   return __ftrace_make_nop_kernel(rec, addr);
 
 #ifdef CONFIG_MODULES
/*
@@ -456,6 +507,40 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long 
addr)
 #endif /* CONFIG_PPC64 */
 #endif /* CONFIG_MODULES */
 
+static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long 
addr)
+{
+   unsigned int op, i;
+   void *ip = (void *)rec->ip;
+
+   /* read where this goes */
+   if (probe_kernel_read(, ip, sizeof(op)))
+   return -EFAULT;
+
+   if (op != PPC_INST_NOP) {
+   

Re: [PATCH 1/7] dmaengine: fsldma: Replace DMA_IN/OUT by FSL_DMA_IN/OUT

2018-10-15 Thread Vinod
On 11-10-18, 17:46, Peng Ma wrote:
> From: Wen He 
> 
> This patch implement a standard macro call functions is
> used to NXP dma drivers.
> 
> Signed-off-by: Wen He 

Please read Documentation/process/submitting-patches.rst, we expect each
patch you send to have your signed off and signed off from orignal
authors

Also as Rob pointed out this patch series should be v8 or so as this is
following the previous work submitted to the list

Please also indicated changes from last rev which help in the review.


-- 
~Vinod


RE: [PATCH v4 5/6] arm64: dts: add QorIQ LX2160A SoC support

2018-10-15 Thread Stephen Boyd
Quoting Vabhav Sharma (2018-10-14 19:58:15)
> > > +
> > > +   pmu {
> > > +   compatible = "arm,cortex-a72-pmu";
> > > +   interrupts = ;
> > > +   };
> > > +
> > > +   psci {
> > > +   compatible = "arm,psci-0.2";
> > > +   method = "smc";
> > > +   };
> > > +
> > > +   memory@8000 {
> > > +   // DRAM space - 1, size : 2 GB DRAM
> > > +   device_type = "memory";
> > > +   reg = <0x 0x8000 0 0x8000>;
> > > +   };
> > > +
> > > +   ddr1: memory-controller@108 {
> > > +   compatible = "fsl,qoriq-memory-controller";
> > > +   reg = <0x0 0x108 0x0 0x1000>;
> > > +   interrupts = ;
> > > +   little-endian;
> > > +   };
> > > +
> > > +   ddr2: memory-controller@109 {
> > > +   compatible = "fsl,qoriq-memory-controller";
> > > +   reg = <0x0 0x109 0x0 0x1000>;
> > > +   interrupts = ;
> > > +   little-endian;
> > > +   };
> > > +
> > > +   sysclk: sysclk {
> > 
> > Name the node a bit generic like clock-xxx.
> There is only one clock-unit, Bootloader(U-boot) require sysclk node during 
> device tree fix-up as different platform support varied platform frequency as 
> per reset configuration word used.
> Referred other ARM based platform with one clock using name as x: x

Please add a comment above this node with this information. Newcomers
reading this DTS file won't have any idea why this node is specially
named and a comment will help tremendously here.



Re: Stack protector crash in pnv_smp_cpu_kill_self()

2018-10-15 Thread Christophe LEROY
Looks like a lack of initialisation of the canary for the non-boot CPUs 
on SMP, you applied this morning the patch I sent you for that.


Is the patch in ?

Christophe

Le 15/10/2018 à 15:26, Michael Ellerman a écrit :

Hi all,

Spotted this today, haven't had time to debug it further, just FYI in
case anyone else sees it.

   Running tests in cpufreq
   
   selftests: cpufreq: main.sh
   pid 9727's current affinity mask: 

   pid 9727's new affinity mask: 1
   Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: 
pnv_smp_cpu_kill_self+0x2a0/0x2b0
   
   CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.19.0-rc3-gcc-7.3.1-00168-g4ffe713b7587 #94

   Call Trace:
   [c7a1fb00] [c0ae7b4c] dump_stack+0xb0/0xf4 (unreliable)
   [c7a1fb40] [c00e59cc] panic+0x144/0x328
   [c7a1fbe0] [c00e544c] __stack_chk_fail+0x2c/0x30
   [c7a1fc40] [c009eca0] pnv_smp_cpu_kill_self+0x2a0/0x2b0
   [c7a1fe10] [c00475f8] cpu_die+0x48/0x70
   [c7a1fe30] [c0020620] arch_cpu_idle_dead+0x20/0x40
   [c7a1fe50] [c012da94] do_idle+0x274/0x390
   [c7a1fec0] [c012de08] cpu_startup_entry+0x38/0x50
   [c7a1fef0] [c0047334] start_secondary+0x5e4/0x600
   [c7a1ff90] [c000ac70] start_secondary_prolog+0x10/0x14
   Rebooting in 10 seconds..
   [39378.502863506,5] OPAL: Reboot request



cheers



Stack protector crash in pnv_smp_cpu_kill_self()

2018-10-15 Thread Michael Ellerman
Hi all,

Spotted this today, haven't had time to debug it further, just FYI in
case anyone else sees it.

  Running tests in cpufreq
  
  selftests: cpufreq: main.sh
  pid 9727's current affinity mask: 
  pid 9727's new affinity mask: 1
  Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: 
pnv_smp_cpu_kill_self+0x2a0/0x2b0
  
  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 
4.19.0-rc3-gcc-7.3.1-00168-g4ffe713b7587 #94
  Call Trace:
  [c7a1fb00] [c0ae7b4c] dump_stack+0xb0/0xf4 (unreliable)
  [c7a1fb40] [c00e59cc] panic+0x144/0x328
  [c7a1fbe0] [c00e544c] __stack_chk_fail+0x2c/0x30
  [c7a1fc40] [c009eca0] pnv_smp_cpu_kill_self+0x2a0/0x2b0
  [c7a1fe10] [c00475f8] cpu_die+0x48/0x70
  [c7a1fe30] [c0020620] arch_cpu_idle_dead+0x20/0x40
  [c7a1fe50] [c012da94] do_idle+0x274/0x390
  [c7a1fec0] [c012de08] cpu_startup_entry+0x38/0x50
  [c7a1fef0] [c0047334] start_secondary+0x5e4/0x600
  [c7a1ff90] [c000ac70] start_secondary_prolog+0x10/0x14
  Rebooting in 10 seconds..
  [39378.502863506,5] OPAL: Reboot request



cheers


Re: [PATCH] powerpc/book3s64: fix dump_linuxpagetables "present" flag

2018-10-15 Thread Aneesh Kumar K.V

On 10/15/18 12:07 PM, Christophe Leroy wrote:

Since commit bd0dbb73e013 ("powerpc/mm/books3s: Add new pte bit to
mark pte temporarily invalid."), _PAGE_PRESENT doesn't mean exactly
that a page is present. A page is also considered preset when
_PAGE_INVALID is set.

This patch changes the meaning of "present" and adds a status "valid"
associated to the _PAGE_PRESENT flag.



Reviewed-by: Aneesh Kumar K.V 


Fixes: bd0dbb73e013 ("powerpc/mm/books3s: Add new pte bit to mark pte temporarily 
invalid.")
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/dump_linuxpagetables-book3s64.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c 
b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c
index a637e612b205..ed6fcf78256e 100644
--- a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c
+++ b/arch/powerpc/mm/dump_linuxpagetables-book3s64.c
@@ -38,8 +38,13 @@ static const struct flag_info flag_array[] = {
}, {
.mask   = _PAGE_PRESENT,
.val= _PAGE_PRESENT,
-   .set= "present",
-   .clear  = "   ",
+   .set= "valid",
+   .clear  = " ",
+   }, {
+   .mask   = _PAGE_PRESENT | _PAGE_INVALID,
+   .val= 0,
+   .set= "   ",
+   .clear  = "present",
}, {
.mask   = H_PAGE_HASHPTE,
.val= H_PAGE_HASHPTE,





Re: linux-next: qemu boot failures with today's linux-next

2018-10-15 Thread Michael Ellerman
Stephen Rothwell  writes:
> Hi all,
>
> Today's linux-next tree does not boot in qemu (with kvm or without).
> Here is the (non-kvm) boot log (the kvm one is basically the same):
>
> --
...
>   Welcome to Open Firmware
>
>   Copyright (c) 2004, 2017 IBM Corporation All rights reserved.
>   This program and the accompanying materials are made available
>   under the terms of the BSD License available at
>   http://www.opensource.org/licenses/bsd-license.php
>
> Booting from memory...
> OF stdout device is: /vdevice/vty@7100
> Preparing to boot Linux version 4.19.0-rc7 (sfr@ash) (gcc version 8.2.0 
> (Debian 8.2.0-4)) #2 SMP Mon Oct 15 18:53:28 AEDT 2018
  ^^
  I assume that's wrong, this is actually 
linux-next you're booting?


> Detected machine type: 0101
> command line: 
> Max number of cores passed to firmware: 2048 (NR_CPUS = 2048)
> Calling ibm,client-architecture-support... done
> memory layout at init:
>   memory_limit :  (16 MB aligned)
>   alloc_bottom : 0164
>   alloc_top: 3000
>   alloc_top_hi : 8000
>   rmo_top  : 3000
>   ram_top  : 8000
> instantiating rtas at 0x2fff... done
> prom_hold_cpus: skipped
> copying OF device tree...
> Building dt strings...
> Building dt structure...
> Device tree strings 0x0185 -> 0x01850a02
> Device tree struct  0x0186 -> 0x0187
> Quiescing Open Firmware ...
> Booting Linux via __start() @ 0x0040 ...
> --
>
> I have no idea what may have caused this.

If you git Ctrl-a-c you should get the qemu prompt. Then you can run
'info registers' to print the regs and maybe see where it's stuck.

And/or build with EARLY_DEBUG_LPAR to get early console output.

cheers


Re: [PATCH 1/3] powerpc: Split user/kernel definitions of struct pt_regs

2018-10-15 Thread Madhavan Srinivasan




On Monday 15 October 2018 04:38 PM, Michael Ellerman wrote:

Madhavan Srinivasan  writes:


On Saturday 13 October 2018 04:26 PM, Michael Ellerman wrote:

We use a shared definition for struct pt_regs in uapi/asm/ptrace.h.
That means the layout of the structure is ABI, ie. we can't change it.

That would be fine if it was only used to describe the user-visible
register state of a process, but it's also the struct we use in the
kernel to describe the registers saved in an interrupt frame.

We'd like more flexibility in the content (and possibly layout) of the
kernel version of the struct, but currently that's not possible.

So split the definition into a user-visible definition which remains
unchanged, and a kernel internal one.

At the moment they're still identical, and we check that at build
time. That's because we have code (in ptrace etc.) that assumes that
they are the same. We will fix that code in future patches, and then
we can break the strict symmetry between the two structs.

Nice and awesome. But just trying to understand. What will
*regs will point to in the "struct sigcontext".

Yeah that's a bit fishy.

It should always point to a user_pt_regs.

So in the kernel we want:

   struct sigcontext {
...
struct user_pt_regs __user *regs;

And in userspace we want:

   struct sigcontext {
...
struct pt_regs  __user *regs;


I think it's not actually broken at the moment, because it's just a
pointer, and we don't do anything based on the sizeof() the type.


yes. This clarifies. But still perf/perf_regs.c needs changes.
Because perf support dumping user_space regs and interrupt regs.
Once again, we dont use any sizeof(), but need to handle the
user_pt_regs changes.

I will have a look at that in the morning.

Thanks for clarification.
Maddy



But still we should fix it.

I guess I'll do this:

diff --git a/arch/powerpc/include/uapi/asm/sigcontext.h 
b/arch/powerpc/include/uapi/asm/sigcontext.h
index 2fbe485acdb4..630aeda56d59 100644
--- a/arch/powerpc/include/uapi/asm/sigcontext.h
+++ b/arch/powerpc/include/uapi/asm/sigcontext.h
@@ -22,7 +22,11 @@ struct sigcontext {
  #endif
unsigned long   handler;
unsigned long   oldmask;
-   struct pt_regs  __user *regs;
+#ifdef __KERNEL__
+   struct user_pt_regs __user *regs;
+#else
+   struct pt_regs  *regs;
+#endif
  #ifdef __powerpc64__
elf_gregset_t   gp_regs;
elf_fpregset_t  fp_regs;


Thanks for the review.

cheers





Re: 32-bit poweroc compile failure in v4.19-rc7-166-g7ec21823634d

2018-10-15 Thread Meelis Roos

I tried to test the fix to 32-bit poweroc boot hang but found that current git 
does not compile on 32bit poweroc at all for me:


That's GCC 8 I think?


Yes, gcc version 8.2.0 (Debian 8.2.0-7).


I have seen that but couldn't work out what the hell GCC is thinking.

I don't think there's an actual bug, the size of the copy is bounded by
the count parameter, which *doesn't* come from user space, it's hard
coded.

I suspect the logic is just to convoluted for GCC.

We should fix it somehow, but I haven't worked out what's the best option.

cheers


--
Meelis Roos 


Re: [PATCH V4 00/15] x86/KVM/Hyper-v: Add HV ept tlb range flush hypercall support in KVM

2018-10-15 Thread Paolo Bonzini
On 13/10/2018 16:53, lantianyu1...@gmail.com wrote:
> From: Lan Tianyu 
> 
> For nested memory virtualization, Hyper-v doesn't set write-protect
> L1 hypervisor EPT page directory and page table node to track changes 
> while it relies on guest to tell it changes via HvFlushGuestAddressLlist
> hypercall. HvFlushGuestAddressLlist hypercall provides a way to flush
> EPT page table with ranges which are specified by L1 hypervisor.
> 
> If L1 hypervisor uses INVEPT or HvFlushGuestAddressSpace hypercall to
> flush EPT tlb, Hyper-V will invalidate associated EPT shadow page table
> and sync L1's EPT table when next EPT page fault is triggered.
> HvFlushGuestAddressLlist hypercall helps to avoid such redundant EPT
> page fault and synchronization of shadow page table.

So I just told you that the first part is well understood but I must
retract that; after carefully reviewing the whole series, I think one of
us is actually very confused.

I am not afraid to say it can be me, but my understanding is that you're
passing L1 GPAs to the hypercall and instead the spec says it expects L2
GPAs.  (Consider that, because KVM's shadow paging code is shared
between nested EPT and !EPT cases, every time you see gpa/gfn in the
code it is for L1, while nested EPT GPAs are really what the code calls
gva.)

What's going on?

Paolo


Re: [PATCH V4 2/15] KVM/MMU: Add tlb flush with range helper function

2018-10-15 Thread Paolo Bonzini
On 14/10/2018 10:16, Thomas Gleixner wrote:
>>> +static inline bool kvm_available_flush_tlb_with_range(void)
>>> +{
>>> +   return kvm_x86_ops->tlb_remote_flush_with_range;
>>> +}
>> Seems that kvm_available_flush_tlb_with_range() is not used in this patch…
> What's wrong with that? 
> 
> It provides the implementation and later patches make use of it. It's a
> sensible way to split patches into small, self contained entities.

That's true, on the other hand I have indeed a concerns with this patch:
this series is not bisectable at all, because all the new code is dead
until the very last patch.  Uses of the new feature should come _after_
the implementation.

I don't have any big problem with what Liran pointed out (and I can live
with the unused static functions that would warn with -Wunused, too),
but the above should be fixed in v5, basically by moving patches 12-15
at the beginning of the series.

Paolo


Re: [PATCH 1/3] powerpc: Split user/kernel definitions of struct pt_regs

2018-10-15 Thread Michael Ellerman
Nicholas Piggin  writes:
> On Sat, 13 Oct 2018 21:56:44 +1100
> Michael Ellerman  wrote:
>
>> We use a shared definition for struct pt_regs in uapi/asm/ptrace.h.
>> That means the layout of the structure is ABI, ie. we can't change it.
>> 
>> That would be fine if it was only used to describe the user-visible
>> register state of a process, but it's also the struct we use in the
>> kernel to describe the registers saved in an interrupt frame.
>> 
>> We'd like more flexibility in the content (and possibly layout) of the
>> kernel version of the struct, but currently that's not possible.
>> 
>> So split the definition into a user-visible definition which remains
>> unchanged, and a kernel internal one.
>> 
>> At the moment they're still identical, and we check that at build
>> time. That's because we have code (in ptrace etc.) that assumes that
>> they are the same. We will fix that code in future patches, and then
>> we can break the strict symmetry between the two structs.
>> 
>> Signed-off-by: Michael Ellerman 
>
> Yeah this looks much better than my int_frame thing, thanks.

Thanks. More bug prone to :)

But hopefully it will pay off in the long run.

cheers


[PATCH 2/2] powerpc/aout: Fix struct user definition to use user_pt_regs

2018-10-15 Thread Michael Ellerman
I'm pretty sure this is dead code, it's only used by the a.out core
dump code, and we don't support a.out. We should remove it.

But while it's in the tree it should be using the ABI version of
pt_regs which is called user_pt_regs in the kernel, because the whole
struct is written to the core dump and so its size shouldn't change.

Note this isn't a uapi header so we don't need an ifdef.

Fixes: 002af9391bfb ("powerpc: Split user/kernel definitions of struct pt_regs")
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/user.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/user.h b/arch/powerpc/include/asm/user.h
index 5c0e082eae7b..99443b8594e7 100644
--- a/arch/powerpc/include/asm/user.h
+++ b/arch/powerpc/include/asm/user.h
@@ -31,7 +31,7 @@
  * to write an integer number of pages.
  */
 struct user {
-   struct pt_regs  regs;   /* entire machine state */
+   struct user_pt_regs regs;   /* entire machine state */
size_t  u_tsize;/* text size (pages) */
size_t  u_dsize;/* data size (pages) */
size_t  u_ssize;/* stack size (pages) */
-- 
2.17.1



[PATCH 1/2] powerpc/uapi: Fix sigcontext definition to use user_pt_regs

2018-10-15 Thread Michael Ellerman
My recent patch to split pt_regs between user and kernel missed
the usage in struct sigcontext.

Because this is a user visible struct it should be using the user
visible definition, which when we're building for the kernel is called
struct user_pt_regs.

As far as I can see this hasn't actually caused a bug (yet), because
we don't use the sizeof() the sigcontext->regs anywhere. But we should
still fix it to avoid confusion and future bugs.

Fixes: 002af9391bfb ("powerpc: Split user/kernel definitions of struct pt_regs")
Reported-by: Madhavan Srinivasan 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/uapi/asm/sigcontext.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/uapi/asm/sigcontext.h 
b/arch/powerpc/include/uapi/asm/sigcontext.h
index 2fbe485acdb4..630aeda56d59 100644
--- a/arch/powerpc/include/uapi/asm/sigcontext.h
+++ b/arch/powerpc/include/uapi/asm/sigcontext.h
@@ -22,7 +22,11 @@ struct sigcontext {
 #endif
unsigned long   handler;
unsigned long   oldmask;
-   struct pt_regs  __user *regs;
+#ifdef __KERNEL__
+   struct user_pt_regs __user *regs;
+#else
+   struct pt_regs  *regs;
+#endif
 #ifdef __powerpc64__
elf_gregset_t   gp_regs;
elf_fpregset_t  fp_regs;
-- 
2.17.1



Re: [PATCH 1/3] powerpc: Split user/kernel definitions of struct pt_regs

2018-10-15 Thread Michael Ellerman
Madhavan Srinivasan  writes:

> On Saturday 13 October 2018 04:26 PM, Michael Ellerman wrote:
>> We use a shared definition for struct pt_regs in uapi/asm/ptrace.h.
>> That means the layout of the structure is ABI, ie. we can't change it.
>>
>> That would be fine if it was only used to describe the user-visible
>> register state of a process, but it's also the struct we use in the
>> kernel to describe the registers saved in an interrupt frame.
>>
>> We'd like more flexibility in the content (and possibly layout) of the
>> kernel version of the struct, but currently that's not possible.
>>
>> So split the definition into a user-visible definition which remains
>> unchanged, and a kernel internal one.
>>
>> At the moment they're still identical, and we check that at build
>> time. That's because we have code (in ptrace etc.) that assumes that
>> they are the same. We will fix that code in future patches, and then
>> we can break the strict symmetry between the two structs.
>
> Nice and awesome. But just trying to understand. What will
> *regs will point to in the "struct sigcontext".

Yeah that's a bit fishy.

It should always point to a user_pt_regs.

So in the kernel we want:

  struct sigcontext {
...
struct user_pt_regs __user *regs;

And in userspace we want:

  struct sigcontext {
...
struct pt_regs  __user *regs;


I think it's not actually broken at the moment, because it's just a
pointer, and we don't do anything based on the sizeof() the type.

But still we should fix it.

I guess I'll do this:

diff --git a/arch/powerpc/include/uapi/asm/sigcontext.h 
b/arch/powerpc/include/uapi/asm/sigcontext.h
index 2fbe485acdb4..630aeda56d59 100644
--- a/arch/powerpc/include/uapi/asm/sigcontext.h
+++ b/arch/powerpc/include/uapi/asm/sigcontext.h
@@ -22,7 +22,11 @@ struct sigcontext {
 #endif
unsigned long   handler;
unsigned long   oldmask;
-   struct pt_regs  __user *regs;
+#ifdef __KERNEL__
+   struct user_pt_regs __user *regs;
+#else
+   struct pt_regs  *regs;
+#endif
 #ifdef __powerpc64__
elf_gregset_t   gp_regs;
elf_fpregset_t  fp_regs;


Thanks for the review.

cheers


Re: 32-bit poweroc compile failure in v4.19-rc7-166-g7ec21823634d

2018-10-15 Thread Michael Ellerman
Meelis Roos  writes:
> I tried to test the fix to 32-bit poweroc boot hang but found that current 
> git does not compile on 32bit poweroc at all for me:

That's GCC 8 I think?

I have seen that but couldn't work out what the hell GCC is thinking.

I don't think there's an actual bug, the size of the copy is bounded by
the count parameter, which *doesn't* come from user space, it's hard
coded.

I suspect the logic is just to convoluted for GCC.

We should fix it somehow, but I haven't worked out what's the best option.

cheers


>CC  arch/powerpc/kernel/ptrace.o
> In file included from ./include/linux/bitmap.h:9,
>   from ./include/linux/cpumask.h:12,
>   from ./include/linux/rcupdate.h:44,
>   from ./include/linux/rculist.h:11,
>   from ./include/linux/pid.h:5,
>   from ./include/linux/sched.h:14,
>   from arch/powerpc/kernel/ptrace.c:19:
> In function ‘memcpy’,
>  inlined from ‘user_regset_copyin’ at ./include/linux/regset.h:295:4,
>  inlined from ‘vr_set’ at arch/powerpc/kernel/ptrace.c:619:9:
> ./include/linux/string.h:345:9: error: ‘__builtin_memcpy’ offset [-527, -529] 
> is out of the bounds [0, 16] of object ‘vrsave’ with type ‘union ’ 
> [-Werror=array-bounds]
>return __builtin_memcpy(p, q, size);
>   ^~~~
> arch/powerpc/kernel/ptrace.c: In function ‘vr_set’:
> arch/powerpc/kernel/ptrace.c:614:5: note: ‘vrsave’ declared here
> } vrsave;
>   ^~
> In file included from ./include/linux/bitmap.h:9,
>   from ./include/linux/cpumask.h:12,
>   from ./include/linux/rcupdate.h:44,
>   from ./include/linux/rculist.h:11,
>   from ./include/linux/pid.h:5,
>   from ./include/linux/sched.h:14,
>   from arch/powerpc/kernel/ptrace.c:19:
> In function ‘memcpy’,
>  inlined from ‘user_regset_copyout’ at ./include/linux/regset.h:270:4,
>  inlined from ‘vr_get’ at arch/powerpc/kernel/ptrace.c:572:9:
> ./include/linux/string.h:345:9: error: ‘__builtin_memcpy’ offset [-527, -529] 
> is out of the bounds [0, 16] of object ‘vrsave’ with type ‘union ’ 
> [-Werror=array-bounds]
>return __builtin_memcpy(p, q, size);
>   ^~~~
> arch/powerpc/kernel/ptrace.c: In function ‘vr_get’:
> arch/powerpc/kernel/ptrace.c:567:5: note: ‘vrsave’ declared here
> } vrsave;
>   ^~
> cc1: all warnings being treated as errors
> make[1]: *** [scripts/Makefile.build:306: arch/powerpc/kernel/ptrace.o] Error 
> 1
> make: *** [Makefile:1052: arch/powerpc/kernel] Error 2
>
> -- 
> Meelis Roos 


Re: [PATCH 2/4] mm: speed up mremap by 500x on large regions (v2)

2018-10-15 Thread Christoph Hellwig
On Fri, Oct 12, 2018 at 06:31:58PM -0700, Joel Fernandes (Google) wrote:
> Android needs to mremap large regions of memory during memory management
> related operations.

Just curious: why?

> + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
> + || old_end - old_addr < PMD_SIZE)

The || goes on the first line.

> + } else if (extent == PMD_SIZE && 
> IS_ENABLED(CONFIG_HAVE_MOVE_PMD)) {

Overly long line.


Re: [PATCH v2 2/2] mm: speed up mremap by 500x on large regions

2018-10-15 Thread Martin Schwidefsky
On Mon, 15 Oct 2018 09:10:53 +0200
Christian Borntraeger  wrote:

> On 10/12/2018 03:37 AM, Joel Fernandes (Google) wrote:
> > Android needs to mremap large regions of memory during memory management
> > related operations. The mremap system call can be really slow if THP is
> > not enabled. The bottleneck is move_page_tables, which is copying each
> > pte at a time, and can be really slow across a large map. Turning on THP
> > may not be a viable option, and is not for us. This patch speeds up the
> > performance for non-THP system by copying at the PMD level when possible.
> > 
> > The speed up is three orders of magnitude. On a 1GB mremap, the mremap
> > completion times drops from 160-250 millesconds to 380-400 microseconds.
> > 
> > Before:
> > Total mremap time for 1GB data: 242321014 nanoseconds.
> > Total mremap time for 1GB data: 196842467 nanoseconds.
> > Total mremap time for 1GB data: 167051162 nanoseconds.
> > 
> > After:
> > Total mremap time for 1GB data: 385781 nanoseconds.
> > Total mremap time for 1GB data: 388959 nanoseconds.
> > Total mremap time for 1GB data: 402813 nanoseconds.
> > 
> > Incase THP is enabled, the optimization is skipped. I also flush the
> > tlb every time we do this optimization since I couldn't find a way to
> > determine if the low-level PTEs are dirty. It is seen that the cost of
> > doing so is not much compared the improvement, on both x86-64 and arm64.
> > 
> > Cc: minc...@kernel.org
> > Cc: pan...@google.com
> > Cc: hu...@google.com
> > Cc: lokeshgi...@google.com
> > Cc: dan...@google.com
> > Cc: mho...@kernel.org
> > Cc: kir...@shutemov.name
> > Cc: a...@linux-foundation.org
> > Signed-off-by: Joel Fernandes (Google) 
> > ---
> >  mm/mremap.c | 62 +
> >  1 file changed, 62 insertions(+)
> > 
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index 9e68a02a52b1..d82c485822ef 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -191,6 +191,54 @@ static void move_ptes(struct vm_area_struct *vma, 
> > pmd_t *old_pmd,
> > drop_rmap_locks(vma);
> >  }
> >  
> > +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long 
> > old_addr,
> > + unsigned long new_addr, unsigned long old_end,
> > + pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
> > +{
> > +   spinlock_t *old_ptl, *new_ptl;
> > +   struct mm_struct *mm = vma->vm_mm;
> > +
> > +   if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
> > +   || old_end - old_addr < PMD_SIZE)
> > +   return false;
> > +
> > +   /*
> > +* The destination pmd shouldn't be established, free_pgtables()
> > +* should have release it.
> > +*/
> > +   if (WARN_ON(!pmd_none(*new_pmd)))
> > +   return false;
> > +
> > +   /*
> > +* We don't have to worry about the ordering of src and dst
> > +* ptlocks because exclusive mmap_sem prevents deadlock.
> > +*/
> > +   old_ptl = pmd_lock(vma->vm_mm, old_pmd);
> > +   if (old_ptl) {
> > +   pmd_t pmd;
> > +
> > +   new_ptl = pmd_lockptr(mm, new_pmd);
> > +   if (new_ptl != old_ptl)
> > +   spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> > +
> > +   /* Clear the pmd */
> > +   pmd = *old_pmd;
> > +   pmd_clear(old_pmd);  
> 
> Adding Martin Schwidefsky.
> Is this mapping maybe still in use on other CPUs? If yes, I think for
> s390 we need to flush here as well (in other word we might need to introduce
> pmd_clear_flush). On s390 you have to use instructions like CRDTE,IPTE or IDTE
> to modify page table entries that are still in use. Otherwise you can get a 
> delayed access exception which is - in contrast to page faults - not 
> recoverable.

Just clearing an active pmd would be broken for s390. We need the equivalent
of the ptep_get_and_clear() function for pmds. For s390 this function would
look like this:

static inline pte_t pmdp_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pmd_t *pmdp)
{
return pmdp_xchg_lazy(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_INVALID));
}

Just like pmdp_huge_get_and_clear() in fact.

> 
> 
> 
> > +
> > +   VM_BUG_ON(!pmd_none(*new_pmd));
> > +
> > +   /* Set the new pmd */
> > +   set_pmd_at(mm, new_addr, new_pmd, pmd);
> > +   if (new_ptl != old_ptl)
> > +   spin_unlock(new_ptl);
> > +   spin_unlock(old_ptl);
> > +
> > +   *need_flush = true;
> > +   return true;
> > +   }
> > +   return false;
> > +}
> > +

So the idea is to move the pmd entry to the new location, dragging
the whole pte table to a new location with a different address.
I wonder if that is safe in regard to get_user_pages_fast().

> >  unsigned long move_page_tables(struct vm_area_struct *vma,
> > unsigned long old_addr, struct vm_area_struct *new_vma,
> > unsigned long new_addr, unsigned long len,
> > @@ -239,7 +287,21 @@ unsigned 

[PATCH kernel v2] KVM: PPC: Optimize clearing TCEs for sparse tables

2018-10-15 Thread Alexey Kardashevskiy
The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE
table and a table with userspace addresses. These tables are radix trees,
we allocate indirect levels when they are written to. Since
the memory allocation is problematic in real mode, we have 2 accessors
to the entries:
- for virtual mode: it allocates the memory and it is always expected
to return non-NULL;
- fr real mode: it does not allocate and can return NULL.

Also, DMA windows can span to up to 55 bits of the address space and since
we never have this much RAM, such windows are sparse. However currently
the SPAPR TCE IOMMU driver walks through all TCEs to unpin DMA memory.

Since we maintain a userspace addresses table for VFIO which is a mirror
of the hardware table, we can use it to know which parts of the DMA
window have not been mapped and skip these so does this patch.

The bare metal systems do not have this problem as they use a bypass mode
of a PHB which maps RAM directly.

This helps a lot with sparse DMA windows, reducing the shutdown time from
about 3 minutes per 1 billion TCEs to a few seconds for 32GB sparse guest.
Just skipping the last level seems to be good enough.

As non-allocating accessor is used now in virtual mode as well, rename it
from IOMMU_TABLE_USERSPACE_ENTRY_RM (real mode) to _RO (read only).

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v2:
* instead of adding the level size to @entry, now we align the entry
to the beginning of next chunk of the level, ie
"entry += tbl->it_level_size - 1" became "entry |= tbl->it_level_size - 1"
---
 arch/powerpc/include/asm/iommu.h|  2 +-
 arch/powerpc/kvm/book3s_64_vio.c|  5 ++---
 arch/powerpc/kvm/book3s_64_vio_hv.c |  6 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c | 23 +--
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 3d4b88c..35db0cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -126,7 +126,7 @@ struct iommu_table {
int it_nid;
 };
 
-#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
+#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
((tbl)->it_ops->useraddrptr((tbl), (entry), false))
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
((tbl)->it_ops->useraddrptr((tbl), (entry), true))
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c0c64d1..62a8d03 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -410,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
if (!pua)
-   /* it_userspace allocation might be delayed */
-   return H_TOO_HARD;
+   return H_SUCCESS;
 
mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
if (!mem)
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index ec99363..2206bc7 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -214,7 +214,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct 
iommu_table *tbl,
 
if (!ret && ((*direction == DMA_FROM_DEVICE) ||
(*direction == DMA_BIDIRECTIONAL))) {
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
/*
 * kvmppc_rm_tce_iommu_do_map() updates the UA cache after
 * calling this so we still get here a valid UA.
@@ -240,7 +240,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
if (!pua)
/* it_userspace allocation might be delayed */
@@ -304,7 +304,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
 {
long ret;
unsigned long hpa = 0;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
struct mm_iommu_table_group_mem_t *mem;
 
if (!pua)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 96721b1..b30926e 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -444,7 +444,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container 
*container,
struct mm_iommu_table_group_mem_t *mem = NULL;
  

[PATCH kernel 3/3] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] [10de:1db1] subdriver

2018-10-15 Thread Alexey Kardashevskiy
POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
pluggable PCIe devices but implement PCIe links for config space and MMIO.
In addition to that the GPUs are interconnected to each other and also
have direct links to the P9 CPU. The links are NVLink2 and provide direct
access to the system RAM for GPUs via NPU (an NVLink2 "proxy" on P9 chip).
These systems also support ATS (address translation services) which is
a part of the NVLink2 prototol. Such GPUs also share on-board RAM
(16GB in tested config) to the system via the same NVLink2 so a CPU has
cache-coherent access to a GPU RAM.

This exports GPU RAM to the userspace as a new PCI region. This
preregisters the new memory as device memory as it might be used for DMA.
This inserts pfns from the fault handler as the GPU memory is not onlined
until the NVIDIA driver is loaded and trained the links so doing this
earlier produces low level errors which we fence in the firmware so
it does not hurt the host system but still better to avoid.

This exports ATSD (Address Translation Shootdown) register of NPU which
allows the guest to invalidate TLB. The register conviniently occupies
a single 64k page. Since NPU maps the GPU memory, it has a "tgt" property
(which is an abbreviated host system bus address). This exports the "tgt"
as a capability so the guest can program it into the GPU so the GPU can
know how to route DMA trafic.

For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
know LPID (a logical partition ID or a KVM guest hardware ID in other
words) and PID (a memory context ID of an userspace process, not to be
confused with a linux pid). This assigns a GPU to LPID in the NPU and
this is why this adds a listener for KVM on an IOMMU group. A PID comes
via NVLink from a GPU and NPU uses a PID wildcard to pass it through.

This requires coherent memory and ATSD to be available on the host as
the GPU vendor only supports configurations with both features enabled
and other configurations are known not to work. Because of this and
because of the ways the features are advertised to the host system
(which is a device tree with very platform specific properties),
this requires enabled POWERNV platform.

This hardcodes the NVLink2 support for specific vendor and device IDs
as there is no reliable way of knowing about coherent memory and ATS
support. The GPU has an unique vendor PCIe capability 0x23 but it was
confirmed that it does not provide required information (and it is still
undisclosed what it actually does).

Signed-off-by: Alexey Kardashevskiy 
---
 drivers/vfio/pci/Makefile   |   1 +
 drivers/vfio/pci/vfio_pci_private.h |   2 +
 include/uapi/linux/vfio.h   |  18 ++
 drivers/vfio/pci/vfio_pci.c |  37 +++-
 drivers/vfio/pci/vfio_pci_nvlink2.c | 409 
 drivers/vfio/pci/Kconfig|   4 +
 6 files changed, 469 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 76d8ec0..9662c06 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,6 @@
 
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
+vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 93c1738..7639241 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -163,4 +163,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device 
*vdev)
return -ENODEV;
 }
 #endif
+extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index f378b98..9e9a8d3 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -303,6 +303,12 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2)
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG  (3)
 
+/* NVIDIA GPU NVlink2 RAM */
+#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1)
+
+/* IBM NPU NVlink2 ATSD */
+#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD   (1)
+
 /*
  * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped
  * which allows direct access to non-MSIX registers which happened to be within
@@ -313,6 +319,18 @@ struct vfio_region_info_cap_type {
  */
 #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3
 
+/*
+ * Capability with compressed real address (aka SSA - small system address)
+ * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing.
+ */
+#define VFIO_REGION_INFO_CAP_NPU2  4
+
+struct vfio_region_info_cap_npu2 {
+   struct vfio_info_cap_header header;
+   __u64 tgt;
+   /* size 

  1   2   >