Re: [PATCH v2 2/7] powerpc/prom: Introduce early_reserve_mem_old()

2020-09-15 Thread Christophe Leroy

Cédric Le Goater  a écrit :


and condition its call with IS_ENABLED(CONFIG_PPC32). This fixes a
compile error with W=1.

arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set  
but not used [-Werror=unused-but-set-variable]

  __be64 *reserve_map;
  ^~~
cc1: all warnings being treated as errors

Cc: Christophe Leroy 


@csgroup.eu instead of @c-s.fr please


Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kernel/prom.c | 37 -
 1 file changed, 20 insertions(+), 17 deletions(-)


That's a lot of changes for a tiny warning.

You could make it easy by just replacing the #ifdef by:

if (!IS_ENABLED(CONFIG_PPC32))
return;



diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d8a2fb87ba0c..c958b67cf1a5 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -620,27 +620,14 @@ static void __init early_reserve_mem_dt(void)
}
 }

-static void __init early_reserve_mem(void)
+static void __init early_reserve_mem_old(void)


Why _old ? Do you mean ppc32 are old ? Modern ADSL boxes like for  
instance the famous French freebox have powerpc32 microcontroller.
Eventually you could name it _ppc32, but I don't think that's the good  
way, see above.


Christophe


 {
__be64 *reserve_map;

reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
fdt_off_mem_rsvmap(initial_boot_params));

-   /* Look for the new "reserved-regions" property in the DT */
-   early_reserve_mem_dt();
-
-#ifdef CONFIG_BLK_DEV_INITRD
-   /* Then reserve the initrd, if any */
-   if (initrd_start && (initrd_end > initrd_start)) {
-   memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
-   ALIGN(initrd_end, PAGE_SIZE) -
-   ALIGN_DOWN(initrd_start, PAGE_SIZE));
-   }
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-#ifdef CONFIG_PPC32
-   /*
+   /*
 * Handle the case where we might be booting from an old kexec
 * image that setup the mem_rsvmap as pairs of 32-bit values
 */
@@ -658,9 +645,25 @@ static void __init early_reserve_mem(void)
DBG("reserving: %x -> %x\n", base_32, size_32);
memblock_reserve(base_32, size_32);
}
-   return;
}
-#endif
+}
+
+static void __init early_reserve_mem(void)
+{
+   /* Look for the new "reserved-regions" property in the DT */
+   early_reserve_mem_dt();
+
+#ifdef CONFIG_BLK_DEV_INITRD
+   /* Then reserve the initrd, if any */
+   if (initrd_start && (initrd_end > initrd_start)) {
+   memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
+   ALIGN(initrd_end, PAGE_SIZE) -
+   ALIGN_DOWN(initrd_start, PAGE_SIZE));
+   }
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+   if (IS_ENABLED(CONFIG_PPC32))
+   early_reserve_mem_old();
 }

 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
--
2.25.4





Re: [PATCH] powerpc/powermac: Fix low_sleep_handler with KUAP and KUEP

2020-09-11 Thread Christophe Leroy




Le 11/09/2020 à 01:56, Michael Ellerman a écrit :

Christophe Leroy  writes:

low_sleep_handler() has an hardcoded restore of segment registers
that doesn't take KUAP and KUEP into account.

Use head_32's load_segment_registers() routine instead.

Signed-off-by: Christophe Leroy 
Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access 
Protection")
Fixes: 31ed2b13c48d ("powerpc/32s: Implement Kernel Userspace Execution 
Prevention.")
Cc: sta...@vger.kernel.org
---
  arch/powerpc/platforms/powermac/sleep.S | 9 +
  1 file changed, 1 insertion(+), 8 deletions(-)


Doesn't build? pmac32_defconfig, gcc 9.3.0:

ld: arch/powerpc/platforms/powermac/sleep.o: in function `core99_wake_up':
(.text+0x25c): undefined reference to `load_segment_registers'

Missing _GLOBAL() presumably?


Oops .. :(

v2 sent out.

Thanks
Christophe


[PATCH v2] powerpc/powermac: Fix low_sleep_handler with KUAP and KUEP

2020-09-11 Thread Christophe Leroy
low_sleep_handler() has an hardcoded restore of segment registers
that doesn't take KUAP and KUEP into account.

Use head_32's load_segment_registers() routine instead.

Signed-off-by: Christophe Leroy 
Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access 
Protection")
Fixes: 31ed2b13c48d ("powerpc/32s: Implement Kernel Userspace Execution 
Prevention.")
Cc: sta...@vger.kernel.org
---
v2: Added missing _GLOBAL() to load_segment_registers in head_32.S
---
 arch/powerpc/kernel/head_32.S   | 2 +-
 arch/powerpc/platforms/powermac/sleep.S | 9 +
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 5624db0e09a1..0f60743474a2 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -1002,7 +1002,7 @@ BEGIN_MMU_FTR_SECTION
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS)
blr
 
-load_segment_registers:
+_GLOBAL(load_segment_registers)
li  r0, NUM_USER_SEGMENTS /* load up user segment register values */
mtctr   r0  /* for context 0 */
li  r3, 0   /* Kp = 0, Ks = 0, VSID = 0 */
diff --git a/arch/powerpc/platforms/powermac/sleep.S 
b/arch/powerpc/platforms/powermac/sleep.S
index f9a680fdd9c4..51bfdfe85058 100644
--- a/arch/powerpc/platforms/powermac/sleep.S
+++ b/arch/powerpc/platforms/powermac/sleep.S
@@ -294,14 +294,7 @@ grackle_wake_up:
 * we do any r1 memory access as we are not sure they
 * are in a sane state above the first 256Mb region
 */
-   li  r0,16   /* load up segment register values */
-   mtctr   r0  /* for context 0 */
-   lis r3,0x2000   /* Ku = 1, VSID = 0 */
-   li  r4,0
-3: mtsrin  r3,r4
-   addir3,r3,0x111 /* increment VSID */
-   addis   r4,r4,0x1000/* address of next segment */
-   bdnz3b
+   bl  load_segment_registers
sync
isync
 
-- 
2.25.0



Re: [PATCH] powerpc/traps: fix recoverability of machine check handling on book3s/32

2020-09-11 Thread Christophe Leroy

Hello,

Le 11/09/2020 à 11:15, Michal Suchánek a écrit :

Hello,

does this logic apply to "Unrecoverable System Reset" as well?


I don't know, I don't think I have any way the generate a System Reset 
on my board to check it.


Christophe



Thanks

Michal

On Tue, Jan 22, 2019 at 02:11:24PM +, Christophe Leroy wrote:

Looks like book3s/32 doesn't set RI on machine check, so
checking RI before calling die() will always be fatal
allthought this is not an issue in most cases.

Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable 
interrupt")
Fixes: daf00ae71dad ("powerpc/traps: restore recoverability of machine_check 
interrupts")
Signed-off-by: Christophe Leroy 
Cc: sta...@vger.kernel.org
---
  arch/powerpc/kernel/traps.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 64936b60d521..c740f8bfccc9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -763,15 +763,15 @@ void machine_check_exception(struct pt_regs *regs)
if (check_io_access(regs))
goto bail;
  
-	/* Must die if the interrupt is not recoverable */

-   if (!(regs->msr & MSR_RI))
-   nmi_panic(regs, "Unrecoverable Machine check");
-
if (!nested)
nmi_exit();
  
  	die("Machine check", regs, SIGBUS);
  
+	/* Must die if the interrupt is not recoverable */

+   if (!(regs->msr & MSR_RI))
+   nmi_panic(regs, "Unrecoverable Machine check");
+
return;
  
  bail:

--
2.13.3



Re: [PATCH 3/7] powerpc/sstep: Fix W=1 compile warning

2020-09-11 Thread Christophe Leroy




Le 11/09/2020 à 07:59, Cédric Le Goater a écrit :

On 9/11/20 7:38 AM, Christophe Leroy wrote:



Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

../arch/powerpc/lib/sstep.c: In function ‘mlsd_8lsd_ea’:
../arch/powerpc/lib/sstep.c:225:3: error: suggest braces around empty body in 
an ‘if’ statement [-Werror=empty-body]
     ; /* Invalid form. Should already be checked for by caller! */
     ^


A small sentence explaining how this is fixed would be welcome, so that you 
don't need to read the code the know what the commit does to fix the warning. 
Also the subject should be more explicit.





Cc: Jordan Niethe 
Signed-off-by: Cédric Le Goater 
---
   arch/powerpc/lib/sstep.c | 3 ++-
   1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index caee8cc77e19..14572af16e55 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -221,8 +221,9 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned 
int instr,
   ; /* Leave ea as is */
   else if (prefix_r && !ra)
   ea += regs->nip;
-    else if (prefix_r && ra)
+    else if (prefix_r && ra) {
   ; /* Invalid form. Should already be checked for by caller! */
+    }


You can't do that. Now checkpatch will complain that you don't have braces on 
all legs of the if/else dance.


Should we fix checkpatch ?


Why not, not then fix 
https://www.kernel.org/doc/html/latest/process/coding-style.html#placing-braces-and-spaces 
first :)


Christophe


Re: [PATCH 7/7] powerpc/32: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

CC  arch/powerpc/kernel/traps.o
../arch/powerpc/kernel/traps.c:1663:6: error: no previous prototype for 
‘stack_overflow_exception’ [-Werror=missing-prototypes]
  void stack_overflow_exception(struct pt_regs *regs)
   ^~~~



A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.




Cc: Christophe Leroy 


Reviewed-by: Christophe Leroy 



Fixes: 3978eb78517c ("powerpc/32: Add early stack overflow detection with VMAP 
stack.")
Signed-off-by: Cédric Le Goater 
---
  arch/powerpc/include/asm/asm-prototypes.h | 1 +


Note that asm-prototypes.h is not the right place for such a prototype, 
but that's probably for another cleanup patch. See discussion at 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/1463534212-4879-2-git-send-email-...@axtens.net/


Christophe



  1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index de14b1a34d56..4957119604c7 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -67,6 +67,7 @@ void single_step_exception(struct pt_regs *regs);
  void program_check_exception(struct pt_regs *regs);
  void alignment_exception(struct pt_regs *regs);
  void StackOverflow(struct pt_regs *regs);
+void stack_overflow_exception(struct pt_regs *regs);
  void kernel_fp_unavailable_exception(struct pt_regs *regs);
  void altivec_unavailable_exception(struct pt_regs *regs);
  void vsx_unavailable_exception(struct pt_regs *regs);



Re: [PATCH 6/7] powerpc/perf: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

CC  arch/powerpc/perf/imc-pmu.o
../arch/powerpc/perf/imc-pmu.c: In function ‘trace_imc_event_init’:
../arch/powerpc/perf/imc-pmu.c:1429:22: error: variable ‘target’ set but not 
used [-Werror=unused-but-set-variable]
   struct task_struct *target;
   ^~


A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.




Cc: Anju T Sudhakar 
Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/perf/imc-pmu.c | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 62d0b54086f8..9ed4fcccf8a9 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1426,8 +1426,6 @@ static void trace_imc_event_del(struct perf_event *event, 
int flags)
  
  static int trace_imc_event_init(struct perf_event *event)

  {
-   struct task_struct *target;
-
if (event->attr.type != event->pmu->type)
return -ENOENT;
  
@@ -1458,7 +1456,6 @@ static int trace_imc_event_init(struct perf_event *event)

mutex_unlock(_global_refc.lock);
  
  	event->hw.idx = -1;

-   target = event->hw.target;
  
  	event->pmu->task_ctx_nr = perf_hw_context;

event->destroy = reset_global_refc;



Re: [PATCH 5/7] powerpc/powernv/pci: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

   CC  arch/powerpc/platforms/powernv/pci-ioda.o
../arch/powerpc/platforms/powernv/pci-ioda.c: In function 
‘pnv_ioda_configure_pe’:
../arch/powerpc/platforms/powernv/pci-ioda.c:897:18: error: variable ‘parent’ 
set but not used [-Werror=unused-but-set-variable]
   struct pci_dev *parent;
   ^~



A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.




Cc: Oliver O'Halloran 
Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/platforms/powernv/pci-ioda.c | 8 
  1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 023a4f987bb2..2b4ceb5e6ce4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -894,7 +894,6 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
  
  int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)

  {
-   struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
long rc, rid_end, rid;
  
@@ -904,7 +903,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
  
  		dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;

fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
-   parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
count = resource_size(>pbus->busn_res);
else
@@ -925,12 +923,6 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct 
pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
-#ifdef CONFIG_PCI_IOV
-   if (pe->flags & PNV_IODA_PE_VF)
-   parent = pe->parent_dev;
-   else
-#endif /* CONFIG_PCI_IOV */
-   parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;



Re: [PATCH 4/7] powerpc/xive: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

CC  arch/powerpc/sysdev/xive/common.o
../arch/powerpc/sysdev/xive/common.c:1568:6: error: no previous prototype for 
‘xive_debug_show_cpu’ [-Werror=missing-prototypes]
  void xive_debug_show_cpu(struct seq_file *m, int cpu)
   ^~~
../arch/powerpc/sysdev/xive/common.c:1602:6: error: no previous prototype for 
‘xive_debug_show_irq’ [-Werror=missing-prototypes]
  void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)
   ^~~



A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.


There are two ways of fixing it:
- Add the missing prototype
- Make it static

You chose the second alternative, this needs to be told in the commit log.



Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/sysdev/xive/common.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index f591be9f01f4..a80440af491a 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1565,7 +1565,7 @@ static int __init xive_off(char *arg)
  }
  __setup("xive=off", xive_off);
  
-void xive_debug_show_cpu(struct seq_file *m, int cpu)

+static void xive_debug_show_cpu(struct seq_file *m, int cpu)
  {
struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
  
@@ -1599,7 +1599,7 @@ void xive_debug_show_cpu(struct seq_file *m, int cpu)

seq_puts(m, "\n");
  }
  
-void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct irq_data *d)

+static void xive_debug_show_irq(struct seq_file *m, u32 hw_irq, struct 
irq_data *d)
  {
struct irq_chip *chip = irq_data_get_irq_chip(d);
int rc;



Re: [PATCH 3/7] powerpc/sstep: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

../arch/powerpc/lib/sstep.c: In function ‘mlsd_8lsd_ea’:
../arch/powerpc/lib/sstep.c:225:3: error: suggest braces around empty body in 
an ‘if’ statement [-Werror=empty-body]
; /* Invalid form. Should already be checked for by caller! */
^


A small sentence explaining how this is fixed would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.






Cc: Jordan Niethe 
Signed-off-by: Cédric Le Goater 
---
  arch/powerpc/lib/sstep.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index caee8cc77e19..14572af16e55 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -221,8 +221,9 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned 
int instr,
; /* Leave ea as is */
else if (prefix_r && !ra)
ea += regs->nip;
-   else if (prefix_r && ra)
+   else if (prefix_r && ra) {
; /* Invalid form. Should already be checked for by caller! */
+   }


You can't do that. Now checkpatch will complain that you don't have 
braces on all legs of the if/else dance.


I think the last 'else if' should simply be removed entirely as it does 
nothing. Eventually, just leave the comment, something like:


	/* (prefix_r && ra) is Invalid form. Should already be checked for by 
caller! */


And if (prefix_r && ra) is not possible, then the previous if should 
just be 'if (prefx_r)'


Christophe


  
  	return ea;

  }



Re: [PATCH 2/7] powerpc/prom: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not 
used [-Werror=unused-but-set-variable]
   __be64 *reserve_map;
   ^~~
cc1: all warnings being treated as errors


A small sentence explaining how this is fixes would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Also the subject should be more explicit.





Cc: Christophe Leroy 
Signed-off-by: Cédric Le Goater 
---
  arch/powerpc/kernel/prom.c | 51 +++---
  1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d8a2fb87ba0c..4bae9ebc7d0b 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -622,11 +622,6 @@ static void __init early_reserve_mem_dt(void)
  
  static void __init early_reserve_mem(void)

  {
-   __be64 *reserve_map;
-
-   reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
-   fdt_off_mem_rsvmap(initial_boot_params));
-
/* Look for the new "reserved-regions" property in the DT */
early_reserve_mem_dt();
  
@@ -639,28 +634,34 @@ static void __init early_reserve_mem(void)

}
  #endif /* CONFIG_BLK_DEV_INITRD */
  
-#ifdef CONFIG_PPC32


Instead of such a big change, you could simply do the following in 
addition to the move of reserve_map allocation after it.


if (!IS_ENABLED(CONFIG_PPC32))
return;


-   /*
-* Handle the case where we might be booting from an old kexec
-* image that setup the mem_rsvmap as pairs of 32-bit values
-*/
-   if (be64_to_cpup(reserve_map) > 0xull) {
-   u32 base_32, size_32;
-   __be32 *reserve_map_32 = (__be32 *)reserve_map;
-
-   DBG("Found old 32-bit reserve map\n");
-
-   while (1) {
-   base_32 = be32_to_cpup(reserve_map_32++);
-   size_32 = be32_to_cpup(reserve_map_32++);
-   if (size_32 == 0)
-   break;
-   DBG("reserving: %x -> %x\n", base_32, size_32);
-   memblock_reserve(base_32, size_32);
+   if (IS_ENABLED(CONFIG_PPC32)) {
+   __be64 *reserve_map;
+
+   reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
+fdt_off_mem_rsvmap(initial_boot_params));
+
+   /*
+* Handle the case where we might be booting from an
+* old kexec image that setup the mem_rsvmap as pairs
+* of 32-bit values
+*/
+   if (be64_to_cpup(reserve_map) > 0xull) {
+   u32 base_32, size_32;
+   __be32 *reserve_map_32 = (__be32 *)reserve_map;
+
+   DBG("Found old 32-bit reserve map\n");
+
+   while (1) {
+   base_32 = be32_to_cpup(reserve_map_32++);
+   size_32 = be32_to_cpup(reserve_map_32++);
+   if (size_32 == 0)
+   break;
+   DBG("reserving: %x -> %x\n", base_32, size_32);
+   memblock_reserve(base_32, size_32);
+   }
+   return;
}
-   return;
}
-#endif
  }
  
  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM





Christophe


Re: [PATCH 1/7] powerpc/sysfs: Fix W=1 compile warning

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 23:02, Cédric Le Goater a écrit :

arch/powerpc/kernel/sysfs.c: In function ‘sysfs_create_dscr_default’:
arch/powerpc/kernel/sysfs.c:228:7: error: variable ‘err’ set but not used 
[-Werror=unused-but-set-variable]
int err = 0;
^~~
cc1: all warnings being treated as errors


A small sentence explaining how this is fixes would be welcome, so that 
you don't need to read the code the know what the commit does to fix the 
warning. Even the subject should be more explicite, rather than saying 
"Fix W=1 compile warning", I think it should say something like "remove 
unused err variable"





Cc: Madhavan Srinivasan 
Signed-off-by: Cédric Le Goater 


Reviewed-by: Christophe Leroy 


---
  arch/powerpc/kernel/sysfs.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 46b4ebc33db7..821a3dc4c924 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -225,14 +225,13 @@ static DEVICE_ATTR(dscr_default, 0600,
  static void sysfs_create_dscr_default(void)
  {
if (cpu_has_feature(CPU_FTR_DSCR)) {
-   int err = 0;
int cpu;
  
  		dscr_default = spr_default_dscr;

for_each_possible_cpu(cpu)
paca_ptrs[cpu]->dscr_default = dscr_default;
  
-		err = device_create_file(cpu_subsys.dev_root, _attr_dscr_default);

+   device_create_file(cpu_subsys.dev_root, _attr_dscr_default);
}
  }
  #endif /* CONFIG_PPC64 */



[PATCH] powerpc/kasan: Fix CONFIG_KASAN_VMALLOC for 8xx

2020-09-10 Thread Christophe Leroy
Before the commit identified below, pages tables allocation was
performed after the allocation of final shadow area for linear memory.
But that commit switched the order, leading to page tables being
already allocated at the time 8xx kasan_init_shadow_8M() is called.
Due to this, kasan_init_shadow_8M() doesn't map the needed
shadow entries because there are already page tables.

kasan_init_shadow_8M() installs huge PMD entries instead of page
tables. We could at that time free the page tables, but there is no
point in creating page tables that get freed before being used.

Only book3s/32 hash needs early allocation of page tables. For other
variants, we can keep the initial order and create remaining page
tables after the allocation of final shadow memory for linear mem.

Move back the allocation of shadow page tables for
CONFIG_KASAN_VMALLOC into kasan_init() after the loop which creates
final shadow memory for linear mem.

Fixes: 41ea93cf7ba4 ("powerpc/kasan: Fix shadow pages allocation failure")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/kasan/kasan_init_32.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c 
b/arch/powerpc/mm/kasan/kasan_init_32.c
index fb294046e00e..929716ea21e9 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -127,8 +127,7 @@ void __init kasan_mmu_init(void)
 {
int ret;
 
-   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ||
-   IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+   if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, 
KASAN_SHADOW_END);
 
if (ret)
@@ -139,11 +138,11 @@ void __init kasan_mmu_init(void)
 void __init kasan_init(void)
 {
struct memblock_region *reg;
+   int ret;
 
for_each_memblock(memory, reg) {
phys_addr_t base = reg->base;
phys_addr_t top = min(base + reg->size, total_lowmem);
-   int ret;
 
if (base >= top)
continue;
@@ -153,6 +152,13 @@ void __init kasan_init(void)
panic("kasan: kasan_init_region() failed");
}
 
+   if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
+   ret = kasan_init_shadow_page_tables(KASAN_SHADOW_START, 
KASAN_SHADOW_END);
+
+   if (ret)
+   panic("kasan: kasan_init_shadow_page_tables() failed");
+   }
+
kasan_remap_early_shadow_ro();
 
clear_page(kasan_early_shadow_page);
-- 
2.25.0



Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-10 Thread Christophe Leroy




Le 10/09/2020 à 10:04, David Laight a écrit :

From: Linus Torvalds

Sent: 09 September 2020 22:34
On Wed, Sep 9, 2020 at 11:42 AM Segher Boessenkool
 wrote:


It will not work like this in GCC, no.  The LLVM people know about that.
I do not know why they insist on pushing this, being incompatible and
everything.


Umm. Since they'd be the ones supporting this, *gcc* would be the
incompatible one, not clang.


I had an 'interesting' idea.

Can you use a local asm register variable as an input and output to
an 'asm volatile goto' statement?

Well you can - but is it guaranteed to work :-)



With gcc at least it should work according to 
https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html


They even explicitely tell: "The only supported use for this feature is 
to specify registers for input and output operands when calling Extended 
asm "


Christophe


Re: [RFC PATCH v2 2/3] mm: make pXd_addr_end() functions page-table entry aware

2020-09-09 Thread Christophe Leroy
On Tue, 2020-09-08 at 16:15 +0200, Alexander Gordeev wrote:
> On Tue, Sep 08, 2020 at 10:16:49AM +0200, Christophe Leroy wrote:
> > >Yes, and also two more sources :/
> > >   arch/powerpc/mm/kasan/8xx.c
> > >   arch/powerpc/mm/kasan/kasan_init_32.c
> > >
> > >But these two are not quite obvious wrt pgd_addr_end() used
> > >while traversing pmds. Could you please clarify a bit?
> > >
> > >
> > >diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
> > >index 2784224..89c5053 100644
> > >--- a/arch/powerpc/mm/kasan/8xx.c
> > >+++ b/arch/powerpc/mm/kasan/8xx.c
> > >@@ -15,8 +15,8 @@
> > >   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block 
> > > += SZ_8M) {
> > >   pte_basic_t *new;
> > >-  k_next = pgd_addr_end(k_cur, k_end);
> > >-  k_next = pgd_addr_end(k_next, k_end);
> > >+  k_next = pmd_addr_end(k_cur, k_end);
> > >+  k_next = pmd_addr_end(k_next, k_end);
> > 
> > No, I don't think so.
> > On powerpc32 we have only two levels, so pgd and pmd are more or
> > less the same.
> > But pmd_addr_end() as defined in include/asm-generic/pgtable-nopmd.h
> > is a no-op, so I don't think it will work.
> > 
> > It is likely that this function should iterate on pgd, then you get
> > pmd = pmd_offset(pud_offset(p4d_offset(pgd)));
> 
> It looks like the code iterates over single pmd table while using
> pgd_addr_end() only to skip all the middle levels and bail out
> from the loop.
> 
> I would be wary for switching from pmds to pgds, since we are
> trying to minimize impact (especially functional) and the
> rework does not seem that obvious.
> 

I've just tested the following change, it works and should fix the
oddity:

diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
index 2784224054f8..8e53ddf57b84 100644
--- a/arch/powerpc/mm/kasan/8xx.c
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -9,11 +9,12 @@
 static int __init
 kasan_init_shadow_8M(unsigned long k_start, unsigned long k_end, void
*block)
 {
-   pmd_t *pmd = pmd_off_k(k_start);
+   pgd_t *pgd = pgd_offset_k(k_start);
unsigned long k_cur, k_next;
 
-   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block
+= SZ_8M) {
+   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pgd += 2, block
+= SZ_8M) {
pte_basic_t *new;
+   pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd, k_cur), 
k_cur),
k_cur);
 
k_next = pgd_addr_end(k_cur, k_end);
k_next = pgd_addr_end(k_next, k_end);
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c
b/arch/powerpc/mm/kasan/kasan_init_32.c
index fb294046e00e..e5f524fa71a7 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -30,13 +30,12 @@ static void __init kasan_populate_pte(pte_t *ptep,
pgprot_t prot)
 
 int __init kasan_init_shadow_page_tables(unsigned long k_start,
unsigned long k_end)
 {
-   pmd_t *pmd;
+   pgd_t *pgd = pgd_offset_k(k_start);
unsigned long k_cur, k_next;
 
-   pmd = pmd_off_k(k_start);
-
-   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
+   for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pgd++) {
pte_t *new;
+   pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd, k_cur), 
k_cur),
k_cur);
 
k_next = pgd_addr_end(k_cur, k_end);
if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
@@ -189,16 +188,18 @@ void __init kasan_early_init(void)
unsigned long addr = KASAN_SHADOW_START;
unsigned long end = KASAN_SHADOW_END;
unsigned long next;
-   pmd_t *pmd = pmd_off_k(addr);
+   pgd_t *pgd = pgd_offset_k(addr);
 
BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK);
 
kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
 
do {
+   pmd_t *pmd = pmd_offset(pud_offset(p4d_offset(pgd, addr), addr),
addr);
+
next = pgd_addr_end(addr, end);
pmd_populate_kernel(_mm, pmd, kasan_early_shadow_pte);
-   } while (pmd++, addr = next, addr != end);
+   } while (pgd++, addr = next, addr != end);
 
if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE))
kasan_early_hash_table();
---
Christophe



Re: [PATCH v1 4/5] powerpc/fault: Avoid heavy search_exception_tables() verification

2020-09-09 Thread Christophe Leroy




Le 09/09/2020 à 08:04, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


search_exception_tables() is an heavy operation, we have to avoid it.
When KUAP is selected, we'll know the fault has been blocked by KUAP.
Otherwise, it behaves just as if the address was already in the TLBs
and no fault was generated.

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/fault.c | 20 +---
  1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 525e0c2b5406..edde169ba3a6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -214,24 +214,14 @@ static bool bad_kernel_fault(struct pt_regs *regs, 
unsigned long error_code,
if (address >= TASK_SIZE)
return true;
  
-	if (!is_exec && (error_code & DSISR_PROTFAULT) &&

-   !search_exception_tables(regs->nip)) {
+   // Read/write fault blocked by KUAP is bad, it can never succeed.
+   if (bad_kuap_fault(regs, address, is_write)) {
pr_crit_ratelimited("Kernel attempted to access user page (%lx) - 
exploit attempt? (uid: %d)\n",
-   address,
-   from_kuid(_user_ns, current_uid()));
-   }
-
-   // Fault on user outside of certain regions (eg. copy_tofrom_user()) is 
bad
-   if (!search_exception_tables(regs->nip))
-   return true;


We still need to keep this ? Without that we detect the lack of
exception tables pretty late.


Is that a problem at all to detect the lack of exception tables late ?
That case is very unlikely and will lead to failure anyway. So, is it 
worth impacting performance of the likely case which will always have an 
exception table and where we expect the exception to run as fast as 
possible ?


The other architectures I have looked at (arm64 and x86) only have the 
exception table search together with the down_read_trylock(>mmap_sem).


Christophe


Re: [RFC PATCH v2 3/3] mm: make generic pXd_addr_end() macros inline functions

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 17:48, Alexander Gordeev a écrit :

On Tue, Sep 08, 2020 at 07:19:38AM +0200, Christophe Leroy wrote:

[...]


diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 67ebc22cf83d..d9e7d16c2263 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -656,31 +656,35 @@ static inline int arch_unmap_one(struct mm_struct *mm,
   */
  #ifndef pgd_addr_end
-#define pgd_addr_end(pgd, addr, end)   \
-({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;  \
-   (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
+#define pgd_addr_end pgd_addr_end


I think that #define is pointless, usually there is no such #define
for the default case.


Default pgd_addr_end() gets overriden on s390 (arch/s390/include/asm/pgtable.h):

#define pgd_addr_end pgd_addr_end
static inline unsigned long pgd_addr_end(pgd_t pgd, unsigned long addr, 
unsigned long end)
{
return rste_addr_end_folded(pgd_val(pgd), addr, end);
}


Yes, there in s390 the #define is needed to hit the #ifndef pgd_addr_end 
that's in include/linux/pgtable.h


But in include/linux/pgtable.h, there is no need of an #define 
pgd_addr_end pgd_addr_end I think





+static inline unsigned long pgd_addr_end(pgd_t pgd, unsigned long addr, 
unsigned long end)
+{  unsigned long __boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
+   return (__boundary - 1 < end - 1) ? __boundary : end;
+}



Christophe


Re: [PATCH] powerpc/64: Make VDSO32 track COMPAT on 64-bit

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 14:58, Michael Ellerman a écrit :

When we added the VDSO32 kconfig symbol, which controls building of
the 32-bit VDSO, we made it depend on CPU_BIG_ENDIAN (for 64-bit).

That was because back then COMPAT was always enabled for 64-bit, so
depending on it would have left the 32-bit VDSO always enabled, which
we didn't want.

But since then we have made COMPAT selectable, and off by default for
ppc64le, so VDSO32 should really depend on that.

For most people this makes no difference, none of the defconfigs
change, it's only if someone is building ppc64le with COMPAT=y, they
will now also get VDSO32. If they've enabled COMPAT in order to run
32-bit binaries they presumably also want the 32-bit VDSO.

Signed-off-by: Michael Ellerman 



Reviewed-by: Christophe Leroy 

Michael, please note that christophe.le...@c-s.fr is a deprecated 
address that will one day not work anymore. Please use the new one 
whenever possible.


Christophe



---
  arch/powerpc/platforms/Kconfig.cputype | 7 +++
  1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 87737ec86d39..a80ad0ef436e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -490,13 +490,12 @@ endmenu
  
  config VDSO32

def_bool y
-   depends on PPC32 || CPU_BIG_ENDIAN
+   depends on PPC32 || COMPAT
help
  This symbol controls whether we build the 32-bit VDSO. We obviously
  want to do that if we're building a 32-bit kernel. If we're building
- a 64-bit kernel then we only want a 32-bit VDSO if we're building for
- big endian. That is because the only little endian configuration we
- support is ppc64le which is 64-bit only.
+ a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling
+ COMPAT.
  
  choice

prompt "Endianness selection"



Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 14:09, Christian Borntraeger a écrit :



On 08.09.20 07:06, Christophe Leroy wrote:



Le 07/09/2020 à 20:00, Gerald Schaefer a écrit :

From: Alexander Gordeev 

Commit 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast
code") introduced a subtle but severe bug on s390 with gup_fast, due to
dynamic page table folding.

The question "What would it require for the generic code to work for s390"
has already been discussed here
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1
and ended with a promising approach here
https://lkml.kernel.org/r/20190419153307.4f2911b5@mschwideX1
which in the end unfortunately didn't quite work completely.

We tried to mimic static level folding by changing pgd_offset to always
calculate top level page table offset, and do nothing in folded pXd_offset.
What has been overlooked is that PxD_SIZE/MASK and thus pXd_addr_end do
not reflect this dynamic behaviour, and still act like static 5-level
page tables.



[...]



Fix this by introducing new pXd_addr_end_folded helpers, which take an
additional pXd entry value parameter, that can be used on s390
to determine the correct page table level and return corresponding
end / boundary. With that, the pointer iteration will always
happen in gup_pgd_range for s390. No change for other architectures
introduced.


Not sure pXd_addr_end_folded() is the best understandable name, allthough I 
don't have any alternative suggestion at the moment.
Maybe could be something like pXd_addr_end_fixup() as it will disappear in the 
next patch, or pXd_addr_end_gup() ?

Also, if it happens to be acceptable to get patch 2 in stable, I think you 
should switch patch 1 and patch 2 to avoid the step through 
pXd_addr_end_folded()


given that this fixes a data corruption issue, wouldnt it be the best to go 
forward
with this patch ASAP and then handle the other patches on top with all the time 
that
we need?


I have no strong opinion on this, but I feel rather tricky to have to 
change generic part of GUP to use a new fonction then revert that change 
in the following patch, just because you want the first patch in stable 
and not the second one.


Regardless, I was wondering, why do we need a reference to the pXd at 
all when calling pXd_addr_end() ?


Couldn't S390 retrieve the pXd by using the pXd_offset() dance with the 
passed addr ?


Christophe


Re: [PATCH v1 1/5] powerpc/mm: sanity_check_fault() should work for all, not only BOOK3S

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 10:43, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of August 7, 2020 3:15 am:

The verification and message introduced by commit 374f3f5979f9
("powerpc/mm/hash: Handle user access of kernel address gracefully")
applies to all platforms, it should not be limited to BOOK3S.

Make the BOOK3S version of sanity_check_fault() the one for all,
and bail out earlier if not BOOK3S.

Fixes: 374f3f5979f9 ("powerpc/mm/hash: Handle user access of kernel address 
gracefully")
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/fault.c | 8 +++-
  1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 925a7231abb3..2efa34d7e644 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -303,7 +303,6 @@ static inline void cmo_account_page_fault(void)
  static inline void cmo_account_page_fault(void) { }
  #endif /* CONFIG_PPC_SMLPAR */
  
-#ifdef CONFIG_PPC_BOOK3S

  static void sanity_check_fault(bool is_write, bool is_user,
   unsigned long error_code, unsigned long address)
  {
@@ -320,6 +319,9 @@ static void sanity_check_fault(bool is_write, bool is_user,
return;
}
  
+	if (!IS_ENABLED(CONFIG_PPC_BOOK3S))

+   return;


Seems okay. Why is address == -1 special though? I guess it's because
it may not be an exploit kernel reference but a buggy pointer underflow?
In that case -1 doesn't seem like it would catch very much. Would it be
better to test for high bit set for example ((long)address < 0) ?


See 
https://github.com/linuxppc/linux/commit/0f9aee0cb9da7db7d96f63cfa2dc5e4f1bffeb87#diff-f9658f412252f3bb3093e0a95b37f3ac


-1 is what mmap() returns on error, if the app uses that as a pointer 
that's a programming error not an exploit.


Euh .. If you test (long)address < 0, then the entire kernel falls into 
that range as usually it goes from 0xc000 to 0x


But we could skip the top page entirely, anyway it is never mapped.



Anyway for your patch

Reviewed-by: Nicholas Piggin 


Thanks
Christophe



Re: [RFC PATCH 02/12] powerpc: remove arguments from interrupt handler functions

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 10:29, Christophe Leroy a écrit :



Le 08/09/2020 à 09:48, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of September 7, 2020 9:34 pm:

On Mon, 2020-09-07 at 11:20 +0200, Christophe Leroy wrote:


Le 05/09/2020 à 19:43, Nicholas Piggin a écrit :

Make interrupt handlers all just take the pt_regs * argument and load
DAR/DSISR etc from that. Make those that return a value return long.


I like this, it will likely simplify a bit the VMAP_STACK mess.

Not sure it is that easy. My board is stuck after the start of init.


On the 8xx, on Instruction TLB Error exception, we do

andis.    r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 
bits */


On book3s/32, on ISI exception we do:
andis.    r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 
bits */


On 40x and bookE, on ISI exception we do:
li    r5,0    /* Pass zero as arg3 */


And regs->dsisr will just contain nothing

So it means we should at least write back r5 into regs->dsisr from 
there

? The performance impact should be minimal as we already write _DAR so
the cache line should already be in the cache.

A hacky 'stw r5, _DSISR(r1)' in handle_page_fault() does the trick,
allthough we don't want to do it for both ISI and DSI at the end, so
you'll have to do it in every head_xxx.S


To get you series build and work, I did the following hacks:


Great, thanks for this.


diff --git a/arch/powerpc/include/asm/interrupt.h
b/arch/powerpc/include/asm/interrupt.h
index acfcc7d5779b..c11045d3113a 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -93,7 +93,9 @@ static inline void interrupt_nmi_exit_prepare(struct
pt_regs *regs, struct inter
  {
  nmi_exit();
+#ifdef CONFIG_PPC64
  this_cpu_set_ftrace_enabled(state->ftrace_enabled);
+#endif


This seems okay, not a hack.


  #ifdef CONFIG_PPC_BOOK3S_64
  /* Check we didn't change the pending interrupt mask. */
diff --git a/arch/powerpc/kernel/entry_32.S
b/arch/powerpc/kernel/entry_32.S
index f4d0af8e1136..66f7adbe1076 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -663,6 +663,7 @@ ppc_swapcontext:
   */
  .globl    handle_page_fault
  handle_page_fault:
+    stw    r5,_DSISR(r1)
  addi    r3,r1,STACK_FRAME_OVERHEAD
  #ifdef CONFIG_PPC_BOOK3S_32
  andis.  r0,r5,DSISR_DABRMATCH@h


Is this what you want to do for 32, or do you want to seperate
ISI and DSI sides?



No I think we want to separate ISI and DSI sides.

And I think the specific filtering done in ISI could be done in 
do_page_fault() in C. Ok, it would make a special handling for is_exec 
but there are already several places where the behaviour differs based 
on is_exec.
The only thing we need to keep at ASM level is the DABR stuff for 
calling do_break() in handle_page_fault(), because it is used to decide 
whether full regs are saved or not. But it could be a test done earlier 
in the prolog and the result being kept in one of the CR bits.


Looking at it once more, I'm wondering whether we really need a full regs.

Before commit 
https://github.com/linuxppc/linux/commit/d300627c6a53693fb01479b59b0cdd293761b1fa#diff-f9658f412252f3bb3093e0a95b37f3ac 
do_break() was called from do_page_fault() without a full regs set.


Christophe


Re: [RFC PATCH 02/12] powerpc: remove arguments from interrupt handler functions

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 09:48, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of September 7, 2020 9:34 pm:

On Mon, 2020-09-07 at 11:20 +0200, Christophe Leroy wrote:


Le 05/09/2020 à 19:43, Nicholas Piggin a écrit :

Make interrupt handlers all just take the pt_regs * argument and load
DAR/DSISR etc from that. Make those that return a value return long.


I like this, it will likely simplify a bit the VMAP_STACK mess.

Not sure it is that easy. My board is stuck after the start of init.


On the 8xx, on Instruction TLB Error exception, we do

andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */

On book3s/32, on ISI exception we do:
andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */

On 40x and bookE, on ISI exception we do:
li  r5,0/* Pass zero as arg3 */


And regs->dsisr will just contain nothing

So it means we should at least write back r5 into regs->dsisr from there
? The performance impact should be minimal as we already write _DAR so
the cache line should already be in the cache.

A hacky 'stw r5, _DSISR(r1)' in handle_page_fault() does the trick,
allthough we don't want to do it for both ISI and DSI at the end, so
you'll have to do it in every head_xxx.S


To get you series build and work, I did the following hacks:


Great, thanks for this.


diff --git a/arch/powerpc/include/asm/interrupt.h
b/arch/powerpc/include/asm/interrupt.h
index acfcc7d5779b..c11045d3113a 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -93,7 +93,9 @@ static inline void interrupt_nmi_exit_prepare(struct
pt_regs *regs, struct inter
  {
nmi_exit();
  
+#ifdef CONFIG_PPC64

this_cpu_set_ftrace_enabled(state->ftrace_enabled);
+#endif


This seems okay, not a hack.


  #ifdef CONFIG_PPC_BOOK3S_64
/* Check we didn't change the pending interrupt mask. */
diff --git a/arch/powerpc/kernel/entry_32.S
b/arch/powerpc/kernel/entry_32.S
index f4d0af8e1136..66f7adbe1076 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -663,6 +663,7 @@ ppc_swapcontext:
   */
.globl  handle_page_fault
  handle_page_fault:
+   stw r5,_DSISR(r1)
addir3,r1,STACK_FRAME_OVERHEAD
  #ifdef CONFIG_PPC_BOOK3S_32
andis.  r0,r5,DSISR_DABRMATCH@h


Is this what you want to do for 32, or do you want to seperate
ISI and DSI sides?



No I think we want to separate ISI and DSI sides.

And I think the specific filtering done in ISI could be done in 
do_page_fault() in C. Ok, it would make a special handling for is_exec 
but there are already several places where the behaviour differs based 
on is_exec.
The only thing we need to keep at ASM level is the DABR stuff for 
calling do_break() in handle_page_fault(), because it is used to decide 
whether full regs are saved or not. But it could be a test done earlier 
in the prolog and the result being kept in one of the CR bits.


That way we can avoid reloading dsisr and dar from the stack, especially 
for VMAP stack as they are saved quite early in the prolog.


Or we can take them out of the thread struct and save them in the stack 
a little latter in the prolog, but then we have to keep the RI bit a bit 
cleared longer.


While we are playing with do_page_fault(), did you have a look at the 
series I sent 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/7baae4086cbb9ffb08c933b065ff7d29dbc03dd6.1596734104.git.christophe.le...@csgroup.eu/


Christophe


Re: [RFC PATCH v2 2/3] mm: make pXd_addr_end() functions page-table entry aware

2020-09-08 Thread Christophe Leroy




Le 08/09/2020 à 09:46, Alexander Gordeev a écrit :

On Tue, Sep 08, 2020 at 07:14:38AM +0200, Christophe Leroy wrote:

You forgot arch/powerpc/mm/book3s64/subpage_prot.c it seems.


Yes, and also two more sources :/
arch/powerpc/mm/kasan/8xx.c
arch/powerpc/mm/kasan/kasan_init_32.c

But these two are not quite obvious wrt pgd_addr_end() used
while traversing pmds. Could you please clarify a bit?


diff --git a/arch/powerpc/mm/kasan/8xx.c b/arch/powerpc/mm/kasan/8xx.c
index 2784224..89c5053 100644
--- a/arch/powerpc/mm/kasan/8xx.c
+++ b/arch/powerpc/mm/kasan/8xx.c
@@ -15,8 +15,8 @@
for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd += 2, block 
+= SZ_8M) {
pte_basic_t *new;
  
-		k_next = pgd_addr_end(k_cur, k_end);

-   k_next = pgd_addr_end(k_next, k_end);
+   k_next = pmd_addr_end(k_cur, k_end);
+   k_next = pmd_addr_end(k_next, k_end);


No, I don't think so.
On powerpc32 we have only two levels, so pgd and pmd are more or less 
the same.
But pmd_addr_end() as defined in include/asm-generic/pgtable-nopmd.h is 
a no-op, so I don't think it will work.


It is likely that this function should iterate on pgd, then you get pmd 
= pmd_offset(pud_offset(p4d_offset(pgd)));



if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
continue;
  
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c

index fb29404..3f7d6dc6 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -38,7 +38,7 @@ int __init kasan_init_shadow_page_tables(unsigned long 
k_start, unsigned long k_
for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) {
pte_t *new;
  
-		k_next = pgd_addr_end(k_cur, k_end);

+   k_next = pmd_addr_end(k_cur, k_end);


Same here I get, iterate on pgd then get pmd = 
pmd_offset(pud_offset(p4d_offset(pgd)));



if ((void *)pmd_page_vaddr(*pmd) != kasan_early_shadow_pte)
continue;
  
@@ -196,7 +196,7 @@ void __init kasan_early_init(void)

kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
  
  	do {

-   next = pgd_addr_end(addr, end);
+   next = pmd_addr_end(addr, end);
pmd_populate_kernel(_mm, pmd, kasan_early_shadow_pte);
} while (pmd++, addr = next, addr != end);
  



Christophe


Re: [RFC PATCH v2 0/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-07 Thread Christophe Leroy




Le 07/09/2020 à 22:12, Mike Rapoport a écrit :

On Mon, Sep 07, 2020 at 08:00:55PM +0200, Gerald Schaefer wrote:

This is v2 of an RFC previously discussed here:
https://lore.kernel.org/lkml/20200828140314.8556-1-gerald.schae...@linux.ibm.com/

Patch 1 is a fix for a regression in gup_fast on s390, after our conversion
to common gup_fast code. It will introduce special helper functions
pXd_addr_end_folded(), which have to be used in places where pagetable walk
is done w/o lock and with READ_ONCE, so currently only in gup_fast.

Patch 2 is an attempt to make that more generic, i.e. change pXd_addr_end()
themselves by adding an extra pXd value parameter. That was suggested by
Jason during v1 discussion, because he is already thinking of some other
places where he might want to switch to the READ_ONCE logic for pagetable
walks. In general, that would be the cleanest / safest solution, but there
is some impact on other architectures and common code, hence the new and
greatly enlarged recipient list.

Patch 3 is a "nice to have" add-on, which makes pXd_addr_end() inline
functions instead of #defines, so that we get some type checking for the
new pXd value parameter.

Not sure about Fixes/stable tags for the generic solution. Only patch 1
fixes a real bug on s390, and has Fixes/stable tags. Patches 2 + 3 might
still be nice to have in stable, to ease future backports, but I guess
"nice to have" does not really qualify for stable backports.


I also think that adding pXd parameter to pXd_addr_end() is a cleaner
way and with this patch 1 is not really required. I would even merge
patches 2 and 3 into a single patch and use only it as the fix.


Why not merging patches 2 and 3, but I would keep patch 1 separate but 
after the generic changes, so that we first do the generic changes, then 
we do the specific S390 use of it.


Christophe


Re: [RFC PATCH v2 3/3] mm: make generic pXd_addr_end() macros inline functions

2020-09-07 Thread Christophe Leroy




Le 07/09/2020 à 20:00, Gerald Schaefer a écrit :

From: Alexander Gordeev 

Since pXd_addr_end() macros take pXd page-table entry as a
parameter it makes sense to check the entry type on compile.
Even though most archs do not make use of page-table entries
in pXd_addr_end() calls, checking the type in traversal code
paths could help to avoid subtle bugs.

Signed-off-by: Alexander Gordeev 
Signed-off-by: Gerald Schaefer 
---
  include/linux/pgtable.h | 36 
  1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 67ebc22cf83d..d9e7d16c2263 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -656,31 +656,35 @@ static inline int arch_unmap_one(struct mm_struct *mm,
   */
  
  #ifndef pgd_addr_end

-#define pgd_addr_end(pgd, addr, end)   \
-({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;  \
-   (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
+#define pgd_addr_end pgd_addr_end


I think that #define is pointless, usually there is no such #define for 
the default case.



+static inline unsigned long pgd_addr_end(pgd_t pgd, unsigned long addr, 
unsigned long end)
+{  unsigned long __boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
+   return (__boundary - 1 < end - 1) ? __boundary : end;
+}


Please use the standard layout, ie entry { and exit } alone on their 
line, and space between local vars declaration and the rest.


Also remove the leading __ in front of var names as it's not needed once 
it is not macros anymore.


f_name()
{
some_local_var;

do_something();
}


  #endif
  
  #ifndef p4d_addr_end

-#define p4d_addr_end(p4d, addr, end)   \
-({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;  \
-   (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
+#define p4d_addr_end p4d_addr_end
+static inline unsigned long p4d_addr_end(p4d_t p4d, unsigned long addr, 
unsigned long end)
+{  unsigned long __boundary = (addr + P4D_SIZE) & P4D_MASK;
+   return (__boundary - 1 < end - 1) ? __boundary : end;
+}
  #endif
  
  #ifndef pud_addr_end

-#define pud_addr_end(pud, addr, end)   \
-({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;  \
-   (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
+#define pud_addr_end pud_addr_end
+static inline unsigned long pud_addr_end(pud_t pud, unsigned long addr, 
unsigned long end)
+{  unsigned long __boundary = (addr + PUD_SIZE) & PUD_MASK;
+   return (__boundary - 1 < end - 1) ? __boundary : end;
+}
  #endif
  
  #ifndef pmd_addr_end

-#define pmd_addr_end(pmd, addr, end)   \
-({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;  \
-   (__boundary - 1 < (end) - 1)? __boundary: (end); \
-})
+#define pmd_addr_end pmd_addr_end
+static inline unsigned long pmd_addr_end(pmd_t pmd, unsigned long addr, 
unsigned long end)
+{  unsigned long __boundary = (addr + PMD_SIZE) & PMD_MASK;
+   return (__boundary - 1 < end - 1) ? __boundary : end;
+}
  #endif
  
  /*




Re: [RFC PATCH v2 2/3] mm: make pXd_addr_end() functions page-table entry aware

2020-09-07 Thread Christophe Leroy




Le 07/09/2020 à 20:00, Gerald Schaefer a écrit :

From: Alexander Gordeev 

Unlike all other page-table abstractions pXd_addr_end() do not take
into account a particular table entry in which context the functions
are called. On architectures with dynamic page-tables folding that
might lead to lack of necessary information that is difficult to
obtain other than from the table entry itself. That already led to
a subtle memory corruption issue on s390.

By letting pXd_addr_end() functions know about the page-table entry
we allow archs not only make extra checks, but also optimizations.

As result of this change the pXd_addr_end_folded() functions used
in gup_fast traversal code become unnecessary and get replaced with
universal pXd_addr_end() variants.

The arch-specific updates not only add dereferencing of page-table
entry pointers, but also small changes to the code flow to make those
dereferences possible, at least for x86 and powerpc. Also for arm64,
but in way that should not have any impact.



[...]



Signed-off-by: Alexander Gordeev 
Signed-off-by: Gerald Schaefer 
---
  arch/arm/include/asm/pgtable-2level.h|  2 +-
  arch/arm/mm/idmap.c  |  6 ++--
  arch/arm/mm/mmu.c|  8 ++---
  arch/arm64/kernel/hibernate.c| 16 ++
  arch/arm64/kvm/mmu.c | 16 +-
  arch/arm64/mm/kasan_init.c   |  8 ++---
  arch/arm64/mm/mmu.c  | 25 +++
  arch/powerpc/mm/book3s64/radix_pgtable.c |  7 ++---
  arch/powerpc/mm/hugetlbpage.c|  6 ++--


You forgot arch/powerpc/mm/book3s64/subpage_prot.c it seems.


  arch/s390/include/asm/pgtable.h  |  8 ++---
  arch/s390/mm/page-states.c   |  8 ++---
  arch/s390/mm/pageattr.c  |  8 ++---
  arch/s390/mm/vmem.c  |  8 ++---
  arch/sparc/mm/hugetlbpage.c  |  6 ++--
  arch/um/kernel/tlb.c |  8 ++---
  arch/x86/mm/init_64.c| 15 -
  arch/x86/mm/kasan_init_64.c  | 16 +-
  include/asm-generic/pgtable-nop4d.h  |  2 +-
  include/asm-generic/pgtable-nopmd.h  |  2 +-
  include/asm-generic/pgtable-nopud.h  |  2 +-
  include/linux/pgtable.h  | 26 ---
  mm/gup.c |  8 ++---
  mm/ioremap.c |  8 ++---
  mm/kasan/init.c  | 17 +-
  mm/madvise.c |  4 +--
  mm/memory.c  | 40 
  mm/mlock.c   | 18 ---
  mm/mprotect.c|  8 ++---
  mm/pagewalk.c|  8 ++---
  mm/swapfile.c|  8 ++---
  mm/vmalloc.c | 16 +-
  31 files changed, 165 insertions(+), 173 deletions(-)


Christophe


Re: [RFC PATCH v2 1/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-07 Thread Christophe Leroy




Le 07/09/2020 à 20:00, Gerald Schaefer a écrit :

From: Alexander Gordeev 

Commit 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast
code") introduced a subtle but severe bug on s390 with gup_fast, due to
dynamic page table folding.

The question "What would it require for the generic code to work for s390"
has already been discussed here
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1
and ended with a promising approach here
https://lkml.kernel.org/r/20190419153307.4f2911b5@mschwideX1
which in the end unfortunately didn't quite work completely.

We tried to mimic static level folding by changing pgd_offset to always
calculate top level page table offset, and do nothing in folded pXd_offset.
What has been overlooked is that PxD_SIZE/MASK and thus pXd_addr_end do
not reflect this dynamic behaviour, and still act like static 5-level
page tables.



[...]



Fix this by introducing new pXd_addr_end_folded helpers, which take an
additional pXd entry value parameter, that can be used on s390
to determine the correct page table level and return corresponding
end / boundary. With that, the pointer iteration will always
happen in gup_pgd_range for s390. No change for other architectures
introduced.


Not sure pXd_addr_end_folded() is the best understandable name, 
allthough I don't have any alternative suggestion at the moment.
Maybe could be something like pXd_addr_end_fixup() as it will disappear 
in the next patch, or pXd_addr_end_gup() ?


Also, if it happens to be acceptable to get patch 2 in stable, I think 
you should switch patch 1 and patch 2 to avoid the step through 
pXd_addr_end_folded()





Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Cc:  # 5.2+
Reviewed-by: Gerald Schaefer 
Signed-off-by: Alexander Gordeev 
Signed-off-by: Gerald Schaefer 
---
  arch/s390/include/asm/pgtable.h | 42 +
  include/linux/pgtable.h | 16 +
  mm/gup.c|  8 +++
  3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7eb01a5459cd..027206e4959d 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -512,6 +512,48 @@ static inline bool mm_pmd_folded(struct mm_struct *mm)
  }
  #define mm_pmd_folded(mm) mm_pmd_folded(mm)
  
+/*

+ * With dynamic page table levels on s390, the static pXd_addr_end() functions
+ * will not return corresponding dynamic boundaries. This is no problem as long
+ * as only pXd pointers are passed down during page table walk, because
+ * pXd_offset() will simply return the given pointer for folded levels, and the
+ * pointer iteration over a range simply happens at the correct page table
+ * level.
+ * It is however a problem with gup_fast, or other places walking the page
+ * tables w/o locks using READ_ONCE(), and passing down the pXd values instead
+ * of pointers. In this case, the pointer given to pXd_offset() is a pointer to
+ * a stack variable, which cannot be used for pointer iteration at the correct
+ * level. Instead, the iteration then has to happen by going up to pgd level
+ * again. To allow this, provide pXd_addr_end_folded() functions with an
+ * additional pXd value parameter, which can be used on s390 to determine the
+ * folding level and return the corresponding boundary.
+ */
+static inline unsigned long rste_addr_end_folded(unsigned long rste, unsigned 
long addr, unsigned long end)


What does 'rste' stands for ?

Isn't this line a bit long ?


+{
+   unsigned long type = (rste & _REGION_ENTRY_TYPE_MASK) >> 2;
+   unsigned long size = 1UL << (_SEGMENT_SHIFT + type * 11);
+   unsigned long boundary = (addr + size) & ~(size - 1);
+
+   /*
+* FIXME The below check is for internal testing only, to be removed
+*/
+   VM_BUG_ON(type < (_REGION_ENTRY_TYPE_R3 >> 2));
+
+   return (boundary - 1) < (end - 1) ? boundary : end;
+}
+
+#define pgd_addr_end_folded pgd_addr_end_folded
+static inline unsigned long pgd_addr_end_folded(pgd_t pgd, unsigned long addr, 
unsigned long end)
+{
+   return rste_addr_end_folded(pgd_val(pgd), addr, end);
+}
+
+#define p4d_addr_end_folded p4d_addr_end_folded
+static inline unsigned long p4d_addr_end_folded(p4d_t p4d, unsigned long addr, 
unsigned long end)
+{
+   return rste_addr_end_folded(p4d_val(p4d), addr, end);
+}
+
  static inline int mm_has_pgste(struct mm_struct *mm)
  {
  #ifdef CONFIG_PGSTE
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e8cbc2e795d5..981c4c2a31fe 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -681,6 +681,22 @@ static inline int arch_unmap_one(struct mm_struct *mm,
  })
  #endif
  
+#ifndef pgd_addr_end_folded

+#define pgd_addr_end_folded(pgd, addr, end)pgd_addr_end(addr, end)
+#endif
+
+#ifndef p4d_addr_end_folded
+#define p4d_addr_end_folded(p4d, addr, end)p4d_addr_end(addr, 

Re: [RFC PATCH v2 0/3] mm/gup: fix gup_fast with dynamic page table folding

2020-09-07 Thread Christophe Leroy




Le 07/09/2020 à 20:00, Gerald Schaefer a écrit :

This is v2 of an RFC previously discussed here:
https://lore.kernel.org/lkml/20200828140314.8556-1-gerald.schae...@linux.ibm.com/

Patch 1 is a fix for a regression in gup_fast on s390, after our conversion
to common gup_fast code. It will introduce special helper functions
pXd_addr_end_folded(), which have to be used in places where pagetable walk
is done w/o lock and with READ_ONCE, so currently only in gup_fast.

Patch 2 is an attempt to make that more generic, i.e. change pXd_addr_end()
themselves by adding an extra pXd value parameter. That was suggested by
Jason during v1 discussion, because he is already thinking of some other
places where he might want to switch to the READ_ONCE logic for pagetable
walks. In general, that would be the cleanest / safest solution, but there
is some impact on other architectures and common code, hence the new and
greatly enlarged recipient list.

Patch 3 is a "nice to have" add-on, which makes pXd_addr_end() inline
functions instead of #defines, so that we get some type checking for the
new pXd value parameter.

Not sure about Fixes/stable tags for the generic solution. Only patch 1
fixes a real bug on s390, and has Fixes/stable tags. Patches 2 + 3 might
still be nice to have in stable, to ease future backports, but I guess
"nice to have" does not really qualify for stable backports.


If one day you have to backport a fix that requires patch 2 and/or 3, 
just mark it "depends-on:" and the patches will go in stable at the 
relevant time.


Christophe


[PATCH 2/2] powerpc/32: Fix vmap stack - Properly set r1 before activating MMU

2020-09-07 Thread Christophe Leroy
We need r1 to be properly set before activating MMU, otherwise any new
exception taken while saving registers into the stack in exception
prologs will use the user stack, which is wrong and will even lockup
or crash when KUAP is selected.

Do that by switching the meaning of r11 and r1 until we have saved r1
to the stack: copy r1 into r11 and setup the new stack pointer in r1.
To avoid complicating and impacting all generic and specific prolog
code (and more), copy back r1 into r11 once r11 is save onto
the stack.

We could get rid of copying r1 back and forth at the cost of
rewriting everything to use r1 instead of r11 all the way when
CONFIG_VMAP_STACK is set, but the effort is probably not worth it.

Fixes: 028474876f47 ("powerpc/32: prepare for CONFIG_VMAP_STACK")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_32.h | 43 +++
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 21effebb9277..cc36998c5541 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -39,15 +39,24 @@
 .endm
 
 .macro EXCEPTION_PROLOG_1 for_rtas=0
+#ifdef CONFIG_VMAP_STACK
+   mr  r11, r1
+   subir1, r1, INT_FRAME_SIZE  /* use r1 if kernel */
+   beq 1f
+   mfspr   r1,SPRN_SPRG_THREAD
+   lwz r1,TASK_STACK-THREAD(r1)
+   addir1, r1, THREAD_SIZE - INT_FRAME_SIZE
+#else
subir11, r1, INT_FRAME_SIZE /* use r1 if kernel */
beq 1f
mfspr   r11,SPRN_SPRG_THREAD
lwz r11,TASK_STACK-THREAD(r11)
addir11, r11, THREAD_SIZE - INT_FRAME_SIZE
+#endif
 1:
tophys_novmstack r11, r11
 #ifdef CONFIG_VMAP_STACK
-   mtcrf   0x7f, r11
+   mtcrf   0x7f, r1
bt  32 - THREAD_ALIGN_SHIFT, stack_overflow
 #endif
 .endm
@@ -62,6 +71,15 @@
stw r10,_CCR(r11)   /* save registers */
 #endif
mfspr   r10, SPRN_SPRG_SCRATCH0
+#ifdef CONFIG_VMAP_STACK
+   stw r11,GPR1(r1)
+   stw r11,0(r1)
+   mr  r11, r1
+#else
+   stw r1,GPR1(r11)
+   stw r1,0(r11)
+   tovirt(r1, r11) /* set new kernel sp */
+#endif
stw r12,GPR12(r11)
stw r9,GPR9(r11)
stw r10,GPR10(r11)
@@ -89,9 +107,6 @@
mfspr   r12,SPRN_SRR0
mfspr   r9,SPRN_SRR1
 #endif
-   stw r1,GPR1(r11)
-   stw r1,0(r11)
-   tovirt_novmstack r1, r11/* set new kernel sp */
 #ifdef CONFIG_40x
rlwinm  r9,r9,0,14,12   /* clear MSR_WE (necessary?) */
 #else
@@ -309,19 +324,19 @@
 .macro vmap_stack_overflow_exception
 #ifdef CONFIG_VMAP_STACK
 #ifdef CONFIG_SMP
-   mfspr   r11, SPRN_SPRG_THREAD
-   lwz r11, TASK_CPU - THREAD(r11)
-   slwir11, r11, 3
-   addis   r11, r11, emergency_ctx@ha
+   mfspr   r1, SPRN_SPRG_THREAD
+   lwz r1, TASK_CPU - THREAD(r1)
+   slwir1, r1, 3
+   addis   r1, r1, emergency_ctx@ha
 #else
-   lis r11, emergency_ctx@ha
+   lis r1, emergency_ctx@ha
 #endif
-   lwz r11, emergency_ctx@l(r11)
-   cmpwi   cr1, r11, 0
+   lwz r1, emergency_ctx@l(r1)
+   cmpwi   cr1, r1, 0
bne cr1, 1f
-   lis r11, init_thread_union@ha
-   addir11, r11, init_thread_union@l
-1: addir11, r11, THREAD_SIZE - INT_FRAME_SIZE
+   lis r1, init_thread_union@ha
+   addir1, r1, init_thread_union@l
+1: addir1, r1, THREAD_SIZE - INT_FRAME_SIZE
EXCEPTION_PROLOG_2
SAVE_NVGPRS(r11)
addir3, r1, STACK_FRAME_OVERHEAD
-- 
2.25.0



[PATCH 1/2] powerpc/32: Fix vmap stack - Do not activate MMU before reading task struct

2020-09-07 Thread Christophe Leroy
We need r1 to be properly set before activating MMU, so
reading task_struct->stack must be done with MMU off.

This means we need an additional register to play with MSR
bits while r11 now points to the stack. For that, move r10
back to CR (As is already done for hash MMU) and use r10.

We still don't have r1 correct yet when we activate MMU.
It is done in following patch.

Fixes: 028474876f47 ("powerpc/32: prepare for CONFIG_VMAP_STACK")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_32.S |  6 --
 arch/powerpc/kernel/head_32.h | 31 ++-
 2 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index f3ab94d73936..d967266d62e8 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -274,14 +274,8 @@ __secondary_hold_acknowledge:
DO_KVM  0x200
 MachineCheck:
EXCEPTION_PROLOG_0
-#ifdef CONFIG_VMAP_STACK
-   li  r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */
-   mtmsr   r11
-   isync
-#endif
 #ifdef CONFIG_PPC_CHRP
mfspr   r11, SPRN_SPRG_THREAD
-   tovirt_vmstack r11, r11
lwz r11, RTAS_SP(r11)
cmpwi   cr1, r11, 0
bne cr1, 7f
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index 9abec6cd099c..21effebb9277 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -39,24 +39,13 @@
 .endm
 
 .macro EXCEPTION_PROLOG_1 for_rtas=0
-#ifdef CONFIG_VMAP_STACK
-   .ifeq   \for_rtas
-   li  r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */
-   mtmsr   r11
-   isync
-   .endif
subir11, r1, INT_FRAME_SIZE /* use r1 if kernel */
-#else
-   tophys(r11,r1)  /* use tophys(r1) if kernel */
-   subir11, r11, INT_FRAME_SIZE/* alloc exc. frame */
-#endif
beq 1f
mfspr   r11,SPRN_SPRG_THREAD
-   tovirt_vmstack r11, r11
lwz r11,TASK_STACK-THREAD(r11)
addir11, r11, THREAD_SIZE - INT_FRAME_SIZE
-   tophys_novmstack r11, r11
 1:
+   tophys_novmstack r11, r11
 #ifdef CONFIG_VMAP_STACK
mtcrf   0x7f, r11
bt  32 - THREAD_ALIGN_SHIFT, stack_overflow
@@ -64,12 +53,11 @@
 .endm
 
 .macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0
-#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S)
-BEGIN_MMU_FTR_SECTION
+#ifdef CONFIG_VMAP_STACK
mtcrr10
-FTR_SECTION_ELSE
-   stw r10, _CCR(r11)
-ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE)
+   li  r10, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */
+   mtmsr   r10
+   isync
 #else
stw r10,_CCR(r11)   /* save registers */
 #endif
@@ -77,11 +65,9 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE)
stw r12,GPR12(r11)
stw r9,GPR9(r11)
stw r10,GPR10(r11)
-#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S)
-BEGIN_MMU_FTR_SECTION
+#ifdef CONFIG_VMAP_STACK
mfcrr10
stw r10, _CCR(r11)
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
 #endif
mfspr   r12,SPRN_SPRG_SCRATCH1
stw r12,GPR11(r11)
@@ -97,11 +83,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
stw r10, _DSISR(r11)
.endif
lwz r9, SRR1(r12)
-#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S)
-BEGIN_MMU_FTR_SECTION
andi.   r10, r9, MSR_PR
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
-#endif
lwz r12, SRR0(r12)
 #else
mfspr   r12,SPRN_SRR0
@@ -328,7 +310,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
 #ifdef CONFIG_VMAP_STACK
 #ifdef CONFIG_SMP
mfspr   r11, SPRN_SPRG_THREAD
-   tovirt(r11, r11)
lwz r11, TASK_CPU - THREAD(r11)
slwir11, r11, 3
addis   r11, r11, emergency_ctx@ha
-- 
2.25.0



Re: [RFC PATCH 02/12] powerpc: remove arguments from interrupt handler functions

2020-09-07 Thread Christophe Leroy
On Mon, 2020-09-07 at 11:20 +0200, Christophe Leroy wrote:
> 
> Le 05/09/2020 à 19:43, Nicholas Piggin a écrit :
> > Make interrupt handlers all just take the pt_regs * argument and load
> > DAR/DSISR etc from that. Make those that return a value return long.
> 
> I like this, it will likely simplify a bit the VMAP_STACK mess.
> 
> Not sure it is that easy. My board is stuck after the start of init.
> 
> 
> On the 8xx, on Instruction TLB Error exception, we do
> 
>   andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
> 
> On book3s/32, on ISI exception we do:
>   andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
> 
> On 40x and bookE, on ISI exception we do:
>   li  r5,0/* Pass zero as arg3 */
> 
> 
> And regs->dsisr will just contain nothing
> 
> So it means we should at least write back r5 into regs->dsisr from there 
> ? The performance impact should be minimal as we already write _DAR so 
> the cache line should already be in the cache.
> 
> A hacky 'stw r5, _DSISR(r1)' in handle_page_fault() does the trick, 
> allthough we don't want to do it for both ISI and DSI at the end, so 
> you'll have to do it in every head_xxx.S

To get you series build and work, I did the following hacks:

diff --git a/arch/powerpc/include/asm/interrupt.h
b/arch/powerpc/include/asm/interrupt.h
index acfcc7d5779b..c11045d3113a 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -93,7 +93,9 @@ static inline void interrupt_nmi_exit_prepare(struct
pt_regs *regs, struct inter
 {
nmi_exit();
 
+#ifdef CONFIG_PPC64
this_cpu_set_ftrace_enabled(state->ftrace_enabled);
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
/* Check we didn't change the pending interrupt mask. */
diff --git a/arch/powerpc/kernel/entry_32.S
b/arch/powerpc/kernel/entry_32.S
index f4d0af8e1136..66f7adbe1076 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -663,6 +663,7 @@ ppc_swapcontext:
  */
.globl  handle_page_fault
 handle_page_fault:
+   stw r5,_DSISR(r1)
addir3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_BOOK3S_32
andis.  r0,r5,DSISR_DABRMATCH@h
---

Christophe



Re: [RFC PATCH 02/12] powerpc: remove arguments from interrupt handler functions

2020-09-07 Thread Christophe Leroy




Le 05/09/2020 à 19:43, Nicholas Piggin a écrit :

Make interrupt handlers all just take the pt_regs * argument and load
DAR/DSISR etc from that. Make those that return a value return long.


I like this, it will likely simplify a bit the VMAP_STACK mess.

Not sure it is that easy. My board is stuck after the start of init.


On the 8xx, on Instruction TLB Error exception, we do

andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */

On book3s/32, on ISI exception we do:
andis.  r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */

On 40x and bookE, on ISI exception we do:
li  r5,0/* Pass zero as arg3 */


And regs->dsisr will just contain nothing

So it means we should at least write back r5 into regs->dsisr from there 
? The performance impact should be minimal as we already write _DAR so 
the cache line should already be in the cache.


A hacky 'stw r5, _DSISR(r1)' in handle_page_fault() does the trick, 
allthough we don't want to do it for both ISI and DSI at the end, so 
you'll have to do it in every head_xxx.S



While you are at it, it would probably also make sense to do remove the 
address param of bad_page_fault(), there is no point in loading back 
regs->dar in handle_page_fault() and machine_check_8xx() and 
alignment_exception(), just read regs->dar in bad_page_fault()


The case of do_break() should also be looked at.

Why changing return code from int to long ?

Christophe



This is done to make the function signatures match more closely, which
will help with a future patch to add wrappers. Explicit arguments could
be re-added for performance in future but that would require more
complex wrapper macros.

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/include/asm/asm-prototypes.h |  4 ++--
  arch/powerpc/include/asm/bug.h|  4 ++--
  arch/powerpc/kernel/exceptions-64e.S  |  2 --
  arch/powerpc/kernel/exceptions-64s.S  | 14 ++
  arch/powerpc/mm/book3s64/hash_utils.c |  8 +---
  arch/powerpc/mm/book3s64/slb.c| 11 +++
  arch/powerpc/mm/fault.c   | 16 +---
  7 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index d714d83bbc7c..2fa0cf6c6011 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -111,8 +111,8 @@
  #ifndef __ASSEMBLY__
  
  struct pt_regs;

-extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
-extern int hash__do_page_fault(struct pt_regs *, unsigned long, unsigned long);
+extern long do_page_fault(struct pt_regs *);
+extern long hash__do_page_fault(struct pt_regs *);


no extern


  extern void bad_page_fault(struct pt_regs *, unsigned long, int);
  extern void _exception(int, struct pt_regs *, int, unsigned long);
  extern void _exception_pkey(struct pt_regs *, unsigned long, int);


Christophe


Re: [RFC PATCH 09/12] powerpc: move NMI entry/exit code into wrapper

2020-09-07 Thread Christophe Leroy




On 9/5/20 5:43 PM, Nicholas Piggin wrote:

This moves the common NMI entry and exit code into the interrupt handler
wrappers.

This changes the behaviour of soft-NMI (watchdog) and HMI interrupts, and
also MCE interrupts on 64e, by adding missing parts of the NMI entry to
them.

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/include/asm/interrupt.h | 26 +++
  arch/powerpc/kernel/mce.c| 12 -
  arch/powerpc/kernel/traps.c  | 38 +---
  arch/powerpc/kernel/watchdog.c   | 10 +++-
  4 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 83fe1d64cf23..69eb8a432984 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -31,6 +31,27 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs)
  }
  #endif /* CONFIG_PPC_BOOK3S_64 */
  
+struct interrupt_nmi_state {

+#ifdef CONFIG_PPC64
+   u8 ftrace_enabled;
+#endif
+};
+
+static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct 
interrupt_nmi_state *state)
+{
+   this_cpu_set_ftrace_enabled(0);
+
+   nmi_enter();
+}
+
+static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct 
interrupt_nmi_state *state)
+{
+   nmi_exit();
+
+   this_cpu_set_ftrace_enabled(state->ftrace_enabled);


PPC32 build:

In file included from arch/powerpc/kernel/irq.c:57:0:
./arch/powerpc/include/asm/interrupt.h: In function 
‘interrupt_nmi_exit_prepare’:
./arch/powerpc/include/asm/interrupt.h:96:35: error: ‘struct 
interrupt_nmi_state’ has no member named ‘ftrace_enabled’

  this_cpu_set_ftrace_enabled(state->ftrace_enabled);
   ^


+}
+
+
  /**
   * DECLARE_INTERRUPT_HANDLER_RAW - Declare raw interrupt handler function
   * @func: Function name of the entry point
@@ -177,10 +198,15 @@ static __always_inline long ___##func(struct pt_regs 
*regs);  \
\
  __visible noinstr long func(struct pt_regs *regs) \
  { \
+   struct interrupt_nmi_state state;   \
long ret;   \
\
+   interrupt_nmi_enter_prepare(regs, );  \
+   \
ret = ___##func (regs); \
\
+   interrupt_nmi_exit_prepare(regs, );   \
+   \
return ret; \
  } \
\


Christophe


Re: [PATCH -next] powerpc/book3s64: fix link error with CONFIG_PPC_RADIX_MMU=n

2020-09-07 Thread Christophe Leroy




Le 07/09/2020 à 03:51, Yang Yingliang a écrit :


On 2020/9/6 14:50, Christophe Leroy wrote:



Le 05/09/2020 à 13:25, Yang Yingliang a écrit :

Fix link error when CONFIG_PPC_RADIX_MMU is disabled:
powerpc64-linux-gnu-ld: 
arch/powerpc/platforms/pseries/lpar.o:(.toc+0x0): undefined reference 
to `mmu_pid_bits'


Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  arch/powerpc/mm/book3s64/mmu_context.c | 4 


In your commit log, you are just mentionning 
arch/powerpc/platforms/pseries/lpar.o, which is right.


You shouldn't need to modify arch/powerpc/mm/book3s64/mmu_context.c at 
all, see below.



  arch/powerpc/platforms/pseries/lpar.c  | 2 ++
  2 files changed, 6 insertions(+)

diff --git a/arch/powerpc/mm/book3s64/mmu_context.c 
b/arch/powerpc/mm/book3s64/mmu_context.c

index 0ba30b8b935b..a8e292cd88f0 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -152,6 +152,7 @@ void hash__setup_new_exec(void)
    static int radix__init_new_context(struct mm_struct *mm)
  {
+#ifdef CONFIG_PPC_RADIX_MMU


This shouldn't be required. radix__init_new_context() is only called 
when radix_enabled() returns true.
As it is a static function, when it is not called it gets optimised 
away, so you will never get an undefined reference to `mmu_pid_bits` 
there.
powerpc64-linux-gnu-ld: 
arch/powerpc/mm/book3s64/mmu_context.o:(.toc+0x0): undefined reference 
to `mmu_pid_bits'
powerpc64-linux-gnu-ld: 
arch/powerpc/mm/book3s64/mmu_context.o:(.toc+0x8): undefined reference 
to `mmu_base_pid'



mmu_context.c is always compiled, it uses mmu_pid_bits and mmu_base_pid.


Yes, mmu_context.c is always compiled, but as I explained, 
radix__init_new_context() is defined as 'static' so it is optimised out 
when radix_enabled() returns false because there is no caller in that case.


I just made the test with ppc64_defconfig + CONFIG_PPC_RADIX_MMU=n (GCC 8.1)

The only failure I got was on lpar.c, which I fixed by enclosing the 
entire radix_init_pseries() in an #ifdef.


Once this is fixed, the build is OK, without any modification to 
mmu_context.c


powerpc64-linux-objdump -x arch/powerpc/mm/book3s64/mmu_context.o shows 
only the following objects in the .toc:


RELOCATION RECORDS FOR [.toc]:
OFFSET   TYPE  VALUE
 R_PPC64_ADDR64kmalloc_caches
0008 R_PPC64_ADDR64vmemmap
0010 R_PPC64_ADDR64__pmd_frag_nr
0018 R_PPC64_ADDR64__pmd_frag_size_shift

mmu_pid_bits and mmu_base_pid are not part of the undefined objetcs:

 *UND*   vmemmap
 *UND*   .mm_iommu_init
 *UND*   __pmd_frag_nr
 *UND*   .ida_alloc_range
 *UND*   .slb_setup_new_exec
 *UND*   mmu_feature_keys
 *UND*   .memset
 *UND*   .memcpy
 *UND*   .slice_init_new_context_exec
 *UND*   ._mcount
 *UND*   .__free_pages
 *UND*   __pmd_frag_size_shift
 *UND*   .slice_setup_new_exec
 *UND*   .ida_free
 *UND*   .pte_frag_destroy
 *UND*   .kfree
 *UND*   .pkey_mm_init
 *UND*   .kmem_cache_alloc_trace
 *UND*   .__warn_printk
 *UND*   _mcount
 *UND*   kmalloc_caches

Christophe


Re: [RFC PATCH 12/12] powerpc/64s: power4 nap fixup in C

2020-09-06 Thread Christophe Leroy




Le 07/09/2020 à 06:02, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of September 6, 2020 5:32 pm:



Le 05/09/2020 à 19:43, Nicholas Piggin a écrit :

There is no need for this to be in asm, use the new intrrupt entry wrapper.

Signed-off-by: Nicholas Piggin 
---
   arch/powerpc/include/asm/interrupt.h   | 14 
   arch/powerpc/include/asm/processor.h   |  1 +
   arch/powerpc/include/asm/thread_info.h |  6 
   arch/powerpc/kernel/exceptions-64s.S   | 45 --
   arch/powerpc/kernel/idle_book3s.S  |  4 +++
   5 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index ed0d633ab5aa..3da1dba91386 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -424,6 +424,7 @@ extern unsigned long isa300_idle_stop_mayloss(unsigned long 
psscr_val);
   extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
   #ifdef CONFIG_PPC_970_NAP
   extern void power4_idle_nap(void);
+extern void power4_idle_nap_return(void);


Please please please, 'extern' keyword is pointless and deprecated for
function prototypes. Don't add new ones.

Also, put it outside the #ifdef, so that you can use IS_ENABLED()
instead of #ifdef when using it.


I just copy paste and forget to remove it. I expect someone will do a
"cleanup" patch to get rid of them in one go, I find a random assortment
of extern and not extern to be even uglier :(


If we don't want to make fixes backporting a huge headache, some 
transition with random assortment is the price to pay.


One day, when 'extern' have become the minority, we can get rid of the 
few last ones.


But if someone believe it is not such a problem with backporting, I can 
provide a cleanup patch now.


Christophe


Re: [RFC PATCH 12/12] powerpc/64s: power4 nap fixup in C

2020-09-06 Thread Christophe Leroy




Le 05/09/2020 à 19:43, Nicholas Piggin a écrit :

There is no need for this to be in asm, use the new intrrupt entry wrapper.

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/include/asm/interrupt.h   | 14 
  arch/powerpc/include/asm/processor.h   |  1 +
  arch/powerpc/include/asm/thread_info.h |  6 
  arch/powerpc/kernel/exceptions-64s.S   | 45 --
  arch/powerpc/kernel/idle_book3s.S  |  4 +++
  5 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 3ae3d2f93b61..acfcc7d5779b 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -8,6 +8,16 @@
  #include 
  #include 
  
+static inline void nap_adjust_return(struct pt_regs *regs)

+{
+#ifdef CONFIG_PPC_970_NAP


Avoid #ifdef, you can use IS_ENABLED(CONFIG_PPC_970_NAP) in the 'if' below


+   if (test_thread_local_flags(_TLF_NAPPING)) {
+   clear_thread_local_flags(_TLF_NAPPING);
+   regs->nip = (unsigned long)power4_idle_nap_return;
+   }
+#endif
+}
+
  #ifdef CONFIG_PPC_BOOK3S_64
  static inline void interrupt_enter_prepare(struct pt_regs *regs)
  {
@@ -33,6 +43,8 @@ static inline void interrupt_async_enter_prepare(struct 
pt_regs *regs)
if (cpu_has_feature(CPU_FTR_CTRL) &&
!test_thread_local_flags(_TLF_RUNLATCH))
__ppc64_runlatch_on();
+
+   nap_adjust_return(regs);
  }
  
  #else /* CONFIG_PPC_BOOK3S_64 */

@@ -72,6 +84,8 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs 
*regs, struct inte
  
  	this_cpu_set_ftrace_enabled(0);
  
+	nap_adjust_return(regs);

+
nmi_enter();
  }
  
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h

index ed0d633ab5aa..3da1dba91386 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -424,6 +424,7 @@ extern unsigned long isa300_idle_stop_mayloss(unsigned long 
psscr_val);
  extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
  #ifdef CONFIG_PPC_970_NAP
  extern void power4_idle_nap(void);
+extern void power4_idle_nap_return(void);


Please please please, 'extern' keyword is pointless and deprecated for 
function prototypes. Don't add new ones.


Also, put it outside the #ifdef, so that you can use IS_ENABLED() 
instead of #ifdef when using it.



  #endif
  
  extern unsigned long cpuidle_disable;

diff --git a/arch/powerpc/include/asm/thread_info.h 
b/arch/powerpc/include/asm/thread_info.h
index ca6c97025704..9b15f7edb0cb 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -156,6 +156,12 @@ void arch_setup_new_exec(void);
  
  #ifndef __ASSEMBLY__
  
+static inline void clear_thread_local_flags(unsigned int flags)

+{
+   struct thread_info *ti = current_thread_info();
+   ti->local_flags &= ~flags;
+}
+
  static inline bool test_thread_local_flags(unsigned int flags)
  {
struct thread_info *ti = current_thread_info();
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 227bad3a586d..1db6b3438c88 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -692,25 +692,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
ld  r1,GPR1(r1)
  .endm
  
-/*

- * When the idle code in power4_idle puts the CPU into NAP mode,
- * it has to do so in a loop, and relies on the external interrupt
- * and decrementer interrupt entry code to get it out of the loop.
- * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags
- * to signal that it is in the loop and needs help to get out.
- */
-#ifdef CONFIG_PPC_970_NAP
-#define FINISH_NAP \
-BEGIN_FTR_SECTION  \
-   ld  r11, PACA_THREAD_INFO(r13); \
-   ld  r9,TI_LOCAL_FLAGS(r11); \
-   andi.   r10,r9,_TLF_NAPPING;\
-   bnelpower4_fixup_nap;   \
-END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
-#else
-#define FINISH_NAP
-#endif
-
  /*
   * There are a few constraints to be concerned with.
   * - Real mode exceptions code/data must be located at their physical 
location.
@@ -1250,7 +1231,6 @@ EXC_COMMON_BEGIN(machine_check_common)
 */
GEN_COMMON machine_check
  
-	FINISH_NAP

/* Enable MSR_RI when finished with PACA_EXMC */
li  r10,MSR_RI
mtmsrd  r10,1
@@ -1572,7 +1552,6 @@ EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
  EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
  EXC_COMMON_BEGIN(hardware_interrupt_common)
GEN_COMMON hardware_interrupt
-   FINISH_NAP
addir3,r1,STACK_FRAME_OVERHEAD
bl  do_IRQ
b   interrupt_return
@@ -1757,7 +1736,6 @@ EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
  EXC_VIRT_END(decrementer, 0x4900, 0x80)
  EXC_COMMON_BEGIN(decrementer_common)

Re: [PATCH -next] powerpc/eeh: fix compile warning with CONFIG_PROC_FS=n

2020-09-06 Thread Christophe Leroy




Le 05/09/2020 à 13:17, Yang Yingliang a écrit :

Fix the compile warning:

arch/powerpc/kernel/eeh.c:1639:12: error: 'proc_eeh_show' defined but not used 
[-Werror=unused-function]
  static int proc_eeh_show(struct seq_file *m, void *v)

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  arch/powerpc/kernel/eeh.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 94682382fc8c..420c3c25c6e7 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1636,6 +1636,7 @@ int eeh_pe_inject_err(struct eeh_pe *pe, int type, int 
func,
  }
  EXPORT_SYMBOL_GPL(eeh_pe_inject_err);
  
+#ifdef CONFIG_PROC_FS


I don't like this way of fixing the issue, because you'll get an 
unbalanced source code: proc_eeh_show() is apparently referenced all the 
time in eeh_init_proc(), but because proc_create_single() is a NULL 
macro when CONFIG_PROC_FS is not selected, you get the 'unused function' 
error.


I think the right fix should be to rewrite proc_create_single() as a 
static inline function that calls proc_create_single_data() when 
CONFIG_PROC_FS is selected and just returns NULL otherwise.



  static int proc_eeh_show(struct seq_file *m, void *v)
  {
if (!eeh_enabled()) {
@@ -1662,6 +1663,7 @@ static int proc_eeh_show(struct seq_file *m, void *v)
  
  	return 0;

  }
+#endif
  
  #ifdef CONFIG_DEBUG_FS

  static int eeh_enable_dbgfs_set(void *data, u64 val)



Christophe


Re: [PATCH -next] powerpc/book3s64: fix link error with CONFIG_PPC_RADIX_MMU=n

2020-09-06 Thread Christophe Leroy




Le 05/09/2020 à 13:25, Yang Yingliang a écrit :

Fix link error when CONFIG_PPC_RADIX_MMU is disabled:
powerpc64-linux-gnu-ld: arch/powerpc/platforms/pseries/lpar.o:(.toc+0x0): 
undefined reference to `mmu_pid_bits'

Reported-by: Hulk Robot 
Signed-off-by: Yang Yingliang 
---
  arch/powerpc/mm/book3s64/mmu_context.c | 4 


In your commit log, you are just mentionning 
arch/powerpc/platforms/pseries/lpar.o, which is right.


You shouldn't need to modify arch/powerpc/mm/book3s64/mmu_context.c at 
all, see below.



  arch/powerpc/platforms/pseries/lpar.c  | 2 ++
  2 files changed, 6 insertions(+)

diff --git a/arch/powerpc/mm/book3s64/mmu_context.c 
b/arch/powerpc/mm/book3s64/mmu_context.c
index 0ba30b8b935b..a8e292cd88f0 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -152,6 +152,7 @@ void hash__setup_new_exec(void)
  
  static int radix__init_new_context(struct mm_struct *mm)

  {
+#ifdef CONFIG_PPC_RADIX_MMU


This shouldn't be required. radix__init_new_context() is only called 
when radix_enabled() returns true.
As it is a static function, when it is not called it gets optimised 
away, so you will never get an undefined reference to `mmu_pid_bits` there.



unsigned long rts_field;
int index, max_id;
  
@@ -177,6 +178,9 @@ static int radix__init_new_context(struct mm_struct *mm)

mm->context.hash_context = NULL;
  
  	return index;

+#else
+   return -ENOTSUPP;
+#endif
  }
  
  int init_new_context(struct task_struct *tsk, struct mm_struct *mm)

diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index baf24eacd268..e454e218dbba 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1726,10 +1726,12 @@ void __init hpte_init_pseries(void)
  
  void radix_init_pseries(void)

  {
+#ifdef CONFIG_PPC_RADIX_MMU


This function is only called from 
/arch/powerpc/mm/book3s64/radix_pgtable.c which is only built when 
CONFIG_PPC_RADIX_MMU is selected.


So the entire function should be encloded in the #ifdef.


pr_info("Using radix MMU under hypervisor\n");
  
  	pseries_lpar_register_process_table(__pa(process_tb),

0, PRTB_SIZE_SHIFT - 12);
+#endif
  }
  
  #ifdef CONFIG_PPC_SMLPAR




Christophe


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3

2020-09-05 Thread Christophe Leroy




Le 04/09/2020 à 23:01, David Laight a écrit :

From: Alexey Dobriyan

Sent: 04 September 2020 18:58

On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote:

* Christoph Hellwig  wrote:

this series removes the last set_fs() used to force a kernel address
space for the uaccess code in the kernel read/write/splice code, and then
stops implementing the address space overrides entirely for x86 and
powerpc.


Cool! For the x86 bits:

   Acked-by: Ingo Molnar 


set_fs() is older than some kernel hackers!

$ cd linux-0.11/
$ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3
./include/asm/segment.h:61:extern inline void set_fs(unsigned long val)
./include/asm/segment.h-62-{
./include/asm/segment.h-63- __asm__("mov %0,%%fs"::"a" ((unsigned 
short) val));
./include/asm/segment.h-64-}


What is this strange %fs register you are talking about.
Figure 2-4 only has CS, DS, SS and ES.



Intel added registers FS and GS in the i386

Christophe


[PATCH 3/3] powerpc/uaccess: Remove __put_user_asm() and __put_user_asm2()

2020-09-04 Thread Christophe Leroy
__put_user_asm() and __put_user_asm2() are not used anymore.

Remove them.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 41 --
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 96d1c144f92b..26781b044932 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -151,42 +151,6 @@ static inline int __access_ok(unsigned long addr, unsigned 
long size,
 
 extern long __put_user_bad(void);
 
-/*
- * We don't tell gcc that we are accessing memory, but this is OK
- * because we do not write to any memory gcc knows about, so there
- * are no aliasing issues.
- */
-#define __put_user_asm(x, addr, err, op)   \
-   __asm__ __volatile__(   \
-   "1: " op "%U2%X2 %1,%2  # put_user\n"   \
-   "2:\n"  \
-   ".section .fixup,\"ax\"\n"  \
-   "3: li %0,%3\n" \
-   "   b 2b\n" \
-   ".previous\n"   \
-   EX_TABLE(1b, 3b)\
-   : "=r" (err)\
-   : "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err))
-
-#ifdef __powerpc64__
-#define __put_user_asm2(x, ptr, retval)\
- __put_user_asm(x, ptr, retval, "std")
-#else /* __powerpc64__ */
-#define __put_user_asm2(x, addr, err)  \
-   __asm__ __volatile__(   \
-   "1: stw%X2 %1,%2\n" \
-   "2: stw%X2 %L1,%L2\n"   \
-   "3:\n"  \
-   ".section .fixup,\"ax\"\n"  \
-   "4: li %0,%3\n" \
-   "   b 3b\n" \
-   ".previous\n"   \
-   EX_TABLE(1b, 4b)\
-   EX_TABLE(2b, 4b)\
-   : "=r" (err)\
-   : "r" (x), "m" (*addr), "i" (-EFAULT), "0" (err))
-#endif /* __powerpc64__ */
-
 #define __put_user_size_allowed(x, ptr, size, retval)  \
 do {   \
__label__ __pu_failed;  \
@@ -249,6 +213,11 @@ do {   
\
 })
 
 
+/*
+ * We don't tell gcc that we are accessing memory, but this is OK
+ * because we do not write to any memory gcc knows about, so there
+ * are no aliasing issues.
+ */
 #define __put_user_asm_goto(x, addr, label, op)\
asm volatile goto(  \
"1: " op "%U1%X1 %0,%1  # put_user\n"   \
-- 
2.25.0



[PATCH 2/3] powerpc/uaccess: Switch __patch_instruction() to __put_user_asm_goto()

2020-09-04 Thread Christophe Leroy
__patch_instruction() is the only user of __put_user_asm() outside
of asm/uaccess.h

Switch to the new __put_user_asm_goto() to enable retirement of
__put_user_asm() in a later patch.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/lib/code-patching.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 8c3934ea6220..2333625b5e31 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -21,21 +21,18 @@
 static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst 
instr,
   struct ppc_inst *patch_addr)
 {
-   int err = 0;
-
-   if (!ppc_inst_prefixed(instr)) {
-   __put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw");
-   } else {
-   __put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std");
-   }
-
-   if (err)
-   return err;
+   if (!ppc_inst_prefixed(instr))
+   __put_user_asm_goto(ppc_inst_val(instr), patch_addr, failed, 
"stw");
+   else
+   __put_user_asm_goto(ppc_inst_as_u64(instr), patch_addr, failed, 
"std");
 
asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr),
"r" (exec_addr));
 
return 0;
+
+failed:
+   return -EFAULT;
 }
 
 int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
-- 
2.25.0



[PATCH 1/3] powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()

2020-09-04 Thread Christophe Leroy
__put_user_asm_goto() provides more flexibility to GCC and avoids using
a local variable to tell if the write succeeded or not.
GCC can then avoid implementing a cmp in the fast path.

See the difference for a small function like the PPC64 version of
save_general_regs() in arch/powerpc/kernel/signal_32.c:

Before the patch (unreachable nop removed):

0c10 <.save_general_regs>:
 c10:   39 20 00 2c li  r9,44
 c14:   39 40 00 00 li  r10,0
 c18:   7d 29 03 a6 mtctr   r9
 c1c:   38 c0 00 00 li  r6,0
 c20:   48 00 00 14 b   c34 <.save_general_regs+0x24>
 c30:   42 40 00 40 bdz c70 <.save_general_regs+0x60>
 c34:   28 2a 00 27 cmpldi  r10,39
 c38:   7c c8 33 78 mr  r8,r6
 c3c:   79 47 1f 24 rldicr  r7,r10,3,60
 c40:   39 20 00 01 li  r9,1
 c44:   41 82 00 0c beq c50 <.save_general_regs+0x40>
 c48:   7d 23 38 2a ldx r9,r3,r7
 c4c:   79 29 00 20 clrldi  r9,r9,32
 c50:   91 24 00 00 stw r9,0(r4)
 c54:   2c 28 00 00 cmpdi   r8,0
 c58:   39 4a 00 01 addir10,r10,1
 c5c:   38 84 00 04 addir4,r4,4
 c60:   41 82 ff d0 beq c30 <.save_general_regs+0x20>
 c64:   38 60 ff f2 li  r3,-14
 c68:   4e 80 00 20 blr
 c70:   38 60 00 00 li  r3,0
 c74:   4e 80 00 20 blr

 <.fixup>:
  cc:   39 00 ff f2 li  r8,-14
  d0:   48 00 00 00 b   d0 <.fixup+0xd0>
d0: R_PPC64_REL24   .text+0xc54

After the patch:

1490 <.save_general_regs>:
1490:   39 20 00 2c li  r9,44
1494:   39 40 00 00 li  r10,0
1498:   7d 29 03 a6 mtctr   r9
149c:   60 00 00 00 nop
14a0:   28 2a 00 27 cmpldi  r10,39
14a4:   79 48 1f 24 rldicr  r8,r10,3,60
14a8:   39 20 00 01 li  r9,1
14ac:   41 82 00 0c beq 14b8 <.save_general_regs+0x28>
14b0:   7d 23 40 2a ldx r9,r3,r8
14b4:   79 29 00 20 clrldi  r9,r9,32
14b8:   91 24 00 00 stw r9,0(r4)
14bc:   39 4a 00 01 addir10,r10,1
14c0:   38 84 00 04 addir4,r4,4
14c4:   42 00 ff dc bdnz14a0 <.save_general_regs+0x10>
14c8:   38 60 00 00 li  r3,0
14cc:   4e 80 00 20 blr
14d0:   38 60 ff f2 li  r3,-14
    14d4:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index a5cfe867fbdc..96d1c144f92b 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -189,14 +189,14 @@ extern long __put_user_bad(void);
 
 #define __put_user_size_allowed(x, ptr, size, retval)  \
 do {   \
+   __label__ __pu_failed;  \
+   \
retval = 0; \
-   switch (size) { \
- case 1: __put_user_asm(x, ptr, retval, "stb"); break; \
- case 2: __put_user_asm(x, ptr, retval, "sth"); break; \
- case 4: __put_user_asm(x, ptr, retval, "stw"); break; \
- case 8: __put_user_asm2(x, ptr, retval); break;   \
- default: __put_user_bad();\
-   }   \
+   __put_user_size_goto(x, ptr, size, __pu_failed);\
+   break;  \
+   \
+__pu_failed:   \
+   retval = -EFAULT;   \
 } while (0)
 
 #define __put_user_size(x, ptr, size, retval)  \
-- 
2.25.0



[PATCH] powerpc/uaccess: Add pre-update addressing to __put_user_asm_goto()

2020-09-04 Thread Christophe Leroy
Enable pre-update addressing mode in __put_user_asm_goto()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 7c2427f237e1..a5cfe867fbdc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -254,7 +254,7 @@ do {
\
"1: " op "%U1%X1 %0,%1  # put_user\n"   \
EX_TABLE(1b, %l2)   \
:   \
-   : "r" (x), "m" (*addr)  \
+   : "r" (x), "m<>" (*addr)\
:   \
: label)
 
-- 
2.25.0



Re: [PATCH 1/2] dma-mapping: introduce dma_get_seg_boundary_nr_pages()

2020-09-03 Thread Christophe Leroy




Le 03/09/2020 à 18:12, Christoph Hellwig a écrit :

On Thu, Sep 03, 2020 at 01:57:39PM +0300, Andy Shevchenko wrote:

+{
+   if (!dev)
+   return (U32_MAX >> page_shift) + 1;
+   return (dma_get_seg_boundary(dev) >> page_shift) + 1;


Can it be better to do something like
   unsigned long boundary = dev ? dma_get_seg_boundary(dev) : U32_MAX;

   return (boundary >> page_shift) + 1;

?


I don't really see what that would buy us.



I guess it would avoid the duplication of>> page_shift) + 1

Christophe


Re: [PATCH 14/14] powerpc: remove address space overrides using set_fs()

2020-09-03 Thread Christophe Leroy




Le 03/09/2020 à 17:56, Christoph Hellwig a écrit :

On Thu, Sep 03, 2020 at 05:49:09PM +0200, Christoph Hellwig wrote:

On Thu, Sep 03, 2020 at 05:43:25PM +0200, Christophe Leroy wrote:



Le 03/09/2020 à 16:22, Christoph Hellwig a écrit :

Stop providing the possibility to override the address space using
set_fs() now that there is no need for that any more.

Signed-off-by: Christoph Hellwig 
---




   -static inline int __access_ok(unsigned long addr, unsigned long size,
-   mm_segment_t seg)
+static inline bool __access_ok(unsigned long addr, unsigned long size)
   {
-   if (addr > seg.seg)
-   return 0;
-   return (size == 0 || size - 1 <= seg.seg - addr);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   return size == 0 || size <= TASK_SIZE_MAX - addr;
   }


You don't need to test size == 0 anymore. It used to be necessary because
of the 'size - 1', as size is unsigned.

Now you can directly do

return size <= TASK_SIZE_MAX - addr;

If size is 0, this will always be true (because you already know that addr
is not >= TASK_SIZE_MAX


True.  What do you think of Linus' comment about always using the
ppc32 version on ppc64 as well with this?


I have nothing against it. That's only adding a substract, all args are 
already in registers so that will be in the noise for a modern CPU.




i.e. something like this folded in:

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 5363f7fc6dd06c..be070254e50943 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -11,26 +11,14 @@
  #ifdef __powerpc64__
  /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */
  #define TASK_SIZE_MAX TASK_SIZE_USER64
-
-/*
- * This check is sufficient because there is a large enough gap between user
- * addresses and the kernel addresses.
- */
-static inline bool __access_ok(unsigned long addr, unsigned long size)
-{
-   return addr < TASK_SIZE_MAX && size < TASK_SIZE_MAX;
-}
-
  #else
  #define TASK_SIZE_MAX TASK_SIZE
+#endif
  
  static inline bool __access_ok(unsigned long addr, unsigned long size)

  {
-   if (addr >= TASK_SIZE_MAX)
-   return false;
-   return size == 0 || size <= TASK_SIZE_MAX - addr;
+   return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr;
  }
-#endif /* __powerpc64__ */
  
  #define access_ok(addr, size)		\

(__chk_user_ptr(addr),  \




Christophe


Re: [PATCH 14/14] powerpc: remove address space overrides using set_fs()

2020-09-03 Thread Christophe Leroy




Le 03/09/2020 à 16:22, Christoph Hellwig a écrit :

Stop providing the possibility to override the address space using
set_fs() now that there is no need for that any more.

Signed-off-by: Christoph Hellwig 
---



  
-static inline int __access_ok(unsigned long addr, unsigned long size,

-   mm_segment_t seg)
+static inline bool __access_ok(unsigned long addr, unsigned long size)
  {
-   if (addr > seg.seg)
-   return 0;
-   return (size == 0 || size - 1 <= seg.seg - addr);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   return size == 0 || size <= TASK_SIZE_MAX - addr;
  }


You don't need to test size == 0 anymore. It used to be necessary 
because of the 'size - 1', as size is unsigned.


Now you can directly do

return size <= TASK_SIZE_MAX - addr;

If size is 0, this will always be true (because you already know that 
addr is not >= TASK_SIZE_MAX


Christophe


Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-03 Thread Christophe Leroy




Le 03/09/2020 à 09:11, Christoph Hellwig a écrit :


Except that we do not actually have such a patch.  For normal user
writes we only use ->write_iter if ->write is not present.  But what
shows up in the profile is that /dev/zero only has a read_iter op and
not a normal read.  I've added a patch below that implements a normal
read which might help a tad with this workload, but should not be part
of a regression.



With that patch below, throughput is 113.5MB/s (instead of 99.9MB/s).
So a 14% improvement. That's not bad.

Christophe



---
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index abd4ffdc8cdebc..1dc99ab158457a 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -726,6 +726,27 @@ static ssize_t read_iter_zero(struct kiocb *iocb, struct 
iov_iter *iter)
return written;
  }
  
+static ssize_t read_zero(struct file *file, char __user *buf,

+size_t count, loff_t *ppos)
+{
+   size_t cleared = 0;
+
+   while (count) {
+   size_t chunk = min_t(size_t, count, PAGE_SIZE);
+
+   if (clear_user(buf + cleared, chunk))
+   return cleared ? cleared : -EFAULT;
+   cleared += chunk;
+   count -= chunk;
+
+   if (signal_pending(current))
+   return cleared ? cleared : -ERESTARTSYS;
+   cond_resched();
+   }
+
+   return cleared;
+}
+
  static int mmap_zero(struct file *file, struct vm_area_struct *vma)
  {
  #ifndef CONFIG_MMU
@@ -921,6 +942,7 @@ static const struct file_operations zero_fops = {
.llseek = zero_lseek,
.write  = write_zero,
.read_iter  = read_iter_zero,
+   .read   = read_zero,
.write_iter = write_iter_zero,
.mmap   = mmap_zero,
.get_unmapped_area = get_unmapped_area_zero,



Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-03 Thread Christophe Leroy




Le 03/09/2020 à 09:11, Christoph Hellwig a écrit :

On Wed, Sep 02, 2020 at 11:02:22AM -0700, Linus Torvalds wrote:

I don't see why this change would make any difference.


Me neither, but while looking at a different project I did spot places
that actually do an access_ok with len 0, that's why I wanted him to
try.

That being said: Christophe are these number stables?  Do you get
similar numbers with multiple runs?


Yes the numbers are similar with multiple runs and multiple reboots.




And btw, why do the 32-bit and 64-bit checks even differ? It's not
like the extra (single) instruction should even matter. I think the
main reason is that the simpler 64-bit case could stay as a macro
(because it only uses "addr" and "size" once), but honestly, that
"simplification" doesn't help when you then need to have that #ifdef
for the 32-bit case and an inline function anyway.


I'll have to leave that to the powerpc folks.  The intent was to not
change the behavior (and I even fucked that up for the the size == 0
case).


However, I suspect a bigger reason for the actual performance
degradation would be the patch that makes things use "write_iter()"
for writing, even when a simpler "write()" exists.


Except that we do not actually have such a patch.  For normal user
writes we only use ->write_iter if ->write is not present.  But what
shows up in the profile is that /dev/zero only has a read_iter op and
not a normal read.  I've added a patch below that implements a normal
read which might help a tad with this workload, but should not be part
of a regression.

Also Christophe:  can you bisect which patch starts this?  Is it really
this last patch in the series?


5.9-rc2: 91.5MB/s
Patch 1: 74.9MB/s
Patch 2: 97.9MB/s
Patch 3: 97.7MB/s
Patch 4 to 9: 97.9MB/s
Patch 10: 85.3MB/s
Patch 11: 75.4MB/s

See my other mail, when removing CONFIG_STACKPROTECTOR, I get a stable 
99.8MB/s throughput.


Christophe


Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-03 Thread Christophe Leroy




Le 02/09/2020 à 20:02, Linus Torvalds a écrit :

On Wed, Sep 2, 2020 at 8:17 AM Christophe Leroy
 wrote:



With this fix, I get

root@vgoippro:~# time dd if=/dev/zero of=/dev/null count=1M
536870912 bytes (512.0MB) copied, 6.776327 seconds, 75.6MB/s

That's still far from the 91.7MB/s I get with 5.9-rc2, but better than
the 65.8MB/s I got yesterday with your series. Still some way to go thought.


I don't see why this change would make any difference.



Neither do I.

Looks like nowadays, CONFIG_STACKPROTECTOR has become a default.
I rebuilt the kernel without it, I now get a throughput of 99.8MB/s both 
without and with this series.


Looking at the generated code (GCC 10.1), a small change in a function 
seems to make large changes in the generated code when 
CONFIG_STACKPROTECTOR is set.


In addition to that, trivial functions which don't use the stack at all 
get a stack frame anyway when CONFIG_STACKPROTECTOR is set, allthough 
that's only -fstack-protector-strong. And there is no canary check.


Without CONFIG_STACKPROTECTOR:

c01572a0 :
c01572a0:   38 60 ff ff li  r3,-1
c01572a4:   38 80 ff e3 li  r4,-29
c01572a8:   4e 80 00 20 blr

With CONFIG_STACKPROTECTOR (regardless of CONFIG_STACKPROTECTOR_STRONG 
or not):


c0164e08 :
c0164e08:   94 21 ff f0 stwur1,-16(r1)
c0164e0c:   38 60 ff ff li  r3,-1
c0164e10:   38 80 ff e3 li  r4,-29
c0164e14:   38 21 00 10 addir1,r1,16
c0164e18:   4e 80 00 20 blr

Wondering why CONFIG_STACKPROTECTOR has become the default. It seems to 
imply a 10% performance loss even in the best case (91.7MB/s versus 
99.8MB/s)


Note that without CONFIG_STACKPROTECTOR_STRONG, I'm at 99.3MB/s, so 
that's really the _STRONG alternative that hurts.


Christophe


Re: [PATCH 1/2] powerpc/vdso64: link vdso64 with linker

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 19:41, Nick Desaulniers a écrit :

On Wed, Sep 2, 2020 at 5:14 AM Michael Ellerman  wrote:


Nick Desaulniers  writes:

Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan 
sections")


I think I'll just revert that for v5.9 ?


SGTM; you'll probably still want these changes with some modifications
at some point; vdso32 did have at least one orphaned section, and will
be important for hermetic builds.  Seeing crashes in supported
versions of the tools ties our hands at the moment.



Keeping the tool problem aside with binutils 2.26, do you have a way to 
really link an elf32ppc object when  building vdso32 for PPC64 ?


Christophe


Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker

2020-09-02 Thread Christophe Leroy

Hi,

Le 02/09/2020 à 16:14, Segher Boessenkool a écrit :

Hi!

On Wed, Sep 02, 2020 at 06:46:45AM +, Christophe Leroy wrote:

ld crashes:

   LD  arch/powerpc/kernel/vdso32/vdso32.so.dbg
/bin/sh: line 1: 23780 Segmentation fault  (core dumped)
ppc-linux-ld -EB -m elf32ppc -shared -soname linux-vdso32.so.1
--eh-frame-hdr --orphan-handling=warn -T
arch/powerpc/kernel/vdso32/vdso32.lds
arch/powerpc/kernel/vdso32/sigtramp.o
arch/powerpc/kernel/vdso32/gettimeofday.o
arch/powerpc/kernel/vdso32/datapage.o
arch/powerpc/kernel/vdso32/cacheflush.o
arch/powerpc/kernel/vdso32/note.o arch/powerpc/kernel/vdso32/getcpu.o -o
arch/powerpc/kernel/vdso32/vdso32.so.dbg
make[4]: *** [arch/powerpc/kernel/vdso32/vdso32.so.dbg] Error 139


[root@localhost linux-powerpc]# ppc-linux-ld --version
GNU ld (GNU Binutils) 2.26.20160125


[ Don't build as root :-P ]

Try with a newer ld?  If it still happens with current versions, please
open a bug report?  https://sourceware.org/bugzilla


Yes it works with 2.30 and 2.34
But minimum for building kernel is supposed to be 2.23

Christophe


Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 14:36, Christoph Hellwig a écrit :

On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:

-   return 0;
-   return (size == 0 || size - 1 <= seg.seg - addr);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   if (size == 0)
+   return false;


__access_ok() was returning true when size == 0 up to now. Any reason to
return false now ?


No, this is accidental and broken.  Can you re-run your benchmark with
this fixed?



With this fix, I get

root@vgoippro:~# time dd if=/dev/zero of=/dev/null count=1M
1048576+0 records in
1048576+0 records out
536870912 bytes (512.0MB) copied, 6.776327 seconds, 75.6MB/s
real0m 6.78s
user0m 1.64s
sys 0m 5.13s

That's still far from the 91.7MB/s I get with 5.9-rc2, but better than 
the 65.8MB/s I got yesterday with your series. Still some way to go thought.


Christophe


Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 15:51, David Laight a écrit :

From: Christophe Leroy

Sent: 02 September 2020 14:25
Le 02/09/2020 à 15:13, David Laight a écrit :

From: Christoph Hellwig

Sent: 02 September 2020 13:37

On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:

-   return 0;
-   return (size == 0 || size - 1 <= seg.seg - addr);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   if (size == 0)
+   return false;


__access_ok() was returning true when size == 0 up to now. Any reason to
return false now ?


No, this is accidental and broken.  Can you re-run your benchmark with
this fixed?


Is TASK_SIZE_MASK defined such that you can do:

return (addr | size) < TASK_SIZE_MAX) || !size;


TASK_SIZE_MAX will usually be 0xc000

With:
addr = 0x8000;
size = 0x8000;

I expect it to fail 

With the formula you propose it will succeed, won't it ?


Hmmm... Was i getting confused about some comments for 64bit
about there being such a big hole between valid user and kernel
addresses that it was enough to check that 'size < TASK_SIZE_MAX'.

That would be true for 64bit x86 (and probably ppc (& arm??))
if TASK_SIZE_MAX were 0x4 << 60.
IIUC the highest user address is (much) less than 0x0 << 60
and the lowest kernel address (much) greater than 0xf << 60
on all these 64bit platforms.

Actually if doing access_ok() inside get_user() you don't
need to check the size at all.


You mean on 64 bit or on any platform ?

What about a word write to 0xbffe, won't it overwrite 0xc000 ?


You don't even need to in copy_to/from_user() provided
it always does a forwards copy.


Do you mean due to the gap ?
Is it garantied to be a gap ? Even on a 32 bits having TASK_SIZE set to 
0xc000 and PAGE_OFFSET set to the same ?



Christophe


Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 15:13, David Laight a écrit :

From: Christoph Hellwig

Sent: 02 September 2020 13:37

On Wed, Sep 02, 2020 at 08:15:12AM +0200, Christophe Leroy wrote:

-   return 0;
-   return (size == 0 || size - 1 <= seg.seg - addr);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   if (size == 0)
+   return false;


__access_ok() was returning true when size == 0 up to now. Any reason to
return false now ?


No, this is accidental and broken.  Can you re-run your benchmark with
this fixed?


Is TASK_SIZE_MASK defined such that you can do:

return (addr | size) < TASK_SIZE_MAX) || !size;


TASK_SIZE_MAX will usually be 0xc000

With:
addr = 0x8000;
size = 0x8000;

I expect it to fail 

With the formula you propose it will succeed, won't it ?

Christophe


Re: [PATCH v4 04/13] mm/debug_vm_pgtables/hugevmap: Use the arch helper to identify huge vmap support.

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 13:42, Aneesh Kumar K.V a écrit :

ppc64 supports huge vmap only with radix translation. Hence use arch helper
to determine the huge vmap support.

Signed-off-by: Aneesh Kumar K.V 
---
  mm/debug_vm_pgtable.c | 14 --
  1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 00649b47f6e0..4c73e63b4ceb 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -28,6 +28,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  
@@ -206,11 +207,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)

WARN_ON(!pmd_leaf(pmd));
  }
  
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

  static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t 
prot)
  {
pmd_t pmd;
  
-	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))

+   if (!arch_ioremap_pmd_supported())


What about moving ioremap_pmd_enabled() from mm/ioremap.c to some .h, 
and using it ?

As ioremap_pmd_enabled() is defined at all time, no need of #ifdef


return;
  
  	pr_debug("Validating PMD huge\n");

@@ -224,6 +226,9 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned 
long pfn, pgprot_t prot)
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
  }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t 
prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
  
  static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)

  {
@@ -320,11 +325,12 @@ static void __init pud_leaf_tests(unsigned long pfn, 
pgprot_t prot)
WARN_ON(!pud_leaf(pud));
  }
  
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

  static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot)
  {
pud_t pud;
  
-	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))

+   if (!arch_ioremap_pud_supported())


What about moving ioremap_pud_enabled() from mm/ioremap.c to some .h, 
and using it ?

As ioremap_pud_enabled() is defined at all time, no need of #ifdef


return;
  
  	pr_debug("Validating PUD huge\n");

@@ -338,6 +344,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned 
long pfn, pgprot_t prot)
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
  }
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t 
prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
  #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
  static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
  static void __init pud_advanced_tests(struct mm_struct *mm,



Christophe


Re: [PATCH v4 02/13] powerpc/mm: Move setting pte specific flags to pfn_pte

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 13:42, Aneesh Kumar K.V a écrit :

powerpc used to set the pte specific flags in set_pte_at(). This is
different from other architectures. To be consistent with other
architecture update pfn_pte to set _PAGE_PTE on ppc64. Also, drop now
unused pte_mkpte.

We add a VM_WARN_ON() to catch the usage of calling set_pte_at()
without setting _PAGE_PTE bit. We will remove that after a few releases.

With respect to huge pmd entries, pmd_mkhuge() takes care of adding the
_PAGE_PTE bit.

Signed-off-by: Aneesh Kumar K.V 


Reviewed-by: Christophe Leroy 

Small nit below.


---
  arch/powerpc/include/asm/book3s/64/pgtable.h | 15 +--
  arch/powerpc/include/asm/nohash/pgtable.h|  5 -
  arch/powerpc/mm/pgtable.c|  5 -
  3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 079211968987..2382fd516f6b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -823,6 +818,14 @@ static inline int pte_none(pte_t pte)
  static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, int percpu)
  {
+
+   VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE)));
+   /*
+* Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE
+* in all the callers.
+*/
+pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));


Wrong alignment, there is a leading space.


+
if (radix_enabled())
return radix__set_pte_at(mm, addr, ptep, pte, percpu);
return hash__set_pte_at(mm, addr, ptep, pte, percpu);


Christophe


Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker

2020-09-02 Thread Christophe Leroy




On 9/1/20 10:25 PM, Nick Desaulniers wrote:

Rather than invoke the compiler as the driver, use the linker. That way
we can check --orphan-handling=warn support correctly, as cc-ldoption
was removed in
commit 055efab3120b ("kbuild: drop support for cc-ldoption").

Requires dropping the .got section.  I couldn't find how it was used in
the vdso32.

Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan 
sections")
Link: 
https://lore.kernel.org/lkml/cakwvodnn3wxydjomvnveyd_njwrku3fabwt_bs92duihhyw...@mail.gmail.com/
Signed-off-by: Nick Desaulniers 
---
Not sure removing .got is a good idea or not.  Otherwise I observe the
following link error:
powerpc-linux-gnu-ld: warning: orphan section `.got' from 
`arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc-linux-gnu-ld: final link failed: bad value


Finally I spotted it I think:

make arch/powerpc/kernel/vdso32/ V=1

powerpc64-linux-ld  -EB -m elf64ppc -shared -soname linux-vdso32.so.1 
--eh-frame-hdr  --orphan-handling=warn -T 
arch/powerpc/kernel/vdso32/vdso32.lds 
arch/powerpc/kernel/vdso32/sigtramp.o 
arch/powerpc/kernel/vdso32/gettimeofday.o 
arch/powerpc/kernel/vdso32/datapage.o 
arch/powerpc/kernel/vdso32/cacheflush.o 
arch/powerpc/kernel/vdso32/note.o arch/powerpc/kernel/vdso32/getcpu.o -o 
arch/powerpc/kernel/vdso32/vdso32.so.dbg




If I do the same manually but with -m elf32ppc instead of -m elf64ppc, 
there is no failure.


Adding -m elf32ppc to ldflags-y also works, allthough I don't like too 
much having "-m elf64ppc -m elf32ppc" on the line.


Christophe


Re: [PATCH] powerpc: Fix random segfault when freeing hugetlb range

2020-09-02 Thread Christophe Leroy




Le 02/09/2020 à 05:23, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


The following random segfault is observed from time to time with
map_hugetlb selftest:

root@localhost:~# ./map_hugetlb 1 19
524288 kB hugepages
Mapping 1 Mbytes
Segmentation fault

[   31.219972] map_hugetlb[365]: segfault (11) at 117 nip 77974f8c lr 779a6834 
code 1 in ld-2.23.so[77966000+21000]
[   31.220192] map_hugetlb[365]: code: 9421ffc0 480318d1 93410028 90010044 
9361002c 93810030 93a10034 93c10038
[   31.220307] map_hugetlb[365]: code: 93e1003c 93210024 8123007c 81430038 
<80e90004> 814a0004 7f443a14 813a0004
[   31.221911] BUG: Bad rss-counter state mm:(ptrval) type:MM_FILEPAGES val:33
[   31.229362] BUG: Bad rss-counter state mm:(ptrval) type:MM_ANONPAGES val:5

This fault is due to hugetlb_free_pgd_range() freeing page tables
that are also used by regular pages.

As explain in the comment at the beginning of
hugetlb_free_pgd_range(), the verification done in free_pgd_range()
on floor and ceiling is not done here, which means
hugetlb_free_pte_range() can free outside the expected range.

As the verification cannot be done in hugetlb_free_pgd_range(), it
must be done in hugetlb_free_pte_range().



Reviewed-by: Aneesh Kumar K.V 


Fixes: b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as standard pages.")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/hugetlbpage.c | 18 --
  1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 26292544630f..e7ae2a2c4545 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -330,10 +330,24 @@ static void free_hugepd_range(struct mmu_gather *tlb, 
hugepd_t *hpdp, int pdshif
 get_hugepd_cache_index(pdshift - shift));
  }
  
-static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr)

+static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+  unsigned long addr, unsigned long end,
+  unsigned long floor, unsigned long ceiling)
  {
+   unsigned long start = addr;
pgtable_t token = pmd_pgtable(*pmd);
  
+	start &= PMD_MASK;

+   if (start < floor)
+   return;
+   if (ceiling) {
+   ceiling &= PMD_MASK;
+   if (!ceiling)
+   return;
+   }
+   if (end - 1 > ceiling - 1)
+   return;
+


We do repeat that for pud/pmd/pte hugetlb_free_range. Can we consolidate
that with comment explaining we are checking if the pgtable entry is
mapping outside the range?


I was thinking about refactoring that into a helper and add all the 
necessary comments to explain what it does.


Will do that in a followup series if you are OK. This patch is a bug fix 
and also have to go through stable.


Christophe


Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker

2020-09-02 Thread Christophe Leroy




On 9/1/20 10:25 PM, Nick Desaulniers wrote:

Rather than invoke the compiler as the driver, use the linker. That way
we can check --orphan-handling=warn support correctly, as cc-ldoption
was removed in
commit 055efab3120b ("kbuild: drop support for cc-ldoption").

Requires dropping the .got section.  I couldn't find how it was used in
the vdso32.

Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan 
sections")
Link: 
https://lore.kernel.org/lkml/cakwvodnn3wxydjomvnveyd_njwrku3fabwt_bs92duihhyw...@mail.gmail.com/
Signed-off-by: Nick Desaulniers 
---
Not sure removing .got is a good idea or not.  Otherwise I observe the
following link error:
powerpc-linux-gnu-ld: warning: orphan section `.got' from 
`arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc-linux-gnu-ld: final link failed: bad value

sigtramp.c doesn't mention anything from the GOT AFAICT, and doesn't
look like it contains relocations that do, so I'm not sure where
references to _GLOBAL_OFFSET_TABLE_ are coming from.


I'm getting the same but only when building for PPC64.
I don't get any reference to sigtramp.o though:

  CALLscripts/checksyscalls.sh
  CALLscripts/atomic/check-atomics.sh
  VDSO32A arch/powerpc/kernel/vdso32/sigtramp.o
  VDSO32A arch/powerpc/kernel/vdso32/gettimeofday.o
  VDSO32A arch/powerpc/kernel/vdso32/datapage.o
  VDSO32A arch/powerpc/kernel/vdso32/cacheflush.o
  VDSO32A arch/powerpc/kernel/vdso32/note.o
  VDSO32A arch/powerpc/kernel/vdso32/getcpu.o
  LD  arch/powerpc/kernel/vdso32/vdso32.so.dbg
powerpc64-linux-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc64-linux-ld: final link failed: Bad value

(GCC 8.1, Binutils 2.30)

So it seems that the got section is being created by the linker. Don't 
know why though.



With GCC 10.1, binutils 2.34 I get:

  LDS arch/powerpc/kernel/vdso32/vdso32.lds
  VDSO32A arch/powerpc/kernel/vdso32/sigtramp.o
  VDSO32A arch/powerpc/kernel/vdso32/gettimeofday.o
  VDSO32A arch/powerpc/kernel/vdso32/datapage.o
  VDSO32A arch/powerpc/kernel/vdso32/cacheflush.o
  VDSO32A arch/powerpc/kernel/vdso32/note.o
  VDSO32A arch/powerpc/kernel/vdso32/getcpu.o
  LD  arch/powerpc/kernel/vdso32/vdso32.so.dbg
powerpc64-linux-ld: warning: orphan section `.branch_lt' from 
`arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.branch_lt'

powerpc64-linux-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc64-linux-ld: final link failed: bad value

I can't see any .branch_lt section when objdumping sigtramp.o or any 
other .o


When I move sigtramp.o at the end of the definition of obj-vdso32 in 
Makefile, I then get:


powerpc64-linux-ld: warning: orphan section `.branch_lt' from 
`arch/powerpc/kernel/vdso32/gettimeofday.o' being placed in section 
`.branch_lt'

powerpc64-linux-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc64-linux-ld: final link failed: bad value


gettimeofday.o now being the first object in obj-vdso32


Christophe



  arch/powerpc/kernel/vdso32/Makefile | 7 +--
  arch/powerpc/kernel/vdso32/vdso32.lds.S | 3 ++-
  2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/vdso32/Makefile 
b/arch/powerpc/kernel/vdso32/Makefile
index 87ab1152d5ce..611a5951945a 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -27,6 +27,9 @@ UBSAN_SANITIZE := n
  ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
-Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
  asflags-y := -D__VDSO32__ -s
+ldflags-y := -shared -soname linux-vdso32.so.1 \
+   $(call ld-option, --eh-frame-hdr) \
+   $(call ld-option, --orphan-handling=warn) -T
  
  obj-y += vdso32_wrapper.o

  extra-y += vdso32.lds
@@ -49,8 +52,8 @@ $(obj-vdso32): %.o: %.S FORCE
$(call if_changed_dep,vdso32as)
  
  # actual build commands

-quiet_cmd_vdso32ld = VDSO32L $@
-  cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ $(call 
cc-ldoption, -Wl$(comma)--orphan-handling=warn) -Wl,-T$(filter %.lds,$^) 
$(filter %.o,$^)
+quiet_cmd_vdso32ld = LD  $@
+  cmd_vdso32ld = $(cmd_ld)
  quiet_cmd_vdso32as = VDSO32A $@
cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
  
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S

index 4c985467a668..0ccdebad18b8 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -61,7 +61,6 @@ SECTIONS
.fixup  : { *(.fixup) }
  
  	.dynamic	: { *(.dynamic) }		:text	:dynamic

-   .got: { *(.got) }   :text
.plt: { *(.plt) }
  
  	_end = .;

@@ -108,7 +107,9 @@ SECTIONS
.debug_varnames  0 : { *(.debug_varnames) }
  
  	/DISCARD/	: {

+   *(.got)
*(.note.GNU-stack)
+   *(.branch_lt)

Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker

2020-09-02 Thread Christophe Leroy




On 09/01/2020 10:25 PM, Nick Desaulniers wrote:

Rather than invoke the compiler as the driver, use the linker. That way
we can check --orphan-handling=warn support correctly, as cc-ldoption
was removed in
commit 055efab3120b ("kbuild: drop support for cc-ldoption").

Requires dropping the .got section.  I couldn't find how it was used in
the vdso32.


ld crashes:

  LD  arch/powerpc/kernel/vdso32/vdso32.so.dbg
/bin/sh: line 1: 23780 Segmentation fault  (core dumped) 
ppc-linux-ld -EB -m elf32ppc -shared -soname linux-vdso32.so.1 
--eh-frame-hdr --orphan-handling=warn -T 
arch/powerpc/kernel/vdso32/vdso32.lds 
arch/powerpc/kernel/vdso32/sigtramp.o 
arch/powerpc/kernel/vdso32/gettimeofday.o 
arch/powerpc/kernel/vdso32/datapage.o 
arch/powerpc/kernel/vdso32/cacheflush.o 
arch/powerpc/kernel/vdso32/note.o arch/powerpc/kernel/vdso32/getcpu.o -o 
arch/powerpc/kernel/vdso32/vdso32.so.dbg

make[4]: *** [arch/powerpc/kernel/vdso32/vdso32.so.dbg] Error 139


[root@localhost linux-powerpc]# ppc-linux-ld --version
GNU ld (GNU Binutils) 2.26.20160125


Christophe



Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan 
sections")
Link: 
https://lore.kernel.org/lkml/cakwvodnn3wxydjomvnveyd_njwrku3fabwt_bs92duihhyw...@mail.gmail.com/
Signed-off-by: Nick Desaulniers 
---
Not sure removing .got is a good idea or not.  Otherwise I observe the
following link error:
powerpc-linux-gnu-ld: warning: orphan section `.got' from 
`arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc-linux-gnu-ld: final link failed: bad value

sigtramp.c doesn't mention anything from the GOT AFAICT, and doesn't
look like it contains relocations that do, so I'm not sure where
references to _GLOBAL_OFFSET_TABLE_ are coming from.

  arch/powerpc/kernel/vdso32/Makefile | 7 +--
  arch/powerpc/kernel/vdso32/vdso32.lds.S | 3 ++-
  2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/vdso32/Makefile 
b/arch/powerpc/kernel/vdso32/Makefile
index 87ab1152d5ce..611a5951945a 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -27,6 +27,9 @@ UBSAN_SANITIZE := n
  ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
-Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
  asflags-y := -D__VDSO32__ -s
+ldflags-y := -shared -soname linux-vdso32.so.1 \
+   $(call ld-option, --eh-frame-hdr) \
+   $(call ld-option, --orphan-handling=warn) -T
  
  obj-y += vdso32_wrapper.o

  extra-y += vdso32.lds
@@ -49,8 +52,8 @@ $(obj-vdso32): %.o: %.S FORCE
$(call if_changed_dep,vdso32as)
  
  # actual build commands

-quiet_cmd_vdso32ld = VDSO32L $@
-  cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ $(call 
cc-ldoption, -Wl$(comma)--orphan-handling=warn) -Wl,-T$(filter %.lds,$^) 
$(filter %.o,$^)
+quiet_cmd_vdso32ld = LD  $@
+  cmd_vdso32ld = $(cmd_ld)
  quiet_cmd_vdso32as = VDSO32A $@
cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
  
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S

index 4c985467a668..0ccdebad18b8 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -61,7 +61,6 @@ SECTIONS
.fixup  : { *(.fixup) }
  
  	.dynamic	: { *(.dynamic) }		:text	:dynamic

-   .got: { *(.got) }   :text
.plt: { *(.plt) }
  
  	_end = .;

@@ -108,7 +107,9 @@ SECTIONS
.debug_varnames  0 : { *(.debug_varnames) }
  
  	/DISCARD/	: {

+   *(.got)
*(.note.GNU-stack)
+   *(.branch_lt)
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
*(.glink .iplt .plt .rela*)



Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()

2020-09-02 Thread Christophe Leroy




Le 27/08/2020 à 17:00, Christoph Hellwig a écrit :

Stop providing the possibility to override the address space using
set_fs() now that there is no need for that any more.

Signed-off-by: Christoph Hellwig 
---
  arch/powerpc/Kconfig   |  1 -
  arch/powerpc/include/asm/processor.h   |  7 ---
  arch/powerpc/include/asm/thread_info.h |  5 +--
  arch/powerpc/include/asm/uaccess.h | 62 --
  arch/powerpc/kernel/signal.c   |  3 --
  arch/powerpc/lib/sstep.c   |  6 +--
  6 files changed, 22 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 7fe3531ad36a77..39727537d39701 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -8,62 +8,36 @@
  #include 
  #include 
  
-/*

- * The fs value determines whether argument validity checking should be
- * performed or not.  If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- *
- * The fs/ds values are now the highest legal address in the "segment".
- * This simplifies the checking in the routines below.
- */
-
-#define MAKE_MM_SEG(s)  ((mm_segment_t) { (s) })
-
-#define KERNEL_DS  MAKE_MM_SEG(~0UL)
  #ifdef __powerpc64__
  /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */
-#define USER_DSMAKE_MM_SEG(TASK_SIZE_USER64 - 1)
-#else
-#define USER_DSMAKE_MM_SEG(TASK_SIZE - 1)
-#endif
-
-#define get_fs()   (current->thread.addr_limit)
+#define TASK_SIZE_MAX  TASK_SIZE_USER64
  
-static inline void set_fs(mm_segment_t fs)

+static inline bool __access_ok(unsigned long addr, unsigned long size)
  {
-   current->thread.addr_limit = fs;
-   /* On user-mode return check addr_limit (fs) is correct */
-   set_thread_flag(TIF_FSCHECK);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   /*
+* This check is sufficient because there is a large enough gap between
+* user addresses and the kernel addresses.
+*/
+   return size <= TASK_SIZE_MAX;
  }
-
-#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
-#define user_addr_max()(get_fs().seg)
-
-#ifdef __powerpc64__
-/*
- * This check is sufficient because there is a large enough
- * gap between user addresses and the kernel addresses
- */
-#define __access_ok(addr, size, segment)   \
-   (((addr) <= (segment).seg) && ((size) <= (segment).seg))
-
  #else
+#define TASK_SIZE_MAX  TASK_SIZE
  
-static inline int __access_ok(unsigned long addr, unsigned long size,

-   mm_segment_t seg)
+static inline bool __access_ok(unsigned long addr, unsigned long size)
  {
-   if (addr > seg.seg)
-   return 0;
-   return (size == 0 || size - 1 <= seg.seg - addr);
+   if (addr >= TASK_SIZE_MAX)
+   return false;
+   if (size == 0)
+   return false;


__access_ok() was returning true when size == 0 up to now. Any reason to 
return false now ?



+   return size <= TASK_SIZE_MAX - addr;
  }
-
-#endif
+#endif /* __powerpc64__ */
  
  #define access_ok(addr, size)		\

(__chk_user_ptr(addr),  \
-__access_ok((__force unsigned long)(addr), (size), get_fs()))
+__access_ok((unsigned long)(addr), (size)))
  
  /*

   * These are the main single-value transfer routines.  They automatically


Christophe


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v2

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 19:25, Al Viro a écrit :

On Tue, Sep 01, 2020 at 07:13:00PM +0200, Christophe Leroy wrote:


 10.92%  dd   [kernel.kallsyms]  [k] iov_iter_zero


Interesting...  Could you get an instruction-level profile inside 
iov_iter_zero(),
along with the disassembly of that sucker?



As a comparison, hereunder is the perf annotate of the 5.9-rc2 without 
the series:


 Percent |	Source code & Disassembly of vmlinux for cpu-clock (2581 
samples)

-
 :
 :
 :
 :  Disassembly of section .text:
 :
 :  c02cbb80 :
 :  iov_iter_zero():
3.22 :c02cbb80:   stwur1,-80(r1)
3.25 :c02cbb84:   stw r30,72(r1)
0.00 :c02cbb88:   mr  r30,r4
2.91 :c02cbb8c:   stw r31,76(r1)
0.00 :c02cbb90:   mr  r31,r3
0.19 :c02cbb94:   stw r27,60(r1)
 :  iov_iter_type():
1.82 :c02cbb98:   lwz r10,0(r4)
0.54 :c02cbb9c:   rlwinm  r9,r10,0,0,30
 :  iov_iter_zero():
1.98 :c02cbba0:   cmpwi   r9,32
0.00 :c02cbba4:   lwz r9,624(r2)
0.35 :c02cbba8:   stw r9,28(r1)
0.00 :c02cbbac:   li  r9,0
0.00 :c02cbbb0:   beq c02cbd00 
2.67 :c02cbbb4:   lwz r9,8(r4)
1.98 :c02cbbb8:   cmplw   r9,r3
0.00 :c02cbbbc:   mr  r27,r9
0.00 :c02cbbc0:   bgt c02cbce8 
0.31 :c02cbbc4:   cmpwi   r9,0
0.00 :c02cbbc8:   beq c02cbcbc 
3.22 :c02cbbcc:   andi.   r8,r10,16
1.70 :c02cbbd0:   lwz r31,4(r30)
0.00 :c02cbbd4:   bne c02cbe10 
0.31 :c02cbbd8:   andi.   r8,r10,8
0.00 :c02cbbdc:   bne c02cbf64 
1.82 :c02cbbe0:   andi.   r10,r10,64
0.00 :c02cbbe4:   bne c02cc080 
0.27 :c02cbbe8:   stw r29,68(r1)
1.94 :c02cbbec:   stw r28,64(r1)
1.98 :c02cbbf0:   lwz r28,12(r30)
0.31 :c02cbbf4:   lwz r7,4(r28)
2.13 :c02cbbf8:   subfr29,r31,r7
1.78 :c02cbbfc:   cmplw   r29,r27
0.08 :c02cbc00:   bgt c02cbcf8 
   28.24 :c02cbc04:   cmpwi   r29,0
0.00 :c02cbc08:   beq c02cc08c 
2.01 :c02cbc0c:   lwz r3,0(r28)
3.10 :c02cbc10:   lwz r10,1208(r2)
0.00 :c02cbc14:   add r3,r3,r31
 :  __access_ok():
0.00 :c02cbc18:   cmplw   r3,r10
0.00 :c02cbc1c:   bgt c02cbc7c 
3.37 :c02cbc20:   subfr10,r3,r10
0.00 :c02cbc24:   addir8,r29,-1
3.14 :c02cbc28:   cmplw   r8,r10
0.08 :c02cbc2c:   mflrr0
0.00 :c02cbc30:   stw r0,84(r1)
0.00 :c02cbc34:   bgt c02cbd40 
 :  clear_user():
0.00 :c02cbc38:   mr  r4,r29
2.40 :c02cbc3c:   bl  c001a428 <__arch_clear_user>
 :  iov_iter_zero():
1.55 :c02cbc40:   add r31,r31,r29
0.00 :c02cbc44:   cmpwi   r3,0
1.94 :c02cbc48:   subfr29,r29,r27
0.00 :c02cbc4c:   subfr31,r3,r31
0.00 :c02cbc50:   add r29,r29,r3
0.00 :c02cbc54:   beq c02cc0ac 
0.00 :c02cbc58:   lwz r9,8(r30)
0.00 :c02cbc5c:   subfr10,r27,r29
0.00 :c02cbc60:   lwz r0,84(r1)
0.00 :c02cbc64:   subfr27,r29,r27
0.00 :c02cbc68:   add r9,r10,r9
0.00 :c02cbc6c:   lwz r7,4(r28)
0.00 :c02cbc70:   lwz r10,12(r30)
0.00 :c02cbc74:   mtlrr0
0.00 :c02cbc78:   b   c02cbc84 
 :  __access_ok():
0.00 :c02cbc7c:   li  r27,0
0.00 :c02cbc80:   mr  r10,r28
 :  iov_iter_zero():
0.00 :c02cbc84:   cmplw   r31,r7
0.00 :c02cbc88:   bne c02cbc94 
0.93 :c02cbc8c:   addir28,r28,8
0.00 :c02cbc90:   li  r31,0
1.28 :c02cbc94:   lwz r8,16(r30)
0.00 :c02cbc98:   subfr10,r10,r28
1.05 :c02cbc9c:   srawi   r10,r10,3
0.00 :c02cbca0:   stw r28,12(r30)
0.00 :c02cbca4:   subfr10,r10,r8
0.93 :c02cbca8:   stw r10,16(r30)
0.04 :c02cbcac:   lwz r28,64(r1)
0.00 :c02cbcb0:   lwz r29,68(r1)
1.05 :c02cbcb4:   stw r9,8(r30)
0.00 :  

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v2

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 19:25, Al Viro a écrit :

On Tue, Sep 01, 2020 at 07:13:00PM +0200, Christophe Leroy wrote:


 10.92%  dd   [kernel.kallsyms]  [k] iov_iter_zero


Interesting...  Could you get an instruction-level profile inside 
iov_iter_zero(),
along with the disassembly of that sucker?



Output of perf annotate:


 Percent |	Source code & Disassembly of vmlinux for cpu-clock (3579 
samples)

-
 :
 :
 :
 :  Disassembly of section .text:
 :
 :  c02cb3a4 :
 :  iov_iter_zero():
2.24 :c02cb3a4:   stwur1,-80(r1)
0.31 :c02cb3a8:   stw r30,72(r1)
0.00 :c02cb3ac:   mr  r30,r4
0.11 :c02cb3b0:   stw r31,76(r1)
0.00 :c02cb3b4:   mr  r31,r3
1.06 :c02cb3b8:   stw r27,60(r1)
 :  iov_iter_type():
0.03 :c02cb3bc:   lwz r10,0(r4)
0.06 :c02cb3c0:   rlwinm  r9,r10,0,0,30
 :  iov_iter_zero():
0.03 :c02cb3c4:   cmpwi   r9,32
0.00 :c02cb3c8:   lwz r9,624(r2)
2.15 :c02cb3cc:   stw r9,28(r1)
0.00 :c02cb3d0:   li  r9,0
0.00 :c02cb3d4:   beq c02cb520 
0.14 :c02cb3d8:   lwz r9,8(r4)
0.08 :c02cb3dc:   cmplw   r9,r3
0.00 :c02cb3e0:   mr  r27,r9
0.03 :c02cb3e4:   bgt c02cb4fc 
1.34 :c02cb3e8:   cmpwi   r9,0
0.00 :c02cb3ec:   beq c02cb4d0 
0.11 :c02cb3f0:   andi.   r8,r10,16
0.17 :c02cb3f4:   lwz r31,4(r30)
1.79 :c02cb3f8:   bne c02cb61c 
0.00 :c02cb3fc:   andi.   r8,r10,8
0.06 :c02cb400:   bne c02cb770 
0.22 :c02cb404:   andi.   r10,r10,64
0.03 :c02cb408:   bne c02cb88c 
0.11 :c02cb40c:   stw r29,68(r1)
1.59 :c02cb410:   stw r28,64(r1)
0.03 :c02cb414:   lwz r28,12(r30)
0.00 :c02cb418:   lwz r7,4(r28)
1.87 :c02cb41c:   subfr29,r31,r7
0.28 :c02cb420:   cmplw   r29,r27
0.03 :c02cb424:   bgt c02cb50c 
0.03 :c02cb428:   cmpwi   r29,0
0.00 :c02cb42c:   beq c02cb898 
1.34 :c02cb430:   lwz r3,0(r28)
 :  __access_ok():
0.00 :c02cb434:   lis r10,-16384
 :  iov_iter_zero():
0.36 :c02cb438:   add r3,r3,r31
 :  __access_ok():
0.03 :c02cb43c:   cmplw   r3,r10
1.79 :c02cb440:   bge c02cb514 
   13.19 :c02cb444:   subfr10,r3,r10
 :  clear_user():
0.00 :c02cb448:   cmplw   r29,r10
4.41 :c02cb44c:   mflrr0
0.00 :c02cb450:   stw r0,84(r1)
0.00 :c02cb454:   bgt c02cb8c4 
0.00 :c02cb458:   mr  r4,r29
0.00 :c02cb45c:   bl  c001a41c <__arch_clear_user>
 :  iov_iter_zero():
0.70 :c02cb460:   add r31,r31,r29
0.00 :c02cb464:   cmpwi   r3,0
   17.13 :c02cb468:   subfr29,r29,r27
0.00 :c02cb46c:   subfr31,r3,r31
1.20 :c02cb470:   add r29,r29,r3
0.00 :c02cb474:   beq c02cb8b8 
0.00 :c02cb478:   lwz r9,8(r30)
0.00 :c02cb47c:   subfr10,r27,r29
0.00 :c02cb480:   lwz r0,84(r1)
0.00 :c02cb484:   subfr27,r29,r27
0.00 :c02cb488:   add r9,r10,r9
0.00 :c02cb48c:   lwz r7,4(r28)
0.00 :c02cb490:   lwz r10,12(r30)
0.00 :c02cb494:   mtlrr0
1.65 :c02cb498:   cmplw   r31,r7
   14.61 :c02cb49c:   bne c02cb4a8 
1.65 :c02cb4a0:   addir28,r28,8
0.00 :c02cb4a4:   li  r31,0
   14.92 :c02cb4a8:   lwz r8,16(r30)
0.00 :c02cb4ac:   subfr10,r10,r28
1.12 :c02cb4b0:   srawi   r10,r10,3
0.56 :c02cb4b4:   stw r28,12(r30)
0.00 :c02cb4b8:   subfr10,r10,r8
1.23 :c02cb4bc:   stw r10,16(r30)
0.00 :c02cb4c0:   lwz r28,64(r1)
0.56 :c02cb4c4:   lwz r29,68(r1)
0.00 :c02cb4c8:   stw r9,8(r30)
2.12 :c02cb4cc:   stw r31,4(r30)
0.00 :c02cb4d0:   lwz r9,28(r1)
0.61 :c02cb4d4:   lwz r10,624(r2)
0.00 :c02cb4d8:   xor.r9,r9,r10
0.00 :c02cb4dc:   li  r10,0
0.00 :  

Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v2

2020-09-01 Thread Christophe Leroy

Hi Christoph,

Le 27/08/2020 à 17:00, Christoph Hellwig a écrit :

Hi all,

this series removes the last set_fs() used to force a kernel address
space for the uaccess code in the kernel read/write/splice code, and then
stops implementing the address space overrides entirely for x86 and
powerpc.

The file system part has been posted a few times, and the read/write side
has been pretty much unchanced.  For splice this series drops the
conversion of the seq_file and sysctl code to the iter ops, and thus loses
the splice support for them.  The reasons for that is that it caused a lot
of churn for not much use - splice for these small files really isn't much
of a win, even if existing userspace uses it.  All callers I found do the
proper fallback, but if this turns out to be an issue the conversion can
be resurrected.

Besides x86 and powerpc I plan to eventually convert all other
architectures, although this will be a slow process, starting with the
easier ones once the infrastructure is merged.  The process to convert
architectures is roughtly:

  (1) ensure there is no set_fs(KERNEL_DS) left in arch specific code
  (2) implement __get_kernel_nofault and __put_kernel_nofault
  (3) remove the arch specific address limitation functionality

Changes since v1:
  - drop the patch to remove the non-iter ops for /dev/zero and
/dev/null as they caused a performance regression
  - don't enable user access in __get_kernel on powerpc
  - xfail the set_fs() based lkdtm tests

Diffstat:




I'm still sceptic with the results I get.

With 5.9-rc2:

root@vgoippro:~# time dd if=/dev/zero of=/dev/null count=1M
1048576+0 records in
1048576+0 records out
536870912 bytes (512.0MB) copied, 5.585880 seconds, 91.7MB/s
real0m 5.59s
user0m 1.40s
sys 0m 4.19s


With your series:

root@vgoippro:/tmp# time dd if=/dev/zero of=/dev/null count=1M
1048576+0 records in
1048576+0 records out
536870912 bytes (512.0MB) copied, 7.780540 seconds, 65.8MB/s
real0m 7.79s
user0m 2.12s
sys 0m 5.66s




Top of perf report of a standard perf record:

With 5.9-rc2:

20.31%  dd   [kernel.kallsyms]  [k] __arch_clear_user
 8.37%  dd   [kernel.kallsyms]  [k] transfer_to_syscall
 7.37%  dd   [kernel.kallsyms]  [k] __fsnotify_parent
 6.95%  dd   [kernel.kallsyms]  [k] iov_iter_zero
 5.72%  dd   [kernel.kallsyms]  [k] new_sync_read
 4.87%  dd   [kernel.kallsyms]  [k] vfs_write
 4.47%  dd   [kernel.kallsyms]  [k] vfs_read
 3.07%  dd   [kernel.kallsyms]  [k] ksys_write
 2.77%  dd   [kernel.kallsyms]  [k] ksys_read
 2.65%  dd   [kernel.kallsyms]  [k] __fget_light
 2.37%  dd   [kernel.kallsyms]  [k] __fdget_pos
 2.35%  dd   [kernel.kallsyms]  [k] memset
 1.53%  dd   [kernel.kallsyms]  [k] rw_verify_area
 1.52%  dd   [kernel.kallsyms]  [k] read_iter_zero

With your series:
19.60%  dd   [kernel.kallsyms]  [k] __arch_clear_user
10.92%  dd   [kernel.kallsyms]  [k] iov_iter_zero
 9.50%  dd   [kernel.kallsyms]  [k] vfs_write
 8.97%  dd   [kernel.kallsyms]  [k] __fsnotify_parent
 5.46%  dd   [kernel.kallsyms]  [k] transfer_to_syscall
 5.42%  dd   [kernel.kallsyms]  [k] vfs_read
 3.58%  dd   [kernel.kallsyms]  [k] ksys_read
 2.84%  dd   [kernel.kallsyms]  [k] read_iter_zero
 2.24%  dd   [kernel.kallsyms]  [k] ksys_write
 1.80%  dd   [kernel.kallsyms]  [k] __fget_light
 1.34%  dd   [kernel.kallsyms]  [k] __fdget_pos
 0.91%  dd   [kernel.kallsyms]  [k] memset
 0.91%  dd   [kernel.kallsyms]  [k] rw_verify_area

Christophe


Re: [PATCH] powerpc/mm: Remove DEBUG_VM_PGTABLE support on ppc64

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 10:15, Christophe Leroy a écrit :



Le 01/09/2020 à 10:12, Aneesh Kumar K.V a écrit :

On 9/1/20 1:40 PM, Christophe Leroy wrote:



Le 01/09/2020 à 10:02, Aneesh Kumar K.V a écrit :

The test is broken w.r.t page table update rules and results in kernel
crash as below. Disable the support untill we get the tests updated.


Signed-off-by: Aneesh Kumar K.V 


Any Fixes: tag ?


---
  arch/powerpc/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65bed1fdeaad..787e829b6f25 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,7 +116,6 @@ config PPC
  #
  select ARCH_32BIT_OFF_T if PPC32
  select ARCH_HAS_DEBUG_VIRTUAL
-    select ARCH_HAS_DEBUG_VM_PGTABLE



You say you remove support for ppc64 but you are removing it for both 
PPC64 and PPC32. Should you replace the line by:


Does it work on PPC32 with DEBUG_VM enabled? I was expecting it will 
be broken there too.


I was wondering. I have just started a build to test that on my 8xx. 
I'll tell you before noon (Paris).




There are warnings but it boots OK. So up to you, but if you deactivate 
it for both PPC32 and PPC64, say so in the subject like.


[7.065399] debug_vm_pgtable: [debug_vm_pgtable ]: Validating 
architecture page table helpers

[7.075155] [ cut here ]
[7.079590] WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/pgtable.c:185 
set_pte_at+0x20/0xf4
[7.087542] CPU: 0 PID: 1 Comm: swapper Not tainted 
5.9.0-rc2-s3k-dev-01348-g283e890ee4ad #3933

[7.096122] NIP:  c000f634 LR: c07440f8 CTR: 
[7.101124] REGS: c9023c50 TRAP: 0700   Not tainted 
(5.9.0-rc2-s3k-dev-01348-g283e890ee4ad)

[7.109445] MSR:  00029032   CR: 53000339  XER: 80006100
[7.116072]
[7.116072] GPR00: c07440f8 c9023d08 c60e4000 c6ba4000 1efac000 
c6ba8eb0 c9023da8 0001
[7.116072] GPR08: 0004 007346c9 c6ba8ebc  93000333 
 c000390c 
[7.116072] GPR16: c084 0ec9 01ec 00734000 06ba8000 
c6bb c05f43e8 1efac000
[7.116072] GPR24: fff0 c6b96d08 c6ba8eac c6ba4000 1efac000 
007346c9 c6ba8eb0 007346c9

[7.150796] NIP [c000f634] set_pte_at+0x20/0xf4
[7.155274] LR [c07440f8] pte_advanced_tests+0xec/0x2bc
[7.160401] Call Trace:
[7.162831] [c9023d08] [c080db94] 0xc080db94 (unreliable)
[7.168183] [c9023d28] [c07440f8] pte_advanced_tests+0xec/0x2bc
[7.174036] [c9023dd8] [c0744498] debug_vm_pgtable+0x1d0/0x668
[7.179827] [c9023e98] [c0734cd4] do_one_initcall+0x8c/0x1cc
[7.185405] [c9023ef8] [c0735008] kernel_init_freeable+0x178/0x1d0
[7.191511] [c9023f28] [c0003920] kernel_init+0x14/0x114
[7.196763] [c9023f38] [c000e184] ret_from_kernel_thread+0x14/0x1c
[7.202818] Instruction dump:
[7.205754] bba10014 7c0803a6 38210020 4e800020 7c0802a6 9421ffe0 
bfc10018 90010024
[7.213412] 83e6 8125 71270001 41820008 <0fe0> 73e90040 
41820080 73ea0001

[7.221249] ---[ end trace 95bbebcafa22d0f7 ]---
[7.226049] [ cut here ]
[7.230438] WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/pgtable.c:185 
set_pte_at+0x20/0xf4
[7.238410] CPU: 0 PID: 1 Comm: swapper Tainted: GW 
5.9.0-rc2-s3k-dev-01348-g283e890ee4ad #3933

[7.248363] NIP:  c000f634 LR: c0744218 CTR: 
[7.253368] REGS: c9023c50 TRAP: 0700   Tainted: GW 
(5.9.0-rc2-s3k-dev-01348-g283e890ee4ad)

[7.263064] MSR:  00029032   CR: 53000335  XER: a0006100
[7.269690]
[7.269690] GPR00: c0744218 c9023d08 c60e4000 c6ba4000 1efac000 
c6ba8eb0 c9023da8 0001
[7.269690] GPR08:  007341c9  007341c9 93000333 
 c000390c 
[7.269690] GPR16: c084 0ec9 01ec 00734000 06ba8000 
c6bb c05f43e8 1efac000
[7.269690] GPR24: fff0 c6b96d08 c6ba8eac c6ba4000 1efac000 
007346c9 c6ba8eb0 007346c9

[7.304418] NIP [c000f634] set_pte_at+0x20/0xf4
[7.308892] LR [c0744218] pte_advanced_tests+0x20c/0x2bc
[7.314105] Call Trace:
[7.316535] [c9023d08] [c080db94] 0xc080db94 (unreliable)
[7.321888] [c9023d28] [c0744218] pte_advanced_tests+0x20c/0x2bc
[7.327826] [c9023dd8] [c0744498] debug_vm_pgtable+0x1d0/0x668
[7.333613] [c9023e98] [c0734cd4] do_one_initcall+0x8c/0x1cc
[7.339196] [c9023ef8] [c0735008] kernel_init_freeable+0x178/0x1d0
[7.345300] [c9023f28] [c0003920] kernel_init+0x14/0x114
[7.350551] [c9023f38] [c000e184] ret_from_kernel_thread+0x14/0x1c
[7.356609] Instruction dump:
[7.359545] bba10014 7c0803a6 38210020 4e800020 7c0802a6 9421ffe0 
bfc10018 90010024
[7.367203] 83e6 8125 71270001 41820008 <0fe0> 73e90040 
41820080 73ea0001

[7.375039] ---[ end trace 95bbebcafa22d0f8 ]---
[7.379783] [ cut here ]
[7.384228] WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/pgtable.c:276 
set_huge_pte_at+0x104/0x134
[7.392803] CPU: 0 PID: 1 Comm: swapper Tainted

Re: [PATCH] powerpc/mm: Remove DEBUG_VM_PGTABLE support on ppc64

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 10:12, Aneesh Kumar K.V a écrit :

On 9/1/20 1:40 PM, Christophe Leroy wrote:



Le 01/09/2020 à 10:02, Aneesh Kumar K.V a écrit :

The test is broken w.r.t page table update rules and results in kernel
crash as below. Disable the support untill we get the tests updated.


Signed-off-by: Aneesh Kumar K.V 


Any Fixes: tag ?


---
  arch/powerpc/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65bed1fdeaad..787e829b6f25 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,7 +116,6 @@ config PPC
  #
  select ARCH_32BIT_OFF_T if PPC32
  select ARCH_HAS_DEBUG_VIRTUAL
-    select ARCH_HAS_DEBUG_VM_PGTABLE



You say you remove support for ppc64 but you are removing it for both 
PPC64 and PPC32. Should you replace the line by:


Does it work on PPC32 with DEBUG_VM enabled? I was expecting it will be 
broken there too.


I was wondering. I have just started a build to test that on my 8xx. 
I'll tell you before noon (Paris).


Christophe





 select ARCH_HAS_DEBUG_VM_PGTABLE if PPC32


  select ARCH_HAS_DEVMEM_IS_ALLOWED
  select ARCH_HAS_ELF_RANDOMIZE
  select ARCH_HAS_FORTIFY_SOURCE



What about 
Documentation/features/debug/debug-vm-pgtable/arch-support.txt ?




I am hoping we can enable the config once we resolve the test issues. 
may be in next merge window.


-aneesh





Re: [PATCH] powerpc/mm: Remove DEBUG_VM_PGTABLE support on ppc64

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 10:02, Aneesh Kumar K.V a écrit :

The test is broken w.r.t page table update rules and results in kernel
crash as below. Disable the support untill we get the tests updated.


Signed-off-by: Aneesh Kumar K.V 


Any Fixes: tag ?


---
  arch/powerpc/Kconfig | 1 -
  1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65bed1fdeaad..787e829b6f25 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,7 +116,6 @@ config PPC
#
select ARCH_32BIT_OFF_T if PPC32
select ARCH_HAS_DEBUG_VIRTUAL
-   select ARCH_HAS_DEBUG_VM_PGTABLE



You say you remove support for ppc64 but you are removing it for both 
PPC64 and PPC32. Should you replace the line by:


select ARCH_HAS_DEBUG_VM_PGTABLE if PPC32


select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE



What about Documentation/features/debug/debug-vm-pgtable/arch-support.txt ?

Christophe


Re: [PATCH v2 00/13] mm/debug_vm_pgtable fixes

2020-09-01 Thread Christophe Leroy




Le 21/08/2020 à 10:51, Anshuman Khandual a écrit :


On 08/19/2020 06:30 PM, Aneesh Kumar K.V wrote:

This patch series includes fixes for debug_vm_pgtable test code so that
they follow page table updates rules correctly. The first two patches introduce
changes w.r.t ppc64. The patches are included in this series for completeness. 
We can
merge them via ppc64 tree if required.

Hugetlb test is disabled on ppc64 because that needs larger change to satisfy
page table update rules.





Changes proposed here will impact other enabled platforms as well.
Adding the following folks and mailing lists, and hoping to get a
broader review and test coverage. Please do include them in the
next iteration as well.

+ linux-arm-ker...@lists.infradead.org
+ linux-s...@vger.kernel.org
+ linux-snps-...@lists.infradead.org
+ x...@kernel.org
+ linux-a...@vger.kernel.org

+ Gerald Schaefer 
+ Christophe Leroy 


Please don't use anymore the above address. Only use the one below.


+ Christophe Leroy 
+ Vineet Gupta 
+ Mike Rapoport 
+ Qian Cai 



Thanks
Christophe


Re: [PATCH v3 12/13] mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 08:30, Aneesh Kumar K.V a écrit :




I actually wanted to add #ifdef BROKEN. That test is completely broken. 
Infact I would suggest to remove that test completely.





#ifdef will not be required here as there would be a stub definition
for hugetlb_advanced_tests() when CONFIG_PPC_BOOK3S_64 is enabled.


  spin_lock(>page_table_lock);
  p4d_clear_tests(mm, p4dp);



But again, we should really try and avoid taking this path.



To be frank i am kind of frustrated with how this patch series is being 
looked at. We pushed a completely broken test to upstream and right now 
we have a code in upstream that crash when booted on ppc64. My attempt 
has been to make progress here and you definitely seems to be not in 
agreement to that.


At this point I am tempted to suggest we remove the DEBUG_VM_PGTABLE 
support on ppc64 because AFAIU it doesn't add any value.




Note that a bug has been filed at 
https://bugzilla.kernel.org/show_bug.cgi?id=209029


Christophe


Re: [PATCH v3 08/13] mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 09:40, Aneesh Kumar K.V a écrit :

On 9/1/20 12:20 PM, Christophe Leroy wrote:



Le 01/09/2020 à 08:25, Aneesh Kumar K.V a écrit :

On 9/1/20 8:52 AM, Anshuman Khandual wrote:




There is a checkpatch.pl warning here.

WARNING: Possible unwrapped commit description (prefer a maximum 75 
chars per line)

#7:
Architectures like ppc64 use deposited page table while updating the 
huge pte


total: 0 errors, 1 warnings, 40 lines checked



I will ignore all these, because they are not really important IMHO.



When doing a git log in a 80 chars terminal window, having wrapping 
lines is not really convenient. It should be easy to avoid it.




We have been ignoring that for a long time  isn't it?

For example ppc64 checkpatch already had
--max-line-length=90


There was also recent discussion whether 80 character limit is valid any 
more. But I do keep it restricted to 80 character where ever it is 
easy/make sense.




Here we are not talking about the code, but the commit log.

As far as I know, the discussions about 80 character lines, 90 lines in 
powerpc etc ... is for the code.


We still aim at keeping lines not longer than 75 chars in the commit log.

Christophe

Christophe


[PATCH] powerpc/powermac: Fix low_sleep_handler with KUAP and KUEP

2020-09-01 Thread Christophe Leroy
low_sleep_handler() has an hardcoded restore of segment registers
that doesn't take KUAP and KUEP into account.

Use head_32's load_segment_registers() routine instead.

Signed-off-by: Christophe Leroy 
Fixes: a68c31fc01ef ("powerpc/32s: Implement Kernel Userspace Access 
Protection")
Fixes: 31ed2b13c48d ("powerpc/32s: Implement Kernel Userspace Execution 
Prevention.")
Cc: sta...@vger.kernel.org
---
 arch/powerpc/platforms/powermac/sleep.S | 9 +
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/sleep.S 
b/arch/powerpc/platforms/powermac/sleep.S
index f9a680fdd9c4..51bfdfe85058 100644
--- a/arch/powerpc/platforms/powermac/sleep.S
+++ b/arch/powerpc/platforms/powermac/sleep.S
@@ -294,14 +294,7 @@ grackle_wake_up:
 * we do any r1 memory access as we are not sure they
 * are in a sane state above the first 256Mb region
 */
-   li  r0,16   /* load up segment register values */
-   mtctr   r0  /* for context 0 */
-   lis r3,0x2000   /* Ku = 1, VSID = 0 */
-   li  r4,0
-3: mtsrin  r3,r4
-   addir3,r3,0x111 /* increment VSID */
-   addis   r4,r4,0x1000/* address of next segment */
-   bdnz3b
+   bl  load_segment_registers
sync
isync
 
-- 
2.25.0



Re: [PATCH v11] Fixup for "powerpc/vdso: Provide __kernel_clock_gettime64() on vdso32"

2020-09-01 Thread Christophe Leroy



Le 01/09/2020 à 09:19, Michal Suchánek a écrit :

Hello,

can you add Fixes: ?


That's a commit which is still in powerpc/next-test.
My intention was to provide something that Michael can squash/fixup into 
the culprit commit.


Christophe




Thanks

Michal

On Tue, Sep 01, 2020 at 05:28:57AM +, Christophe Leroy wrote:

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/vdso/gettimeofday.h | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 59a609a48b63..8da84722729b 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -186,6 +186,8 @@ int __c_kernel_clock_getres(clockid_t clock_id, struct 
__kernel_timespec *res,
  #else
  int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
 const struct vdso_data *vd);
+int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts,
+  const struct vdso_data *vd);
  int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
const struct vdso_data *vd);
  #endif
--
2.25.0



Re: [PATCH v3 08/13] mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP

2020-09-01 Thread Christophe Leroy




Le 01/09/2020 à 08:25, Aneesh Kumar K.V a écrit :

On 9/1/20 8:52 AM, Anshuman Khandual wrote:




There is a checkpatch.pl warning here.

WARNING: Possible unwrapped commit description (prefer a maximum 75 
chars per line)

#7:
Architectures like ppc64 use deposited page table while updating the 
huge pte


total: 0 errors, 1 warnings, 40 lines checked



I will ignore all these, because they are not really important IMHO.



When doing a git log in a 80 chars terminal window, having wrapping 
lines is not really convenient. It should be easy to avoid it.


Christophe


[PATCH v11] Fixup for "powerpc/vdso: Provide __kernel_clock_gettime64() on vdso32"

2020-08-31 Thread Christophe Leroy
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/vdso/gettimeofday.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h 
b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 59a609a48b63..8da84722729b 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -186,6 +186,8 @@ int __c_kernel_clock_getres(clockid_t clock_id, struct 
__kernel_timespec *res,
 #else
 int __c_kernel_clock_gettime(clockid_t clock, struct old_timespec32 *ts,
 const struct vdso_data *vd);
+int __c_kernel_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts,
+  const struct vdso_data *vd);
 int __c_kernel_clock_getres(clockid_t clock_id, struct old_timespec32 *res,
const struct vdso_data *vd);
 #endif
-- 
2.25.0



[PATCH 1/2] powerpc/8xx: Refactor calculation of number of entries per PTE in page tables

2020-08-31 Thread Christophe Leroy
On 8xx, the number of entries occupied by a PTE in the page tables
depends on the size of the page. At the time being, this calculation
is done in two places: in pte_update() and in set_huge_pte_at()

Refactor this calculation into a helper called
number_of_cells_per_pte(). For the time being, the val param is
unused. It will be used by following patch.

Instead of opencoding is_hugepd(), use hugepd_ok() with a forward
declaration.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/pgtable.h | 18 --
 arch/powerpc/mm/pgtable.c|  6 --
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index b9e134d0f03a..80bbc21b87f0 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -227,6 +227,17 @@ static inline void pmd_clear(pmd_t *pmdp)
  */
 #ifdef CONFIG_PPC_8xx
 static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr);
+static int hugepd_ok(hugepd_t hpd);
+
+static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge)
+{
+   if (!huge)
+   return PAGE_SIZE / SZ_4K;
+   else if (hugepd_ok(*((hugepd_t *)pmd)))
+   return 1;
+   else
+   return SZ_512K / SZ_4K;
+}
 
 static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, 
pte_t *p,
 unsigned long clr, unsigned long set, int 
huge)
@@ -237,12 +248,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, 
unsigned long addr, p
int num, i;
pmd_t *pmd = pmd_off(mm, addr);
 
-   if (!huge)
-   num = PAGE_SIZE / SZ_4K;
-   else if ((pmd_val(*pmd) & _PMD_PAGE_MASK) != _PMD_PAGE_8M)
-   num = SZ_512K / SZ_4K;
-   else
-   num = 1;
+   num = number_of_cells_per_pte(pmd, new, huge);
 
for (i = 0; i < num; i++, entry++, new += SZ_4K)
*entry = new;
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9c0547d77af3..2dcad640b869 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -266,8 +266,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep, pte_
pmd_t *pmd = pmd_off(mm, addr);
pte_basic_t val;
pte_basic_t *entry = >pte;
-   int num = is_hugepd(*((hugepd_t *)pmd)) ? 1 : SZ_512K / SZ_4K;
-   int i;
+   int num, i;
 
/*
 * Make sure hardware valid bit is not set. We don't do
@@ -280,6 +279,9 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long 
addr, pte_t *ptep, pte_
pte = set_pte_filter(pte);
 
val = pte_val(pte);
+
+   num = number_of_cells_per_pte(pmd, val, 1);
+
for (i = 0; i < num; i++, entry++, val += SZ_4K)
*entry = val;
 }
-- 
2.25.0



[PATCH 2/2] powerpc/8xx: Support 16k hugepages with 4k pages

2020-08-31 Thread Christophe Leroy
The 8xx has 4 page sizes: 4k, 16k, 512k and 8M

4k and 16k can be selected at build time as standard page sizes,
and 512k and 8M are hugepages.

When 4k standard pages are selected, 16k pages are not available.

Allow 16k pages as hugepages when 4k pages are used.

To allow that, implement arch_make_huge_pte() which receives
the necessary arguments to allow setting the PTE in accordance
with the page size:
- 512 k pages must have _PAGE_HUGE and _PAGE_SPS. They are set
by pte_mkhuge(). arch_make_huge_pte() does nothing.
- 16 k pages must have only _PAGE_SPS. arch_make_huge_pte() clears
_PAGE_HUGE.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 14 ++
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 ++
 arch/powerpc/mm/hugetlbpage.c|  2 +-
 arch/powerpc/mm/nohash/tlb.c |  4 
 arch/powerpc/mm/ptdump/8xx.c |  5 +
 include/uapi/asm-generic/hugetlb_encode.h|  1 +
 include/uapi/linux/mman.h|  1 +
 7 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h 
b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
index e752a5807a59..39be9aea86db 100644
--- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h
@@ -65,4 +65,18 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct 
*mm,
pte_update(mm, addr, ptep, clr, set, 1);
 }
 
+#ifdef CONFIG_PPC_4K_PAGES
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+  struct page *page, int writable)
+{
+   size_t size = huge_page_size(hstate_vma(vma));
+
+   if (size == SZ_16K)
+   return __pte(pte_val(entry) & ~_PAGE_HUGE);
+   else
+   return entry;
+}
+#define arch_make_huge_pte arch_make_huge_pte
+#endif
+
 #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 80bbc21b87f0..ee2243ba96cf 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -235,6 +235,8 @@ static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t 
val, int huge)
return PAGE_SIZE / SZ_4K;
else if (hugepd_ok(*((hugepd_t *)pmd)))
return 1;
+   else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE))
+   return SZ_16K / SZ_4K;
else
return SZ_512K / SZ_4K;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e7ae2a2c4545..36c3800769fb 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -180,7 +180,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long 
addr, unsigned long sz
if (!hpdp)
return NULL;
 
-   if (IS_ENABLED(CONFIG_PPC_8xx) && sz == SZ_512K)
+   if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT)
return pte_alloc_map(mm, (pmd_t *)hpdp, addr);
 
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 14514585db98..5872f69141d5 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -83,16 +83,12 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 };
 #elif defined(CONFIG_PPC_8xx)
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
-   /* we only manage 4k and 16k pages as normal pages */
-#ifdef CONFIG_PPC_4K_PAGES
[MMU_PAGE_4K] = {
.shift  = 12,
},
-#else
[MMU_PAGE_16K] = {
.shift  = 14,
},
-#endif
[MMU_PAGE_512K] = {
.shift  = 19,
},
diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 8a797dcbf475..86da2a669680 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -11,8 +11,13 @@
 
 static const struct flag_info flag_array[] = {
{
+#ifdef CONFIG_PPC_16K_PAGES
.mask   = _PAGE_HUGE,
.val= _PAGE_HUGE,
+#else
+   .mask   = _PAGE_SPS,
+   .val= _PAGE_SPS,
+#endif
.set= "huge",
.clear  = "",
}, {
diff --git a/include/uapi/asm-generic/hugetlb_encode.h 
b/include/uapi/asm-generic/hugetlb_encode.h
index b0f8e87235bd..4f3d5aaa11f5 100644
--- a/include/uapi/asm-generic/hugetlb_encode.h
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -20,6 +20,7 @@
 #define HUGETLB_FLAG_ENCODE_SHIFT  26
 #define HUGETLB_FLAG_ENCODE_MASK   0x3f
 
+#define HUGETLB_FLAG_ENCODE_16KB   (14 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_64KB   (16 << HUGETLB_FLAG_ENCODE_SHIFT)
 #defin

[PATCH] selftests/vm: Fix display of page size in map_hugetlb

2020-08-31 Thread Christophe Leroy
The displayed size is in bytes while the text says it is in kB.

Shift it by 10 to really display kBytes.

Fixes: fa7b9a805c79 ("tools/selftest/vm: allow choosing mem size and page size 
in map_hugetlb")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 tools/testing/selftests/vm/map_hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/map_hugetlb.c 
b/tools/testing/selftests/vm/map_hugetlb.c
index 6af951900aa3..312889edb84a 100644
--- a/tools/testing/selftests/vm/map_hugetlb.c
+++ b/tools/testing/selftests/vm/map_hugetlb.c
@@ -83,7 +83,7 @@ int main(int argc, char **argv)
}
 
if (shift)
-   printf("%u kB hugepages\n", 1 << shift);
+   printf("%u kB hugepages\n", 1 << (shift - 10));
else
printf("Default size hugepages\n");
printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
-- 
2.25.0



[PATCH] powerpc: Fix random segfault when freeing hugetlb range

2020-08-31 Thread Christophe Leroy
The following random segfault is observed from time to time with
map_hugetlb selftest:

root@localhost:~# ./map_hugetlb 1 19
524288 kB hugepages
Mapping 1 Mbytes
Segmentation fault

[   31.219972] map_hugetlb[365]: segfault (11) at 117 nip 77974f8c lr 779a6834 
code 1 in ld-2.23.so[77966000+21000]
[   31.220192] map_hugetlb[365]: code: 9421ffc0 480318d1 93410028 90010044 
9361002c 93810030 93a10034 93c10038
[   31.220307] map_hugetlb[365]: code: 93e1003c 93210024 8123007c 81430038 
<80e90004> 814a0004 7f443a14 813a0004
[   31.221911] BUG: Bad rss-counter state mm:(ptrval) type:MM_FILEPAGES val:33
[   31.229362] BUG: Bad rss-counter state mm:(ptrval) type:MM_ANONPAGES val:5

This fault is due to hugetlb_free_pgd_range() freeing page tables
that are also used by regular pages.

As explain in the comment at the beginning of
hugetlb_free_pgd_range(), the verification done in free_pgd_range()
on floor and ceiling is not done here, which means
hugetlb_free_pte_range() can free outside the expected range.

As the verification cannot be done in hugetlb_free_pgd_range(), it
must be done in hugetlb_free_pte_range().

Fixes: b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as standard pages.")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/hugetlbpage.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 26292544630f..e7ae2a2c4545 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -330,10 +330,24 @@ static void free_hugepd_range(struct mmu_gather *tlb, 
hugepd_t *hpdp, int pdshif
 get_hugepd_cache_index(pdshift - shift));
 }
 
-static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 
unsigned long addr)
+static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+  unsigned long addr, unsigned long end,
+  unsigned long floor, unsigned long ceiling)
 {
+   unsigned long start = addr;
pgtable_t token = pmd_pgtable(*pmd);
 
+   start &= PMD_MASK;
+   if (start < floor)
+   return;
+   if (ceiling) {
+   ceiling &= PMD_MASK;
+   if (!ceiling)
+   return;
+   }
+   if (end - 1 > ceiling - 1)
+   return;
+
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
mm_dec_nr_ptes(tlb->mm);
@@ -363,7 +377,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, 
pud_t *pud,
 */
WARN_ON(!IS_ENABLED(CONFIG_PPC_8xx));
 
-   hugetlb_free_pte_range(tlb, pmd, addr);
+   hugetlb_free_pte_range(tlb, pmd, addr, end, floor, 
ceiling);
 
continue;
}
-- 
2.25.0



Re: [PATCH] fsldma: fsl_ioread64*() do not need lower_32_bits()

2020-08-29 Thread Christophe Leroy




Le 29/08/2020 à 17:05, Guenter Roeck a écrit :

On Sat, Aug 29, 2020 at 02:45:38PM +0200, Luc Van Oostenryck wrote:

For ppc32, the functions fsl_ioread64() & fsl_ioread64be()
use lower_32_bits() as a fancy way to cast the pointer to u32
in order to do non-atomic 64-bit IO.

But the pointer is already 32-bit, so simply cast the pointer to u32.

This fixes a compile error introduced by
ef91bb196b0d ("kernel.h: Silence sparse warning in lower_32_bits")

Fixes: ef91bb196b0db1013ef8705367bc2d7944ef696b


checkpatch complains about this and prefers

Fixes: ef91bb196b0d ("kernel.h: Silence sparse warning in lower_32_bits")


Checkpatch also complains about spacing:

CHECK:SPACING: No space is necessary after a cast
#39: FILE: drivers/dma/fsldma.h:208:
+   u32 fsl_addr = (u32) addr;

CHECK:SPACING: No space is necessary after a cast
#48: FILE: drivers/dma/fsldma.h:222:
+   u32 fsl_addr = (u32) addr;

total: 0 errors, 0 warnings, 2 checks, 16 lines checked

Christophe



Otherwise

Tested-by: Guenter Roeck 


Reported-by: Guenter Roeck 
Cc: Li Yang 
Cc: Zhang Wei 
Cc: Dan Williams 
Cc: Vinod Koul 
Cc: Herbert Xu 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: dmaeng...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Luc Van Oostenryck 
---
  drivers/dma/fsldma.h | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index 56f18ae99233..6f6fa7641fa2 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -205,7 +205,7 @@ struct fsldma_chan {
  #else
  static u64 fsl_ioread64(const u64 __iomem *addr)
  {
-   u32 fsl_addr = lower_32_bits(addr);
+   u32 fsl_addr = (u32) addr;
u64 fsl_addr_hi = (u64)in_le32((u32 *)(fsl_addr + 1)) << 32;
  
  	return fsl_addr_hi | in_le32((u32 *)fsl_addr);

@@ -219,7 +219,7 @@ static void fsl_iowrite64(u64 val, u64 __iomem *addr)
  
  static u64 fsl_ioread64be(const u64 __iomem *addr)

  {
-   u32 fsl_addr = lower_32_bits(addr);
+   u32 fsl_addr = (u32) addr;
u64 fsl_addr_hi = (u64)in_be32((u32 *)fsl_addr) << 32;
  
  	return fsl_addr_hi | in_be32((u32 *)(fsl_addr + 1));

--
2.28.0



[PATCH v2 5/5] powerpc/vdso: Declare vdso_patches[] as __initdata

2020-08-28 Thread Christophe Leroy
vdso_patches[] table is used only at init time.

Mark it __initdata.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 4ad042995ccc..dfa08a7b4e7c 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -76,7 +76,7 @@ struct vdso_patch_def
  * Currently, we only change sync_dicache to do nothing on processors
  * with a coherent icache
  */
-static struct vdso_patch_def vdso_patches[] = {
+static struct vdso_patch_def vdso_patches[] __initdata = {
{
CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE,
"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
-- 
2.25.0



[PATCH v2 4/5] powerpc/vdso: Declare constant vars as __ro_after_init

2020-08-28 Thread Christophe Leroy
To avoid any risk of modification of vital VDSO variables,
declare them __ro_after_init.

vdso32_kbase and vdso64_kbase could be made 'const', but it would
have high impact on all functions using them as the compiler doesn't
expect const property to be discarded.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index fb393266b9cb..4ad042995ccc 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -38,19 +38,19 @@
 #define VDSO_ALIGNMENT (1 << 16)
 
 extern char vdso32_start, vdso32_end;
-static unsigned int vdso32_pages;
-static void *vdso32_kbase = _start;
-unsigned long vdso32_sigtramp;
-unsigned long vdso32_rt_sigtramp;
+static unsigned int vdso32_pages __ro_after_init;
+static void *vdso32_kbase __ro_after_init = _start;
+unsigned long vdso32_sigtramp __ro_after_init;
+unsigned long vdso32_rt_sigtramp __ro_after_init;
 
 extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = _start;
-static unsigned int vdso64_pages;
+static void *vdso64_kbase __ro_after_init = _start;
+static unsigned int vdso64_pages __ro_after_init;
 #ifdef CONFIG_PPC64
-unsigned long vdso64_rt_sigtramp;
+unsigned long vdso64_rt_sigtramp __ro_after_init;
 #endif /* CONFIG_PPC64 */
 
-static int vdso_ready;
+static int vdso_ready __ro_after_init;
 
 /*
  * The vdso data page (aka. systemcfg for old ppc64 fans) is here.
-- 
2.25.0



[PATCH v2 3/5] powerpc/vdso: Initialise vdso32_kbase at compile time

2020-08-28 Thread Christophe Leroy
Initialise vdso32_kbase at compile time like vdso64_kbase.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 8f245e988a8a..fb393266b9cb 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -37,13 +37,12 @@
 /* The alignment of the vDSO */
 #define VDSO_ALIGNMENT (1 << 16)
 
+extern char vdso32_start, vdso32_end;
 static unsigned int vdso32_pages;
-static void *vdso32_kbase;
+static void *vdso32_kbase = _start;
 unsigned long vdso32_sigtramp;
 unsigned long vdso32_rt_sigtramp;
 
-extern char vdso32_start, vdso32_end;
-
 extern char vdso64_start, vdso64_end;
 static void *vdso64_kbase = _start;
 static unsigned int vdso64_pages;
@@ -689,8 +688,6 @@ static int __init vdso_init(void)
 */
vdso64_pages = (_end - _start) >> PAGE_SHIFT;
 
-   vdso32_kbase = _start;
-
/*
 * Calculate the size of the 32 bits vDSO
 */
-- 
2.25.0



[PATCH v2 1/5] powerpc/vdso: Remove DBG()

2020-08-28 Thread Christophe Leroy
DBG() is defined as void when DEBUG is not defined,
and DEBUG is explicitly undefined.

It means there is no other way than modifying source code
to get the messages printed.

It was most likely useful in the first days of VDSO, but
today the only 3 DBG() calls don't deserve a special
handling.

Just remove them. If one day someone need such messages back,
use a standard pr_debug() or equivalent.

Signed-off-by: Christophe Leroy 
---
This is a follow up series, applying on top of the series that
switches powerpc VDSO to _install_special_mapping(),
rebased on today's powerpc/next-test (dd419a93bd99)

v2 removes the modification to arch_setup_additional_pages() to
consider when is_32bit_task() returning true when CONFIG_VDSO32
not set, as this should never happen.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index e2568d9ecdff..3ef3fc546ac8 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -31,14 +31,6 @@
 #include 
 #include 
 
-#undef DEBUG
-
-#ifdef DEBUG
-#define DBG(fmt...) printk(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
 /* Max supported size for symbol names */
 #define MAX_SYMNAME64
 
@@ -567,9 +559,6 @@ static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo 
*v32,
if (!match)
continue;
 
-   DBG("replacing %s with %s...\n", patch->gen_name,
-   patch->fix_name ? "NONE" : patch->fix_name);
-
/*
 * Patch the 32 bits and 64 bits symbols. Note that we do not
 * patch the "." symbol on 64 bits.
@@ -704,7 +693,6 @@ static int __init vdso_init(void)
 * Calculate the size of the 64 bits vDSO
 */
vdso64_pages = (_end - _start) >> PAGE_SHIFT;
-   DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
 
vdso32_kbase = _start;
 
@@ -712,7 +700,6 @@ static int __init vdso_init(void)
 * Calculate the size of the 32 bits vDSO
 */
vdso32_pages = (_end - _start) >> PAGE_SHIFT;
-   DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
 
/*
 * Setup the syscall map in the vDOS
-- 
2.25.0



[PATCH v2 2/5] powerpc/vdso: Don't rely on vdso_pages being 0 for failure

2020-08-28 Thread Christophe Leroy
If vdso initialisation failed, vdso_ready is not set.
Otherwise, vdso_pages is only 0 when it is a 32 bits task
and CONFIG_VDSO32 is not selected.

As arch_setup_additional_pages() now bails out directly in
that case, we don't need to set vdso_pages to 0.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 3ef3fc546ac8..8f245e988a8a 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -176,11 +176,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
 
current->mm->context.vdso_base = 0;
 
-   /* vDSO has a problem and was disabled, just don't "enable" it for the
-* process
-*/
-   if (vdso_pages == 0)
-   return 0;
/* Add a page to the vdso size for the data page */
vdso_pages ++;
 
@@ -710,14 +705,16 @@ static int __init vdso_init(void)
 * Initialize the vDSO images in memory, that is do necessary
 * fixups of vDSO symbols, locate trampolines, etc...
 */
-   if (vdso_setup())
-   goto setup_failed;
+   if (vdso_setup()) {
+   pr_err("vDSO setup failure, not enabled !\n");
+   return 0;
+   }
 
if (IS_ENABLED(CONFIG_VDSO32)) {
/* Make sure pages are in the correct state */
pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), 
GFP_KERNEL);
if (!pagelist)
-   goto alloc_failed;
+   return 0;
 
pagelist[0] = virt_to_page(vdso_data);
 
@@ -730,7 +727,7 @@ static int __init vdso_init(void)
if (IS_ENABLED(CONFIG_PPC64)) {
pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), 
GFP_KERNEL);
if (!pagelist)
-   goto alloc_failed;
+   return 0;
 
pagelist[0] = virt_to_page(vdso_data);
 
@@ -743,14 +740,6 @@ static int __init vdso_init(void)
smp_wmb();
vdso_ready = 1;
 
-   return 0;
-
-setup_failed:
-   pr_err("vDSO setup failure, not enabled !\n");
-alloc_failed:
-   vdso32_pages = 0;
-   vdso64_pages = 0;
-
return 0;
 }
 arch_initcall(vdso_init);
-- 
2.25.0



Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization

2020-08-27 Thread Christophe Leroy




Le 28/08/2020 à 07:40, Christophe Leroy a écrit :



Le 27/08/2020 à 15:19, Michael Ellerman a écrit :

Christophe Leroy  writes:

On 08/26/2020 02:58 PM, Michael Ellerman wrote:

Christophe Leroy  writes:

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index daef14a284a3..bbb69832fd46 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -718,16 +710,14 @@ static int __init vdso_init(void)

...

-
-#ifdef CONFIG_VDSO32
   vdso32_kbase = _start;
   /*
@@ -735,8 +725,6 @@ static int __init vdso_init(void)
    */
   vdso32_pages = (_end - _start) >> PAGE_SHIFT;
   DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, 
vdso32_pages);

-#endif


This didn't build for ppc64le:


/opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: 
arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to 
`vdso32_end'

/opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: 
arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to 
`vdso32_start'
    make[1]: *** [/scratch/michael/build/maint/Makefile:1166: 
vmlinux] Error 1

    make: *** [Makefile:185: __sub-make] Error 2

So I just put that ifdef back.



The problem is because is_32bit() can still return true even when
CONFIG_VDSO32 is not set.


Hmm, you're right. My config had CONFIG_COMPAT enabled.

But that seems like a bug, if someone enables COMPAT on ppc64le they are
almost certainly going to want VDSO32 as well.

So I think I'll do a lead up patch as below.


Ah yes, and with that then no need to consider the case where 
is_32bit_task() is true and CONFIG_VDSO32 is not selected.


I'll update my leading series accordingly.


I meant follow up series.

Christophe


Christophe



cheers

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype

index d4fd109f177e..cf2da1e401ef 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -501,13 +501,12 @@ endmenu
  config VDSO32
  def_bool y
-    depends on PPC32 || CPU_BIG_ENDIAN
+    depends on PPC32 || COMPAT
  help
    This symbol controls whether we build the 32-bit VDSO. We 
obviously
    want to do that if we're building a 32-bit kernel. If we're 
building
-  a 64-bit kernel then we only want a 32-bit VDSO if we're 
building for
-  big endian. That is because the only little endian 
configuration we

-  support is ppc64le which is 64-bit only.
+  a 64-bit kernel then we only want a 32-bit VDSO if we're also 
enabling

+  COMPAT.
  choice
  prompt "Endianness selection"



Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization

2020-08-27 Thread Christophe Leroy




Le 27/08/2020 à 15:19, Michael Ellerman a écrit :

Christophe Leroy  writes:

On 08/26/2020 02:58 PM, Michael Ellerman wrote:

Christophe Leroy  writes:

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index daef14a284a3..bbb69832fd46 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -718,16 +710,14 @@ static int __init vdso_init(void)

...
   
-

-#ifdef CONFIG_VDSO32
vdso32_kbase = _start;
   
   	/*

@@ -735,8 +725,6 @@ static int __init vdso_init(void)
 */
vdso32_pages = (_end - _start) >> PAGE_SHIFT;
DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
-#endif


This didn't build for ppc64le:


/opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld:
 arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end'

/opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld:
 arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start'
make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1
make: *** [Makefile:185: __sub-make] Error 2

So I just put that ifdef back.



The problem is because is_32bit() can still return true even when
CONFIG_VDSO32 is not set.


Hmm, you're right. My config had CONFIG_COMPAT enabled.

But that seems like a bug, if someone enables COMPAT on ppc64le they are
almost certainly going to want VDSO32 as well.

So I think I'll do a lead up patch as below.


Ah yes, and with that then no need to consider the case where 
is_32bit_task() is true and CONFIG_VDSO32 is not selected.


I'll update my leading series accordingly.

Christophe



cheers

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index d4fd109f177e..cf2da1e401ef 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -501,13 +501,12 @@ endmenu
  
  config VDSO32

def_bool y
-   depends on PPC32 || CPU_BIG_ENDIAN
+   depends on PPC32 || COMPAT
help
  This symbol controls whether we build the 32-bit VDSO. We obviously
  want to do that if we're building a 32-bit kernel. If we're building
- a 64-bit kernel then we only want a 32-bit VDSO if we're building for
- big endian. That is because the only little endian configuration we
- support is ppc64le which is 64-bit only.
+ a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling
+ COMPAT.
  
  choice

prompt "Endianness selection"



[PATCH v2] powerpc/32s: Disable VMAP stack which CONFIG_ADB_PMU

2020-08-27 Thread Christophe Leroy
low_sleep_handler() can't restore the context from virtual
stack because the stack can hardly be accessed with MMU OFF.

For now, disable VMAP stack when CONFIG_ADB_PMU is selected.

Reported-by: Giuseppe Sacco 
Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK")
Signed-off-by: Christophe Leroy 
---
v2: Argh, went too quick. CONFIG_ADB_PMU ==> ADB_PMU
---
 arch/powerpc/platforms/Kconfig.cputype | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 87737ec86d39..1dc9d3c81872 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -36,7 +36,7 @@ config PPC_BOOK3S_6xx
select PPC_HAVE_PMU_SUPPORT
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
-   select HAVE_ARCH_VMAP_STACK
+   select HAVE_ARCH_VMAP_STACK if !ADB_PMU
 
 config PPC_BOOK3S_601
bool "PowerPC 601"
-- 
2.25.0



Re: kernel since 5.6 do not boot anymore on Apple PowerBook

2020-08-27 Thread Christophe Leroy




Le 27/08/2020 à 16:37, Giuseppe Sacco a écrit :

Il giorno gio, 27/08/2020 alle 12.39 +0200, Christophe Leroy ha
scritto:

Hi,

Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit :

[...]

Sorry, I made a mistake. The real problem is down, on the same
function, when it calls low_sleep_handler(). This is where the problem
probably is.


Great, you spotted the problem.

I see what it is, it is in low_sleep_handler() in
arch/powerpc/platforms/powermac/sleep.S

All critical registers are saved on the stack. At restore, they are
restore BEFORE re-enabling MMU (because they are needed for that). But
when we have VMAP_STACK, the stack can hardly be accessed without the
MMU enabled. tophys() doesn't work for virtual stack addresses.

Therefore, the low_sleep_handler() has to be reworked for using an area
in the linear mem instead of the stack.


I am sorry, but I don't know how to fix it. Should I open a bug for
tracking this problem?


Yes please, at https://github.com/linuxppc/issues/issues

In the meantime, I have sent a patch to disable CONFIG_VMAP_STACK when 
CONFIG_ADB_PMU is selected until this is fixed.


Have you tried without CONFIG_ADB_PMU ? Or does it make no sense ?

Christophe


[PATCH] powerpc/32s: Disable VMAP stack which CONFIG_ADB_PMU

2020-08-27 Thread Christophe Leroy
low_sleep_handler() can't restore the context from virtual
stack because the stack can hardly be accessed with MMU OFF.

For now, disable VMAP stack when CONFIG_ADB_PMU is selected.

Reported-by: Giuseppe Sacco 
Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/platforms/Kconfig.cputype | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 87737ec86d39..c12768242c17 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -36,7 +36,7 @@ config PPC_BOOK3S_6xx
select PPC_HAVE_PMU_SUPPORT
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
-   select HAVE_ARCH_VMAP_STACK
+   select HAVE_ARCH_VMAP_STACK if !CONFIG_ADB_PMU
 
 config PPC_BOOK3S_601
bool "PowerPC 601"
-- 
2.25.0



Re: [PATCH v2 25/25] powerpc/signal32: Transform save_user_regs() and save_tm_user_regs() in 'unsafe' version

2020-08-27 Thread Christophe Leroy




Le 27/08/2020 à 11:07, kernel test robot a écrit :

Hi Christophe,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on linus/master v5.9-rc2 next-20200827]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Christophe-Leroy/powerpc-Switch-signal-32-to-using-unsafe_put_user-and-friends/20200819-012411
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc64-randconfig-r005-20200827 (attached as .config)
compiler: powerpc-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
 wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
 chmod +x ~/bin/make.cross
 # save the attached .config to linux build tree
 COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=powerpc64

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

arch/powerpc/kernel/signal_32.c: In function 'save_user_regs_unsafe':

arch/powerpc/kernel/signal_32.c:314:34: error: macro "unsafe_copy_to_user" 
requires 4 arguments, but only 3 given

  314 | ELF_NEVRREG * sizeof(u32)), failed);
  |  ^
In file included from include/linux/uaccess.h:9,
 from include/linux/sched/task.h:11,
 from include/linux/sched/signal.h:9,
 from include/linux/rcuwait.h:6,
 from include/linux/percpu-rwsem.h:7,
 from include/linux/fs.h:33,
 from include/linux/huge_mm.h:8,
 from include/linux/mm.h:672,
 from arch/powerpc/kernel/signal_32.c:17:
arch/powerpc/include/asm/uaccess.h:605: note: macro "unsafe_copy_to_user" 
defined here
  605 | #define unsafe_copy_to_user(d, s, l, e) \
  |

arch/powerpc/kernel/signal_32.c:313:3: error: 'unsafe_copy_to_user' undeclared 
(first use in this function); did you mean 'raw_copy_to_user'?

  313 |   unsafe_copy_to_user(>mc_vregs, current->thread.evr,
  |   ^~~
  |   raw_copy_to_user
arch/powerpc/kernel/signal_32.c:313:3: note: each undeclared identifier is 
reported only once for each function it appears in

arch/powerpc/kernel/signal_32.c:314:37: error: 'failed' undeclared (first use 
in this function)

  314 | ELF_NEVRREG * sizeof(u32)), failed);
  | ^~
arch/powerpc/kernel/signal_32.c:314:35: warning: left-hand operand of comma 
expression has no effect [-Wunused-value]
  314 | ELF_NEVRREG * sizeof(u32)), failed);
  |   ^

arch/powerpc/kernel/signal_32.c:314:43: error: expected ';' before ')' token

  314 | ELF_NEVRREG * sizeof(u32)), failed);
  |   ^
  |   ;

arch/powerpc/kernel/signal_32.c:314:43: error: expected statement before ')' 
token




Should be fixed by:

diff --git a/arch/powerpc/kernel/signal_32.c 
b/arch/powerpc/kernel/signal_32.c

index f795fe0240a1..123682299d4f 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -311,7 +311,7 @@ static int save_user_regs_unsafe(struct pt_regs 
*regs, struct mcontext __user *f

/* save spe registers */
if (current->thread.used_spe) {
unsafe_copy_to_user(>mc_vregs, current->thread.evr,
-   ELF_NEVRREG * sizeof(u32)), failed);
+   ELF_NEVRREG * sizeof(u32), failed);
/* set MSR_SPE in the saved MSR value to indicate that
   frame->mc_vregs contains valid data */
msr |= MSR_SPE;

---
Christophe


[PATCH v1 6/6] powerpc/vdso: Declare vdso_patches[] as __initdata

2020-08-27 Thread Christophe Leroy
vdso_patches[] table is used only at init time.

Mark it __initdata.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 600df1164a0b..efaaee94f273 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -76,7 +76,7 @@ struct vdso_patch_def
  * Currently, we only change sync_dicache to do nothing on processors
  * with a coherent icache
  */
-static struct vdso_patch_def vdso_patches[] = {
+static struct vdso_patch_def vdso_patches[] __initdata = {
{
CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE,
"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
-- 
2.25.0



[PATCH v1 4/6] powerpc/vdso: Initialise vdso32_kbase at compile time

2020-08-27 Thread Christophe Leroy
Initialise vdso32_kbase at compile time like vdso64_kbase.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index c173c70ca7d2..6390a37dacea 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -37,13 +37,12 @@
 /* The alignment of the vDSO */
 #define VDSO_ALIGNMENT (1 << 16)
 
+extern char vdso32_start, vdso32_end;
 static unsigned int vdso32_pages;
-static void *vdso32_kbase;
+static void *vdso32_kbase = _start;
 unsigned long vdso32_sigtramp;
 unsigned long vdso32_rt_sigtramp;
 
-extern char vdso32_start, vdso32_end;
-
 extern char vdso64_start, vdso64_end;
 static void *vdso64_kbase = _start;
 static unsigned int vdso64_pages;
@@ -691,8 +690,6 @@ static int __init vdso_init(void)
 */
vdso64_pages = (_end - _start) >> PAGE_SHIFT;
 
-   vdso32_kbase = _start;
-
/*
 * Calculate the size of the 32 bits vDSO
 */
-- 
2.25.0



[PATCH v1 3/6] powerpc/vdso: Don't rely on vdso_pages being 0 for failure

2020-08-27 Thread Christophe Leroy
If vdso initialisation failed, vdso_ready is not set.
Otherwise, vdso_pages is only 0 when it is a 32 bits task
and CONFIG_VDSO32 is not selected.

As arch_setup_additional_pages() now bails out directly in
that case, we don't need to set vdso_pages to 0.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 23 ++-
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 465150253c31..c173c70ca7d2 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -178,11 +178,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
 
current->mm->context.vdso_base = 0;
 
-   /* vDSO has a problem and was disabled, just don't "enable" it for the
-* process
-*/
-   if (vdso_pages == 0)
-   return 0;
/* Add a page to the vdso size for the data page */
vdso_pages ++;
 
@@ -712,14 +707,16 @@ static int __init vdso_init(void)
 * Initialize the vDSO images in memory, that is do necessary
 * fixups of vDSO symbols, locate trampolines, etc...
 */
-   if (vdso_setup())
-   goto setup_failed;
+   if (vdso_setup()) {
+   pr_err("vDSO setup failure, not enabled !\n");
+   return 0;
+   }
 
if (IS_ENABLED(CONFIG_VDSO32)) {
/* Make sure pages are in the correct state */
pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), 
GFP_KERNEL);
if (!pagelist)
-   goto alloc_failed;
+   return 0;
 
pagelist[0] = virt_to_page(vdso_data);
 
@@ -732,7 +729,7 @@ static int __init vdso_init(void)
if (IS_ENABLED(CONFIG_PPC64)) {
pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), 
GFP_KERNEL);
if (!pagelist)
-   goto alloc_failed;
+   return 0;
 
pagelist[0] = virt_to_page(vdso_data);
 
@@ -745,14 +742,6 @@ static int __init vdso_init(void)
smp_wmb();
vdso_ready = 1;
 
-   return 0;
-
-setup_failed:
-   pr_err("vDSO setup failure, not enabled !\n");
-alloc_failed:
-   vdso32_pages = 0;
-   vdso64_pages = 0;
-
return 0;
 }
 arch_initcall(vdso_init);
-- 
2.25.0



[PATCH v1 5/6] powerpc/vdso: Declare constant vars as __ro_after_init

2020-08-27 Thread Christophe Leroy
To avoid any risk of modification of vital VDSO variables,
declare them __ro_after_init.

vdso32_kbase and vdso64_kbase could be made 'const', but it would
have high impact on all functions using them as the compiler doesn't
expect const property to be discarded.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 6390a37dacea..600df1164a0b 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -38,19 +38,19 @@
 #define VDSO_ALIGNMENT (1 << 16)
 
 extern char vdso32_start, vdso32_end;
-static unsigned int vdso32_pages;
-static void *vdso32_kbase = _start;
-unsigned long vdso32_sigtramp;
-unsigned long vdso32_rt_sigtramp;
+static unsigned int vdso32_pages __ro_after_init;
+static void *vdso32_kbase __ro_after_init = _start;
+unsigned long vdso32_sigtramp __ro_after_init;
+unsigned long vdso32_rt_sigtramp __ro_after_init;
 
 extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = _start;
-static unsigned int vdso64_pages;
+static void *vdso64_kbase __ro_after_init = _start;
+static unsigned int vdso64_pages __ro_after_init;
 #ifdef CONFIG_PPC64
-unsigned long vdso64_rt_sigtramp;
+unsigned long vdso64_rt_sigtramp __ro_after_init;
 #endif /* CONFIG_PPC64 */
 
-static int vdso_ready;
+static int vdso_ready __ro_after_init;
 
 /*
  * The vdso data page (aka. systemcfg for old ppc64 fans) is here.
-- 
2.25.0



[PATCH v1 1/6] powerpc/vdso: Remove DBG()

2020-08-27 Thread Christophe Leroy
DBG() is defined as void when DEBUG is not defined,
and DEBUG is explicitly undefined.

It means there is no other way than modifying source code
to get the messages printed.

It was most likely useful in the first days of VDSO, but
today the only 3 DBG() calls don't deserve a special
handling.

Just remove them. If one day someone need such messages back,
use a standard pr_debug() or equivalent.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index e8aaeeae9e9f..a44e8e6a4692 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -31,14 +31,6 @@
 #include 
 #include 
 
-#undef DEBUG
-
-#ifdef DEBUG
-#define DBG(fmt...) printk(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
 /* Max supported size for symbol names */
 #define MAX_SYMNAME64
 
@@ -567,9 +559,6 @@ static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo 
*v32,
if (!match)
continue;
 
-   DBG("replacing %s with %s...\n", patch->gen_name,
-   patch->fix_name ? "NONE" : patch->fix_name);
-
/*
 * Patch the 32 bits and 64 bits symbols. Note that we do not
 * patch the "." symbol on 64 bits.
@@ -704,7 +693,6 @@ static int __init vdso_init(void)
 * Calculate the size of the 64 bits vDSO
 */
vdso64_pages = (_end - _start) >> PAGE_SHIFT;
-   DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
 
vdso32_kbase = _start;
 
@@ -713,7 +701,6 @@ static int __init vdso_init(void)
 * Calculate the size of the 32 bits vDSO
 */
vdso32_pages = (_end - _start) >> PAGE_SHIFT;
-   DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
 #endif
 
/*
-- 
2.25.0



[PATCH v1 2/6] powerpc/vdso: Don't reference vdso32 static functions/vars without CONFIG_VDSO32

2020-08-27 Thread Christophe Leroy
When CONFIG_VDSO32 is not selected, just don't reference the static
vdso32 variables and functions.

This allows the compiler to optimise them out, and allows to
drop an #ifdef CONFIG_VDSO32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/vdso.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index a44e8e6a4692..465150253c31 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -159,11 +159,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
if (!vdso_ready)
return 0;
 
-   if (is_32bit_task()) {
-   vdso_spec = _spec;
-   vdso_pages = vdso32_pages;
-   vdso_base = VDSO32_MBASE;
-   } else {
+   if (!is_32bit_task()) {
vdso_spec = _spec;
vdso_pages = vdso64_pages;
/*
@@ -172,6 +168,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
 * and most likely share a SLB entry.
 */
vdso_base = 0;
+   } else if (IS_ENABLED(CONFIG_VDSO32)) {
+   vdso_spec = _spec;
+   vdso_pages = vdso32_pages;
+   vdso_base = VDSO32_MBASE;
+   } else {
+   return 0;
}
 
current->mm->context.vdso_base = 0;
@@ -696,12 +698,10 @@ static int __init vdso_init(void)
 
vdso32_kbase = _start;
 
-#ifdef CONFIG_VDSO32
/*
 * Calculate the size of the 32 bits vDSO
 */
vdso32_pages = (_end - _start) >> PAGE_SHIFT;
-#endif
 
/*
 * Setup the syscall map in the vDOS
-- 
2.25.0



Re: kernel since 5.6 do not boot anymore on Apple PowerBook

2020-08-27 Thread Christophe Leroy

Hi,

Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit :

Il giorno gio, 27/08/2020 alle 09.46 +0200, Giuseppe Sacco ha scritto:

Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto:

Hello Christophe,

Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha
scritto:
[...]

If there is no warning, then the issue is something else, bad luck.

Could you increase the loglevel and try again both with and without
VMAP_STACK ? Maybe we'll get more information on where it stops.


The problem is related to the CPU frequency changes. This is where the
system stop: cpufreq get the CPU frequency limits and then start the
default governor (performance) and then calls function
cpufreq_gov_performance_limits() that never returns.

Rebuilding after enabling pr_debug for cpufreq.c, I've got some more
lines of output:

cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz
cpufreq: new min and max freqs are 667000 - 867000 kHz
cpufreq: governor switch
cpufreq: cpufreq_init_governor: for CPU 0
cpufreq: cpufreq_start_governor: for CPU 0
cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz
cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000
cpufreq: notification 0 of frequency transition to 867000 kHz
cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is 667000 
kHz

no more lines are printed. I think this output only refers to the
notification sent prior to the frequency change.

I was thinking that selecting a governor that would run at 667mHz would
probably skip the problem. I added cpufreq.default_governor=powersave
to the command line parameters but it did not work: the selected
governor was still performance and the system crashed.


I kept following the thread and found that CPU frequency is changed in
function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c.
As first thing, the function calls the macro preempt_disable() and this
is where it stops.


Sorry, I made a mistake. The real problem is down, on the same
function, when it calls low_sleep_handler(). This is where the problem
probably is.




Great, you spotted the problem.

I see what it is, it is in low_sleep_handler() in 
arch/powerpc/platforms/powermac/sleep.S


All critical registers are saved on the stack. At restore, they are 
restore BEFORE re-enabling MMU (because they are needed for that). But 
when we have VMAP_STACK, the stack can hardly be accessed without the 
MMU enabled. tophys() doesn't work for virtual stack addresses.


Therefore, the low_sleep_handler() has to be reworked for using an area 
in the linear mem instead of the stack.


Christophe


Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization

2020-08-27 Thread Christophe Leroy




On 08/26/2020 02:58 PM, Michael Ellerman wrote:

Christophe Leroy  writes:

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index daef14a284a3..bbb69832fd46 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -718,16 +710,14 @@ static int __init vdso_init(void)

...
  
-

-#ifdef CONFIG_VDSO32
vdso32_kbase = _start;
  
  	/*

@@ -735,8 +725,6 @@ static int __init vdso_init(void)
 */
vdso32_pages = (_end - _start) >> PAGE_SHIFT;
DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
-#endif


This didn't build for ppc64le:

   
/opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld:
 arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end'
   
/opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld:
 arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start'
   make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1
   make: *** [Makefile:185: __sub-make] Error 2

So I just put that ifdef back.



The problem is because is_32bit() can still return true even when 
CONFIG_VDSO32 is not set.


The change below fixes the problem:

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index bbb69832fd46..38abff60cbe2 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -132,11 +132,7 @@ int arch_setup_additional_pages(struct linux_binprm 
*bprm, int uses_interp)

if (!vdso_ready)
return 0;

-   if (is_32bit_task()) {
-   vdso_pagelist = vdso32_pagelist;
-   vdso_pages = vdso32_pages;
-   vdso_base = VDSO32_MBASE;
-   } else {
+   if (!is_32bit_task()) {
vdso_pagelist = vdso64_pagelist;
vdso_pages = vdso64_pages;
/*
@@ -145,6 +141,12 @@ int arch_setup_additional_pages(struct linux_binprm 
*bprm, int uses_interp)

 * and most likely share a SLB entry.
 */
vdso_base = 0;
+   } else if (IS_ENABLED(CONFIG_VDSO32)) {
+   vdso_pagelist = vdso32_pagelist;
+   vdso_pages = vdso32_pages;
+   vdso_base = VDSO32_MBASE;
+   } else {
+   vdso_pages = 0;
}

current->mm->context.vdso_base = 0;

With this change all vdso32 static objects (functions and vars) go away 
as expected.


We get a simple conflict with the following patch.

Do you prefer an updated series or a follow up patch, or you take the 
above change yourself ?


Christophe


Re: [PATCH v5 08/23] powerpc/book3s64/kuap: Rename MMU_FTR_RADIX_KUAP to MMU_FTR_KUAP

2020-08-26 Thread Christophe Leroy




Le 27/08/2020 à 06:09, Aneesh Kumar K.V a écrit :

This is in preparate to adding support for kuap with hash translation.
In preparation for that rename/move kuap related functions to
non radix names. Also move the feature bit closer to MMU_FTR_KUEP.

Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/include/asm/book3s/64/kup.h | 18 +-
  arch/powerpc/include/asm/mmu.h   | 16 
  arch/powerpc/mm/book3s64/pkeys.c |  2 +-
  3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index 918a2fcceee7..5cec202dc42f 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -24,7 +24,7 @@
mtspr   SPRN_AMR, \gpr2
/* No isync required, see kuap_restore_amr() */
  998:
-   END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
+   END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
  #endif
  .endm
  
@@ -36,7 +36,7 @@

sldi\gpr2, \gpr2, AMR_KUAP_SHIFT
  999:  tdne\gpr1, \gpr2
EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | 
BUGFLAG_ONCE)
-   END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
+   END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
  #endif
  .endm
  
@@ -56,7 +56,7 @@

mtspr   SPRN_AMR, \gpr2
isync
  99:
-   END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
+   END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_KUAP, 67)
  #endif
  .endm
  
@@ -69,7 +69,7 @@
  
  static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)

  {
-   if (mmu_has_feature(MMU_FTR_RADIX_KUAP) && unlikely(regs->kuap != amr)) 
{
+   if (mmu_has_feature(MMU_FTR_KUAP) && unlikely(regs->kuap != amr)) {
isync();
mtspr(SPRN_AMR, regs->kuap);
/*
@@ -82,7 +82,7 @@ static inline void kuap_restore_amr(struct pt_regs *regs, 
unsigned long amr)
  
  static inline unsigned long kuap_get_and_check_amr(void)

  {
-   if (mmu_has_feature(MMU_FTR_RADIX_KUAP)) {
+   if (mmu_has_feature(MMU_FTR_KUAP)) {
unsigned long amr = mfspr(SPRN_AMR);
if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */
WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED);
@@ -93,7 +93,7 @@ static inline unsigned long kuap_get_and_check_amr(void)
  
  static inline void kuap_check_amr(void)

  {
-   if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && 
mmu_has_feature(MMU_FTR_RADIX_KUAP))
+   if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_KUAP))
WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED);
  }
  
@@ -122,7 +122,7 @@ static inline unsigned long kuap_get_and_check_amr(void)
  
  static inline unsigned long get_kuap(void)

  {
-   if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP))
+   if (!early_mmu_has_feature(MMU_FTR_KUAP))
return 0;
  
  	return mfspr(SPRN_AMR);

@@ -130,7 +130,7 @@ static inline unsigned long get_kuap(void)
  
  static inline void set_kuap(unsigned long value)

  {
-   if (!early_mmu_has_feature(MMU_FTR_RADIX_KUAP))
+   if (!early_mmu_has_feature(MMU_FTR_KUAP))
return;
  
  	/*

@@ -180,7 +180,7 @@ static inline void restore_user_access(unsigned long flags)
  static inline bool
  bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write)
  {
-   return WARN(mmu_has_feature(MMU_FTR_RADIX_KUAP) &&
+   return WARN(mmu_has_feature(MMU_FTR_KUAP) &&
(regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : 
AMR_KUAP_BLOCK_READ)),
"Bug: %s fault blocked by AMR!", is_write ? "Write" : 
"Read");
  }
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 255a1837e9f7..04e7a65637fb 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -29,7 +29,12 @@
   */
  
  /*

- * Support for KUEP feature.
+ * Supports KUAP (key 0 controlling userspace addresses) on radix
+ */
+#define MMU_FTR_KUAP   ASM_CONST(0x0200)
+
+/*
+ * Suppor for KUEP feature.


Unexpected change I guess. Suppor ==> Support

Christophe


   */
  #define MMU_FTR_KUEP  ASM_CONST(0x0400)
  
@@ -120,11 +125,6 @@

   */
  #define MMU_FTR_1T_SEGMENTASM_CONST(0x4000)
  
-/*

- * Supports KUAP (key 0 controlling userspace addresses) on radix
- */
-#define MMU_FTR_RADIX_KUAP ASM_CONST(0x8000)
-
  /* MMU feature bit sets for various CPUs */
  #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \
MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2
@@ -187,10 +187,10 @@ enum {
  #ifdef CONFIG_PPC_RADIX_MMU
MMU_FTR_TYPE_RADIX |
MMU_FTR_GTSE |
+#endif /* CONFIG_PPC_RADIX_MMU */
  #ifdef CONFIG_PPC_KUAP
-   MMU_FTR_RADIX_KUAP |
+   MMU_FTR_KUAP |
  #endif /* CONFIG_PPC_KUAP */
-#endif /* CONFIG_PPC_RADIX_MMU */

Re: [PATCH v2] powerpc: Update documentation of ISA versions for Power10

2020-08-26 Thread Christophe Leroy




Le 27/08/2020 à 06:05, Jordan Niethe a écrit :

Update the CPU to ISA Version Mapping document to include Power10 and
ISA v3.1.


Maybe Documentation/powerpc/cpu_families.rst should be updated as well.

Christophe





Signed-off-by: Jordan Niethe 
---
v2: Transactional Memory = No
---
  Documentation/powerpc/isa-versions.rst | 4 
  1 file changed, 4 insertions(+)

diff --git a/Documentation/powerpc/isa-versions.rst 
b/Documentation/powerpc/isa-versions.rst
index a363d8c1603c..3873bbba183a 100644
--- a/Documentation/powerpc/isa-versions.rst
+++ b/Documentation/powerpc/isa-versions.rst
@@ -7,6 +7,7 @@ Mapping of some CPU versions to relevant ISA versions.
  = 
  CPU   Architecture version
  = 
+Power10   Power ISA v3.1
  Power9Power ISA v3.0B
  Power8Power ISA v2.07
  Power7Power ISA v2.06
@@ -32,6 +33,7 @@ Key Features
  == ==
  CPUVMX (aka. Altivec)
  == ==
+Power10Yes
  Power9 Yes
  Power8 Yes
  Power7 Yes
@@ -47,6 +49,7 @@ PPC970 Yes
  == 
  CPUVSX
  == 
+Power10Yes
  Power9 Yes
  Power8 Yes
  Power7 Yes
@@ -62,6 +65,7 @@ PPC970 No
  == 
  CPUTransactional Memory
  == 
+Power10No  (* see Power ISA v3.1 Appendix A.)
  Power9 Yes (* see transactional_memory.txt)
  Power8 Yes
  Power7 No



  1   2   3   4   5   6   7   8   9   10   >