Re: [PATCH v3 2/2] powerpc/uaccess: Add pre-update addressing to __get_user_asm() and __put_user_asm()

2020-08-12 Thread Christophe Leroy




Le 12/08/2020 à 21:37, Segher Boessenkool a écrit :

On Wed, Aug 12, 2020 at 12:25:17PM +, Christophe Leroy wrote:

Enable pre-update addressing mode in __get_user_asm() and __put_user_asm()

Signed-off-by: Christophe Leroy 
---
v3: new, splited out from patch 1.


It still looks fine to me, you can keep my Reviewed-by: :-)



Ah yes thanks, forgot it when I commited it.

Reviewed-by: Segher Boessenkool 

Christophe


Re: [PATCH 16/16] debug_vm_pgtable/ppc64: Add a variant of pfn_pte/pmd

2020-08-12 Thread Anshuman Khandual


On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> The tests do expect _PAGE_PTE bit set by different page table accessors.
> This is not true for the kernel. Within the kernel, _PAGE_PTE bits are
> usually set by set_pte_at(). To make the below tests work correctly add test
> specific pfn_pte/pmd helpers that set _PAGE_PTE bit.
> 
> pte_t pte = pfn_pte(pfn, prot);
> WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
> WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte;
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 65 +++
>  1 file changed, 41 insertions(+), 24 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index eea62d5e503b..153c925b5273 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -31,6 +31,23 @@
>  #include 
>  #include 
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +static inline pte_t debug_vm_pfn_pte(unsigned long pfn, pgprot_t pgprot)
> +{
> + pte_t pte = pfn_pte(pfn, pgprot);
> + return __pte(pte_val(pte) | _PAGE_PTE);
> +
> +}
> +static inline pmd_t debug_vm_pfn_pmd(unsigned long pfn, pgprot_t pgprot)
> +{
> + pmd_t pmd = pfn_pmd(pfn, pgprot);
> + return __pmd(pmd_val(pmd) | _PAGE_PTE);
> +}
> +#else
> +#define debug_vm_pfn_pte(pfn, pgprot) pfn_pte(pfn, pgprot)
> +#define debug_vm_pfn_pmd(pfn, pgprot) pfn_pmd(pfn, pgprot)
> +#endif

Again, no platform specific constructs please. This defeats the whole purpose of
this test. If __PAGE_PTE is required for the helpers, then pfn_pmd/pte() could
be modified to accommodate that. We dont see similar issues on other platforms,
hence could you please explain why ppc64 is different here.


Re: [PATCH 13/16] debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> pmd_clear() should not be used to clear pmd level pte entries.

Could you please elaborate on this. The proposed change set does
not match the description here.

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 061c19bba7f0..529892b9be2f 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -191,6 +191,8 @@ static void __init pmd_advanced_tests(struct mm_struct 
> *mm,
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(pmd_young(pmd));
>  
> + /*  Clear the pte entries  */
> + pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>   pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
>  }
>  
> @@ -313,6 +315,8 @@ static void __init pud_advanced_tests(struct mm_struct 
> *mm,
>   pudp_test_and_clear_young(vma, vaddr, pudp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(pud_young(pud));
> +
> + pudp_huge_get_and_clear(mm, vaddr, pudp);
>  }
>  
>  static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
> @@ -431,8 +435,6 @@ static void __init pud_populate_tests(struct mm_struct 
> *mm, pud_t *pudp,
>* This entry points to next level page table page.
>* Hence this must not qualify as pud_bad().
>*/
> - pmd_clear(pmdp);
> - pud_clear(pudp);

Both entires are cleared before creating a fresh page table entry.
Why that is a problem.

>   pud_populate(mm, pudp, pmdp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(pud_bad(pud));
> @@ -564,7 +566,6 @@ static void __init pmd_populate_tests(struct mm_struct 
> *mm, pmd_t *pmdp,
>* This entry points to next level page table page.
>* Hence this must not qualify as pmd_bad().
>*/
> - pmd_clear(pmdp);

Ditto.

>   pmd_populate(mm, pmdp, pgtable);
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(pmd_bad(pmd));
> 


Re: [PATCH 10/16] debug_vm_pgtable/thp: Use page table depost/withdraw with THP

2020-08-12 Thread Anshuman Khandual
On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> Architectures like ppc64 use deposited page table while updating the huge pte
> entries.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 644d28861ce9..48475d288df1 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -147,7 +147,7 @@ static void __init pmd_basic_tests(unsigned long pfn, 
> pgprot_t prot)
>  static void __init pmd_advanced_tests(struct mm_struct *mm,
> struct vm_area_struct *vma, pmd_t *pmdp,
> unsigned long pfn, unsigned long vaddr,
> -   pgprot_t prot)
> +   pgprot_t prot, pgtable_t pgtable)
>  {
>   pmd_t pmd;
>  
> @@ -158,6 +158,8 @@ static void __init pmd_advanced_tests(struct mm_struct 
> *mm,
>   /* Align the address wrt HPAGE_PMD_SIZE */
>   vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
>  
> + pgtable_trans_huge_deposit(mm, pmdp, pgtable);
> +
>   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>   set_pmd_at(mm, vaddr, pmdp, pmd);
>   pmdp_set_wrprotect(mm, vaddr, pmdp);
> @@ -188,6 +190,8 @@ static void __init pmd_advanced_tests(struct mm_struct 
> *mm,
>   pmdp_test_and_clear_young(vma, vaddr, pmdp);
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(pmd_young(pmd));
> +
> + pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
>  }
>  
>  static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
> @@ -1002,7 +1006,7 @@ static int __init debug_vm_pgtable(void)
>   pgd_clear_tests(mm, pgdp);
>  
>   pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> - pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
> + pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
>   pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
>   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>  
> 

Makes sense, if it is required for THP to work correctly but needs to be tested
across enabled platforms. Why should not the same apply for pud_advanced_tests()
on platforms that supports PUD based THP.


[PATCH] powerpc/papr_scm: Limit the readability of 'perf_stats' sysfs attribute

2020-08-12 Thread Vaibhav Jain
The newly introduced 'perf_stats' attribute uses the default access
mode of 0444 letting non-root users access performance stats of an
nvdimm and potentially force the kernel into issuing large number of
expensive HCALLs. Since the information exposed by this attribute
cannot be cached hence its better to ward of access to this attribute
from users who don't need to access these performance statistics.

Hence this patch adds check in perf_stats_show() to only let users
that are 'perfmon_capable()' to read the nvdimm performance
statistics.

Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance stats from 
PHYP')
Reported-by: Aneesh Kumar K.V 
Signed-off-by: Vaibhav Jain 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index f439f0dfea7d1..36c51bf8af9a8 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -792,6 +792,10 @@ static ssize_t perf_stats_show(struct device *dev,
struct nvdimm *dimm = to_nvdimm(dev);
struct papr_scm_priv *p = nvdimm_provider_data(dimm);
 
+   /* Allow access only to perfmon capable users */
+   if (!perfmon_capable())
+   return -EACCES;
+
if (!p->stat_buffer_len)
return -ENOENT;
 
-- 
2.26.2



[PATCH 3/3] selftests/powerpc: Run tm-tmspr test for longer

2020-08-12 Thread Michael Ellerman
This test creates some threads, which write to TM SPRs, and then makes
sure the registers maintain the correct values across context switches
and contention with other threads.

But currently the test finishes almost instantaneously, which reduces
the chance of it hitting an interesting condition.

So increase the number of loops, so it runs a bit longer, though still
less than 2s on a Power8.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/tm/tm-tmspr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c 
b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
index 2ff329e2fca9..794d574db784 100644
--- a/tools/testing/selftests/powerpc/tm/tm-tmspr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
@@ -33,7 +33,7 @@
 #include "utils.h"
 #include "tm.h"
 
-intnum_loops   = 1;
+intnum_loops   = 100;
 intpassed = 1;
 
 void tfiar_tfhar(void *in)
-- 
2.25.1



[PATCH 2/3] selftests/powerpc: Don't use setaffinity in tm-tmspr

2020-08-12 Thread Michael Ellerman
This test tries to set affinity to CPUs that don't exist, especially
if the set of online CPUs doesn't start at 0.

But there's no real reason for it to use setaffinity in the first
place, it's just trying to create lots of threads to cause contention.
So drop the setaffinity entirely.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/tm/tm-tmspr.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c 
b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
index 17becf3dcee4..2ff329e2fca9 100644
--- a/tools/testing/selftests/powerpc/tm/tm-tmspr.c
+++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
@@ -38,14 +38,8 @@ int  passed = 1;
 
 void tfiar_tfhar(void *in)
 {
-   int i, cpu;
unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd;
-   cpu_set_t cpuset;
-
-   CPU_ZERO();
-   cpu = (unsigned long)in >> 1;
-   CPU_SET(cpu, );
-   sched_setaffinity(0, sizeof(cpuset), );
+   int i;
 
/* TFIAR: Last bit has to be high so userspace can read register */
tfiar = ((unsigned long)in) + 1;
-- 
2.25.1



[PATCH 1/3] selftests/powerpc: Fix TM tests when CPU 0 is offline

2020-08-12 Thread Michael Ellerman
Several of the TM tests fail spuriously if CPU 0 is offline, because
they blindly try to affinitise to CPU 0.

Fix them by picking any online CPU and using that instead.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/tm/tm-poison.c  | 11 +++
 tools/testing/selftests/powerpc/tm/tm-trap.c| 10 ++
 tools/testing/selftests/powerpc/tm/tm-unavailable.c |  9 ++---
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c 
b/tools/testing/selftests/powerpc/tm/tm-poison.c
index 977558497c16..29e5f26af7b9 100644
--- a/tools/testing/selftests/powerpc/tm/tm-poison.c
+++ b/tools/testing/selftests/powerpc/tm/tm-poison.c
@@ -26,7 +26,7 @@
 
 int tm_poison_test(void)
 {
-   int pid;
+   int cpu, pid;
cpu_set_t cpuset;
uint64_t poison = 0xdeadbeefc0dec0fe;
uint64_t unknown = 0;
@@ -35,10 +35,13 @@ int tm_poison_test(void)
 
SKIP_IF(!have_htm());
 
-   /* Attach both Child and Parent to CPU 0 */
+   cpu = pick_online_cpu();
+   FAIL_IF(cpu < 0);
+
+   // Attach both Child and Parent to the same CPU
CPU_ZERO();
-   CPU_SET(0, );
-   sched_setaffinity(0, sizeof(cpuset), );
+   CPU_SET(cpu, );
+   FAIL_IF(sched_setaffinity(0, sizeof(cpuset), ) != 0);
 
pid = fork();
if (!pid) {
diff --git a/tools/testing/selftests/powerpc/tm/tm-trap.c 
b/tools/testing/selftests/powerpc/tm/tm-trap.c
index 601f0c1d450d..c75960af8018 100644
--- a/tools/testing/selftests/powerpc/tm/tm-trap.c
+++ b/tools/testing/selftests/powerpc/tm/tm-trap.c
@@ -247,8 +247,7 @@ void *pong(void *not_used)
 int tm_trap_test(void)
 {
uint16_t k = 1;
-
-   int rc;
+   int cpu, rc;
 
pthread_attr_t attr;
cpu_set_t cpuset;
@@ -267,9 +266,12 @@ int tm_trap_test(void)
usr1_sa.sa_sigaction = usr1_signal_handler;
sigaction(SIGUSR1, _sa, NULL);
 
-   /* Set only CPU 0 in the mask. Both threads will be bound to cpu 0. */
+   cpu = pick_online_cpu();
+   FAIL_IF(cpu < 0);
+
+   // Set only one CPU in the mask. Both threads will be bound to that CPU.
CPU_ZERO();
-   CPU_SET(0, );
+   CPU_SET(cpu, );
 
/* Init pthread attribute */
rc = pthread_attr_init();
diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c 
b/tools/testing/selftests/powerpc/tm/tm-unavailable.c
index 2ca2fccb0a3e..a1348a5f721a 100644
--- a/tools/testing/selftests/powerpc/tm/tm-unavailable.c
+++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c
@@ -338,16 +338,19 @@ void test_fp_vec(int fp, int vec, pthread_attr_t *attr)
 
 int tm_unavailable_test(void)
 {
-   int rc, exception; /* FP = 0, VEC = 1, VSX = 2 */
+   int cpu, rc, exception; /* FP = 0, VEC = 1, VSX = 2 */
pthread_t t1;
pthread_attr_t attr;
cpu_set_t cpuset;
 
SKIP_IF(!have_htm());
 
-   /* Set only CPU 0 in the mask. Both threads will be bound to CPU 0. */
+   cpu = pick_online_cpu();
+   FAIL_IF(cpu < 0);
+
+   // Set only one CPU in the mask. Both threads will be bound to that CPU.
CPU_ZERO();
-   CPU_SET(0, );
+   CPU_SET(cpu, );
 
/* Init pthread attribute. */
rc = pthread_attr_init();
-- 
2.25.1



Re: [PATCH v2] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-08-12 Thread Nathan Lynch
Michael Ellerman  writes:
> Tyrel Datwyler  writes:
>> On 8/11/20 6:20 PM, Nathan Lynch wrote:
>>>  
>>> +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
>>> +{
>>> +   const unsigned int resched_interval = 20;
>>> +
>>> +   BUG_ON(lmb < drmem_info->lmbs);
>>> +   BUG_ON(lmb >= drmem_info->lmbs + drmem_info->n_lmbs);
>>
>> I think BUG_ON is a pretty big no-no these days unless there is no other 
>> option.
>
> It's complicated, but yes we would like to avoid adding them if we can.
>
> In a case like this there is no other option, *if* the check has to be
> in drmem_lmb_next().
>
> But I don't think we really need to check that there.
>
> If for some reason this was called with an *lmb pointing outside of the
> lmbs array it would confuse the cond_resched() logic, but it's not worth
> crashing the box for that IMHO.

The BUG_ONs are pretty much orthogonal to the cond_resched().

It's not apparent from the context of the change, but some users of the
for_each_drmem_lmb* macros modify elements of the drmem_info->lmbs
array. If the lmb passed to drmem_lmb_next() violates the bounds check
(say, if the callsite inappropriately modifies it within the loop), such
users are guaranteed to corrupt other objects in memory. This was my
thinking in adding the BUG_ONs, and I don't see another place to do
it.


Re: [PATCH] powerpc: Fix P10 PVR revision in /proc/cpuinfo for SMT4 cores

2020-08-12 Thread Michael Neuling
On Mon, 2020-08-03 at 22:41 +1000, Michael Ellerman wrote:
> Michael Neuling  writes:
> > On POWER10 bit 12 in the PVR indicates if the core is SMT4 or
> > SMT8. Bit 12 is set for SMT4.
> > 
> > Without this patch, /proc/cpuinfo on a SMT4 DD1 POWER10 looks like
> > this:
> > cpu : POWER10, altivec supported
> > revision: 17.0 (pvr 0080 1100)
> > 
> > Signed-off-by: Michael Neuling 
> > ---
> >  arch/powerpc/kernel/setup-common.c | 1 +
> >  1 file changed, 1 insertion(+)
> 
> This should have a Fixes: pointing at something so it gets backported.

Yes it should.

Mikey



Re: [PATCH v2] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-08-12 Thread Michael Ellerman
Tyrel Datwyler  writes:
> On 8/11/20 6:20 PM, Nathan Lynch wrote:
>> The drmem lmb list can have hundreds of thousands of entries, and
>> unfortunately lookups take the form of linear searches. As long as
>> this is the case, traversals have the potential to monopolize the CPU
>> and provoke lockup reports, workqueue stalls, and the like unless
>> they explicitly yield.
>> 
>> Rather than placing cond_resched() calls within various
>> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
>> expression of the loop macro itself so users can't omit it.
>> 
>> Call cond_resched() on every 20th element. Each iteration of the loop
>> in DLPAR code paths can involve around ten RTAS calls which can each
>> take up to 250us, so this ensures the check is performed at worst
>> every few milliseconds.
>> 
>> Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT 
>> format")
>> Signed-off-by: Nathan Lynch 
>> ---
>>  arch/powerpc/include/asm/drmem.h | 18 +-
>>  1 file changed, 17 insertions(+), 1 deletion(-)
>> 
>> Changes since v1:
>> * Add bounds assertions in drmem_lmb_next().
>> * Call cond_resched() in the iterator on only every 20th element
>>   instead of on every iteration, to reduce overhead in tight loops.
>> 
>> diff --git a/arch/powerpc/include/asm/drmem.h 
>> b/arch/powerpc/include/asm/drmem.h
>> index 17ccc6474ab6..583277e30dd2 100644
>> --- a/arch/powerpc/include/asm/drmem.h
>> +++ b/arch/powerpc/include/asm/drmem.h
>> @@ -8,6 +8,9 @@
>>  #ifndef _ASM_POWERPC_LMB_H
>>  #define _ASM_POWERPC_LMB_H
>>  
>> +#include 
>> +#include 
>> +
>>  struct drmem_lmb {
>>  u64 base_addr;
>>  u32 drc_index;
>> @@ -26,8 +29,21 @@ struct drmem_lmb_info {
>>  
>>  extern struct drmem_lmb_info *drmem_info;
>>  
>> +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
>> +{
>> +const unsigned int resched_interval = 20;
>> +
>> +BUG_ON(lmb < drmem_info->lmbs);
>> +BUG_ON(lmb >= drmem_info->lmbs + drmem_info->n_lmbs);
>
> I think BUG_ON is a pretty big no-no these days unless there is no other 
> option.

It's complicated, but yes we would like to avoid adding them if we can.

In a case like this there is no other option, *if* the check has to be
in drmem_lmb_next().

But I don't think we really need to check that there.

If for some reason this was called with an *lmb pointing outside of the
lmbs array it would confuse the cond_resched() logic, but it's not worth
crashing the box for that IMHO.

cheers


[RFC PATCH v1 1/4] powerpc: apm82181: create shared dtsi for APM bluestone

2020-08-12 Thread Christian Lamparter
This patch adds an DTSI-File that can be used by various device-tree
files for APM82181-based devices.

Some of the nodes (like UART, PCIE, SATA) are used by the uboot and
need to stick with the naming-conventions of the old times'.
I've added comments whenever this was the case. But unfortunately,
this I've to point out that for this reason, there are some warning
messages when compiling the dtb:

> apm82181.dtsi:440.26-483.5: Warning (pci_bridge): /plb/pciex@d: node 
> name is not "pci" or "pcie"
> wd-mybooklive.dtb: Warning (pci_device_bus_num): Failed prerequisite 
> 'pci_bridge'

Signed-off-By: Chris Blake 
Signed-off-by: Christian Lamparter 
---
 arch/powerpc/boot/dts/apm82181.dtsi | 485 
 1 file changed, 485 insertions(+)
 create mode 100644 arch/powerpc/boot/dts/apm82181.dtsi

diff --git a/arch/powerpc/boot/dts/apm82181.dtsi 
b/arch/powerpc/boot/dts/apm82181.dtsi
new file mode 100644
index ..54a9a6fda4af
--- /dev/null
+++ b/arch/powerpc/boot/dts/apm82181.dtsi
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Device Tree template include for various Bluestone (APM82181) boards.
+ *
+ * Copyright (c) 2010, Applied Micro Circuits Corporation
+ * Author: Tirumala R Marri ,
+ *Christian Lamparter ,
+ *Chris Blake 
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/ {
+   #address-cells = <2>;
+   #size-cells = <1>;
+   dcr-parent = <&{/cpus/cpu@0}>;
+   compatible = "apm,bluestone";
+
+   aliases {
+   ethernet0 =  /* needed for BSP u-boot */
+   };
+
+   cpus {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   CPU0: cpu@0 {
+   device_type = "cpu";
+   model = "PowerPC,apm82181";
+   reg = <0x>;
+   clock-frequency = <0>; /* Filled in by U-Boot */
+   timebase-frequency = <0>; /* Filled in by U-Boot */
+   i-cache-line-size = <32>;
+   d-cache-line-size = <32>;
+   i-cache-size = <32768>;
+   d-cache-size = <32768>;
+   dcr-controller;
+   dcr-access-method = "native";
+   next-level-cache = <>;
+   };
+   };
+
+   memory {
+   device_type = "memory";
+   reg = <0x 0x 0x>; /* Filled in by 
U-Boot */
+   };
+
+   UIC0: interrupt-controller0 {
+   compatible = "apm,uic-apm82181", "ibm,uic";
+   interrupt-controller;
+   cell-index = <0>;
+   dcr-reg = <0x0c0 0x009>;
+   #address-cells = <0>;
+   #size-cells = <0>;
+   #interrupt-cells = <2>;
+   };
+
+   UIC1: interrupt-controller1 {
+   compatible = "apm,uic-apm82181", "ibm,uic";
+   interrupt-controller;
+   cell-index = <1>;
+   dcr-reg = <0x0d0 0x009>;
+   #address-cells = <0>;
+   #size-cells = <0>;
+   #interrupt-cells = <2>;
+   interrupts = <0x1e IRQ_TYPE_LEVEL_HIGH>,
+<0x1f IRQ_TYPE_LEVEL_HIGH>; /* cascade */
+   interrupt-parent = <>;
+   };
+
+   UIC2: interrupt-controller2 {
+   compatible = "apm,uic-apm82181", "ibm,uic";
+   interrupt-controller;
+   cell-index = <2>;
+   dcr-reg = <0x0e0 0x009>;
+   #address-cells = <0>;
+   #size-cells = <0>;
+   #interrupt-cells = <2>;
+   interrupts = <0x0a IRQ_TYPE_LEVEL_HIGH>,
+<0x0b IRQ_TYPE_LEVEL_HIGH>; /* cascade */
+   interrupt-parent = <>;
+   };
+
+   UIC3: interrupt-controller3 {
+   compatible = "apm,uic-apm82181","ibm,uic";
+   interrupt-controller;
+   cell-index = <3>;
+   dcr-reg = <0x0f0 0x009>;
+   #address-cells = <0>;
+   #size-cells = <0>;
+   #interrupt-cells = <2>;
+   interrupts = <0x10 IRQ_TYPE_LEVEL_HIGH>,
+<0x11 IRQ_TYPE_LEVEL_HIGH>; /* cascade */
+   interrupt-parent = <>;
+   };
+
+   OCM1: ocm@40004 {
+   compatible = "apm,ocm-apm82181", "ibm,ocm";
+   status = "okay";
+   cell-index = <1>;
+   /* configured in U-Boot */
+   reg = <4 0x0004 0x8000>; /* 32K */
+   };
+
+   SDR0: sdr {
+   compatible = "apm,sdr-apm82181", "ibm,sdr-460ex";
+   dcr-reg = <0x00e 0x002>;
+   };
+
+   CPR0: cpr {
+   compatible = "apm,cpr-apm82181", "ibm,cpr-460ex";
+   dcr-reg = <0x00c 0x002>;
+   };
+
+   L2C0: l2c {
+   compatible = 

[RFC PATCH v1 3/4] powerpc: apm82181: add Meraki MR24 AP

2020-08-12 Thread Christian Lamparter
This patch adds the device-tree definitions for Meraki MR24
Accesspoint devices.

Ready to go images and install instruction can be found @OpenWrt.

Signed-off-By: Chris Blake 
Signed-off-by: Christian Lamparter 
---
 arch/powerpc/boot/dts/meraki-mr24.dts  | 237 +
 arch/powerpc/platforms/44x/ppc44x_simple.c |   1 +
 2 files changed, 238 insertions(+)
 create mode 100644 arch/powerpc/boot/dts/meraki-mr24.dts

diff --git a/arch/powerpc/boot/dts/meraki-mr24.dts 
b/arch/powerpc/boot/dts/meraki-mr24.dts
new file mode 100644
index ..945e3e6b2585
--- /dev/null
+++ b/arch/powerpc/boot/dts/meraki-mr24.dts
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Device Tree Source for Meraki MR24 (Ikarem)
+ *
+ * Copyright (C) 2016 Chris Blake 
+ *
+ * Based on Cisco Meraki GPL Release r23-20150601 MR24 DTS
+ */
+
+/dts-v1/;
+
+#include "apm82181.dtsi"
+
+/ {
+   model = "Meraki MR24 Access Point";
+   compatible = "meraki,mr24";
+
+   aliases {
+   serial0 = 
+   led-boot = 
+   led-failsafe = 
+   led-running = 
+   led-upgrade = 
+   };
+
+   chosen {
+   stdout-path = "/plb/opb/serial@ef600400";
+   };
+};
+
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   /* Ikarem has 32MB of NAND */
+   ndfc@1,0 {
+   status = "okay";
+   /* 32 MiB NAND Flash */
+   nand {
+   partition@0 {
+   label = "u-boot";
+   reg = <0x 0x0015>;
+   read-only;
+   };
+
+   partition@15 {
+   /*
+* The u-boot environment size is one NAND
+* block (16KiB). u-boot allocates four NAND
+* blocks (64KiB) in order to have spares
+* around for bad block management
+*/
+   label = "u-boot-env";
+   reg = <0x0015 0x0001>;
+   read-only;
+   };
+
+   partition@16 {
+   /*
+* redundant u-boot environment.
+* has to be kept it in sync with the
+* data in "u-boot-env".
+*/
+   label = "u-boot-env-redundant";
+   reg = <0x0016 0x0001>;
+   read-only;
+   };
+
+   partition@17 {
+   label = "oops";
+   reg = <0x0017 0x0001>;
+   };
+
+   partition@18 {
+   label = "ubi";
+   reg = <0x0018 0x01e8>;
+   };
+   };
+   };
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+};
+
+ {
+   status = "okay";
+   /* Boot ROM is at 0x52-0x53, do not touch */
+   /* Unknown chip at 0x6e, not sure what it is */
+};
+
+ {
+   status = "okay";
+
+   phy-mode = "rgmii-id";
+   phy-map = <0x2>;
+   phy-address = <0x1>;
+   phy-handle = <>;
+
+   mdio {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   phy: phy@1 {
+   compatible = "ethernet-phy-ieee802.3-c22";
+   reg = <1>;
+   };
+   };
+};
+
+ {
+   leds {
+   compatible = "gpio-leds";
+
+   status: power-green {
+   label = "mr24:green:power";
+   gpios = < 18 GPIO_ACTIVE_LOW>;
+   };
+
+   failsafe: power-orange {
+   label = "mr24:orange:power";
+   gpios = < 19 GPIO_ACTIVE_LOW>;
+   };
+
+   lan {
+   label = "mr24:green:wan";
+   gpios = < 17 GPIO_ACTIVE_LOW>;
+   };
+
+   ssi-0 {
+   label = "mr24:green:wifi1";
+   gpios = < 23 GPIO_ACTIVE_LOW>;
+   };
+
+   ssi-1 {
+   label = "mr24:green:wifi2";
+   gpios = < 22 GPIO_ACTIVE_LOW>;
+   };
+
+   ssi-2 {
+   label = "mr24:green:wifi3";
+   gpios = < 21 GPIO_ACTIVE_LOW>;
+   };
+
+   ssi-3 {
+   label = "mr24:green:wifi4";
+   gpios = < 20 GPIO_ACTIVE_LOW>;

[RFC PATCH v1 0/4] powerpc: apm82181: adding customer devices

2020-08-12 Thread Christian Lamparter
Hello,

I've been holding on to these devices dts' for a while now.
But ever since the recent purge of the PPC405, I'm feeling
the urge to move forward.

The devices in question have been running with OpenWrt since
around 2016/2017. Back then it was linux v4.4 and required
many out-of-tree patches (for WIFI, SATA, CRYPTO...), that
since have been integrated. So, there's nothing else in the
way I think.

Now, I've looked around in the arch/powerpc for recent .dts
and device submissions to get an understanding of what is
required.
>From the looks of it, it seems like every device gets a
skeleton defconfig and a CONFIG_$DEVICE symbol (Like:
CONFIG_MERAKI_MR24, CONFIG_WD_MYBOOKLIVE).

Will this be the case? Or would it make sense to further
unite the Bluestone, MR24 and MBL under a common CONFIG_APM82181
and integrate the BLUESTONE device's defconfig into it as well?
(I've stumbled accross the special machine compatible
handling of ppc in the Documentation/devicetree/usage-model.rst
already.)

Cheers,
Christian

Note:
If someone has a WD MyBook Live (DUO) and is interested in
giving it a spin with 5.8. I've made a:
"build your own Debian System" sort of script that can be
found on github: 
(the only remaining patch hack is for debian's make-kpkg crossbuild)

Furthermore, the OpenWrt project currently has images for the
following apm82181 devices:
 Cisco Meraki MX60(W) - Needs DSA for the AR8327
 Netgear WNDAP620/WNDAP660 - (Could be next)
 Netgear WNDR4700 - Needs DSA for the AR8327

Note2: I do have a stash of extensive APM82181 related documentation.

Christian Lamparter (4):
  powerpc: apm82181: create shared dtsi for APM bluestone
  powerpc: apm82181: add WD MyBook Live NAS
  powerpc: apm82181: add Meraki MR24 AP
  powerpc: apm82181: integrate bluestone.dts

 arch/powerpc/boot/dts/apm82181.dtsi| 485 +
 arch/powerpc/boot/dts/bluestone.dts| 456 +--
 arch/powerpc/boot/dts/meraki-mr24.dts  | 237 ++
 arch/powerpc/boot/dts/wd-mybooklive.dts| 199 +
 arch/powerpc/platforms/44x/ppc44x_simple.c |   4 +-
 5 files changed, 1033 insertions(+), 348 deletions(-)
 create mode 100644 arch/powerpc/boot/dts/apm82181.dtsi
 create mode 100644 arch/powerpc/boot/dts/meraki-mr24.dts
 create mode 100644 arch/powerpc/boot/dts/wd-mybooklive.dts

-- 
2.28.0



[RFC PATCH v1 4/4] powerpc: apm82181: integrate bluestone.dts

2020-08-12 Thread Christian Lamparter
This patch tries to integrate the existing bluestone.dts into the
apm82181.dtsi framework.

The original bluestone.dts produces a  peculiar warning message.
> bluestone.dts:120.10-125.4: Warning (i2c_bus_reg):
>  /plb/opb/i2c@ef600700/sttm@4C: I2C bus unit address format error, expected 
> "4c"
For now, this has been kept alone.

Signed-off-by: Christian Lamparter 
---
 arch/powerpc/boot/dts/bluestone.dts | 456 +++-
 1 file changed, 109 insertions(+), 347 deletions(-)

diff --git a/arch/powerpc/boot/dts/bluestone.dts 
b/arch/powerpc/boot/dts/bluestone.dts
index cc965a1816b6..bf83bdaa8ec9 100644
--- a/arch/powerpc/boot/dts/bluestone.dts
+++ b/arch/powerpc/boot/dts/bluestone.dts
@@ -8,388 +8,150 @@
 
 /dts-v1/;
 
+#include "apm82181.dtsi"
+
 / {
-   #address-cells = <2>;
-   #size-cells = <1>;
model = "apm,bluestone";
compatible = "apm,bluestone";
-   dcr-parent = <&{/cpus/cpu@0}>;
 
aliases {
-   ethernet0 = 
serial0 = 
serial1 = 
};
+};
 
-   cpus {
-   #address-cells = <1>;
-   #size-cells = <0>;
-
-   cpu@0 {
-   device_type = "cpu";
-   model = "PowerPC,apm821xx";
-   reg = <0x>;
-   clock-frequency = <0>; /* Filled in by U-Boot */
-   timebase-frequency = <0>; /* Filled in by U-Boot */
-   i-cache-line-size = <32>;
-   d-cache-line-size = <32>;
-   i-cache-size = <32768>;
-   d-cache-size = <32768>;
-   dcr-controller;
-   dcr-access-method = "native";
-   next-level-cache = <>;
-   };
-   };
-
-   memory {
-   device_type = "memory";
-   reg = <0x 0x 0x>; /* Filled in by 
U-Boot */
-   };
+ {
+   status = "okay";
+};
 
-   UIC0: interrupt-controller0 {
-   compatible = "ibm,uic";
-   interrupt-controller;
-   cell-index = <0>;
-   dcr-reg = <0x0c0 0x009>;
-   #address-cells = <0>;
-   #size-cells = <0>;
-   #interrupt-cells = <2>;
-   };
+ {
+   status = "okay";
+};
 
-   UIC1: interrupt-controller1 {
-   compatible = "ibm,uic";
-   interrupt-controller;
-   cell-index = <1>;
-   dcr-reg = <0x0d0 0x009>;
-   #address-cells = <0>;
-   #size-cells = <0>;
-   #interrupt-cells = <2>;
-   interrupts = <0x1e 0x4 0x1f 0x4>; /* cascade */
-   interrupt-parent = <>;
-   };
+ {
+   status = "okay";
+};
 
-   UIC2: interrupt-controller2 {
-   compatible = "ibm,uic";
-   interrupt-controller;
-   cell-index = <2>;
-   dcr-reg = <0x0e0 0x009>;
-   #address-cells = <0>;
-   #size-cells = <0>;
-   #interrupt-cells = <2>;
-   interrupts = <0xa 0x4 0xb 0x4>; /* cascade */
-   interrupt-parent = <>;
-   };
+ {
+   status = "okay";
 
-   UIC3: interrupt-controller3 {
-   compatible = "ibm,uic";
-   interrupt-controller;
-   cell-index = <3>;
-   dcr-reg = <0x0f0 0x009>;
-   #address-cells = <0>;
-   #size-cells = <0>;
-   #interrupt-cells = <2>;
-   interrupts = <0x10 0x4 0x11 0x4>; /* cascade */
-   interrupt-parent = <>;
-   };
-
-   OCM: ocm@40004 {
-   compatible = "ibm,ocm";
+   nor_flash@0,0 {
status = "okay";
-   cell-index = <1>;
-   /* configured in U-Boot */
-   reg = <4 0x0004 0x8000>; /* 32K */
-   };
-
-   SDR0: sdr {
-   compatible = "ibm,sdr-apm821xx";
-   dcr-reg = <0x00e 0x002>;
-   };
-
-   CPR0: cpr {
-   compatible = "ibm,cpr-apm821xx";
-   dcr-reg = <0x00c 0x002>;
-   };
-
-   L2C0: l2c {
-   compatible = "ibm,l2-cache-apm82181", "ibm,l2-cache";
-   dcr-reg = <0x020 0x008
-  0x030 0x008>;
-   cache-line-size = <32>;
-   cache-size = <262144>;
-   interrupt-parent = <>;
-   interrupts = <11 1>;
-   };
 
-   plb {
-   compatible = "ibm,plb4";
-   #address-cells = <2>;
+   compatible = "amd,s29gl512n", "cfi-flash";
+   bank-width = <2>;
+   reg = <0x 0x 0x0040>;
+   #address-cells = <1>;
#size-cells = <1>;
-   ranges;
-   clock-frequency = <0>; /* Filled in by U-Boot */
-
-   SDRAM0: sdram {
-   

[RFC PATCH v1 2/4] powerpc: apm82181: add WD MyBook Live NAS

2020-08-12 Thread Christian Lamparter
This patch adds the device-tree definitions for
Western Digital MyBook Live NAS devices.

Technically, this devicetree file is shared by two, very
similar devices.

There's the My Book Live and the My Book Live Duo. WD's uboot
on the device will enable/disable the nodes for the device.

Ready to go images and install instruction can be found @OpenWrt.org

Signed-off-by: Christian Lamparter 
---
 arch/powerpc/boot/dts/wd-mybooklive.dts| 199 +
 arch/powerpc/platforms/44x/ppc44x_simple.c |   3 +-
 2 files changed, 201 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/boot/dts/wd-mybooklive.dts

diff --git a/arch/powerpc/boot/dts/wd-mybooklive.dts 
b/arch/powerpc/boot/dts/wd-mybooklive.dts
new file mode 100644
index ..0871cc17bc9d
--- /dev/null
+++ b/arch/powerpc/boot/dts/wd-mybooklive.dts
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2008 DENX Software Engineering, Stefan Roese 
+ * (c) Copyright 2010 Western Digital Technologies, Inc. All Rights Reserved.
+ */
+
+/dts-v1/;
+
+#include "apm82181.dtsi"
+
+/ {
+   compatible = "wd,mybooklive";
+   model = "MyBook Live";
+
+   aliases {
+   serial0 = 
+   };
+};
+
+ {
+   ebc {
+   nor_flash@0,0 {
+   status = "okay";
+   compatible = "amd,s29gl512n", "jedec-probe", 
"cfi-flash", "mtd-rom";
+   bank-width = <1>;
+   reg = <0x 0x 0x0008>;
+   #address-cells = <1>;
+   #size-cells = <1>;
+
+   partition@0 {
+   /* Part of bootrom - Don't use it without a 
jump */
+   label = "free";
+   reg = <0x 0x0001e000>;
+   };
+
+   partition@1e000 {
+   label = "env";
+   reg = <0x0001e000 0x2000>;
+   };
+
+   partition@2 {
+   label = "uboot";
+   reg = <0x0002 0x0005>;
+   };
+   };
+   };
+
+   GPIO1: gpio@e000 {
+   compatible = "wd,mbl-gpio";
+   reg-names = "dat";
+   reg = <0xe000 0x1>;
+   #gpio-cells = <2>;
+   gpio-controller;
+
+   enable-button {
+   /* Defined in u-boot as: NOT_NOR
+* "enables features other than NOR
+* specifically, the buffer at CS2"
+* (button).
+*
+* Note: This option is disabled as
+* it prevents the system from being
+* rebooted successfully.
+*/
+
+   gpio-hog;
+   line-name = "Enable Reset Button, disable NOR";
+   gpios = <1 GPIO_ACTIVE_HIGH>;
+   output-low;
+   };
+   };
+
+   GPIO2: gpio@e010 {
+   compatible = "wd,mbl-gpio";
+   reg-names = "dat";
+   reg = <0xe010 0x1>;
+   #gpio-cells = <2>;
+   gpio-controller;
+   no-output;
+   };
+
+   leds {
+   compatible = "gpio-leds";
+
+   failsafe: power-red {
+   label = "mbl:red:power";
+   gpios = < 4 GPIO_ACTIVE_HIGH>;
+   linux,default-trigger = "panic";
+   };
+
+   status: power-green {
+   label = "mbl:green:power";
+   gpios = < 5 GPIO_ACTIVE_HIGH>;
+   };
+
+   power-blue {
+   label = "mbl:blue:power";
+   gpios = < 6 GPIO_ACTIVE_HIGH>;
+   linux,default-trigger = "disk-activity";
+   };
+   };
+
+   keys {
+   compatible = "gpio-keys-polled";
+   poll-interval = <60>;   /* 3 * 20 = 60ms */
+   autorepeat;
+
+   reset-button {
+   label = "Reset button";
+   linux,code = ;
+   gpios = < 2 GPIO_ACTIVE_LOW>;
+   };
+   };
+
+   usbpwr: usb-regulator {
+   compatible = "regulator-fixed";
+   regulator-name = "Power USB Core";
+   gpios = < 2 GPIO_ACTIVE_LOW>;
+   regulator-min-microvolt = <500>;
+   regulator-max-microvolt = <500>;
+   };
+
+   sata1pwr: sata1-regulator {
+   compatible = "regulator-fixed";
+   regulator-name = "Power Drive Port 1";
+   gpios = < 3 GPIO_ACTIVE_LOW>;
+   

Re: [PATCH v3 2/2] powerpc/uaccess: Add pre-update addressing to __get_user_asm() and __put_user_asm()

2020-08-12 Thread Segher Boessenkool
On Wed, Aug 12, 2020 at 12:25:17PM +, Christophe Leroy wrote:
> Enable pre-update addressing mode in __get_user_asm() and __put_user_asm()
> 
> Signed-off-by: Christophe Leroy 
> ---
> v3: new, splited out from patch 1.

It still looks fine to me, you can keep my Reviewed-by: :-)


Segher


Re: [PATCH v2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()

2020-08-12 Thread Segher Boessenkool
On Wed, Aug 12, 2020 at 02:32:51PM +0200, Christophe Leroy wrote:
> Anyway, it seems that GCC doesn't make much use of the "m<>" and the 
> pre-update form.

GCC does not use update form outside of inner loops much.  Did you
expect anything else?

> Most of the benefit of flexible addressing seems to be 
> achieved with patch 1 ie without the "m<>" constraint and update form.

Yes.


Segher


Re: [PATCH v2] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-08-12 Thread Tyrel Datwyler
On 8/11/20 6:20 PM, Nathan Lynch wrote:
> The drmem lmb list can have hundreds of thousands of entries, and
> unfortunately lookups take the form of linear searches. As long as
> this is the case, traversals have the potential to monopolize the CPU
> and provoke lockup reports, workqueue stalls, and the like unless
> they explicitly yield.
> 
> Rather than placing cond_resched() calls within various
> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
> expression of the loop macro itself so users can't omit it.
> 
> Call cond_resched() on every 20th element. Each iteration of the loop
> in DLPAR code paths can involve around ten RTAS calls which can each
> take up to 250us, so this ensures the check is performed at worst
> every few milliseconds.
> 
> Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT 
> format")
> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/include/asm/drmem.h | 18 +-
>  1 file changed, 17 insertions(+), 1 deletion(-)
> 
> Changes since v1:
> * Add bounds assertions in drmem_lmb_next().
> * Call cond_resched() in the iterator on only every 20th element
>   instead of on every iteration, to reduce overhead in tight loops.
> 
> diff --git a/arch/powerpc/include/asm/drmem.h 
> b/arch/powerpc/include/asm/drmem.h
> index 17ccc6474ab6..583277e30dd2 100644
> --- a/arch/powerpc/include/asm/drmem.h
> +++ b/arch/powerpc/include/asm/drmem.h
> @@ -8,6 +8,9 @@
>  #ifndef _ASM_POWERPC_LMB_H
>  #define _ASM_POWERPC_LMB_H
>  
> +#include 
> +#include 
> +
>  struct drmem_lmb {
>   u64 base_addr;
>   u32 drc_index;
> @@ -26,8 +29,21 @@ struct drmem_lmb_info {
>  
>  extern struct drmem_lmb_info *drmem_info;
>  
> +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
> +{
> + const unsigned int resched_interval = 20;
> +
> + BUG_ON(lmb < drmem_info->lmbs);
> + BUG_ON(lmb >= drmem_info->lmbs + drmem_info->n_lmbs);

I think BUG_ON is a pretty big no-no these days unless there is no other option.

-Tyrel

> +
> + if ((lmb - drmem_info->lmbs) % resched_interval == 0)
> + cond_resched();
> +
> + return ++lmb;
> +}
> +
>  #define for_each_drmem_lmb_in_range(lmb, start, end) \
> - for ((lmb) = (start); (lmb) < (end); (lmb)++)
> + for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb))
>  
>  #define for_each_drmem_lmb(lmb)  \
>   for_each_drmem_lmb_in_range((lmb),  \
> 



Re: [PATCH v3 8/8] mm/vmalloc: Hugepage vmalloc mappings

2020-08-12 Thread Jonathan Cameron
On Wed, 12 Aug 2020 13:25:24 +0100
Jonathan Cameron  wrote:

> On Mon, 10 Aug 2020 12:27:32 +1000
> Nicholas Piggin  wrote:
> 
> > On platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmaps,
> > vmalloc will attempt to allocate PMD-sized pages first, before falling
> > back to small pages.
> > 
> > Allocations which use something other than PAGE_KERNEL protections are
> > not permitted to use huge pages yet, not all callers expect this (e.g.,
> > module allocations vs strict module rwx).
> > 
> > This reduces TLB misses by nearly 30x on a `git diff` workload on a
> > 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
> > 
> > This can result in more internal fragmentation and memory overhead for a
> > given allocation, an option nohugevmap is added to disable at boot.
> > 
> > Signed-off-by: Nicholas Piggin   
> Hi Nicholas,
> 
> Busy afternoon, but a possible point of interest in line in the meantime.
> 

I did manage to get back to this.

The issue I think is that ARM64 defines THREAD_ALIGN with CONFIG_VMAP_STACK
to be 2* THREAD SIZE.  There is comment in arch/arm64/include/asm/memory.h
that this is to allow cheap checking of overflow.

A quick grep suggests ARM64 is the only architecture to do this...

Jonathan



> 
> ...
> 
> > @@ -2701,22 +2760,45 @@ void *__vmalloc_node_range(unsigned long size, 
> > unsigned long align,
> > pgprot_t prot, unsigned long vm_flags, int node,
> > const void *caller)
> >  {
> > -   struct vm_struct *area;
> > +   struct vm_struct *area = NULL;
> > void *addr;
> > unsigned long real_size = size;
> > +   unsigned long real_align = align;
> > +   unsigned int shift = PAGE_SHIFT;
> >  
> > size = PAGE_ALIGN(size);
> > if (!size || (size >> PAGE_SHIFT) > totalram_pages())
> > goto fail;
> >  
> > -   area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED 
> > |
> > +   if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) {
> > +   unsigned long size_per_node;
> > +
> > +   /*
> > +* Try huge pages. Only try for PAGE_KERNEL allocations,
> > +* others like modules don't yet expect huge pages in
> > +* their allocations due to apply_to_page_range not
> > +* supporting them.
> > +*/
> > +
> > +   size_per_node = size;
> > +   if (node == NUMA_NO_NODE)
> > +   size_per_node /= num_online_nodes();
> > +   if (size_per_node >= PMD_SIZE)
> > +   shift = PMD_SHIFT;
> > +   }
> > +
> > +again:
> > +   align = max(real_align, 1UL << shift);
> > +   size = ALIGN(real_size, align);  
> 
> So my suspicion is that the issue on arm64 is related to this.
> In the relevant call path, align is 32K whilst the size is 16K
> 
> Previously I don't think we force size to be a multiple of align.
> 
> I think this results in nr_pages being double what it was before.
> 
> 
> > +
> > +   area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
> > vm_flags, start, end, node, gfp_mask, caller);
> > if (!area)
> > goto fail;
> >  
> > -   addr = __vmalloc_area_node(area, gfp_mask, prot, node);
> > +   addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
> > if (!addr)
> > -   return NULL;
> > +   goto fail;
> >  
> > /*
> >  * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> > @@ -2730,8 +2812,16 @@ void *__vmalloc_node_range(unsigned long size, 
> > unsigned long align,
> > return addr;
> >  
> >  fail:
> > -   warn_alloc(gfp_mask, NULL,
> > +   if (shift > PAGE_SHIFT) {
> > +   shift = PAGE_SHIFT;
> > +   goto again;
> > +   }
> > +
> > +   if (!area) {
> > +   /* Warn for area allocation, page allocations already warn */
> > +   warn_alloc(gfp_mask, NULL,
> >   "vmalloc: allocation failure: %lu bytes", real_size);
> > +   }
> > return NULL;
> >  }
> >
> 
> 
> 
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel




Re: [PATCH v2] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-08-12 Thread Christophe Leroy




Le 12/08/2020 à 15:46, Nathan Lynch a écrit :

Hi Christophe,

Christophe Leroy  writes:

+static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
+{
+   const unsigned int resched_interval = 20;
+
+   BUG_ON(lmb < drmem_info->lmbs);
+   BUG_ON(lmb >= drmem_info->lmbs + drmem_info->n_lmbs);


BUG_ON() shall be avoided unless absolutely necessary.
Wouldn't WARN_ON() together with an early return be enough ?


Not sure what a sensible early return behavior would be. If the iterator
doesn't advance the pointer the behavior will be a hang.


I was thinking about returning lmb++ without checking the cond_resched 
stuff.




BUG_ON a bounds-check failure is appropriate here; many users of this
API will corrupt memory otherwise.


It looks really strange to me to do bounds-checks in an iterator like 
this. Should be checked before entering the loop.





+
+   if ((lmb - drmem_info->lmbs) % resched_interval == 0)
+   cond_resched();


Do you need something that precise ? Can't you use 16 or 32 and use a
logical AND instead of a MODULO ?


Eh if you're executing in this code you've already lost with respect to
performance considerations at this level, see the discussion on v1. I'll
use 16 since I'm going to reroll the patch though.


And what garanties that lmb is always an element of a table based at
drmem_info->lmbs ?


Well, that's its only intended use right now. There should not be any
other arrays of drmem_lmb objects, and I hope we don't gain any.



What about:

static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
struct drmem_lmb *start)
{
const unsigned int resched_interval = 16;

if ((++lmb - start) & resched_interval == 0)

^^^
Did you mean '%' here? The bitwise AND doesn't do what I want.


I meant '& (resched_interval - 1)' indeed. But yes we can leave the %, 
GCC will change a % 16 to an & 15.




Otherwise, making drmem_lmb_next() more general by adding a 'start'
argument could ease refactoring to come, so I'll do that.


Yes, the main idea is to avoid the access to a global variable in such 
an helper.






Christophe


Re: [PATCH 14/16] debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-08-12 Thread Aneesh Kumar K.V

On 8/12/20 7:04 PM, Anshuman Khandual wrote:



On 08/12/2020 06:46 PM, Aneesh Kumar K.V wrote:

On 8/12/20 6:33 PM, Anshuman Khandual wrote:



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:

The seems to be missing quite a lot of details w.r.t allocating
the correct pgtable_t page (huge_pte_alloc()), holding the right
lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.

ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
Hence disable the test on ppc64.


This test is free from any platform specific #ifdefs which should
never be broken. If hugetlb_advanced_tests() does not work or is
not detailed enough for ppc64, then it would be great if you could
suggest some improvements so that it works for all enabled platforms.




As mentioned the test is broken. For hugetlb, the pgtable_t pages should be 
allocated by huge_pte_alloc(). We need to hold huget_pte_lock() before  
updating huge tlb pte. That takes hugepage size, which is mostly derived out of 
vma. Hence vma need to be a hugetlb vma. Some of the functions also depend on 
hstate. Also we should use set_huge_pte_at() when setting up hugetlb pte 
entries. I was tempted to remove that test completely marking it broken. But 
avoided that by marking it broken on only PPC64.


The test is not broken, hugetlb helpers on multiple platforms dont complain 
about
this at all. The tests here emulate 'enough' MM objects required for the helpers
on enabled platforms, to perform the primary task i.e page table transformation 
it
is expected to do. The test does not claim to emulate a perfect MM environment 
for
a given subsystem's (like HugeTLB) arch helpers. Now in this case, the MM 
objects
being emulated for the HugeTLB advanced tests does not seem to be sufficient for
ppc64 but it can be improved. But that does not mean it is broken in it's 
current
form for other platforms.



There is nothing ppc64 specific here. It is just that we have 
CONFIG_DEBUG_VM based checks for different possibly wrong usages of 
these functions. This was done because we have different page sizes, two 
different translations to support and we want to avoid any wrong usage. 
IMHO expecting hugetlb page table helpers to work with a non hugetlb VMA 
and  without holding hugeTLB pte lock is a clear violation of hugetlb 
interface.


-aneesh


Re: [PATCH v2] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal

2020-08-12 Thread Nathan Lynch
Hi Christophe,

Christophe Leroy  writes:
>> +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
>> +{
>> +const unsigned int resched_interval = 20;
>> +
>> +BUG_ON(lmb < drmem_info->lmbs);
>> +BUG_ON(lmb >= drmem_info->lmbs + drmem_info->n_lmbs);
>
> BUG_ON() shall be avoided unless absolutely necessary.
> Wouldn't WARN_ON() together with an early return be enough ?

Not sure what a sensible early return behavior would be. If the iterator
doesn't advance the pointer the behavior will be a hang.

BUG_ON a bounds-check failure is appropriate here; many users of this
API will corrupt memory otherwise.

>> +
>> +if ((lmb - drmem_info->lmbs) % resched_interval == 0)
>> +cond_resched();
>
> Do you need something that precise ? Can't you use 16 or 32 and use a 
> logical AND instead of a MODULO ?

Eh if you're executing in this code you've already lost with respect to
performance considerations at this level, see the discussion on v1. I'll
use 16 since I'm going to reroll the patch though.

> And what garanties that lmb is always an element of a table based at 
> drmem_info->lmbs ?

Well, that's its only intended use right now. There should not be any
other arrays of drmem_lmb objects, and I hope we don't gain any.


> What about:
>
> static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb, 
> struct drmem_lmb *start)
> {
>   const unsigned int resched_interval = 16;
>
>   if ((++lmb - start) & resched_interval == 0)
   ^^^
Did you mean '%' here? The bitwise AND doesn't do what I want.

Otherwise, making drmem_lmb_next() more general by adding a 'start'
argument could ease refactoring to come, so I'll do that.


Re: [PATCH v2] powerpc/pseries/hotplug-cpu: wait indefinitely for vCPU death

2020-08-12 Thread Greg Kurz
On Tue, 11 Aug 2020 11:15:44 -0500
Michael Roth  wrote:

> For a power9 KVM guest with XIVE enabled, running a test loop
> where we hotplug 384 vcpus and then unplug them, the following traces
> can be seen (generally within a few loops) either from the unplugged
> vcpu:
> 
>   [ 1767.353447] cpu 65 (hwid 65) Ready to die...
>   [ 1767.952096] Querying DEAD? cpu 66 (66) shows 2
>   [ 1767.952311] list_del corruption. next->prev should be c00a02470208, 
> but was c00a02470048
>   [ 1767.952322] [ cut here ]
>   [ 1767.952323] kernel BUG at lib/list_debug.c:56!
>   [ 1767.952325] Oops: Exception in kernel mode, sig: 5 [#1]
>   [ 1767.952326] LE SMP NR_CPUS=2048 NUMA pSeries
>   [ 1767.952328] Modules linked in: fuse nft_fib_inet nft_fib_ipv4 
> nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject 
> nft_ct nf_tables_set nft_chain_nat_ipv6 nf_nat_ipv6 nft_chain_route_ipv6 
> nft_chain_nat_ipv4 nf_nat_ipv4 nf_nat nf_conntrack nf_defrag_ipv6 
> nf_defrag_ipv4 nft_chain_route_ipv4 ip6_tables nft_compat ip_set nf_tables 
> nfnetlink uio_pdrv_genirq ip_tables xfs libcrc32c sd_mod sg xts vmx_crypto 
> virtio_net net_failover failover virtio_scsi dm_multipath dm_mirror 
> dm_region_hash dm_log dm_mod be2iscsi bnx2i cnic uio cxgb4i cxgb4 libcxgbi 
> libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi 
> scsi_transport_iscsi
>   [ 1767.952352] CPU: 66 PID: 0 Comm: swapper/66 Kdump: loaded Not tainted 
> 4.18.0-221.el8.ppc64le #1
>   [ 1767.952354] NIP:  c07ab50c LR: c07ab508 CTR: 
> 03ac
>   [ 1767.952355] REGS: c009e5a17840 TRAP: 0700   Not tainted  
> (4.18.0-221.el8.ppc64le)
>   [ 1767.952355] MSR:  8282b033   
> CR: 28000842  XER: 2004
>   [ 1767.952360] CFAR: c01ffe64 IRQMASK: 1
>   [ 1767.952360] GPR00: c07ab508 c009e5a17ac0 c1ac0700 
> 0054
>   [ 1767.952360] GPR04: c009f056cf90 c009f05f5628 c009ed40d654 
> c1c90700
>   [ 1767.952360] GPR08: 0007 c009f0573980 0009ef2b 
> 7562202c38303230
>   [ 1767.952360] GPR12:  c007fff6ce80 c00a02470208 
> 
>   [ 1767.952360] GPR16: 0001 c009f05fbb00 0800 
> c009ff3d4980
>   [ 1767.952360] GPR20: c009f05fbb10 5deadbeef100 5deadbeef200 
> 00187961
>   [ 1767.952360] GPR24: c009e5a17b78  0001 
> 
>   [ 1767.952360] GPR28: c00a02470200 c009f05fbb10 c009f05fbb10 
> 
>   [ 1767.952375] NIP [c07ab50c] __list_del_entry_valid+0xac/0x100
>   [ 1767.952376] LR [c07ab508] __list_del_entry_valid+0xa8/0x100
>   [ 1767.952377] Call Trace:
>   [ 1767.952378] [c009e5a17ac0] [c07ab508] 
> __list_del_entry_valid+0xa8/0x100 (unreliable)
>   [ 1767.952381] [c009e5a17b20] [c0476e18] 
> free_pcppages_bulk+0x1f8/0x940
>   [ 1767.952383] [c009e5a17c20] [c047a9a0] 
> free_unref_page+0xd0/0x100
>   [ 1767.952386] [c009e5a17c50] [c00bc2a8] 
> xive_spapr_cleanup_queue+0x148/0x1b0
>   [ 1767.952388] [c009e5a17cf0] [c00b96dc] 
> xive_teardown_cpu+0x1bc/0x240
>   [ 1767.952391] [c009e5a17d30] [c010bf28] 
> pseries_mach_cpu_die+0x78/0x2f0
>   [ 1767.952393] [c009e5a17de0] [c005c8d8] cpu_die+0x48/0x70
>   [ 1767.952394] [c009e5a17e00] [c0021cf0] 
> arch_cpu_idle_dead+0x20/0x40
>   [ 1767.952397] [c009e5a17e20] [c01b4294] do_idle+0x2f4/0x4c0
>   [ 1767.952399] [c009e5a17ea0] [c01b46a8] 
> cpu_startup_entry+0x38/0x40
>   [ 1767.952400] [c009e5a17ed0] [c005c43c] 
> start_secondary+0x7bc/0x8f0
>   [ 1767.952403] [c009e5a17f90] [c000ac70] 
> start_secondary_prolog+0x10/0x14
> 
> or on the worker thread handling the unplug:
> 
>   [ 1538.253044] pseries-hotplug-cpu: Attempting to remove CPU , drc 
> index: 113a
>   [ 1538.360259] Querying DEAD? cpu 314 (314) shows 2
>   [ 1538.360736] BUG: Bad page state in process kworker/u768:3  pfn:95de1
>   [ 1538.360746] cpu 314 (hwid 314) Ready to die...
>   [ 1538.360784] page:c00a02577840 refcount:0 mapcount:-128 
> mapping: index:0x0
>   [ 1538.361881] flags: 0x5c()
>   [ 1538.361908] raw: 005c 5deadbeef100 5deadbeef200 
> 
>   [ 1538.361955] raw:   ff7f 
> 
>   [ 1538.362002] page dumped because: nonzero mapcount
>   [ 1538.362033] Modules linked in: kvm xt_CHECKSUM ipt_MASQUERADE 
> xt_conntrack ipt_REJECT nft_counter nf_nat_tftp nft_objref nf_conntrack_tftp 
> tun bridge stp llc nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
> nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nf_tables_set 
> nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6_tables 
> nft_compat ip_set nf_tables nfnetlink 

Re: [PATCH 14/16] debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 06:46 PM, Aneesh Kumar K.V wrote:
> On 8/12/20 6:33 PM, Anshuman Khandual wrote:
>>
>>
>> On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
>>> The seems to be missing quite a lot of details w.r.t allocating
>>> the correct pgtable_t page (huge_pte_alloc()), holding the right
>>> lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.
>>>
>>> ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
>>> Hence disable the test on ppc64.
>>
>> This test is free from any platform specific #ifdefs which should
>> never be broken. If hugetlb_advanced_tests() does not work or is
>> not detailed enough for ppc64, then it would be great if you could
>> suggest some improvements so that it works for all enabled platforms.
>>
>>
> 
> As mentioned the test is broken. For hugetlb, the pgtable_t pages should be 
> allocated by huge_pte_alloc(). We need to hold huget_pte_lock() before  
> updating huge tlb pte. That takes hugepage size, which is mostly derived out 
> of vma. Hence vma need to be a hugetlb vma. Some of the functions also depend 
> on hstate. Also we should use set_huge_pte_at() when setting up hugetlb pte 
> entries. I was tempted to remove that test completely marking it broken. But 
> avoided that by marking it broken on only PPC64.

The test is not broken, hugetlb helpers on multiple platforms dont complain 
about
this at all. The tests here emulate 'enough' MM objects required for the helpers
on enabled platforms, to perform the primary task i.e page table transformation 
it
is expected to do. The test does not claim to emulate a perfect MM environment 
for
a given subsystem's (like HugeTLB) arch helpers. Now in this case, the MM 
objects
being emulated for the HugeTLB advanced tests does not seem to be sufficient for
ppc64 but it can be improved. But that does not mean it is broken in it's 
current
form for other platforms.

> 
> 
> 
>>> Signed-off-by: Aneesh Kumar K.V 
>>> ---
>>>   mm/debug_vm_pgtable.c | 6 +-
>>>   1 file changed, 5 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>>> index 529892b9be2f..3e112d0ba1b2 100644
>>> --- a/mm/debug_vm_pgtable.c
>>> +++ b/mm/debug_vm_pgtable.c
>>> @@ -800,6 +800,7 @@ static void __init hugetlb_basic_tests(unsigned long 
>>> pfn, pgprot_t prot)
>>>   #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>>>   }
>>>   +#ifndef CONFIG_PPC_BOOK3S_64
>>>   static void __init hugetlb_advanced_tests(struct mm_struct *mm,
>>>     struct vm_area_struct *vma,
>>>     pte_t *ptep, unsigned long pfn,
>>> @@ -842,6 +843,7 @@ static void __init hugetlb_advanced_tests(struct 
>>> mm_struct *mm,
>>>   pte = huge_ptep_get(ptep);
>>>   WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
>>>   }
>>> +#endif
>>>   #else  /* !CONFIG_HUGETLB_PAGE */
>>>   static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) 
>>> { }
>>>   static void __init hugetlb_advanced_tests(struct mm_struct *mm,
>>> @@ -1053,7 +1055,9 @@ static int __init debug_vm_pgtable(void)
>>>   pud_populate_tests(mm, pudp, saved_pmdp);
>>>   spin_unlock(ptl);
>>>   -    //hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>>> +#ifndef CONFIG_PPC_BOOK3S_64
>>> +    hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>>> +#endif
>>>     spin_lock(>page_table_lock);
>>>   p4d_clear_tests(mm, p4dp);
>>>
> 
> 


Re: [PATCH 14/16] debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-08-12 Thread Aneesh Kumar K.V

On 8/12/20 6:33 PM, Anshuman Khandual wrote:



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:

The seems to be missing quite a lot of details w.r.t allocating
the correct pgtable_t page (huge_pte_alloc()), holding the right
lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.

ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
Hence disable the test on ppc64.


This test is free from any platform specific #ifdefs which should
never be broken. If hugetlb_advanced_tests() does not work or is
not detailed enough for ppc64, then it would be great if you could
suggest some improvements so that it works for all enabled platforms.




As mentioned the test is broken. For hugetlb, the pgtable_t pages should 
be allocated by huge_pte_alloc(). We need to hold huget_pte_lock() 
before  updating huge tlb pte. That takes hugepage size, which is mostly 
derived out of vma. Hence vma need to be a hugetlb vma. Some of the 
functions also depend on hstate. Also we should use set_huge_pte_at() 
when setting up hugetlb pte entries. I was tempted to remove that test 
completely marking it broken. But avoided that by marking it broken on 
only PPC64.





Signed-off-by: Aneesh Kumar K.V 
---
  mm/debug_vm_pgtable.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 529892b9be2f..3e112d0ba1b2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -800,6 +800,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, 
pgprot_t prot)
  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
  }
  
+#ifndef CONFIG_PPC_BOOK3S_64

  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
  struct vm_area_struct *vma,
  pte_t *ptep, unsigned long pfn,
@@ -842,6 +843,7 @@ static void __init hugetlb_advanced_tests(struct mm_struct 
*mm,
pte = huge_ptep_get(ptep);
WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
  }
+#endif
  #else  /* !CONFIG_HUGETLB_PAGE */
  static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
@@ -1053,7 +1055,9 @@ static int __init debug_vm_pgtable(void)
pud_populate_tests(mm, pudp, saved_pmdp);
spin_unlock(ptl);
  
-	//hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);

+#ifndef CONFIG_PPC_BOOK3S_64
+   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+#endif
  
  	spin_lock(>page_table_lock);

p4d_clear_tests(mm, p4dp);





[Virtual ppce500] virtio_gpu virtio0: swiotlb buffer is full

2020-08-12 Thread Christian Zigotzky

Hello Daniel,

The VirtIO-GPU doesn't work anymore with the latest Git kernel in a 
virtual e5500 PPC64 QEMU machine [1,2] after the commit "drm/virtio: 
Call the right shmem helpers". [3]

The kernel 5.8 works with the VirtIO-GPU in this virtual machine.

I bisected today [4].

Result: drm/virtio: Call the right shmem helpers ( 
d323bb44e4d23802eb25d13de1f93f2335bd60d0) [3] is the first bad commit.


I was able to revert the first bad commit. [5] After that I compiled a 
new kernel again. Then I was able to boot Linux with this kernel in a 
virtual e5500 PPC64 QEMU machine with the VirtIO-GPU.


I created a patch. [6] With this patch I can use the VirtIO-GPU again.

Could you please check the first bad commit?

Thanks,
Christian

[1] QEMU command: qemu-system-ppc64 -M ppce500 -cpu e5500 -enable-kvm -m 
1024 -kernel uImage -drive 
format=raw,file=fienix-soar_3.0-2020608-net.img,index=0,if=virtio -nic 
user,model=e1000 -append "rw root=/dev/vda2" -device virtio-vga -device 
virtio-mouse-pci -device virtio-keyboard-pci -device pci-ohci,id=newusb 
-device usb-audio,bus=newusb.0 -smp 4


[2] Error messages:

virtio_gpu virtio0: swiotlb buffer is full (sz: 4096 bytes), total 0 
(slots), used 0 (slots)

BUG: Kernel NULL pointer dereference on read at 0x0010
Faulting instruction address: 0xc00c7324
Oops: Kernel access of bad area, sig: 11 [#1]
BE PAGE_SIZE=4K PREEMPT SMP NR_CPUS=4 QEMU e500
Modules linked in:
CPU: 2 PID: 1678 Comm: kworker/2:2 Not tainted 
5.9-a3_A-EON_X5000-11735-g06a81c1c7db9-dirty #1

Workqueue: events .virtio_gpu_dequeue_ctrl_func
NIP:  c00c7324 LR: c00c72e4 CTR: c0462930
REGS: c0003dba75e0 TRAP: 0300   Not tainted 
(5.9-a3_A-EON_X5000-11735-g06a81c1c7db9-dirty)

MSR:  90029000   CR: 24002288  XER: 
DEAR: 0010 ESR:  IRQMASK: 0
GPR00: c00c6188 c0003dba7870 c17f2300 c0003d893010
GPR04:  0001  
GPR08:    7f7f7f7f7f7f7f7f
GPR12: 24002284 c0003fff9200 c008c3a0 c61566c0
GPR16:    
GPR20:    
GPR24: 0001 0011  
GPR28: c0003d893010   c0003d893010
NIP [c00c7324] .dma_direct_unmap_sg+0x4c/0xd8
LR [c00c72e4] .dma_direct_unmap_sg+0xc/0xd8
Call Trace:
[c0003dba7870] [c0003dba7950] 0xc0003dba7950 (unreliable)
[c0003dba7920] [c00c6188] .dma_unmap_sg_attrs+0x5c/0x98
[c0003dba79d0] [c05cd438] .drm_gem_shmem_free_object+0x98/0xcc
[c0003dba7a50] [c06af5b4] .virtio_gpu_cleanup_object+0xc8/0xd4
[c0003dba7ad0] [c06ad3bc] .virtio_gpu_cmd_unref_cb+0x1c/0x30
[c0003dba7b40] [c06adab8] 
.virtio_gpu_dequeue_ctrl_func+0x208/0x28c

[c0003dba7c10] [c0086b70] .process_one_work+0x1a4/0x258
[c0003dba7cb0] [c00870f4] .worker_thread+0x214/0x284
[c0003dba7d70] [c008c4f0] .kthread+0x150/0x158
[c0003dba7e20] [c82c] .ret_from_kernel_thread+0x58/0x60
Instruction dump:
f821ff51 7cb82b78 7cdb3378 4e00 7cfa3b78 3bc0 7f9ec000 41fc0014
382100b0 81810008 7d808120 48bc1ba8  ebfc0248 833d0018 7fff4850
---[ end trace f28d194d9f0955a8 ]---

virtio_gpu virtio0: swiotlb buffer is full (sz: 4096 bytes), total 0 
(slots), used 0 (slots)
virtio_gpu virtio0: swiotlb buffer is full (sz: 16384 bytes), total 0 
(slots), used 0 (slots)


---

[3] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d323bb44e4d23802eb25d13de1f93f2335bd60d0


[4] https://forum.hyperion-entertainment.com/viewtopic.php?p=51377#p51377

[5] git revert d323bb44e4d23802eb25d13de1f93f2335bd60d0 //Output: 
[master 966950f724e4] Revert "drm/virtio: Call the right shmem helpers" 
1 file changed, 1 insertion(+), 1 deletion(-)


[6]
diff --git a/drivers/gpu/drm/virtio/virtgpu_object.c 
b/drivers/gpu/drm/virtio/virtgpu_object.c

index 6ccbd01cd888..346cef5ce251 100644
--- a/drivers/gpu/drm/virtio/virtgpu_object.c
+++ b/drivers/gpu/drm/virtio/virtgpu_object.c
@@ -150,7 +150,7 @@ static int virtio_gpu_object_shmem_init(struct 
virtio_gpu_device *vgdev,

 if (ret < 0)
 return -EINVAL;

-    shmem->pages = drm_gem_shmem_get_pages_sgt(>base.base);
+    shmem->pages = drm_gem_shmem_get_sg_table(>base.base);
 if (!shmem->pages) {
 drm_gem_shmem_unpin(>base.base);
 return -EINVAL;
---


Re: [PATCH 15/16] debug_vm_pgtable/savedwrite: Use savedwrite test with protnone ptes

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> Saved write support was added to track the write bit of a pte after marking 
> the
> pte protnone. This was done so that AUTONUMA can convert a write pte to 
> protnone
> and still track the old write bit. When converting it back we set the pte 
> write
> bit correctly thereby avoiding a write fault again.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 3e112d0ba1b2..eea62d5e503b 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -1006,8 +1006,8 @@ static int __init debug_vm_pgtable(void)
>   pud_leaf_tests(pud_aligned, prot);
>  
>  #ifdef CONFIG_NUMA_BALANCING
> - pte_savedwrite_tests(pte_aligned, prot);
> - pmd_savedwrite_tests(pmd_aligned, prot);
> + pte_savedwrite_tests(pte_aligned, protnone);
> + pmd_savedwrite_tests(pmd_aligned, protnone);
>  #endif
>   pte_special_tests(pte_aligned, prot);
>   pte_protnone_tests(pte_aligned, protnone);
> 

This can be folded back with [PATCH 5/16] which now checks for
CONFIG_NUMA_BALANCING in pxx_savedwrite_tests().


Re: [PATCH 14/16] debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> The seems to be missing quite a lot of details w.r.t allocating
> the correct pgtable_t page (huge_pte_alloc()), holding the right
> lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.
> 
> ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
> Hence disable the test on ppc64.

This test is free from any platform specific #ifdefs which should
never be broken. If hugetlb_advanced_tests() does not work or is
not detailed enough for ppc64, then it would be great if you could
suggest some improvements so that it works for all enabled platforms.

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 529892b9be2f..3e112d0ba1b2 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -800,6 +800,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, 
> pgprot_t prot)
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  }
>  
> +#ifndef CONFIG_PPC_BOOK3S_64
>  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
> struct vm_area_struct *vma,
> pte_t *ptep, unsigned long pfn,
> @@ -842,6 +843,7 @@ static void __init hugetlb_advanced_tests(struct 
> mm_struct *mm,
>   pte = huge_ptep_get(ptep);
>   WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
>  }
> +#endif
>  #else  /* !CONFIG_HUGETLB_PAGE */
>  static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
>  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
> @@ -1053,7 +1055,9 @@ static int __init debug_vm_pgtable(void)
>   pud_populate_tests(mm, pudp, saved_pmdp);
>   spin_unlock(ptl);
>  
> - //hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> +#ifndef CONFIG_PPC_BOOK3S_64
> + hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> +#endif
>  
>   spin_lock(>page_table_lock);
>   p4d_clear_tests(mm, p4dp);
> 


Re: [PATCH 09/16] debug_vm_pgtable/set_pud: Don't use set_pud_at to update an existing pud entry

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> set_pud_at() should not be used to set a pte entry at locations that
> already holds a valid pte entry. Architectures like ppc64 don't do TLB
> invalidate in set_pud_at() and hence expect it to be used to set locations
> that are not a valid PTE.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 60bf876081b8..644d28861ce9 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -278,9 +278,6 @@ static void __init pud_advanced_tests(struct mm_struct 
> *mm,
>   WARN_ON(pud_write(pud));
>  
>  #ifndef __PAGETABLE_PMD_FOLDED
> -
> - pud = pud_mkhuge(pfn_pud(pfn, prot));
> - set_pud_at(mm, vaddr, pudp, pud);
>   pudp_huge_get_and_clear(mm, vaddr, pudp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(!pud_none(pud));
> @@ -302,6 +299,11 @@ static void __init pud_advanced_tests(struct mm_struct 
> *mm,
>   pud = READ_ONCE(*pudp);
>   WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
>  
> + pudp_huge_get_and_clear_full(vma, vaddr, pudp, 1);
> + pud = READ_ONCE(*pudp);
> + WARN_ON(!pud_none(pud));
> +
> + pud = pud_mkhuge(pfn_pud(pfn, prot));
>   pud = pud_mkyoung(pud);
>   set_pud_at(mm, vaddr, pudp, pud);
>   pudp_test_and_clear_young(vma, vaddr, pudp);

Very similar to [PATCH 3/16] wrt set_pte_at(), please fold it.


Re: [PATCH 08/16] debug_vm_pgtable/set_pmd: Don't use set_pmd_at to update an existing pmd entry

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> set_pmd_at() should not be used to set a pte entry at locations that
> already holds a valid pte entry. Architectures like ppc64 don't do TLB
> invalidate in set_pmd_at() and hence expect it to be used to set locations
> that are not a valid PTE.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index cd609a212dd4..60bf876081b8 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -164,8 +164,6 @@ static void __init pmd_advanced_tests(struct mm_struct 
> *mm,
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(pmd_write(pmd));
>  
> - pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
> - set_pmd_at(mm, vaddr, pmdp, pmd);
>   pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(!pmd_none(pmd));
> @@ -180,12 +178,11 @@ static void __init pmd_advanced_tests(struct mm_struct 
> *mm,
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
>  
> - pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
> - set_pmd_at(mm, vaddr, pmdp, pmd);
>   pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
>   pmd = READ_ONCE(*pmdp);
>   WARN_ON(!pmd_none(pmd));
>  
> + pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>   pmd = pmd_mkyoung(pmd);
>   set_pmd_at(mm, vaddr, pmdp, pmd);
>   pmdp_test_and_clear_young(vma, vaddr, pmdp);
> 

Very similar to [PATCH 3/16] wrt set_pte_at(), please fold it.


Re: [PATCH v2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()

2020-08-12 Thread Christophe Leroy




Le 08/07/2020 à 06:49, Christophe Leroy a écrit :



Le 07/07/2020 à 21:02, Christophe Leroy a écrit :



Le 07/07/2020 à 14:44, Christophe Leroy a écrit :



Le 30/06/2020 à 03:19, Michael Ellerman a écrit :

Michael Ellerman  writes:

Christophe Leroy  writes:

Hi Michael,

I see this patch is marked as "defered" in patchwork, but I can't see
any related discussion. Is it normal ?


Because it uses the "m<>" constraint which didn't work on GCC 4.6.

https://github.com/linuxppc/issues/issues/297

So we should be able to pick it up for v5.9 hopefully.


It seems to break the build with the kernel.org 4.9.4 compiler and
corenet64_smp_defconfig:


Most likely a GCC bug ?

It seems the problem vanishes with patch 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/173de3b659fa3a5f126a0eb170522cccd909950f.1594125164.git.christophe.le...@csgroup.eu/ 



Same kind of issue in signal_64.c now.

The following patch fixes it: 
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/810bd8840ef990a200f58c9dea9abe767ca02a3a.1594146723.git.christophe.le...@csgroup.eu/ 





This time I confirm, with the two above mentioned patches, it builds OK 
with 4.9, see 
http://kisskb.ellerman.id.au/kisskb/head/810bd8840ef990a200f58c9dea9abe767ca02a3a/ 





I see you've merged those patches that make the issue disappear, yet not 
this patch yet. I guess you are still a bit chilly about it, so I split 
it in two parts for you to safely take patch 1 as soon as possible while 
handling the "m<>" constraint subject more carefully via 
https://github.com/linuxppc/issues/issues/297 in a later stage.


Anyway, it seems that GCC doesn't make much use of the "m<>" and the 
pre-update form. Most of the benefit of flexible addressing seems to be 
achieved with patch 1 ie without the "m<>" constraint and update form.


Christophe


Re: [PATCH v3 8/8] mm/vmalloc: Hugepage vmalloc mappings

2020-08-12 Thread Jonathan Cameron
On Mon, 10 Aug 2020 12:27:32 +1000
Nicholas Piggin  wrote:

> On platforms that define HAVE_ARCH_HUGE_VMAP and support PMD vmaps,
> vmalloc will attempt to allocate PMD-sized pages first, before falling
> back to small pages.
> 
> Allocations which use something other than PAGE_KERNEL protections are
> not permitted to use huge pages yet, not all callers expect this (e.g.,
> module allocations vs strict module rwx).
> 
> This reduces TLB misses by nearly 30x on a `git diff` workload on a
> 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
> 
> This can result in more internal fragmentation and memory overhead for a
> given allocation, an option nohugevmap is added to disable at boot.
> 
> Signed-off-by: Nicholas Piggin 
Hi Nicholas,

Busy afternoon, but a possible point of interest in line in the meantime.


...

> @@ -2701,22 +2760,45 @@ void *__vmalloc_node_range(unsigned long size, 
> unsigned long align,
>   pgprot_t prot, unsigned long vm_flags, int node,
>   const void *caller)
>  {
> - struct vm_struct *area;
> + struct vm_struct *area = NULL;
>   void *addr;
>   unsigned long real_size = size;
> + unsigned long real_align = align;
> + unsigned int shift = PAGE_SHIFT;
>  
>   size = PAGE_ALIGN(size);
>   if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>   goto fail;
>  
> - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED 
> |
> + if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) {
> + unsigned long size_per_node;
> +
> + /*
> +  * Try huge pages. Only try for PAGE_KERNEL allocations,
> +  * others like modules don't yet expect huge pages in
> +  * their allocations due to apply_to_page_range not
> +  * supporting them.
> +  */
> +
> + size_per_node = size;
> + if (node == NUMA_NO_NODE)
> + size_per_node /= num_online_nodes();
> + if (size_per_node >= PMD_SIZE)
> + shift = PMD_SHIFT;
> + }
> +
> +again:
> + align = max(real_align, 1UL << shift);
> + size = ALIGN(real_size, align);

So my suspicion is that the issue on arm64 is related to this.
In the relevant call path, align is 32K whilst the size is 16K

Previously I don't think we force size to be a multiple of align.

I think this results in nr_pages being double what it was before.


> +
> + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>   vm_flags, start, end, node, gfp_mask, caller);
>   if (!area)
>   goto fail;
>  
> - addr = __vmalloc_area_node(area, gfp_mask, prot, node);
> + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
>   if (!addr)
> - return NULL;
> + goto fail;
>  
>   /*
>* In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2730,8 +2812,16 @@ void *__vmalloc_node_range(unsigned long size, 
> unsigned long align,
>   return addr;
>  
>  fail:
> - warn_alloc(gfp_mask, NULL,
> + if (shift > PAGE_SHIFT) {
> + shift = PAGE_SHIFT;
> + goto again;
> + }
> +
> + if (!area) {
> + /* Warn for area allocation, page allocations already warn */
> + warn_alloc(gfp_mask, NULL,
> "vmalloc: allocation failure: %lu bytes", real_size);
> + }
>   return NULL;
>  }
>  




[PATCH v3 1/2] powerpc/uaccess: Use flexible addressing with __put_user()/__get_user()

2020-08-12 Thread Christophe Leroy
At the time being, __put_user()/__get_user() and friends only use
D-form addressing, with 0 offset. Ex:

lwz reg1, 0(reg2)

Give the compiler the opportunity to use other adressing modes
whenever possible, to get more optimised code.

Hereunder is a small exemple:

struct test {
u32 item1;
u16 item2;
u8 item3;
u64 item4;
};

int set_test_user(struct test __user *from, struct test __user *to)
{
int err;
u32 item1;
u16 item2;
u8 item3;
u64 item4;

err = __get_user(item1, >item1);
err |= __get_user(item2, >item2);
err |= __get_user(item3, >item3);
err |= __get_user(item4, >item4);

err |= __put_user(item1, >item1);
err |= __put_user(item2, >item2);
err |= __put_user(item3, >item3);
err |= __put_user(item4, >item4);

return err;
}

Before the patch:

0df0 :
 df0:   94 21 ff f0 stwur1,-16(r1)
 df4:   39 40 00 00 li  r10,0
 df8:   93 c1 00 08 stw r30,8(r1)
 dfc:   93 e1 00 0c stw r31,12(r1)
 e00:   7d 49 53 78 mr  r9,r10
 e04:   80 a3 00 00 lwz r5,0(r3)
 e08:   38 e3 00 04 addir7,r3,4
 e0c:   7d 46 53 78 mr  r6,r10
 e10:   a0 e7 00 00 lhz r7,0(r7)
 e14:   7d 29 33 78 or  r9,r9,r6
 e18:   39 03 00 06 addir8,r3,6
 e1c:   7d 46 53 78 mr  r6,r10
 e20:   89 08 00 00 lbz r8,0(r8)
 e24:   7d 29 33 78 or  r9,r9,r6
 e28:   38 63 00 08 addir3,r3,8
 e2c:   7d 46 53 78 mr  r6,r10
 e30:   83 c3 00 00 lwz r30,0(r3)
 e34:   83 e3 00 04 lwz r31,4(r3)
 e38:   7d 29 33 78 or  r9,r9,r6
 e3c:   7d 43 53 78 mr  r3,r10
 e40:   90 a4 00 00 stw r5,0(r4)
 e44:   7d 29 1b 78 or  r9,r9,r3
 e48:   38 c4 00 04 addir6,r4,4
 e4c:   7d 43 53 78 mr  r3,r10
 e50:   b0 e6 00 00 sth r7,0(r6)
 e54:   7d 29 1b 78 or  r9,r9,r3
 e58:   38 e4 00 06 addir7,r4,6
 e5c:   7d 43 53 78 mr  r3,r10
 e60:   99 07 00 00 stb r8,0(r7)
 e64:   7d 23 1b 78 or  r3,r9,r3
 e68:   38 84 00 08 addir4,r4,8
 e6c:   93 c4 00 00 stw r30,0(r4)
 e70:   93 e4 00 04 stw r31,4(r4)
 e74:   7c 63 53 78 or  r3,r3,r10
 e78:   83 c1 00 08 lwz r30,8(r1)
 e7c:   83 e1 00 0c lwz r31,12(r1)
 e80:   38 21 00 10 addir1,r1,16
 e84:   4e 80 00 20 blr

After the patch:

0dbc :
 dbc:   39 40 00 00 li  r10,0
 dc0:   7d 49 53 78 mr  r9,r10
 dc4:   80 03 00 00 lwz r0,0(r3)
 dc8:   7d 48 53 78 mr  r8,r10
 dcc:   a1 63 00 04 lhz r11,4(r3)
 dd0:   7d 29 43 78 or  r9,r9,r8
 dd4:   7d 48 53 78 mr  r8,r10
 dd8:   88 a3 00 06 lbz r5,6(r3)
 ddc:   7d 29 43 78 or  r9,r9,r8
 de0:   7d 48 53 78 mr  r8,r10
 de4:   80 c3 00 08 lwz r6,8(r3)
 de8:   80 e3 00 0c lwz r7,12(r3)
 dec:   7d 29 43 78 or  r9,r9,r8
 df0:   7d 43 53 78 mr  r3,r10
 df4:   90 04 00 00 stw r0,0(r4)
 df8:   7d 29 1b 78 or  r9,r9,r3
 dfc:   7d 43 53 78 mr  r3,r10
 e00:   b1 64 00 04 sth r11,4(r4)
 e04:   7d 29 1b 78 or  r9,r9,r3
 e08:   7d 43 53 78 mr  r3,r10
 e0c:   98 a4 00 06 stb r5,6(r4)
 e10:   7d 23 1b 78 or  r3,r9,r3
 e14:   90 c4 00 08 stw r6,8(r4)
 e18:   90 e4 00 0c stw r7,12(r4)
 e1c:   7c 63 53 78 or  r3,r3,r10
 e20:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
Reviewed-by: Segher Boessenkool 
---
v3:
- Removed the pre-update form and the <> modifier, because it seems
to be problematic with some older versions of GCC (namely 4.9) and
doesn't seems to generate much code of that form anyway. So better
to get it merged with the pre-update form than not be merged at all.
The pre-update form is added in a following patch that may then get
merged later once we have cleared the issue with it.

v2:
- Added <> modifier in __put_user_asm() and __get_user_asm()
- Removed %U2 in __put_user_asm2() and __get_user_asm2()
- Reworded the commit log

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 64c04ab09112..c546fb36043d 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -159,7 +159,7 @@ extern long __put_user_bad(void);
  */
 #define __put_user_asm(x, addr, err, op)   \
__asm__ __volatile__(   \
-   "1: " op " %1,0(%2) # put_user\n"   \
+   "1: " op "%X2 %1,%2 # put_user\n"   \
"2:\n"  \
".section .fixup,\"ax\"\n"  \
"3: li 

[PATCH v3 2/2] powerpc/uaccess: Add pre-update addressing to __get_user_asm() and __put_user_asm()

2020-08-12 Thread Christophe Leroy
Enable pre-update addressing mode in __get_user_asm() and __put_user_asm()

Signed-off-by: Christophe Leroy 
---
v3: new, splited out from patch 1.
---
 arch/powerpc/include/asm/uaccess.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index c546fb36043d..eadb25f4cc9f 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -159,7 +159,7 @@ extern long __put_user_bad(void);
  */
 #define __put_user_asm(x, addr, err, op)   \
__asm__ __volatile__(   \
-   "1: " op "%X2 %1,%2 # put_user\n"   \
+   "1: " op "%U2%X2 %1,%2  # put_user\n"   \
"2:\n"  \
".section .fixup,\"ax\"\n"  \
"3: li %0,%3\n" \
@@ -167,7 +167,7 @@ extern long __put_user_bad(void);
".previous\n"   \
EX_TABLE(1b, 3b)\
: "=r" (err)\
-   : "r" (x), "m" (*addr), "i" (-EFAULT), "0" (err))
+   : "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err))
 
 #ifdef __powerpc64__
 #define __put_user_asm2(x, ptr, retval)\
@@ -317,7 +317,7 @@ extern long __get_user_bad(void);
 
 #define __get_user_asm(x, addr, err, op)   \
__asm__ __volatile__(   \
-   "1: "op"%X2 %1, %2  # get_user\n"   \
+   "1: "op"%U2%X2 %1, %2   # get_user\n"   \
"2:\n"  \
".section .fixup,\"ax\"\n"  \
"3: li %0,%3\n" \
@@ -326,7 +326,7 @@ extern long __get_user_bad(void);
".previous\n"   \
EX_TABLE(1b, 3b)\
: "=r" (err), "=r" (x)  \
-   : "m" (*addr), "i" (-EFAULT), "0" (err))
+   : "m<>" (*addr), "i" (-EFAULT), "0" (err))
 
 #ifdef __powerpc64__
 #define __get_user_asm2(x, addr, err)  \
-- 
2.25.0



[RFC PATCH v1 18/19] powerpc/signal32: Add and use unsafe_put_sigset_t()

2020-08-12 Thread Christophe Leroy
put_sigset_t() calls copy_to_user() for copying two words.

Because INLINE_COPY_TO_USER is not defined on powerpc,
copy_to_user() doesn't get optimised and falls back to
copy_tofrom_user() with the relevant glue. This is terribly
inefficient for copying two words.

By switching to unsafe_put_user(), we end up with something as
simple as:

 3cc:   81 3d 00 00 lwz r9,0(r29)
 3d0:   91 26 00 b4 stw r9,180(r6)
 3d4:   81 3d 00 04 lwz r9,4(r29)
 3d8:   91 26 00 b8 stw r9,184(r6)

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index d03ba3d8eb68..6cbff0293ff4 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -87,6 +87,8 @@ static inline int put_sigset_t(compat_sigset_t __user *uset, 
sigset_t *set)
return put_compat_sigset(uset, set, sizeof(*uset));
 }
 
+#define unsafe_put_sigset_tunsafe_put_compat_sigset
+
 static inline int get_sigset_t(sigset_t *set,
   const compat_sigset_t __user *uset)
 {
@@ -143,6 +145,13 @@ static inline int put_sigset_t(sigset_t __user *uset, 
sigset_t *set)
return copy_to_user(uset, set, sizeof(*uset));
 }
 
+#define unsafe_put_sigset_t(uset, set, label) do { \
+   sigset_t __user *__us = uset;   \
+   const sigset_t *__s = set;  \
+   \
+   unsafe_copy_to_user(__us, __s, sizeof(*__us), label);   \
+} while (0)
+
 static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset)
 {
return copy_from_user(set, uset, sizeof(*uset));
@@ -820,10 +829,11 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
{
unsafe_put_user(0, _sf->uc.uc_link, failed);
}
+
+   unsafe_put_sigset_t(_sf->uc.uc_sigmask, oldset, failed);
+
user_write_access_end();
 
-   if (put_sigset_t(_sf->uc.uc_sigmask, oldset))
-   goto badframe;
if (copy_siginfo_to_user(_sf->info, >info))
goto badframe;
 
-- 
2.25.0



[RFC PATCH v1 19/19] powerpc/signal32: Switch swap_context() to user_access_begin() logic

2020-08-12 Thread Christophe Leroy
As this was the last user of put_sigset_t(), remove it as well.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 24 ++--
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 6cbff0293ff4..399d823782cf 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -82,11 +82,6 @@
  * Functions for flipping sigsets (thanks to brain dead generic
  * implementation that makes things simple for little endian only)
  */
-static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set)
-{
-   return put_compat_sigset(uset, set, sizeof(*uset));
-}
-
 #define unsafe_put_sigset_tunsafe_put_compat_sigset
 
 static inline int get_sigset_t(sigset_t *set,
@@ -140,11 +135,6 @@ static inline int restore_general_regs(struct pt_regs 
*regs,
 
 #define GP_REGS_SIZE   min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
 
-static inline int put_sigset_t(sigset_t __user *uset, sigset_t *set)
-{
-   return copy_to_user(uset, set, sizeof(*uset));
-}
-
 #define unsafe_put_sigset_t(uset, set, label) do { \
sigset_t __user *__us = uset;   \
const sigset_t *__s = set;  \
@@ -1012,11 +1002,13 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, 
old_ctx,
 */
mctx = (struct mcontext __user *)
((unsigned long) _ctx->uc_mcontext & ~0xfUL);
-   if (!access_ok(old_ctx, ctx_size)
-   || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
-   || put_sigset_t(_ctx->uc_sigmask, >blocked)
-   || __put_user(to_user_ptr(mctx), _ctx->uc_regs))
+   if (save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region))
+   return -EFAULT;
+   if (!user_write_access_begin(old_ctx, ctx_size))
return -EFAULT;
+   unsafe_put_sigset_t(_ctx->uc_sigmask, >blocked, 
failed);
+   unsafe_put_user(to_user_ptr(mctx), _ctx->uc_regs, failed);
+   user_write_access_end();
}
if (new_ctx == NULL)
return 0;
@@ -1040,6 +1032,10 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, 
old_ctx,
 
set_thread_flag(TIF_RESTOREALL);
return 0;
+
+failed:
+   user_write_access_end();
+   return -EFAULT;
 }
 
 #ifdef CONFIG_PPC64
-- 
2.25.0



[RFC PATCH v1 17/19] signal: Add unsafe_put_compat_sigset()

2020-08-12 Thread Christophe Leroy
Implement 'unsafe' version of put_compat_sigset()

For the bigendian, use unsafe_put_user() directly
to avoid intermediate copy through the stack.

For the littleendian, use a straight unsafe_copy_to_user().

Signed-off-by: Christophe Leroy 
Cc: Dmitry V. Levin 
Cc: Al Viro 
---
 include/linux/compat.h | 32 
 1 file changed, 32 insertions(+)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index c4255d8a4a8a..bbe5f9658ed1 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -442,6 +442,38 @@ put_compat_sigset(compat_sigset_t __user *compat, const 
sigset_t *set,
 #endif
 }
 
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define unsafe_put_compat_sigset(compat, set, label) do {  \
+   compat_sigset_t __user *__c = compat;   \
+   const sigset_t *__s = set;  \
+   \
+   switch (_NSIG_WORDS) {  \
+   case 4: \
+   unsafe_put_user(__s->sig[3] >> 32, __c->sig[7], label); \
+   unsafe_put_user(__s->sig[3], __c->sig[6], label);   \
+   fallthrough;\
+   case 3: \
+   unsafe_put_user(__s->sig[2] >> 32, __c->sig[5], label); \
+   unsafe_put_user(__s->sig[2], __c->sig[4], label);   \
+   fallthrough;\
+   case 2: \
+   unsafe_put_user(__s->sig[1] >> 32, __c->sig[3], label); \
+   unsafe_put_user(__s->sig[1], __c->sig[2], label);   \
+   fallthrough;\
+   case 1: \
+   unsafe_put_user(__s->sig[0] >> 32, __c->sig[1], label); \
+   unsafe_put_user(__s->sig[0], __c->sig[0], label);   \
+   }   \
+} while (0)
+#else
+#define unsafe_put_compat_sigset(compat, set, label) do {  \
+   compat_sigset_t __user *__c = compat;   \
+   const sigset_t *__s = set;  \
+   \
+   unsafe_copy_to_user(__c, __s, sizeof(*__c), label); \
+} while (0)
+#endif
+
 extern int compat_ptrace_request(struct task_struct *child,
 compat_long_t request,
 compat_ulong_t addr, compat_ulong_t data);
-- 
2.25.0



[RFC PATCH v1 16/19] powerpc/signal32: Switch handle_rt_signal32() to user_access_begin() logic

2020-08-12 Thread Christophe Leroy
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 47 +
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 4ea83578ba9d..d03ba3d8eb68 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -58,8 +58,6 @@
 #define mcontext   mcontext32
 #define ucontext   ucontext32
 
-#define __save_altstack __compat_save_altstack
-
 /*
  * Userspace code may pass a ucontext which doesn't include VSX added
  * at the end.  We need to check for this case.
@@ -797,16 +795,36 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
rt_sf = get_sigframe(ksig, tsk, sizeof(*rt_sf), 1);
-   if (!access_ok(rt_sf, sizeof(*rt_sf)))
+   if (!user_write_access_begin(rt_sf, sizeof(*rt_sf)))
goto badframe;
 
/* Put the siginfo & fill in most of the ucontext */
-   if (copy_siginfo_to_user(_sf->info, >info)
-   || __put_user(0, _sf->uc.uc_flags)
-   || __save_altstack(_sf->uc.uc_stack, regs->gpr[1])
-   || __put_user(to_user_ptr(_sf->uc.uc_mcontext),
-   _sf->uc.uc_regs)
-   || put_sigset_t(_sf->uc.uc_sigmask, oldset))
+   unsafe_put_user(0, _sf->uc.uc_flags, failed);
+#ifdef CONFIG_PPC64
+   unsafe_compat_save_altstack(_sf->uc.uc_stack, regs->gpr[1], failed);
+#else
+   unsafe_save_altstack(_sf->uc.uc_stack, regs->gpr[1], failed);
+#endif
+   unsafe_put_user(to_user_ptr(_sf->uc.uc_mcontext), 
_sf->uc.uc_regs, failed);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   tm_frame = _sf->uc_transact.uc_mcontext;
+   if (MSR_TM_ACTIVE(msr)) {
+   unsafe_put_user((unsigned long)_sf->uc_transact,
+   _sf->uc.uc_link, failed);
+   unsafe_put_user((unsigned long)tm_frame,
+   _sf->uc_transact.uc_regs, failed);
+   }
+   else
+#endif
+   {
+   unsafe_put_user(0, _sf->uc.uc_link, failed);
+   }
+   user_write_access_end();
+
+   if (put_sigset_t(_sf->uc.uc_sigmask, oldset))
+   goto badframe;
+   if (copy_siginfo_to_user(_sf->info, >info))
goto badframe;
 
/* Save user registers on the stack */
@@ -820,21 +838,13 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   tm_frame = _sf->uc_transact.uc_mcontext;
if (MSR_TM_ACTIVE(msr)) {
-   if (__put_user((unsigned long)_sf->uc_transact,
-  _sf->uc.uc_link) ||
-   __put_user((unsigned long)tm_frame,
-  _sf->uc_transact.uc_regs))
-   goto badframe;
if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr))
goto badframe;
}
else
 #endif
{
-   if (__put_user(0, _sf->uc.uc_link))
-   goto badframe;
if (save_user_regs(regs, frame, tm_frame, sigret, 1))
goto badframe;
}
@@ -861,6 +871,9 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
regs->msr |= (MSR_KERNEL & MSR_LE);
return 0;
 
+failed:
+   user_write_access_end();
+
 badframe:
signal_fault(tsk, regs, "handle_rt_signal32", rt_sf);
 
-- 
2.25.0



[RFC PATCH v1 11/19] powerpc/signal32: Simplify logging in handle_rt_signal32()

2020-08-12 Thread Christophe Leroy
If something is bad in the frame, there is no point in
knowing which part of the frame exactly is wrong as it
got allocated as a single block.

Always print the root address of the frame in case on
failed user access, just like handle_signal32().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index d1087dd87174..495bee1b713d 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -754,7 +754,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
struct rt_sigframe __user *rt_sf;
struct mcontext __user *frame;
struct mcontext __user *tm_frame = NULL;
-   void __user *addr;
unsigned long newsp = 0;
int sigret;
unsigned long tramp;
@@ -769,7 +768,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
rt_sf = get_sigframe(ksig, tsk, sizeof(*rt_sf), 1);
-   addr = rt_sf;
if (!access_ok(rt_sf, sizeof(*rt_sf)))
goto badframe;
 
@@ -784,7 +782,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Save user registers on the stack */
frame = _sf->uc.uc_mcontext;
-   addr = frame;
if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
sigret = 0;
tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
@@ -820,7 +817,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* create a stack frame for the caller of the handler */
newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
-   addr = (void __user *)regs->gpr[1];
if (put_user(regs->gpr[1], (u32 __user *)newsp))
goto badframe;
 
@@ -837,7 +833,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   signal_fault(tsk, regs, "handle_rt_signal32", addr);
+   signal_fault(tsk, regs, "handle_rt_signal32", rt_sf);
 
return 1;
 }
-- 
2.25.0



[RFC PATCH v1 15/19] powerpc/signal32: Switch handle_signal32() to user_access_begin() logic

2020-08-12 Thread Christophe Leroy
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 0d076c2a9f6c..4ea83578ba9d 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1241,23 +1241,23 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
-   if (!access_ok(frame, sizeof(*frame)))
+   if (!user_write_access_begin(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
 
 #if _NSIG != 64
 #error "Please adjust handle_signal()"
 #endif
-   if (__put_user(to_user_ptr(ksig->ka.sa.sa_handler), >handler)
-   || __put_user(oldset->sig[0], >oldmask)
+   unsafe_put_user(to_user_ptr(ksig->ka.sa.sa_handler), >handler, 
failed);
+   unsafe_put_user(oldset->sig[0], >oldmask, failed);
 #ifdef CONFIG_PPC64
-   || __put_user((oldset->sig[0] >> 32), >_unused[3])
+   unsafe_put_user((oldset->sig[0] >> 32), >_unused[3], failed);
 #else
-   || __put_user(oldset->sig[1], >_unused[3])
+   unsafe_put_user(oldset->sig[1], >_unused[3], failed);
 #endif
-   || __put_user(to_user_ptr(>mctx), >regs)
-   || __put_user(ksig->sig, >signal))
-   goto badframe;
+   unsafe_put_user(to_user_ptr(>mctx), >regs, failed);
+   unsafe_put_user(ksig->sig, >signal, failed);
+   user_write_access_end();
 
if (vdso32_sigtramp && tsk->mm->context.vdso_base) {
sigret = 0;
@@ -1300,6 +1300,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
regs->msr &= ~MSR_LE;
return 0;
 
+failed:
+   user_write_access_end();
+
 badframe:
signal_fault(tsk, regs, "handle_signal32", frame);
 
-- 
2.25.0



[RFC PATCH v1 14/19] powerpc/signal32: Switch save_user_regs() and save_tm_user_regs() to user_access_begin() logic

2020-08-12 Thread Christophe Leroy
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 168 
 1 file changed, 84 insertions(+), 84 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 2c3d5d4400ec..0d076c2a9f6c 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -98,7 +98,7 @@ static inline int get_sigset_t(sigset_t *set,
 #define to_user_ptr(p) ptr_to_compat(p)
 #define from_user_ptr(p)   compat_ptr(p)
 
-static inline int save_general_regs(struct pt_regs *regs,
+static __always_inline int save_general_regs(struct pt_regs *regs,
struct mcontext __user *frame)
 {
elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
@@ -113,10 +113,12 @@ static inline int save_general_regs(struct pt_regs *regs,
else
val = gregs[i];
 
-   if (__put_user(val, >mc_gregs[i]))
-   return -EFAULT;
+   unsafe_put_user(val, >mc_gregs[i], failed);
}
return 0;
+
+failed:
+   return 1;
 }
 
 static inline int restore_general_regs(struct pt_regs *regs,
@@ -151,11 +153,15 @@ static inline int get_sigset_t(sigset_t *set, const 
sigset_t __user *uset)
 #define to_user_ptr(p) ((unsigned long)(p))
 #define from_user_ptr(p)   ((void __user *)(p))
 
-static inline int save_general_regs(struct pt_regs *regs,
+static __always_inline int save_general_regs(struct pt_regs *regs,
struct mcontext __user *frame)
 {
WARN_ON(!FULL_REGS(regs));
-   return __copy_to_user(>mc_gregs, regs, GP_REGS_SIZE);
+   unsafe_copy_to_user(>mc_gregs, regs, GP_REGS_SIZE, failed);
+   return 0;
+
+failed:
+   return 1;
 }
 
 static inline int restore_general_regs(struct pt_regs *regs,
@@ -258,16 +264,18 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
flush_spe_to_thread(current);
 #endif
 
+   if (!user_write_access_begin(frame, sizeof(*frame)))
+   return 1;
+
/* save general registers */
if (save_general_regs(regs, frame))
-   return 1;
+   goto failed;
 
 #ifdef CONFIG_ALTIVEC
/* save altivec registers */
if (current->thread.used_vr) {
-   if (__copy_to_user(>mc_vregs, >thread.vr_state,
-  ELF_NVRREG * sizeof(vector128)))
-   return 1;
+   unsafe_copy_to_user(>mc_vregs, >thread.vr_state,
+   ELF_NVRREG * sizeof(vector128), failed);
/* set MSR_VEC in the saved MSR value to indicate that
   frame->mc_vregs contains valid data */
msr |= MSR_VEC;
@@ -280,11 +288,9 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * most significant bits of that same vector. --BenH
 * Note that the current VRSAVE value is in the SPR at this point.
 */
-   if (__put_user(current->thread.vrsave, (u32 __user 
*)>mc_vregs[32]))
-   return 1;
+   unsafe_put_user(current->thread.vrsave, (u32 __user 
*)>mc_vregs[32], failed);
 #endif /* CONFIG_ALTIVEC */
-   if (copy_fpr_to_user(>mc_fregs, current))
-   return 1;
+   unsafe_copy_fpr_to_user(>mc_fregs, current, failed);
 
/*
 * Clear the MSR VSX bit to indicate there is no valid state attached
@@ -299,17 +305,15 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * contains valid data
 */
if (current->thread.used_vsr && ctx_has_vsx_region) {
-   if (copy_vsx_to_user(>mc_vsregs, current))
-   return 1;
+   unsafe_copy_vsx_to_user(>mc_vsregs, current, failed);
msr |= MSR_VSX;
}
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_SPE
/* save spe registers */
if (current->thread.used_spe) {
-   if (__copy_to_user(>mc_vregs, current->thread.evr,
-  ELF_NEVRREG * sizeof(u32)))
-   return 1;
+   unsafe_copy_to_user(>mc_vregs, current->thread.evr,
+   ELF_NEVRREG * sizeof(u32)), failed);
/* set MSR_SPE in the saved MSR value to indicate that
   frame->mc_vregs contains valid data */
msr |= MSR_SPE;
@@ -317,19 +321,18 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
/* else assert((regs->msr & MSR_SPE) == 0) */
 
/* We always copy to/from spefscr */
-   if (__put_user(current->thread.spefscr, (u32 __user *)>mc_vregs 
+ ELF_NEVRREG))
-   return 1;
+   unsafe_put_user(current->thread.spefscr, (u32 __user *)>mc_vregs 
+ ELF_NEVRREG, failed);
 #endif /* CONFIG_SPE */
 
-   if (__put_user(msr, >mc_gregs[PT_MSR]))
-   return 1;
+   

[RFC PATCH v1 13/19] powerpc/signal32: Create 'unsafe' versions of copy_[ck][fpr/vsx]_to_user()

2020-08-12 Thread Christophe Leroy
For the non VSX version, that's trivial. Just use unsafe_copy_to_user()
instead of __copy_to_user().

For the VSX version, remove the intermediate step through a buffer and
use unsafe_put_user() directly. This generates a far smaller code which
is acceptable to inline, see below:

Standard VSX version:

 <.copy_fpr_to_user>:
   0:   7c 08 02 a6 mflrr0
   4:   fb e1 ff f8 std r31,-8(r1)
   8:   39 00 00 20 li  r8,32
   c:   39 24 0b 80 addir9,r4,2944
  10:   7d 09 03 a6 mtctr   r8
  14:   f8 01 00 10 std r0,16(r1)
  18:   f8 21 fe 71 stdur1,-400(r1)
  1c:   39 41 00 68 addir10,r1,104
  20:   e9 09 00 00 ld  r8,0(r9)
  24:   39 4a 00 08 addir10,r10,8
  28:   39 29 00 10 addir9,r9,16
  2c:   f9 0a 00 00 std r8,0(r10)
  30:   42 00 ff f0 bdnz20 <.copy_fpr_to_user+0x20>
  34:   e9 24 0d 80 ld  r9,3456(r4)
  38:   3d 42 00 00 addis   r10,r2,0
3a: R_PPC64_TOC16_HA.toc
  3c:   eb ea 00 00 ld  r31,0(r10)
3e: R_PPC64_TOC16_LO_DS .toc
  40:   f9 21 01 70 std r9,368(r1)
  44:   e9 3f 00 00 ld  r9,0(r31)
  48:   81 29 00 20 lwz r9,32(r9)
  4c:   2f 89 00 00 cmpwi   cr7,r9,0
  50:   40 9c 00 18 bge cr7,68 <.copy_fpr_to_user+0x68>
  54:   4c 00 01 2c isync
  58:   3d 20 40 00 lis r9,16384
  5c:   79 29 07 c6 rldicr  r9,r9,32,31
  60:   7d 3d 03 a6 mtspr   29,r9
  64:   4c 00 01 2c isync
  68:   38 a0 01 08 li  r5,264
  6c:   38 81 00 70 addir4,r1,112
  70:   48 00 00 01 bl  70 <.copy_fpr_to_user+0x70>
70: R_PPC64_REL24   .__copy_tofrom_user
  74:   60 00 00 00 nop
  78:   e9 3f 00 00 ld  r9,0(r31)
  7c:   81 29 00 20 lwz r9,32(r9)
  80:   2f 89 00 00 cmpwi   cr7,r9,0
  84:   40 9c 00 18 bge cr7,9c <.copy_fpr_to_user+0x9c>
  88:   4c 00 01 2c isync
  8c:   39 20 ff ff li  r9,-1
  90:   79 29 00 44 rldicr  r9,r9,0,1
  94:   7d 3d 03 a6 mtspr   29,r9
  98:   4c 00 01 2c isync
  9c:   38 21 01 90 addir1,r1,400
  a0:   e8 01 00 10 ld  r0,16(r1)
  a4:   eb e1 ff f8 ld  r31,-8(r1)
  a8:   7c 08 03 a6 mtlrr0
  ac:   4e 80 00 20 blr

'unsafe' simulated VSX version (The ... are only nops) using
unsafe_copy_fpr_to_user() macro:

unsigned long copy_fpr_to_user(void __user *to,
   struct task_struct *task)
{
unsafe_copy_fpr_to_user(to, task, failed);
return 0;
failed:
return 1;
}

 <.copy_fpr_to_user>:
   0:   39 00 00 20 li  r8,32
   4:   39 44 0b 80 addir10,r4,2944
   8:   7d 09 03 a6 mtctr   r8
   c:   7c 69 1b 78 mr  r9,r3
...
  20:   e9 0a 00 00 ld  r8,0(r10)
  24:   f9 09 00 00 std r8,0(r9)
  28:   39 4a 00 10 addir10,r10,16
  2c:   39 29 00 08 addir9,r9,8
  30:   42 00 ff f0 bdnz20 <.copy_fpr_to_user+0x20>
  34:   e9 24 0d 80 ld  r9,3456(r4)
  38:   f9 23 01 00 std r9,256(r3)
  3c:   38 60 00 00 li  r3,0
  40:   4e 80 00 20 blr
...
  50:   38 60 00 01 li  r3,1
  54:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.h | 47 
 1 file changed, 47 insertions(+)

diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index f610cfafa478..abe570f06c12 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -32,7 +32,49 @@ unsigned long copy_fpr_to_user(void __user *to, struct 
task_struct *task);
 unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct *task);
 unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from);
 unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user 
*from);
+
+#define unsafe_copy_fpr_to_user(to, task, label)   do {\
+   u64 __user *buf = (u64 __user *)to; \
+   int i;  \
+   \
+   for (i = 0; i < ELF_NFPREG - 1 ; i++)   \
+   unsafe_put_user(task->thread.TS_FPR(i), [i], label);\
+   unsafe_put_user(task->thread.fp_state.fpscr, [i], label);   \
+} while (0)
+
+#define unsafe_copy_vsx_to_user(to, task, label)   do {\
+   u64 __user *buf = (u64 __user *)to; \
+   int i;  \
+   \
+   for (i = 0; i < ELF_NVSRHALFREG ; i++)  \
+   unsafe_put_user(task->thread.fp_state.fpr[i][TS_VSRLOWOFFSET], \
+   [i], label);\
+} while (0)
+
+#ifdef 

[RFC PATCH v1 12/19] powerpc/signal32: Regroup copies in save_user_regs() and save_tm_user_regs()

2020-08-12 Thread Christophe Leroy
Reorder actions in save_user_regs() and save_tm_user_regs() to
regroup copies together in order to switch to user_access_begin()
logic in a later patch.

In save_tm_user_regs(), first perform copies to frame, then
perform copies to tm_frame.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 153 +++-
 1 file changed, 91 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 495bee1b713d..2c3d5d4400ec 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -243,6 +243,20 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 
/* Make sure floating point registers are stored in regs */
flush_fp_to_thread(current);
+#ifdef CONFIG_ALTIVEC
+   if (current->thread.used_vr)
+   flush_altivec_to_thread(current);
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   current->thread.vrsave = mfspr(SPRN_VRSAVE);
+#endif
+#ifdef CONFIG_VSX
+   if (current->thread.used_vsr && ctx_has_vsx_region)
+   flush_vsx_to_thread(current);
+#endif
+#ifdef CONFIG_SPE
+   if (current->thread.used_spe)
+   flush_spe_to_thread(current);
+#endif
 
/* save general registers */
if (save_general_regs(regs, frame))
@@ -251,7 +265,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 #ifdef CONFIG_ALTIVEC
/* save altivec registers */
if (current->thread.used_vr) {
-   flush_altivec_to_thread(current);
if (__copy_to_user(>mc_vregs, >thread.vr_state,
   ELF_NVRREG * sizeof(vector128)))
return 1;
@@ -267,8 +280,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * most significant bits of that same vector. --BenH
 * Note that the current VRSAVE value is in the SPR at this point.
 */
-   if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   current->thread.vrsave = mfspr(SPRN_VRSAVE);
if (__put_user(current->thread.vrsave, (u32 __user 
*)>mc_vregs[32]))
return 1;
 #endif /* CONFIG_ALTIVEC */
@@ -288,7 +299,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * contains valid data
 */
if (current->thread.used_vsr && ctx_has_vsx_region) {
-   flush_vsx_to_thread(current);
if (copy_vsx_to_user(>mc_vsregs, current))
return 1;
msr |= MSR_VSX;
@@ -297,7 +307,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 #ifdef CONFIG_SPE
/* save spe registers */
if (current->thread.used_spe) {
-   flush_spe_to_thread(current);
if (__copy_to_user(>mc_vregs, current->thread.evr,
   ELF_NEVRREG * sizeof(u32)))
return 1;
@@ -314,20 +323,22 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 
if (__put_user(msr, >mc_gregs[PT_MSR]))
return 1;
-   /* We need to write 0 the MSR top 32 bits in the tm frame so that we
-* can check it on the restore to see if TM is active
-*/
-   if (tm_frame && __put_user(0, _frame->mc_gregs[PT_MSR]))
-   return 1;
 
if (sigret) {
/* Set up the sigreturn trampoline: li 0,sigret; sc */
if (__put_user(PPC_INST_ADDI + sigret, >tramp[0])
|| __put_user(PPC_INST_SC, >tramp[1]))
return 1;
+   }
+   if (sigret)
flush_icache_range((unsigned long) >tramp[0],
   (unsigned long) >tramp[2]);
-   }
+
+   /* We need to write 0 the MSR top 32 bits in the tm frame so that we
+* can check it on the restore to see if TM is active
+*/
+   if (tm_frame && __put_user(0, _frame->mc_gregs[PT_MSR]))
+   return 1;
 
return 0;
 }
@@ -349,18 +360,16 @@ static int save_tm_user_regs(struct pt_regs *regs,
 {
WARN_ON(tm_suspend_disabled);
 
-   /* Save both sets of general registers */
-   if (save_general_regs(>thread.ckpt_regs, frame)
-   || save_general_regs(regs, tm_frame))
-   return 1;
+#ifdef CONFIG_ALTIVEC
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   current->thread.ckvrsave = mfspr(SPRN_VRSAVE);
+#endif
+#ifdef CONFIG_SPE
+   if (current->thread.used_spe)
+   flush_spe_to_thread(current);
+#endif
 
-   /* Stash the top half of the 64bit MSR into the 32bit MSR word
-* of the transactional mcontext.  This way we have a 
backward-compatible
-* MSR in the 'normal' (checkpointed) mcontext and additionally one can
-* also look at what type of transaction (T or S) was 

[RFC PATCH v1 05/19] powerpc/signal: Don't manage floating point regs when no FPU

2020-08-12 Thread Christophe Leroy
There is no point in copying floating point regs when there
is no FPU and MATH_EMULATION is not selected.

Create a new CONFIG_PPC_FPU_REGS bool that is selected by
CONFIG_MATH_EMULATION and CONFIG_PPC_FPU, and use it to
opt out everything related to fp_state in thread_struct.

The asm const used only by fpu.S are opted out with CONFIG_PPC_FPU
as fpu.S build is conditionnal to CONFIG_PPC_FPU.

The following app spends approx 8.1 seconds system time on
an 8xx without the patch, and 7.0 seconds with the patch.

void sigusr1(int sig) { }

int main(int argc, char **argv)
{
int i = 10;

signal(SIGUSR1, sigusr1);
for (;i--;)
raise(SIGUSR1);
exit(0);
}

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/processor.h |  2 ++
 arch/powerpc/kernel/asm-offsets.c|  2 ++
 arch/powerpc/kernel/process.c|  4 
 arch/powerpc/kernel/ptrace/Makefile  |  4 ++--
 arch/powerpc/kernel/ptrace/ptrace-decl.h | 14 ++
 arch/powerpc/kernel/ptrace/ptrace-view.c |  2 ++
 arch/powerpc/kernel/signal.h | 14 +-
 arch/powerpc/kernel/signal_32.c  |  4 
 arch/powerpc/kernel/traps.c  |  2 ++
 arch/powerpc/platforms/Kconfig.cputype   |  4 
 11 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f48bbfb3ce9..a2611880b904 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -416,6 +416,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 config MATH_EMULATION
bool "Math emulation"
depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
+   select PPC_FPU_REGS
help
  Some PowerPC chips designed for embedded applications do not have
  a floating-point unit and therefore do not implement the
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index ed0d633ab5aa..e20b0c5abe62 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -175,8 +175,10 @@ struct thread_struct {
 #endif
/* Debug Registers */
struct debug_reg debug;
+#ifdef CONFIG_PPC_FPU_REGS
struct thread_fp_state  fp_state;
struct thread_fp_state  *fp_save_area;
+#endif
int fpexc_mode; /* floating-point exception mode */
unsigned intalign_ctl;  /* alignment handling control */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 8711c2164b45..6cb36c341c70 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -110,9 +110,11 @@ int main(void)
 #ifdef CONFIG_BOOKE
OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]);
 #endif
+#ifdef CONFIG_PPC_FPU
OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode);
OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr);
OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area);
+#endif
OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr);
OFFSET(THREAD_LOAD_FP, thread_struct, load_fp);
 #ifdef CONFIG_ALTIVEC
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 016bd831908e..7e0082ac0a39 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1694,7 +1694,9 @@ int copy_thread(unsigned long clone_flags, unsigned long 
usp,
p->thread.ptrace_bps[i] = NULL;
 #endif
 
+#ifdef CONFIG_PPC_FPU_REGS
p->thread.fp_save_area = NULL;
+#endif
 #ifdef CONFIG_ALTIVEC
p->thread.vr_save_area = NULL;
 #endif
@@ -1821,8 +1823,10 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
 #endif
current->thread.load_slb = 0;
current->thread.load_fp = 0;
+#ifdef CONFIG_PPC_FPU_REGS
memset(>thread.fp_state, 0, sizeof(current->thread.fp_state));
current->thread.fp_save_area = NULL;
+#endif
 #ifdef CONFIG_ALTIVEC
memset(>thread.vr_state, 0, sizeof(current->thread.vr_state));
current->thread.vr_state.vscr.u[3] = 0x0001; /* Java mode disabled 
*/
diff --git a/arch/powerpc/kernel/ptrace/Makefile 
b/arch/powerpc/kernel/ptrace/Makefile
index 77abd1a5a508..8ebc11d1168d 100644
--- a/arch/powerpc/kernel/ptrace/Makefile
+++ b/arch/powerpc/kernel/ptrace/Makefile
@@ -6,11 +6,11 @@
 CFLAGS_ptrace-view.o   += -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
 obj-y  += ptrace.o ptrace-view.o
-obj-y  += ptrace-fpu.o
+obj-$(CONFIG_PPC_FPU_REGS) += ptrace-fpu.o
 obj-$(CONFIG_COMPAT)   += ptrace32.o
 obj-$(CONFIG_VSX)  += ptrace-vsx.o
 ifneq ($(CONFIG_VSX),y)
-obj-y  += ptrace-novsx.o
+obj-$(CONFIG_PPC_FPU_REGS) += ptrace-novsx.o
 endif
 obj-$(CONFIG_ALTIVEC)  += ptrace-altivec.o
 

[RFC PATCH v1 10/19] powerpc/signal: Refactor bad frame logging

2020-08-12 Thread Christophe Leroy
The logging of bad frame appears half a dozen of times
and is pretty similar.

Create signal_fault() fonction to perform that logging.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c| 11 +++
 arch/powerpc/kernel/signal.h|  3 +++
 arch/powerpc/kernel/signal_32.c | 35 +
 arch/powerpc/kernel/signal_64.c | 15 ++
 4 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 5edded5c5d20..53b4987a45b5 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -355,3 +355,14 @@ static unsigned long get_tm_stackpointer(struct 
task_struct *tsk)
 #endif
return ret;
 }
+
+static const char fmt32[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %08lx 
lr %08lx\n";
+static const char fmt64[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %016lx 
lr %016lx\n";
+
+void signal_fault(struct task_struct *tsk, struct pt_regs *regs,
+ const char *where, void __user *ptr)
+{
+   if (show_unhandled_signals)
+   printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, 
tsk->comm,
+   task_pid_nr(tsk), where, ptr, regs->nip, 
regs->link);
+}
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index fb98731348c3..f610cfafa478 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -93,4 +93,7 @@ static inline int handle_rt_signal64(struct ksignal *ksig, 
sigset_t *set,
 
 #endif /* !defined(CONFIG_PPC64) */
 
+void signal_fault(struct task_struct *tsk, struct pt_regs *regs,
+ const char *where, void __user *ptr);
+
 #endif  /* _POWERPC_ARCH_SIGNAL_H */
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 3356f6aba4ae..d1087dd87174 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -837,12 +837,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in handle_rt_signal32: "
-  "%p nip %08lx lr %08lx\n",
-  tsk->comm, tsk->pid,
-  addr, regs->nip, regs->link);
+   signal_fault(tsk, regs, "handle_rt_signal32", addr);
 
return 1;
 }
@@ -1094,12 +1089,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 
  bad:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in sys_rt_sigreturn: "
-  "%p nip %08lx lr %08lx\n",
-  current->comm, current->pid,
-  rt_sf, regs->nip, regs->link);
+   signal_fault(current, regs, "sys_rt_sigreturn", rt_sf);
 
force_sig(SIGSEGV);
return 0;
@@ -1183,12 +1173,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user 
*, ctx,
 * We kill the task with a SIGSEGV in this situation.
 */
if (do_setcontext(ctx, regs, 1)) {
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO "%s[%d]: bad frame in "
-  "sys_debug_setcontext: %p nip %08lx "
-  "lr %08lx\n",
-  current->comm, current->pid,
-  ctx, regs->nip, regs->link);
+   signal_fault(current, regs, "sys_debug_setcontext", ctx);
 
force_sig(SIGSEGV);
goto out;
@@ -1291,12 +1276,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in handle_signal32: "
-  "%p nip %08lx lr %08lx\n",
-  tsk->comm, tsk->pid,
-  frame, regs->nip, regs->link);
+   signal_fault(tsk, regs, "handle_signal32", frame);
 
return 1;
 }
@@ -1367,12 +1347,7 @@ SYSCALL_DEFINE0(sigreturn)
return 0;
 
 badframe:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in sys_sigreturn: "
-  "%p nip %08lx lr %08lx\n",
-  current->comm, current->pid,
-  addr, regs->nip, regs->link);
+   signal_fault(current, regs, "sys_sigreturn", addr);
 
force_sig(SIGSEGV);
return 0;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 92d152c1155c..a10b0bb14131 100644
--- 

[RFC PATCH v1 09/19] powerpc/signal: Call get_tm_stackpointer() from get_sigframe()

2020-08-12 Thread Christophe Leroy
Instead of calling get_tm_stackpointer() from the caller, call it
directly from get_sigframe(). This avoids a double call and
allows get_tm_stackpointer() to become static and be inlined
into get_sigframe() by GCC.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c| 9 ++---
 arch/powerpc/kernel/signal.h| 6 ++
 arch/powerpc/kernel/signal_32.c | 4 ++--
 arch/powerpc/kernel/signal_64.c | 2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index a295d482adec..5edded5c5d20 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -144,10 +144,13 @@ int show_unhandled_signals = 1;
 /*
  * Allocate space for the signal frame
  */
-void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
-  size_t frame_size, int is_32)
+static unsigned long get_tm_stackpointer(struct task_struct *tsk);
+
+void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk,
+ size_t frame_size, int is_32)
 {
 unsigned long oldsp, newsp;
+   unsigned long sp = get_tm_stackpointer(tsk);
 
 /* Default to using normal stack */
if (is_32)
@@ -304,7 +307,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long 
thread_info_flags)
user_enter();
 }
 
-unsigned long get_tm_stackpointer(struct task_struct *tsk)
+static unsigned long get_tm_stackpointer(struct task_struct *tsk)
 {
/* When in an active transaction that takes a signal, we need to be
 * careful with the stack.  It's possible that the stack has moved back
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 6c2a33ab042c..fb98731348c3 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -10,8 +10,8 @@
 #ifndef _POWERPC_ARCH_SIGNAL_H
 #define _POWERPC_ARCH_SIGNAL_H
 
-extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
- size_t frame_size, int is_32);
+void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk,
+ size_t frame_size, int is_32);
 
 extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
   struct task_struct *tsk);
@@ -19,8 +19,6 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
  struct task_struct *tsk);
 
-extern unsigned long get_tm_stackpointer(struct task_struct *tsk);
-
 #ifdef CONFIG_VSX
 extern unsigned long copy_vsx_to_user(void __user *to,
  struct task_struct *task);
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 5a838188a181..3356f6aba4ae 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -768,7 +768,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
-   rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
+   rt_sf = get_sigframe(ksig, tsk, sizeof(*rt_sf), 1);
addr = rt_sf;
if (!access_ok(rt_sf, sizeof(*rt_sf)))
goto badframe;
@@ -1230,7 +1230,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
BUG_ON(tsk != current);
 
/* Set up Signal Frame */
-   frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
+   frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index ec259a0efe24..92d152c1155c 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -824,7 +824,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 
BUG_ON(tsk != current);
 
-   frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
+   frame = get_sigframe(ksig, tsk, sizeof(*frame), 0);
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
 
-- 
2.25.0



[RFC PATCH v1 06/19] powerpc/32s: Allow deselecting CONFIG_PPC_FPU on mpc832x

2020-08-12 Thread Christophe Leroy
The e300c2 core which is embedded in mpc832x CPU doesn't have
an FPU.

Make it possible to not select CONFIG_PPC_FPU when building a
kernel dedicated to that target.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/platforms/Kconfig.cputype | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 40ffcdba42b8..d4fd109f177e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -32,7 +32,7 @@ choice
 config PPC_BOOK3S_6xx
bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601"
select PPC_BOOK3S_32
-   select PPC_FPU
+   imply PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
@@ -229,9 +229,16 @@ config PPC_FPU_REGS
bool
 
 config PPC_FPU
-   bool
+   bool "Support for Floating Point Unit (FPU)" if PPC_MPC832x
default y if PPC64
select PPC_FPU_REGS
+   help
+ This must be enabled to support the Floating Point Unit
+ Most 6xx have an FPU but e300c2 core (mpc832x) don't have
+ an FPU, so when building an embedded kernel for that target
+ you can disable FPU support.
+
+ If unsure say Y.
 
 config FSL_EMB_PERFMON
bool "Freescale Embedded Perfmon"
-- 
2.25.0



[RFC PATCH v1 07/19] powerpc/signal: Move access_ok() out of get_sigframe()

2020-08-12 Thread Christophe Leroy
This access_ok() will soon be performed by user_access_begin().
So move it out of get_sigframe()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c| 4 
 arch/powerpc/kernel/signal_32.c | 4 ++--
 arch/powerpc/kernel/signal_64.c | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 3b56db02b762..1be5fd01f866 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -154,10 +154,6 @@ void __user *get_sigframe(struct ksignal *ksig, unsigned 
long sp,
oldsp = sigsp(oldsp, ksig);
newsp = (oldsp - frame_size) & ~0xFUL;
 
-   /* Check access */
-   if (!access_ok((void __user *)newsp, oldsp - newsp))
-   return NULL;
-
 return (void __user *)newsp;
 }
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 7b291707eb31..5a838188a181 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -770,7 +770,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
/* Put a Real Time Context onto stack */
rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
addr = rt_sf;
-   if (unlikely(rt_sf == NULL))
+   if (!access_ok(rt_sf, sizeof(*rt_sf)))
goto badframe;
 
/* Put the siginfo & fill in most of the ucontext */
@@ -1231,7 +1231,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
-   if (unlikely(frame == NULL))
+   if (!access_ok(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
 
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index bfc939360bad..ec259a0efe24 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -825,7 +825,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
BUG_ON(tsk != current);
 
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
-   if (unlikely(frame == NULL))
+   if (!access_ok(frame, sizeof(*frame)))
goto badframe;
 
err |= __put_user(>info, >pinfo);
-- 
2.25.0



[RFC PATCH v1 08/19] powerpc/signal: Remove get_clean_sp()

2020-08-12 Thread Christophe Leroy
get_clean_sp() is only used once in kernel/signal.c .

And GCC is smart enough to see that x & 0x is a nop
calculation on PPC32, no need of a special PPC32 trivial version.

Include the logic from the PPC64 version of get_clean_sp() directly
in get_sigframe()

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/processor.h | 14 --
 arch/powerpc/kernel/signal.c |  5 -
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index e20b0c5abe62..8320aedbdca3 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -406,20 +406,6 @@ static inline void prefetchw(const void *x)
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
 
-#ifdef CONFIG_PPC64
-static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
-{
-   if (is_32)
-   return sp & 0x0UL;
-   return sp;
-}
-#else
-static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
-{
-   return sp;
-}
-#endif
-
 /* asm stubs */
 extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
 extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 1be5fd01f866..a295d482adec 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -150,7 +150,10 @@ void __user *get_sigframe(struct ksignal *ksig, unsigned 
long sp,
 unsigned long oldsp, newsp;
 
 /* Default to using normal stack */
-oldsp = get_clean_sp(sp, is_32);
+   if (is_32)
+   oldsp = sp & 0x0UL;
+   else
+   oldsp = sp;
oldsp = sigsp(oldsp, ksig);
newsp = (oldsp - frame_size) & ~0xFUL;
 
-- 
2.25.0



[RFC PATCH v1 01/19] powerpc/signal: Move inline functions in signal.h

2020-08-12 Thread Christophe Leroy
To really be inlined, the functions needs to be defined in the
same C file as the caller, or in an included header.

Move functions from signal .c defined inline in signal.h

Signed-off-by: Christophe Leroy 
Fixes: 3dd4eb83a9c0 ("powerpc: move common register copy functions from 
signal_32.c to signal.c")
---
 arch/powerpc/kernel/signal.c | 30 --
 arch/powerpc/kernel/signal.h | 41 +---
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index d15a98c758b8..3b56db02b762 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -133,36 +133,6 @@ unsigned long copy_ckvsx_from_user(struct task_struct 
*task,
return 0;
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-#else
-inline unsigned long copy_fpr_to_user(void __user *to,
- struct task_struct *task)
-{
-   return __copy_to_user(to, task->thread.fp_state.fpr,
- ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_fpr_from_user(struct task_struct *task,
-   void __user *from)
-{
-   return __copy_from_user(task->thread.fp_state.fpr, from,
- ELF_NFPREG * sizeof(double));
-}
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-inline unsigned long copy_ckfpr_to_user(void __user *to,
-struct task_struct *task)
-{
-   return __copy_to_user(to, task->thread.ckfp_state.fpr,
- ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
-void __user *from)
-{
-   return __copy_from_user(task->thread.ckfp_state.fpr, from,
-   ELF_NFPREG * sizeof(double));
-}
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 #endif
 
 /* Log an error when sending an unhandled signal to a process. Controlled
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index d396efca4068..4626d39cc0f0 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -19,14 +19,6 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
  struct task_struct *tsk);
 
-extern unsigned long copy_fpr_to_user(void __user *to,
- struct task_struct *task);
-extern unsigned long copy_ckfpr_to_user(void __user *to,
-  struct task_struct *task);
-extern unsigned long copy_fpr_from_user(struct task_struct *task,
-   void __user *from);
-extern unsigned long copy_ckfpr_from_user(struct task_struct *task,
-void __user *from);
 extern unsigned long get_tm_stackpointer(struct task_struct *tsk);
 
 #ifdef CONFIG_VSX
@@ -38,6 +30,39 @@ extern unsigned long copy_vsx_from_user(struct task_struct 
*task,
void __user *from);
 extern unsigned long copy_ckvsx_from_user(struct task_struct *task,
 void __user *from);
+unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task);
+unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct *task);
+unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from);
+unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user 
*from);
+#else
+static inline unsigned long
+copy_fpr_to_user(void __user *to, struct task_struct *task)
+{
+   return __copy_to_user(to, task->thread.fp_state.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+static inline unsigned long
+copy_fpr_from_user(struct task_struct *task, void __user *from)
+{
+   return __copy_from_user(task->thread.fp_state.fpr, from,
+ ELF_NFPREG * sizeof(double));
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+inline unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct 
*task)
+{
+   return __copy_to_user(to, task->thread.ckfp_state.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+static inline unsigned long
+copy_ckfpr_from_user(struct task_struct *task, void __user *from)
+{
+   return __copy_from_user(task->thread.ckfp_state.fpr, from,
+   ELF_NFPREG * sizeof(double));
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 #endif
 
 #ifdef CONFIG_PPC64
-- 
2.25.0



[RFC PATCH v1 02/19] powerpc/ptrace: Move declaration of ptrace_get_reg() and ptrace_set_reg()

2020-08-12 Thread Christophe Leroy
ptrace_get_reg() and ptrace_set_reg() are only used internally by
ptrace.

Move them in arch/powerpc/kernel/ptrace/ptrace-decl.h

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/ptrace.h| 6 --
 arch/powerpc/kernel/ptrace/ptrace-decl.h | 3 +++
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 155a197c0aa1..3c3cf537c3bf 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -171,12 +171,6 @@ static inline void regs_set_return_value(struct pt_regs 
*regs, unsigned long rc)
set_thread_flag(TIF_NOERROR); \
} while(0)
 
-struct task_struct;
-extern int ptrace_get_reg(struct task_struct *task, int regno,
- unsigned long *data);
-extern int ptrace_put_reg(struct task_struct *task, int regno,
- unsigned long data);
-
 #define current_pt_regs() \
((struct pt_regs *)((unsigned long)task_stack_page(current) + 
THREAD_SIZE) - 1)
 
diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index 67447a6197eb..2ddc68412fa8 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -159,6 +159,9 @@ int tm_cgpr32_set(struct task_struct *target, const struct 
user_regset *regset,
 
 /* ptrace-view */
 
+int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data);
+int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data);
+
 extern const struct user_regset_view user_ppc_native_view;
 
 /* ptrace-(no)adv */
-- 
2.25.0



[RFC PATCH v1 00/19] powerpc: Switch signal 32 to using user_access_begin() and friends

2020-08-12 Thread Christophe Leroy
This series replaces copies to users by unsafe_put_user() and friends
with user_write_access_begin() dance in signal32.

The advantages are:
- No KUAP unlock/lock at every copy
- More readable code.
- Better generated code.

Copying Al Viro who did it on x86 and may have suggestions,
and Dmitry V. Levin who introduced put_compat_sigset()

Christophe Leroy (19):
  powerpc/signal: Move inline functions in signal.h
  powerpc/ptrace: Move declaration of ptrace_get_reg() and
ptrace_set_reg()
  powerpc/ptrace: Consolidate reg index calculation
  powerpc/ptrace: Create ptrace_get_fpr() and ptrace_put_fpr()
  powerpc/signal: Don't manage floating point regs when no FPU
  powerpc/32s: Allow deselecting CONFIG_PPC_FPU on mpc832x
  powerpc/signal: Move access_ok() out of get_sigframe()
  powerpc/signal: Remove get_clean_sp()
  powerpc/signal: Call get_tm_stackpointer() from get_sigframe()
  powerpc/signal: Refactor bad frame logging
  powerpc/signal32: Simplify logging in handle_rt_signal32()
  powerpc/signal32: Regroup copies in save_user_regs() and
save_tm_user_regs()
  powerpc/signal32: Create 'unsafe' versions of
copy_[ck][fpr/vsx]_to_user()
  powerpc/signal32: Switch save_user_regs() and save_tm_user_regs() to
user_access_begin() logic
  powerpc/signal32: Switch handle_signal32() to user_access_begin()
logic
  powerpc/signal32: Switch handle_rt_signal32() to user_access_begin()
logic
  signal: Add unsafe_put_compat_sigset()
  powerpc/signal32: Add and use unsafe_put_sigset_t()
  powerpc/signal32: Switch swap_context() to user_access_begin() logic

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/processor.h |  16 +-
 arch/powerpc/include/asm/ptrace.h|   6 -
 arch/powerpc/kernel/asm-offsets.c|   2 +
 arch/powerpc/kernel/process.c|   4 +
 arch/powerpc/kernel/ptrace/Makefile  |   3 +-
 arch/powerpc/kernel/ptrace/ptrace-decl.h |  21 ++
 arch/powerpc/kernel/ptrace/ptrace-fpu.c  |  40 +++
 arch/powerpc/kernel/ptrace/ptrace-view.c |   2 +
 arch/powerpc/kernel/ptrace/ptrace.c  |  54 +---
 arch/powerpc/kernel/signal.c |  59 ++--
 arch/powerpc/kernel/signal.h | 109 ++-
 arch/powerpc/kernel/signal_32.c  | 386 ---
 arch/powerpc/kernel/signal_64.c  |  19 +-
 arch/powerpc/kernel/traps.c  |   2 +
 arch/powerpc/platforms/Kconfig.cputype   |  15 +-
 include/linux/compat.h   |  32 ++
 17 files changed, 462 insertions(+), 309 deletions(-)
 create mode 100644 arch/powerpc/kernel/ptrace/ptrace-fpu.c

-- 
2.25.0



[RFC PATCH v1 04/19] powerpc/ptrace: Create ptrace_get_fpr() and ptrace_put_fpr()

2020-08-12 Thread Christophe Leroy
On the same model as ptrace_get_reg() and ptrace_put_reg(),
create ptrace_get_fpr() and ptrace_put_fpr() to get/set
the floating points registers.

We move the boundary checkings in them.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/ptrace/Makefile  |  1 +
 arch/powerpc/kernel/ptrace/ptrace-decl.h |  4 +++
 arch/powerpc/kernel/ptrace/ptrace-fpu.c  | 40 
 arch/powerpc/kernel/ptrace/ptrace.c  | 38 +++---
 4 files changed, 56 insertions(+), 27 deletions(-)
 create mode 100644 arch/powerpc/kernel/ptrace/ptrace-fpu.c

diff --git a/arch/powerpc/kernel/ptrace/Makefile 
b/arch/powerpc/kernel/ptrace/Makefile
index c2f2402ebc8c..77abd1a5a508 100644
--- a/arch/powerpc/kernel/ptrace/Makefile
+++ b/arch/powerpc/kernel/ptrace/Makefile
@@ -6,6 +6,7 @@
 CFLAGS_ptrace-view.o   += -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
 obj-y  += ptrace.o ptrace-view.o
+obj-y  += ptrace-fpu.o
 obj-$(CONFIG_COMPAT)   += ptrace32.o
 obj-$(CONFIG_VSX)  += ptrace-vsx.o
 ifneq ($(CONFIG_VSX),y)
diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index 2ddc68412fa8..eafe5f0f6289 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -164,6 +164,10 @@ int ptrace_put_reg(struct task_struct *task, int regno, 
unsigned long data);
 
 extern const struct user_regset_view user_ppc_native_view;
 
+/* ptrace-fpu */
+int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data);
+int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data);
+
 /* ptrace-(no)adv */
 void ppc_gethwdinfo(struct ppc_debug_info *dbginfo);
 int ptrace_get_debugreg(struct task_struct *child, unsigned long addr,
diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c 
b/arch/powerpc/kernel/ptrace/ptrace-fpu.c
new file mode 100644
index ..8301cb52dd99
--- /dev/null
+++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+
+#include 
+
+#include "ptrace-decl.h"
+
+int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data)
+{
+   unsigned int fpidx = index - PT_FPR0;
+
+   if (index > PT_FPSCR)
+   return -EIO;
+
+   flush_fp_to_thread(child);
+   if (fpidx < (PT_FPSCR - PT_FPR0))
+   memcpy(data, >thread.TS_FPR(fpidx), sizeof(long));
+   else
+   *data = child->thread.fp_state.fpscr;
+
+   return 0;
+}
+
+int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data)
+{
+   unsigned int fpidx = index - PT_FPR0;
+
+   if (index > PT_FPSCR)
+   return -EIO;
+
+   flush_fp_to_thread(child);
+   if (fpidx < (PT_FPSCR - PT_FPR0))
+   memcpy(>thread.TS_FPR(fpidx), , sizeof(long));
+   else
+   child->thread.fp_state.fpscr = data;
+
+   return 0;
+}
+
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index 0b4645a7a1b4..3d44b73adb83 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -56,24 +56,17 @@ long arch_ptrace(struct task_struct *child, long request,
ret = -EIO;
/* convert to index and check */
index = addr / sizeof(long);
-   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR) || 
!child->thread.regs)
+   if ((addr & (sizeof(long) - 1)) || !child->thread.regs)
break;
 
CHECK_FULL_REGS(child->thread.regs);
-   if (index < PT_FPR0) {
+   if (index < PT_FPR0)
ret = ptrace_get_reg(child, (int) index, );
-   if (ret)
-   break;
-   } else {
-   unsigned int fpidx = index - PT_FPR0;
-
-   flush_fp_to_thread(child);
-   if (fpidx < (PT_FPSCR - PT_FPR0))
-   memcpy(, >thread.TS_FPR(fpidx),
-  sizeof(long));
-   else
-   tmp = child->thread.fp_state.fpscr;
-   }
+   else
+   ret = ptrace_get_fpr(child, index, );
+
+   if (ret)
+   break;
ret = put_user(tmp, datalp);
break;
}
@@ -85,23 +78,14 @@ long arch_ptrace(struct task_struct *child, long request,
ret = -EIO;
/* convert to index and check */
index = addr / sizeof(long);
-   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR) || 
!child->thread.regs)
+   if ((addr & (sizeof(long) - 1)) || !child->thread.regs)
break;
 
CHECK_FULL_REGS(child->thread.regs);
-   

[RFC PATCH v1 03/19] powerpc/ptrace: Consolidate reg index calculation

2020-08-12 Thread Christophe Leroy
Today we have:

#ifdef CONFIG_PPC32
index = addr >> 2;
if ((addr & 3) || child->thread.regs == NULL)
#else
index = addr >> 3;
if ((addr & 7))
#endif

sizeof(long) has value 4 for PPC32 and value 8 for PPC64.

Dividing by 4 is equivalent to >> 2 and dividing by 8 is equivalent
to >> 3.

And 3 and 7 are respectively (sizeof(long) - 1).

Use sizeof(long) to get rid of the #ifdef CONFIG_PPC32 and consolidate
the calculation and checking.

thread.regs have to be not NULL on both PPC32 and PPC64 so adding
that test on PPC64 is harmless.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/ptrace/ptrace.c | 20 
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index f6e51be47c6e..0b4645a7a1b4 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -55,14 +55,8 @@ long arch_ptrace(struct task_struct *child, long request,
 
ret = -EIO;
/* convert to index and check */
-#ifdef CONFIG_PPC32
-   index = addr >> 2;
-   if ((addr & 3) || (index > PT_FPSCR)
-   || (child->thread.regs == NULL))
-#else
-   index = addr >> 3;
-   if ((addr & 7) || (index > PT_FPSCR))
-#endif
+   index = addr / sizeof(long);
+   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR) || 
!child->thread.regs)
break;
 
CHECK_FULL_REGS(child->thread.regs);
@@ -90,14 +84,8 @@ long arch_ptrace(struct task_struct *child, long request,
 
ret = -EIO;
/* convert to index and check */
-#ifdef CONFIG_PPC32
-   index = addr >> 2;
-   if ((addr & 3) || (index > PT_FPSCR)
-   || (child->thread.regs == NULL))
-#else
-   index = addr >> 3;
-   if ((addr & 7) || (index > PT_FPSCR))
-#endif
+   index = addr / sizeof(long);
+   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR) || 
!child->thread.regs)
break;
 
CHECK_FULL_REGS(child->thread.regs);
-- 
2.25.0



Re: [PATCH 07/16] debug_vm_pgtable/THP: Mark the pte entry huge before using set_pud_at

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> kernel expect entries to be marked huge before we use set_pud_at().
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 11 +++
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index b6aca2526e01..cd609a212dd4 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -265,7 +265,7 @@ static void __init pud_advanced_tests(struct mm_struct 
> *mm,
> unsigned long pfn, unsigned long vaddr,
> pgprot_t prot)
>  {
> - pud_t pud = pfn_pud(pfn, prot);
> + pud_t pud;
>  
>   if (!has_transparent_hugepage())
>   return;
> @@ -274,25 +274,28 @@ static void __init pud_advanced_tests(struct mm_struct 
> *mm,
>   /* Align the address wrt HPAGE_PUD_SIZE */
>   vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
>  
> + pud = pud_mkhuge(pfn_pud(pfn, prot));
>   set_pud_at(mm, vaddr, pudp, pud);
>   pudp_set_wrprotect(mm, vaddr, pudp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(pud_write(pud));
>  
>  #ifndef __PAGETABLE_PMD_FOLDED
> - pud = pfn_pud(pfn, prot);
> +
> + pud = pud_mkhuge(pfn_pud(pfn, prot));
>   set_pud_at(mm, vaddr, pudp, pud);
>   pudp_huge_get_and_clear(mm, vaddr, pudp);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(!pud_none(pud));
>  
> - pud = pfn_pud(pfn, prot);
> + pud = pud_mkhuge(pfn_pud(pfn, prot));
>   set_pud_at(mm, vaddr, pudp, pud);
>   pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
>   pud = READ_ONCE(*pudp);
>   WARN_ON(!pud_none(pud));
>  #endif /* __PAGETABLE_PMD_FOLDED */
> - pud = pfn_pud(pfn, prot);
> +
> + pud = pud_mkhuge(pfn_pud(pfn, prot));
>   pud = pud_wrprotect(pud);
>   pud = pud_mkclean(pud);
>   set_pud_at(mm, vaddr, pudp, pud);
> 

Looks very similar to the previous patch, hence please fold it back.
Seems fair enough that pxx_set_at() expects given entry to be huge.
But would need to run them across enabled platforms to be sure.


Re: [PATCH 05/16] debug_vm_pgtable/savedwrite: Enable savedwrite test with CONFIG_NUMA_BALANCING

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> Saved write support was added to track the write bit of a pte after marking 
> the
> pte protnone. This was done so that AUTONUMA can convert a write pte to 
> protnone
> and still track the old write bit. When converting it back we set the pte 
> write
> bit correctly thereby avoiding a write fault again. Hence enable the test only
> when CONFIG_NUMA_BALANCING is enabled.

This has not been a problem on architectures that do not define pxx_savedwrite()
as they fallback on standard pxx_write() helpers. But ppc64 defines a fall back
for pte_savedwrite() when !CONFIG_NUMA_BALANCING, which always returns false.
Agreed, it makes sense to add CONFIG_NUMA_BALANCING check. But a simple runtime
check will do as the functions are always visible/defined.

if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
return;

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 679bb3d289a3..de8a62d0a931 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -110,6 +110,7 @@ static void __init pte_advanced_tests(struct mm_struct 
> *mm,
>   WARN_ON(pte_young(pte));
>  }
>  
> +#ifdef CONFIG_NUMA_BALANCING
>  static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>  {
>   pte_t pte = pfn_pte(pfn, prot);
> @@ -118,6 +119,8 @@ static void __init pte_savedwrite_tests(unsigned long 
> pfn, pgprot_t prot)
>   WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte;
>   WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte;
>  }
> +#endif
> +
>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>  static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
>  {
> @@ -221,6 +224,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned 
> long pfn, pgprot_t prot)
>   WARN_ON(!pmd_none(pmd));
>  }
>  
> +#ifdef CONFIG_NUMA_BALANCING
>  static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>  {
>   pmd_t pmd = pfn_pmd(pfn, prot);
> @@ -229,6 +233,7 @@ static void __init pmd_savedwrite_tests(unsigned long 
> pfn, pgprot_t prot)
>   WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd;
>   WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd;
>  }
> +#endif
>  
>  #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>  static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
> @@ -1005,8 +1010,10 @@ static int __init debug_vm_pgtable(void)
>   pmd_huge_tests(pmdp, pmd_aligned, prot);
>   pud_huge_tests(pudp, pud_aligned, prot);
>  
> +#ifdef CONFIG_NUMA_BALANCING
>   pte_savedwrite_tests(pte_aligned, prot);
>   pmd_savedwrite_tests(pmd_aligned, prot);
> +#endif

BTW, config  wrappers should be avoided at the call site. If and when needed
a stub is defined for !CONFIG_XXX_XXX.

>  
>   pte_unmap_unlock(ptep, ptl);
>  
> 


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-08-12 Thread peterz
On Wed, Aug 12, 2020 at 06:18:28PM +1000, Nicholas Piggin wrote:
> Excerpts from pet...@infradead.org's message of August 7, 2020 9:11 pm:
> > 
> > What's wrong with something like this?
> > 
> > AFAICT there's no reason to actually try and add IRQ tracing here, it's
> > just a hand full of instructions at the most.
> 
> Because we may want to use that in other places as well, so it would
> be nice to have tracing.
> 
> Hmm... also, I thought NMI context was free to call local_irq_save/restore
> anyway so the bug would still be there in those cases?

NMI code has in_nmi() true, in which case the IRQ tracing is disabled
(except for x86 which has CONFIG_TRACE_IRQFLAGS_NMI).


Re: [PATCH 04/16] debug_vm_pgtables/hugevmap: Use the arch helper to identify huge vmap support.

2020-08-12 Thread Anshuman Khandual


On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> ppc64 supports huge vmap only with radix translation. Hence use arch helper
> to determine the huge vmap support.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 02a7c20aa4a2..679bb3d289a3 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -206,7 +206,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned 
> long pfn, pgprot_t prot)
>  {
>   pmd_t pmd;
>  
> - if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> + if (!arch_ioremap_pmd_supported())
>   return;
>  
>   pr_debug("Validating PMD huge\n");
> 

Problem is arch_ioremap_pmd_supported() symbol which should also be
explicitly included via , is not available without the
config CONFIG_HAVE_ARCH_HUGE_VMAP. ioremap_pmd_enabled() should have
been better here and has a fallback for !CONFIG_HAVE_ARCH_HUGE_VMAP.
But then the symbol is local to that file. Unless we would like to
make ioremap_pxx_enabled generally available, the remaining option
would be to wrap pxx_huge_tests() with CONFIG_HAVE_ARCH_HUGE_VMAP.
Similar changes should also be done for pud_huge_tests() as well.


Re: [PATCH 03/16] debug_vm_pgtable/set_pte: Don't use set_pte_at to update an existing pte entry

2020-08-12 Thread Aneesh Kumar K.V

On 8/12/20 2:42 PM, Anshuman Khandual wrote:



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:

set_pte_at() should not be used to set a pte entry at locations that
already holds a valid pte entry. Architectures like ppc64 don't do TLB
invalidate in set_pte_at() and hence expect it to be used to set locations
that are not a valid PTE.


Even though set_pte_at() is not really a arch page table helper and
very much arch specific, I just wonder why this deviation on ppc64
as compared to other platforms. Detecting such semantics variation
across platforms is an objective of this test.


Not sure what you mean by set_pte_at is not a page table helper. Generic 
kernel use that helper to set a pte entry.


Now w.r.t ppc64 behavior this was discussed multiple times. I guess 
Linux kernel always used set_pte_at on a none pte entry. We had some 
exceptions in the recent past. But all fixed when noticed.



383321ab8578dfe3bbcc0bc5604c0f8ae08a5c98
mm/hugetlb/migration: use set_huge_pte_at instead of set_pte_at

cee216a696b2004017a5ecb583366093d90b1568
mm/autonuma: don't use set_pte_at when updating protnone ptes

56eecdb912b536a4fa97fb5bfe5a940a54d79be6
mm: Use ptep/pmdp_set_numa() for updating _PAGE_NUMA bit

Yes. Having a testcase like this help



As small nit.

Please follow the existing subject format for all patches in here.
It will improve readability and also help understand these changes
better, later on.

mm/debug_vm_pgtable: 



Signed-off-by: Aneesh Kumar K.V 
---
  mm/debug_vm_pgtable.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 4c32063a8acf..02a7c20aa4a2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -81,8 +81,6 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
pte = ptep_get(ptep);
WARN_ON(pte_write(pte));
  
-	pte = pfn_pte(pfn, prot);

-   set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));


This makes sense. But could you please fold this code stanza with
the previous one in order to imply that 'ptep' did have some valid
entry before being cleared and checked against pte_none().



will do that


@@ -97,12 +95,14 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
pte = ptep_get(ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
  
-	pte = pfn_pte(pfn, prot);

-   set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear_full(mm, vaddr, ptep, 1);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));


Same, please fold back.



ok



+   /*
+* We should clear pte before we do set_pte_at
+*/
+   pte = ptep_get_and_clear(mm, vaddr, ptep);
pte = pte_mkyoung(pte);
set_pte_at(mm, vaddr, ptep, pte);
ptep_test_and_clear_young(vma, vaddr, ptep);



The comment above should also explain details that are mentioned
in the commit message i.e how platforms such as ppc64 expects a
clear pte entry for set_pte_at() to work.



I don't think it is specific to ppc64. There is nothing specific to 
ppc64 architecture in there. It is an optimization used in kernel to 
help architecture avoid TLB flush. I will update the comment as below



/* We should clear pte before we do set_pte_at so that set_pte_at don't 
find a valid pte at ptep *?


is that good?

-aneesh



Re: [PATCH 03/16] debug_vm_pgtable/set_pte: Don't use set_pte_at to update an existing pte entry

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> set_pte_at() should not be used to set a pte entry at locations that
> already holds a valid pte entry. Architectures like ppc64 don't do TLB
> invalidate in set_pte_at() and hence expect it to be used to set locations
> that are not a valid PTE.

Even though set_pte_at() is not really a arch page table helper and
very much arch specific, I just wonder why this deviation on ppc64
as compared to other platforms. Detecting such semantics variation
across platforms is an objective of this test.

As small nit.

Please follow the existing subject format for all patches in here.
It will improve readability and also help understand these changes
better, later on.

mm/debug_vm_pgtable: 

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 4c32063a8acf..02a7c20aa4a2 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -81,8 +81,6 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
>   pte = ptep_get(ptep);
>   WARN_ON(pte_write(pte));
>  
> - pte = pfn_pte(pfn, prot);
> - set_pte_at(mm, vaddr, ptep, pte);
>   ptep_get_and_clear(mm, vaddr, ptep);
>   pte = ptep_get(ptep);
>   WARN_ON(!pte_none(pte));

This makes sense. But could you please fold this code stanza with
the previous one in order to imply that 'ptep' did have some valid
entry before being cleared and checked against pte_none().

> @@ -97,12 +95,14 @@ static void __init pte_advanced_tests(struct mm_struct 
> *mm,
>   pte = ptep_get(ptep);
>   WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
>  
> - pte = pfn_pte(pfn, prot);
> - set_pte_at(mm, vaddr, ptep, pte);
>   ptep_get_and_clear_full(mm, vaddr, ptep, 1);
>   pte = ptep_get(ptep);
>   WARN_ON(!pte_none(pte));

Same, please fold back.

>  
> + /*
> +  * We should clear pte before we do set_pte_at
> +  */
> + pte = ptep_get_and_clear(mm, vaddr, ptep);
>   pte = pte_mkyoung(pte);
>   set_pte_at(mm, vaddr, ptep, pte);
>   ptep_test_and_clear_young(vma, vaddr, ptep);
>

The comment above should also explain details that are mentioned
in the commit message i.e how platforms such as ppc64 expects a
clear pte entry for set_pte_at() to work.


Re: [PATCH 01/16] powerpc/mm: Add DEBUG_VM WARN for pmd_clear

2020-08-12 Thread Aneesh Kumar K.V

On 8/12/20 1:16 PM, Anshuman Khandual wrote:

On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:

With the hash page table, the kernel should not use pmd_clear for clearing
huge pte entries. Add a DEBUG_VM WARN to catch the wrong usage.

Signed-off-by: Aneesh Kumar K.V 


This particular change is very much powerpc specific. Hence please drop it from
the series which otherwise changes the page table test. Also, this series which
is not a RFC, still lacks a proper cover letter with diff stats, tree/tag on
which this applies, summary about the proposal etc. All those information will
be helpful in reviewing this series better. For now, assuming that this applies
cleanly on current master branch. But again, please do include a cover letter
in the next version.



The patch series include all sort of fixes. There is no special theme 
for the series. So all that the cover letter would have is "fixes to 
make debug_vm_pgtable work on ppc64"


I tried to keep each patch simpler explaining why the current code is 
wrong.






---
  arch/powerpc/include/asm/book3s/64/pgtable.h | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 6de56c3b33c4..079211968987 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -868,6 +868,13 @@ static inline bool pte_ci(pte_t pte)
  
  static inline void pmd_clear(pmd_t *pmdp)

  {
+   if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+   /*
+* Don't use this if we can possibly have a hash page table
+* entry mapping this.
+*/
+   WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == 
(H_PAGE_HASHPTE | _PAGE_PTE));
+   }
*pmdp = __pmd(0);
  }
  
@@ -916,6 +923,13 @@ static inline int pmd_bad(pmd_t pmd)
  
  static inline void pud_clear(pud_t *pudp)

  {
+   if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+   /*
+* Don't use this if we can possibly have a hash page table
+* entry mapping this.
+*/
+   WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == 
(H_PAGE_HASHPTE | _PAGE_PTE));
+   }
*pudp = __pud(0);
  }
  



-aneesh



Re: [PATCH 02/16] debug_vm_pgtable/ppc64: Avoid setting top bits in radom value

2020-08-12 Thread Aneesh Kumar K.V

On 8/12/20 1:42 PM, Anshuman Khandual wrote:



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:

ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting that bit in
random value.

Signed-off-by: Aneesh Kumar K.V 
---
  mm/debug_vm_pgtable.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..4c32063a8acf 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -45,9 +45,12 @@
   * pxx_clear() because of how dynamic page table folding works on s390. So
   * while loading up the entries do not change the lower 4 bits. It does not
   * have affect any other platform.
+ *
+ * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
   */


Please move and fold the above line with the existing paragraph.


  #define S390_MASK_BITS4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define PPC_MASK_BITS  2


s/PPC/PPC64/


+#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
S390_MASK_BITS)
  #define RANDOM_NZVALUEGENMASK(7, 0)
  
  static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)




With this change, RANDOM_ORVALUE will be (0x3ff0) which discards
both bit 63 and 62. If only bit 62 is to be avoided for ppc64 the mask should
be (0xbff0) instead. The following change on this patch should do
the trick.

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index eb059fef89c2..1499181fb0e9 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -45,13 +45,13 @@
   * entry type. But these bits might affect the ability to clear entries with
   * pxx_clear() because of how dynamic page table folding works on s390. So
   * while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
- *
- * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
   */
-#define S390_MASK_BITS 4
-#define PPC_MASK_BITS  2
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#define PPC64_SKIP_MASKGENMASK(62, 62)
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
  #define RANDOM_NZVALUE GENMASK(7, 0)
  
  static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)




I will switch to this.

-aneesh


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-08-12 Thread Nicholas Piggin
Excerpts from pet...@infradead.org's message of August 7, 2020 9:11 pm:
> 
> What's wrong with something like this?
> 
> AFAICT there's no reason to actually try and add IRQ tracing here, it's
> just a hand full of instructions at the most.

Because we may want to use that in other places as well, so it would
be nice to have tracing.

Hmm... also, I thought NMI context was free to call local_irq_save/restore
anyway so the bug would still be there in those cases?

Thanks,
Nick

> 
> ---
> 
> diff --git a/arch/powerpc/include/asm/hw_irq.h 
> b/arch/powerpc/include/asm/hw_irq.h
> index 3a0db7b0b46e..6be22c1838e2 100644
> --- a/arch/powerpc/include/asm/hw_irq.h
> +++ b/arch/powerpc/include/asm/hw_irq.h
> @@ -196,33 +196,6 @@ static inline bool arch_irqs_disabled(void)
>   arch_local_irq_restore(flags);  \
>   } while(0)
>  
> -#ifdef CONFIG_TRACE_IRQFLAGS
> -#define powerpc_local_irq_pmu_save(flags)\
> -  do {   \
> - raw_local_irq_pmu_save(flags);  \
> - trace_hardirqs_off();   \
> - } while(0)
> -#define powerpc_local_irq_pmu_restore(flags) \
> - do {\
> - if (raw_irqs_disabled_flags(flags)) {   \
> - raw_local_irq_pmu_restore(flags);   \
> - trace_hardirqs_off();   \
> - } else {\
> - trace_hardirqs_on();\
> - raw_local_irq_pmu_restore(flags);   \
> - }   \
> - } while(0)
> -#else
> -#define powerpc_local_irq_pmu_save(flags)\
> - do {\
> - raw_local_irq_pmu_save(flags);  \
> - } while(0)
> -#define powerpc_local_irq_pmu_restore(flags) \
> - do {\
> - raw_local_irq_pmu_restore(flags);   \
> - } while (0)
> -#endif  /* CONFIG_TRACE_IRQFLAGS */
> -
>  #endif /* CONFIG_PPC_BOOK3S */
>  
>  #ifdef CONFIG_PPC_BOOK3E
> diff --git a/arch/powerpc/include/asm/local.h 
> b/arch/powerpc/include/asm/local.h
> index bc4bd19b7fc2..b357a35672b1 100644
> --- a/arch/powerpc/include/asm/local.h
> +++ b/arch/powerpc/include/asm/local.h
> @@ -32,9 +32,9 @@ static __inline__ void local_##op(long i, local_t *l)   
> \
>  {\
>   unsigned long flags;\
>   \
> - powerpc_local_irq_pmu_save(flags);  \
> + raw_powerpc_local_irq_pmu_save(flags);  \
>   l->v c_op i;\
> - powerpc_local_irq_pmu_restore(flags);   \
> + raw_powerpc_local_irq_pmu_restore(flags);   
> \
>  }
>  
>  #define LOCAL_OP_RETURN(op, c_op)\
> @@ -43,9 +43,9 @@ static __inline__ long local_##op##_return(long a, local_t 
> *l)  \
>   long t; \
>   unsigned long flags;\
>   \
> - powerpc_local_irq_pmu_save(flags);  \
> + raw_powerpc_local_irq_pmu_save(flags);  \
>   t = (l->v c_op a);  \
> - powerpc_local_irq_pmu_restore(flags);   \
> + raw_powerpc_local_irq_pmu_restore(flags);   
> \
>   \
>   return t;   \
>  }
> @@ -81,11 +81,11 @@ static __inline__ long local_cmpxchg(local_t *l, long o, 
> long n)
>   long t;
>   unsigned long flags;
>  
> - powerpc_local_irq_pmu_save(flags);
> + raw_powerpc_local_irq_pmu_save(flags);
>   t = l->v;
>   if (t == o)
>   l->v = n;
> - powerpc_local_irq_pmu_restore(flags);
> + raw_powerpc_local_irq_pmu_restore(flags);
>  
>   return t;
>  }
> @@ -95,10 +95,10 @@ static __inline__ long local_xchg(local_t *l, long n)
>   long t;
>   unsigned long flags;
>  
> - powerpc_local_irq_pmu_save(flags);
> + raw_powerpc_local_irq_pmu_save(flags);
>   t = l->v;
>   l->v = n;
> - powerpc_local_irq_pmu_restore(flags);
> + raw_powerpc_local_irq_pmu_restore(flags);
>  
>   return t;
>  }
> @@ 

Re: [PATCH 02/16] debug_vm_pgtable/ppc64: Avoid setting top bits in radom value

2020-08-12 Thread Anshuman Khandual



On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting that bit 
> in
> random value.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  mm/debug_vm_pgtable.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 086309fb9b6f..4c32063a8acf 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -45,9 +45,12 @@
>   * pxx_clear() because of how dynamic page table folding works on s390. So
>   * while loading up the entries do not change the lower 4 bits. It does not
>   * have affect any other platform.
> + *
> + * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
>   */

Please move and fold the above line with the existing paragraph.

>  #define S390_MASK_BITS   4
> -#define RANDOM_ORVALUE   GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
> +#define PPC_MASK_BITS2

s/PPC/PPC64/

> +#define RANDOM_ORVALUE   GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
> S390_MASK_BITS)
>  #define RANDOM_NZVALUE   GENMASK(7, 0)
>  
>  static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
> 

With this change, RANDOM_ORVALUE will be (0x3ff0) which discards
both bit 63 and 62. If only bit 62 is to be avoided for ppc64 the mask should
be (0xbff0) instead. The following change on this patch should do
the trick.

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index eb059fef89c2..1499181fb0e9 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -45,13 +45,13 @@
  * entry type. But these bits might affect the ability to clear entries with
  * pxx_clear() because of how dynamic page table folding works on s390. So
  * while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
- *
- * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
  */
-#define S390_MASK_BITS 4
-#define PPC_MASK_BITS  2
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#define PPC64_SKIP_MASKGENMASK(62, 62)
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
 #define RANDOM_NZVALUE GENMASK(7, 0)
 
 static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)


Re: [PATCH v3 0/8] huge vmalloc mappings

2020-08-12 Thread Nicholas Piggin
Excerpts from Zefan Li's message of August 12, 2020 11:07 am:
> On 2020/8/12 0:32, Jonathan Cameron wrote:
>> On Mon, 10 Aug 2020 12:27:24 +1000
>> Nicholas Piggin  wrote:
>> 
>>> Not tested on x86 or arm64, would appreciate a quick test there so I can
>>> ask Andrew to put it in -mm. Other option is I can disable huge vmallocs
>>> for them for the time being.
>> 
>> Hi Nicholas,
>> 
>> For arm64 testing with a Kunpeng920.
>> 
>> I ran a quick sanity test with this series on top of mainline (yes mid merge 
>> window
>> so who knows what state is...).  Could I be missing some dependency?
>> 
>> Without them it boots, with them it doesn't.  Any immediate guesses?
>> 
> 
> I've already reported this bug in v2, and yeah I also tested it on arm64
> (not Kunpeng though), so looks like it still hasn't been fixed.

Huh, I thought I did fix it but seems not. vmap stacks shouldn't be 
big enough to use huge pages though, so I don't know what's going on
there. I'll dig around a bit more.

> 
> ...
>>>
>>> Since v2:
>>> - Rebased on vmalloc cleanups, split series into simpler pieces.
>>> - Fixed several compile errors and warnings
>>> - Keep the page array and accounting in small page units because
>>>   struct vm_struct is an interface (this should fix x86 vmap stack debug
>>>   assert). [Thanks Zefan]
> 
> though the changelog says it's fixed for x86.

Yes, my mistake that was supposed to say arm64.

Thanks,
Nick



[PATCH 2/2] powerpc: unrel_branch_check.sh: enable the use of llvm-objdump v9, 10 or 11

2020-08-12 Thread Stephen Rothwell
Currently, using llvm-objtool, this script just silently succeeds without
actually do the intended checking.  So this updates it to work properly.

Firstly, llvm-objdump does not add target symbol names to the end
of branches in its asm output, so we have to drop the branch to
__start_initialization_multiplatform using its address.

Secondly, v9 and 10 specify branch targets as .+, so we convert
those to actual addresses.

Thirdly, v10 and 11 error out on a vmlinux if given the -R option
complaining that it is "not a dynamic object".  The -R does not make
any difference to the asm output, so remove it.

Lastly, v11 produces asm that is very similar to Gnu objtool (at least
as far as branches are concerned), so no further changes are necessary
to make it work.

Cc: Nicholas Piggin 
Cc: Bill Wendling 
Signed-off-by: Stephen Rothwell 
---
 arch/powerpc/tools/unrel_branch_check.sh | 34 
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/tools/unrel_branch_check.sh 
b/arch/powerpc/tools/unrel_branch_check.sh
index 0369eb2e7e4b..8301efee1e6c 100755
--- a/arch/powerpc/tools/unrel_branch_check.sh
+++ b/arch/powerpc/tools/unrel_branch_check.sh
@@ -18,12 +18,16 @@ if [ "$end_intr" = "0x" ]; then
exit 0
 fi
 
-$objdump -R -D --no-show-raw-insn --start-address="$kstart" 
--stop-address="$end_intr" "$vmlinux" |
+# we know that there is a correct branch to
+# __start_initialization_multiplatform, so find its address
+# so we can exclude it.
+sim=0x$($nm -p "$vmlinux" |
+   sed -E -n 
'/\s+[[:alpha:]]\s+__start_initialization_multiplatform\s*$/{s///p;q}')
+
+$objdump -D --no-show-raw-insn --start-address="$kstart" 
--stop-address="$end_intr" "$vmlinux" |
 sed -E -n '
 # match lines that start with a kernel address
 /^c[0-9a-f]*:\s*b/ {
-   # drop a target that we do not care about
-   /\<__start_initialization_multiplatform>/d
# drop branches via ctr or lr
/\= 0x200 )); then
+   to=$(( to - 0x400 ))
+   fi
+   elif (( to >= 0x8000 )); then
+   to=$(( to - 0x1 ))
+   fi
+   printf -v to '0x%x' $(( "0x$from" + to ))
+   ;;
+   *)  printf 'Unkown branch format\n'
+   ;;
+   esac
+   if [ "$to" = "$sim" ]; then
+   continue
+   fi
if (( to > end_intr )); then
if $all_good; then
printf '%s\n' 'WARNING: Unrelocated relative branches'
-- 
2.28.0



[PATCH 1/2] powerpc: unrel_branch_check.sh: use nm to find symbol value

2020-08-12 Thread Stephen Rothwell
This is considerably faster then parsing the objdump asm output.  It will
also make the enabling of llvm-objdump a little easier.

Cc: Nicholas Piggin 
Cc: Bill Wendling 
Signed-off-by: Stephen Rothwell 
---
 arch/powerpc/Makefile.postlink   |  2 +-
 arch/powerpc/tools/unrel_branch_check.sh | 13 +
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
index 2268396ff4bb..a6c77f4d32b2 100644
--- a/arch/powerpc/Makefile.postlink
+++ b/arch/powerpc/Makefile.postlink
@@ -18,7 +18,7 @@ quiet_cmd_relocs_check = CHKREL  $@
 ifdef CONFIG_PPC_BOOK3S_64
   cmd_relocs_check =   \
$(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh 
"$(OBJDUMP)" "$(NM)" "$@" ; \
-   $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh 
"$(OBJDUMP)" "$@"
+   $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh 
"$(OBJDUMP)" "$(NM)" "$@"
 else
   cmd_relocs_check =   \
$(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh 
"$(OBJDUMP)" "$(NM)" "$@"
diff --git a/arch/powerpc/tools/unrel_branch_check.sh 
b/arch/powerpc/tools/unrel_branch_check.sh
index 70da90270c78..0369eb2e7e4b 100755
--- a/arch/powerpc/tools/unrel_branch_check.sh
+++ b/arch/powerpc/tools/unrel_branch_check.sh
@@ -5,18 +5,15 @@
 # This script checks the unrelocated code of a vmlinux for "suspicious"
 # branches to relocated code (head_64.S code).
 
-# Have Kbuild supply the path to objdump so we handle cross compilation.
+# Have Kbuild supply the path to objdump and nm so we handle cross compilation.
 objdump="$1"
-vmlinux="$2"
+nm="$2"
+vmlinux="$3"
 
-#__end_interrupts should be located within the first 64K
 kstart=0xc000
-printf -v kend '0x%x' $(( kstart + 0x1 ))
 
-end_intr=0x$(
-$objdump -R -d --start-address="$kstart" --stop-address="$kend" "$vmlinux" 
2>/dev/null |
-awk '$2 == "<__end_interrupts>:" { print $1 }'
-)
+end_intr=0x$($nm -p "$vmlinux" |
+   sed -E -n '/\s+[[:alpha:]]\s+__end_interrupts\s*$/{s///p;q}')
 if [ "$end_intr" = "0x" ]; then
exit 0
 fi
-- 
2.28.0



[PATCH 0/2] powerpc: unrel_branch_check.sh: enable llvm-objdump

2020-08-12 Thread Stephen Rothwell
These 2 patches enable this script to work properly when llvm-objtool
is being used.

They depend on my previos series that make this script suck less.

Cc: Nicholas Piggin 
Cc: Bill Wendling 



Re: [PATCH 01/16] powerpc/mm: Add DEBUG_VM WARN for pmd_clear

2020-08-12 Thread Anshuman Khandual
On 08/12/2020 12:03 PM, Aneesh Kumar K.V wrote:
> With the hash page table, the kernel should not use pmd_clear for clearing
> huge pte entries. Add a DEBUG_VM WARN to catch the wrong usage.
> 
> Signed-off-by: Aneesh Kumar K.V 

This particular change is very much powerpc specific. Hence please drop it from
the series which otherwise changes the page table test. Also, this series which
is not a RFC, still lacks a proper cover letter with diff stats, tree/tag on
which this applies, summary about the proposal etc. All those information will
be helpful in reviewing this series better. For now, assuming that this applies
cleanly on current master branch. But again, please do include a cover letter
in the next version.

> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 14 ++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 6de56c3b33c4..079211968987 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -868,6 +868,13 @@ static inline bool pte_ci(pte_t pte)
>  
>  static inline void pmd_clear(pmd_t *pmdp)
>  {
> + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
> + /*
> +  * Don't use this if we can possibly have a hash page table
> +  * entry mapping this.
> +  */
> + WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == 
> (H_PAGE_HASHPTE | _PAGE_PTE));
> + }
>   *pmdp = __pmd(0);
>  }
>  
> @@ -916,6 +923,13 @@ static inline int pmd_bad(pmd_t pmd)
>  
>  static inline void pud_clear(pud_t *pudp)
>  {
> + if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
> + /*
> +  * Don't use this if we can possibly have a hash page table
> +  * entry mapping this.
> +  */
> + WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == 
> (H_PAGE_HASHPTE | _PAGE_PTE));
> + }
>   *pudp = __pud(0);
>  }
>  
> 


Re: [PATCH] powerpc/papr_scm: Make access mode of 'perf_stats' attribute file to '0400'

2020-08-12 Thread Vaibhav Jain
Hi Mpe,

Thanks for reviewing this patch. My responses below:

Michael Ellerman  writes:

> Vaibhav Jain  writes:
>> The newly introduced 'perf_stats' attribute uses the default access
>> mode of 0444 letting non-root users access performance stats of an
>> nvdimm and potentially force the kernel into issuing large number of
>> expensive HCALLs. Since the information exposed by this attribute
>> cannot be cached hence its better to ward of access to this attribute
>> from non-root users.
>>
>> Hence this patch updates the access-mode of 'perf_stats' sysfs
>> attribute file to 0400 to make it only readable to root-users.
>
> Or should we ratelimit it?
Ideal consumers of this data will be users with CAP_PERFMON or
CAP_SYS_ADMIN. Also they need up-to-date values for these performance stats
as these values can be time sensitive.

So rate limiting may not be a complete solution since a user running
'perf' might be throttled by another user who is simply reading the
sysfs file contents.

So instead of setting attribute mode to 0400, will add a check for
'perfmon_capable()' in perf_stats_show() denying read access to users
without CAP_PERFMON or CAP_SYS_ADMIN.


> Fixes: ??
Right. I will add this in v2.

>
>> Reported-by: Aneesh Kumar K.V 
>> Signed-off-by: Vaibhav Jain 
>
> cheers
>

-- 
Cheers
~ Vaibhav


Re: [PATCH 02/16] debug_vm_pgtable/ppc64: Avoid setting top bits in radom value

2020-08-12 Thread Aneesh Kumar K.V

On 8/12/20 12:10 PM, Christophe Leroy wrote:



Le 12/08/2020 à 08:33, Aneesh Kumar K.V a écrit :
ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting 
that bit in

random value.

Signed-off-by: Aneesh Kumar K.V 
---
  mm/debug_vm_pgtable.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..4c32063a8acf 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -45,9 +45,12 @@
   * pxx_clear() because of how dynamic page table folding works on 
s390. So
   * while loading up the entries do not change the lower 4 bits. It 
does not

   * have affect any other platform.
+ *
+ * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
   */
  #define S390_MASK_BITS    4
-#define RANDOM_ORVALUE    GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define PPC_MASK_BITS    2
+#define RANDOM_ORVALUE    GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
S390_MASK_BITS)


Do you mean:

#define RANDOM_ORVALUE    GENMASK(BITS_PER_LONG - 1, PPC_MASK_BITS | 
S390_MASK_BITS)



IIUC GENMASK(hi, low) generate a mask from hi to low bits. Since i want 
to avoid bit 62, I am forcing it to generate bits from (61, 4)



-aneesh


Re: [PATCH 02/16] debug_vm_pgtable/ppc64: Avoid setting top bits in radom value

2020-08-12 Thread Christophe Leroy




Le 12/08/2020 à 08:33, Aneesh Kumar K.V a écrit :

ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting that bit in
random value.

Signed-off-by: Aneesh Kumar K.V 
---
  mm/debug_vm_pgtable.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..4c32063a8acf 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -45,9 +45,12 @@
   * pxx_clear() because of how dynamic page table folding works on s390. So
   * while loading up the entries do not change the lower 4 bits. It does not
   * have affect any other platform.
+ *
+ * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
   */
  #define S390_MASK_BITS4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define PPC_MASK_BITS  2
+#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
S390_MASK_BITS)


Do you mean:

#define RANDOM_ORVALUE	GENMASK(BITS_PER_LONG - 1, PPC_MASK_BITS | 
S390_MASK_BITS)


Christophe


  #define RANDOM_NZVALUEGENMASK(7, 0)
  
  static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)




[PATCH 16/16] debug_vm_pgtable/ppc64: Add a variant of pfn_pte/pmd

2020-08-12 Thread Aneesh Kumar K.V
The tests do expect _PAGE_PTE bit set by different page table accessors.
This is not true for the kernel. Within the kernel, _PAGE_PTE bits are
usually set by set_pte_at(). To make the below tests work correctly add test
specific pfn_pte/pmd helpers that set _PAGE_PTE bit.

pte_t pte = pfn_pte(pfn, prot);
WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte;

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 65 +++
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index eea62d5e503b..153c925b5273 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -31,6 +31,23 @@
 #include 
 #include 
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline pte_t debug_vm_pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+   pte_t pte = pfn_pte(pfn, pgprot);
+   return __pte(pte_val(pte) | _PAGE_PTE);
+
+}
+static inline pmd_t debug_vm_pfn_pmd(unsigned long pfn, pgprot_t pgprot)
+{
+   pmd_t pmd = pfn_pmd(pfn, pgprot);
+   return __pmd(pmd_val(pmd) | _PAGE_PTE);
+}
+#else
+#define debug_vm_pfn_pte(pfn, pgprot) pfn_pte(pfn, pgprot)
+#define debug_vm_pfn_pmd(pfn, pgprot) pfn_pmd(pfn, pgprot)
+#endif
+
 /*
  * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
  * expectations that are being validated here. All future changes in here
@@ -55,7 +72,7 @@
 
 static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
 {
-   pte_t pte = pfn_pte(pfn, prot);
+   pte_t pte = debug_vm_pfn_pte(pfn, prot);
 
pr_debug("Validating PTE basic\n");
WARN_ON(!pte_same(pte, pte));
@@ -72,10 +89,10 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
  unsigned long pfn, unsigned long vaddr,
  pgprot_t prot)
 {
-   pte_t pte = pfn_pte(pfn, prot);
+   pte_t pte = debug_vm_pfn_pte(pfn, prot);
 
pr_debug("Validating PTE advanced\n");
-   pte = pfn_pte(pfn, prot);
+   pte = debug_vm_pfn_pte(pfn, prot);
set_pte_at(mm, vaddr, ptep, pte);
ptep_set_wrprotect(mm, vaddr, ptep);
pte = ptep_get(ptep);
@@ -85,7 +102,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
 
-   pte = pfn_pte(pfn, prot);
+   pte = debug_vm_pfn_pte(pfn, prot);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(mm, vaddr, ptep, pte);
@@ -113,7 +130,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
 #ifdef CONFIG_NUMA_BALANCING
 static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
-   pte_t pte = pfn_pte(pfn, prot);
+   pte_t pte = debug_vm_pfn_pte(pfn, prot);
 
pr_debug("Validating PTE saved write\n");
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte;
@@ -124,7 +141,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, 
pgprot_t prot)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
-   pmd_t pmd = pfn_pmd(pfn, prot);
+   pmd_t pmd = debug_vm_pfn_pmd(pfn, prot);
 
if (!has_transparent_hugepage())
return;
@@ -160,7 +177,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 
pgtable_trans_huge_deposit(mm, pmdp, pgtable);
 
-   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+   pmd = pmd_mkhuge(debug_vm_pfn_pmd(pfn, prot));
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
@@ -170,7 +187,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
 
-   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+   pmd = pmd_mkhuge(debug_vm_pfn_pmd(pfn, prot));
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
@@ -184,7 +201,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
 
-   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+   pmd = pmd_mkhuge(debug_vm_pfn_pmd(pfn, prot));
pmd = pmd_mkyoung(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_test_and_clear_young(vma, vaddr, pmdp);
@@ -198,7 +215,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 
 static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
 {
-   pmd_t pmd = pfn_pmd(pfn, prot);
+   pmd_t pmd = debug_vm_pfn_pmd(pfn, prot);
 
pr_debug("Validating PMD leaf\n");
/*
@@ -230,7 +247,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned 
long pfn, pgprot_t prot)
 #ifdef CONFIG_NUMA_BALANCING
 static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
-   pmd_t pmd = pfn_pmd(pfn, prot);
+ 

[PATCH 15/16] debug_vm_pgtable/savedwrite: Use savedwrite test with protnone ptes

2020-08-12 Thread Aneesh Kumar K.V
Saved write support was added to track the write bit of a pte after marking the
pte protnone. This was done so that AUTONUMA can convert a write pte to protnone
and still track the old write bit. When converting it back we set the pte write
bit correctly thereby avoiding a write fault again.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 3e112d0ba1b2..eea62d5e503b 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1006,8 +1006,8 @@ static int __init debug_vm_pgtable(void)
pud_leaf_tests(pud_aligned, prot);
 
 #ifdef CONFIG_NUMA_BALANCING
-   pte_savedwrite_tests(pte_aligned, prot);
-   pmd_savedwrite_tests(pmd_aligned, prot);
+   pte_savedwrite_tests(pte_aligned, protnone);
+   pmd_savedwrite_tests(pmd_aligned, protnone);
 #endif
pte_special_tests(pte_aligned, prot);
pte_protnone_tests(pte_aligned, protnone);
-- 
2.26.2



[PATCH 14/16] debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64

2020-08-12 Thread Aneesh Kumar K.V
The seems to be missing quite a lot of details w.r.t allocating
the correct pgtable_t page (huge_pte_alloc()), holding the right
lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.

ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
Hence disable the test on ppc64.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 529892b9be2f..3e112d0ba1b2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -800,6 +800,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, 
pgprot_t prot)
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 }
 
+#ifndef CONFIG_PPC_BOOK3S_64
 static void __init hugetlb_advanced_tests(struct mm_struct *mm,
  struct vm_area_struct *vma,
  pte_t *ptep, unsigned long pfn,
@@ -842,6 +843,7 @@ static void __init hugetlb_advanced_tests(struct mm_struct 
*mm,
pte = huge_ptep_get(ptep);
WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
 }
+#endif
 #else  /* !CONFIG_HUGETLB_PAGE */
 static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init hugetlb_advanced_tests(struct mm_struct *mm,
@@ -1053,7 +1055,9 @@ static int __init debug_vm_pgtable(void)
pud_populate_tests(mm, pudp, saved_pmdp);
spin_unlock(ptl);
 
-   //hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+#ifndef CONFIG_PPC_BOOK3S_64
+   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+#endif
 
spin_lock(>page_table_lock);
p4d_clear_tests(mm, p4dp);
-- 
2.26.2



[PATCH 13/16] debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries

2020-08-12 Thread Aneesh Kumar K.V
pmd_clear() should not be used to clear pmd level pte entries.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 061c19bba7f0..529892b9be2f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -191,6 +191,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_young(pmd));
 
+   /*  Clear the pte entries  */
+   pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
 }
 
@@ -313,6 +315,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pudp_test_and_clear_young(vma, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_young(pud));
+
+   pudp_huge_get_and_clear(mm, vaddr, pudp);
 }
 
 static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -431,8 +435,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, 
pud_t *pudp,
 * This entry points to next level page table page.
 * Hence this must not qualify as pud_bad().
 */
-   pmd_clear(pmdp);
-   pud_clear(pudp);
pud_populate(mm, pudp, pmdp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_bad(pud));
@@ -564,7 +566,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, 
pmd_t *pmdp,
 * This entry points to next level page table page.
 * Hence this must not qualify as pmd_bad().
 */
-   pmd_clear(pmdp);
pmd_populate(mm, pmdp, pgtable);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_bad(pmd));
-- 
2.26.2



[PATCH 12/16] debug_vm_pgtable/locks: Take correct page table lock

2020-08-12 Thread Aneesh Kumar K.V
Make sure we call pte accessors with correct lock held.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 34 --
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 90e9c2d3a092..061c19bba7f0 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1027,33 +1027,39 @@ static int __init debug_vm_pgtable(void)
pmd_thp_tests(pmd_aligned, prot);
pud_thp_tests(pud_aligned, prot);
 
+   hugetlb_basic_tests(pte_aligned, prot);
+
/*
 * Page table modifying tests
 */
-   pte_clear_tests(mm, ptep, vaddr);
-   pmd_clear_tests(mm, pmdp);
-   pud_clear_tests(mm, pudp);
-   p4d_clear_tests(mm, p4dp);
-   pgd_clear_tests(mm, pgdp);
 
ptep = pte_alloc_map_lock(mm, pmdp, vaddr, );
+   pte_clear_tests(mm, ptep, vaddr);
pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-   pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
-   pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
-   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
+   pte_unmap_unlock(ptep, ptl);
 
+   ptl = pmd_lock(mm, pmdp);
+   pmd_clear_tests(mm, pmdp);
+   pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
pmd_huge_tests(pmdp, pmd_aligned, prot);
+   pmd_populate_tests(mm, pmdp, saved_ptep);
+   spin_unlock(ptl);
+
+   ptl = pud_lock(mm, pudp);
+   pud_clear_tests(mm, pudp);
+   pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
pud_huge_tests(pudp, pud_aligned, prot);
+   pud_populate_tests(mm, pudp, saved_pmdp);
+   spin_unlock(ptl);
 
-   pte_unmap_unlock(ptep, ptl);
+   //hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
 
-   pmd_populate_tests(mm, pmdp, saved_ptep);
-   pud_populate_tests(mm, pudp, saved_pmdp);
+   spin_lock(>page_table_lock);
+   p4d_clear_tests(mm, p4dp);
+   pgd_clear_tests(mm, pgdp);
p4d_populate_tests(mm, p4dp, saved_pudp);
pgd_populate_tests(mm, pgdp, saved_p4dp);
-
-   hugetlb_basic_tests(pte_aligned, prot);
+   spin_unlock(>page_table_lock);
 
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
-- 
2.26.2



[PATCH 11/16] debug_vm_pgtable/locks: Move non page table modifying test together

2020-08-12 Thread Aneesh Kumar K.V
This will help in adding proper locks in a later patch

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 53 +++
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 48475d288df1..90e9c2d3a092 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -979,7 +979,7 @@ static int __init debug_vm_pgtable(void)
p4dp = p4d_alloc(mm, pgdp, vaddr);
pudp = pud_alloc(mm, p4dp, vaddr);
pmdp = pmd_alloc(mm, pudp, vaddr);
-   ptep = pte_alloc_map_lock(mm, pmdp, vaddr, );
+   ptep = pte_alloc_map(mm, pmdp, vaddr);
 
/*
 * Save all the page table page addresses as the page table
@@ -999,35 +999,13 @@ static int __init debug_vm_pgtable(void)
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
 
-   pte_clear_tests(mm, ptep, vaddr);
-   pmd_clear_tests(mm, pmdp);
-   pud_clear_tests(mm, pudp);
-   p4d_clear_tests(mm, p4dp);
-   pgd_clear_tests(mm, pgdp);
-
-   pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-   pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
-   pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
-   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
pmd_leaf_tests(pmd_aligned, prot);
pud_leaf_tests(pud_aligned, prot);
 
-   pmd_huge_tests(pmdp, pmd_aligned, prot);
-   pud_huge_tests(pudp, pud_aligned, prot);
-
 #ifdef CONFIG_NUMA_BALANCING
pte_savedwrite_tests(pte_aligned, prot);
pmd_savedwrite_tests(pmd_aligned, prot);
 #endif
-
-   pte_unmap_unlock(ptep, ptl);
-
-   pmd_populate_tests(mm, pmdp, saved_ptep);
-   pud_populate_tests(mm, pudp, saved_pmdp);
-   p4d_populate_tests(mm, p4dp, saved_pudp);
-   pgd_populate_tests(mm, pgdp, saved_p4dp);
-
pte_special_tests(pte_aligned, prot);
pte_protnone_tests(pte_aligned, protnone);
pmd_protnone_tests(pmd_aligned, protnone);
@@ -1045,11 +1023,38 @@ static int __init debug_vm_pgtable(void)
pmd_swap_tests(pmd_aligned, prot);
 
swap_migration_tests();
-   hugetlb_basic_tests(pte_aligned, prot);
 
pmd_thp_tests(pmd_aligned, prot);
pud_thp_tests(pud_aligned, prot);
 
+   /*
+* Page table modifying tests
+*/
+   pte_clear_tests(mm, ptep, vaddr);
+   pmd_clear_tests(mm, pmdp);
+   pud_clear_tests(mm, pudp);
+   p4d_clear_tests(mm, p4dp);
+   pgd_clear_tests(mm, pgdp);
+
+   ptep = pte_alloc_map_lock(mm, pmdp, vaddr, );
+   pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+   pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+   pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+   hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+
+   pmd_huge_tests(pmdp, pmd_aligned, prot);
+   pud_huge_tests(pudp, pud_aligned, prot);
+
+   pte_unmap_unlock(ptep, ptl);
+
+   pmd_populate_tests(mm, pmdp, saved_ptep);
+   pud_populate_tests(mm, pudp, saved_pmdp);
+   p4d_populate_tests(mm, p4dp, saved_pudp);
+   pgd_populate_tests(mm, pgdp, saved_p4dp);
+
+   hugetlb_basic_tests(pte_aligned, prot);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
-- 
2.26.2



[PATCH 10/16] debug_vm_pgtable/thp: Use page table depost/withdraw with THP

2020-08-12 Thread Aneesh Kumar K.V
Architectures like ppc64 use deposited page table while updating the huge pte
entries.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 644d28861ce9..48475d288df1 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -147,7 +147,7 @@ static void __init pmd_basic_tests(unsigned long pfn, 
pgprot_t prot)
 static void __init pmd_advanced_tests(struct mm_struct *mm,
  struct vm_area_struct *vma, pmd_t *pmdp,
  unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
 {
pmd_t pmd;
 
@@ -158,6 +158,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
 
+   pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
@@ -188,6 +190,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmdp_test_and_clear_young(vma, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_young(pmd));
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
 }
 
 static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -1002,7 +1006,7 @@ static int __init debug_vm_pgtable(void)
pgd_clear_tests(mm, pgdp);
 
pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-   pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+   pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
 
-- 
2.26.2



[PATCH 09/16] debug_vm_pgtable/set_pud: Don't use set_pud_at to update an existing pud entry

2020-08-12 Thread Aneesh Kumar K.V
set_pud_at() should not be used to set a pte entry at locations that
already holds a valid pte entry. Architectures like ppc64 don't do TLB
invalidate in set_pud_at() and hence expect it to be used to set locations
that are not a valid PTE.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 60bf876081b8..644d28861ce9 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -278,9 +278,6 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
WARN_ON(pud_write(pud));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-
-   pud = pud_mkhuge(pfn_pud(pfn, prot));
-   set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
@@ -302,6 +299,11 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pud = READ_ONCE(*pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
 
+   pudp_huge_get_and_clear_full(vma, vaddr, pudp, 1);
+   pud = READ_ONCE(*pudp);
+   WARN_ON(!pud_none(pud));
+
+   pud = pud_mkhuge(pfn_pud(pfn, prot));
pud = pud_mkyoung(pud);
set_pud_at(mm, vaddr, pudp, pud);
pudp_test_and_clear_young(vma, vaddr, pudp);
-- 
2.26.2



[PATCH 08/16] debug_vm_pgtable/set_pmd: Don't use set_pmd_at to update an existing pmd entry

2020-08-12 Thread Aneesh Kumar K.V
set_pmd_at() should not be used to set a pte entry at locations that
already holds a valid pte entry. Architectures like ppc64 don't do TLB
invalidate in set_pmd_at() and hence expect it to be used to set locations
that are not a valid PTE.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index cd609a212dd4..60bf876081b8 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -164,8 +164,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_write(pmd));
 
-   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
-   set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
@@ -180,12 +178,11 @@ static void __init pmd_advanced_tests(struct mm_struct 
*mm,
pmd = READ_ONCE(*pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
 
-   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
-   set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
 
+   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_mkyoung(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_test_and_clear_young(vma, vaddr, pmdp);
-- 
2.26.2



[PATCH 07/16] debug_vm_pgtable/THP: Mark the pte entry huge before using set_pud_at

2020-08-12 Thread Aneesh Kumar K.V
kernel expect entries to be marked huge before we use set_pud_at().

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index b6aca2526e01..cd609a212dd4 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -265,7 +265,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
  unsigned long pfn, unsigned long vaddr,
  pgprot_t prot)
 {
-   pud_t pud = pfn_pud(pfn, prot);
+   pud_t pud;
 
if (!has_transparent_hugepage())
return;
@@ -274,25 +274,28 @@ static void __init pud_advanced_tests(struct mm_struct 
*mm,
/* Align the address wrt HPAGE_PUD_SIZE */
vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
 
+   pud = pud_mkhuge(pfn_pud(pfn, prot));
set_pud_at(mm, vaddr, pudp, pud);
pudp_set_wrprotect(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_write(pud));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-   pud = pfn_pud(pfn, prot);
+
+   pud = pud_mkhuge(pfn_pud(pfn, prot));
set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
 
-   pud = pfn_pud(pfn, prot);
+   pud = pud_mkhuge(pfn_pud(pfn, prot));
set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
-   pud = pfn_pud(pfn, prot);
+
+   pud = pud_mkhuge(pfn_pud(pfn, prot));
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(mm, vaddr, pudp, pud);
-- 
2.26.2



[PATCH 06/16] debug_vm_pgtable/THP: Mark the pte entry huge before using set_pmd_at

2020-08-12 Thread Aneesh Kumar K.V
kernel expect entries to be marked huge before we use set_pmd_at().

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index de8a62d0a931..b6aca2526e01 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -149,7 +149,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
  unsigned long pfn, unsigned long vaddr,
  pgprot_t prot)
 {
-   pmd_t pmd = pfn_pmd(pfn, prot);
+   pmd_t pmd;
 
if (!has_transparent_hugepage())
return;
@@ -158,19 +158,19 @@ static void __init pmd_advanced_tests(struct mm_struct 
*mm,
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
 
-   pmd = pfn_pmd(pfn, prot);
+   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_write(pmd));
 
-   pmd = pfn_pmd(pfn, prot);
+   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
 
-   pmd = pfn_pmd(pfn, prot);
+   pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
-- 
2.26.2



[PATCH 05/16] debug_vm_pgtable/savedwrite: Enable savedwrite test with CONFIG_NUMA_BALANCING

2020-08-12 Thread Aneesh Kumar K.V
Saved write support was added to track the write bit of a pte after marking the
pte protnone. This was done so that AUTONUMA can convert a write pte to protnone
and still track the old write bit. When converting it back we set the pte write
bit correctly thereby avoiding a write fault again. Hence enable the test only
when CONFIG_NUMA_BALANCING is enabled.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 679bb3d289a3..de8a62d0a931 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -110,6 +110,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
WARN_ON(pte_young(pte));
 }
 
+#ifdef CONFIG_NUMA_BALANCING
 static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
pte_t pte = pfn_pte(pfn, prot);
@@ -118,6 +119,8 @@ static void __init pte_savedwrite_tests(unsigned long pfn, 
pgprot_t prot)
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte;
WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte;
 }
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -221,6 +224,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned 
long pfn, pgprot_t prot)
WARN_ON(!pmd_none(pmd));
 }
 
+#ifdef CONFIG_NUMA_BALANCING
 static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
pmd_t pmd = pfn_pmd(pfn, prot);
@@ -229,6 +233,7 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, 
pgprot_t prot)
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd;
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd;
 }
+#endif
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
@@ -1005,8 +1010,10 @@ static int __init debug_vm_pgtable(void)
pmd_huge_tests(pmdp, pmd_aligned, prot);
pud_huge_tests(pudp, pud_aligned, prot);
 
+#ifdef CONFIG_NUMA_BALANCING
pte_savedwrite_tests(pte_aligned, prot);
pmd_savedwrite_tests(pmd_aligned, prot);
+#endif
 
pte_unmap_unlock(ptep, ptl);
 
-- 
2.26.2



[PATCH 04/16] debug_vm_pgtables/hugevmap: Use the arch helper to identify huge vmap support.

2020-08-12 Thread Aneesh Kumar K.V
ppc64 supports huge vmap only with radix translation. Hence use arch helper
to determine the huge vmap support.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 02a7c20aa4a2..679bb3d289a3 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -206,7 +206,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned 
long pfn, pgprot_t prot)
 {
pmd_t pmd;
 
-   if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+   if (!arch_ioremap_pmd_supported())
return;
 
pr_debug("Validating PMD huge\n");
-- 
2.26.2



[PATCH 03/16] debug_vm_pgtable/set_pte: Don't use set_pte_at to update an existing pte entry

2020-08-12 Thread Aneesh Kumar K.V
set_pte_at() should not be used to set a pte entry at locations that
already holds a valid pte entry. Architectures like ppc64 don't do TLB
invalidate in set_pte_at() and hence expect it to be used to set locations
that are not a valid PTE.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 4c32063a8acf..02a7c20aa4a2 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -81,8 +81,6 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
pte = ptep_get(ptep);
WARN_ON(pte_write(pte));
 
-   pte = pfn_pte(pfn, prot);
-   set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
@@ -97,12 +95,14 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
pte = ptep_get(ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
 
-   pte = pfn_pte(pfn, prot);
-   set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear_full(mm, vaddr, ptep, 1);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
 
+   /*
+* We should clear pte before we do set_pte_at
+*/
+   pte = ptep_get_and_clear(mm, vaddr, ptep);
pte = pte_mkyoung(pte);
set_pte_at(mm, vaddr, ptep, pte);
ptep_test_and_clear_young(vma, vaddr, ptep);
-- 
2.26.2



[PATCH 02/16] debug_vm_pgtable/ppc64: Avoid setting top bits in radom value

2020-08-12 Thread Aneesh Kumar K.V
ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting that bit in
random value.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..4c32063a8acf 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -45,9 +45,12 @@
  * pxx_clear() because of how dynamic page table folding works on s390. So
  * while loading up the entries do not change the lower 4 bits. It does not
  * have affect any other platform.
+ *
+ * Also avoid the 62nd bit on ppc64 that is used to mark a pte entry.
  */
 #define S390_MASK_BITS 4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define PPC_MASK_BITS  2
+#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1 - PPC_MASK_BITS, 
S390_MASK_BITS)
 #define RANDOM_NZVALUE GENMASK(7, 0)
 
 static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
-- 
2.26.2



[PATCH 01/16] powerpc/mm: Add DEBUG_VM WARN for pmd_clear

2020-08-12 Thread Aneesh Kumar K.V
With the hash page table, the kernel should not use pmd_clear for clearing
huge pte entries. Add a DEBUG_VM WARN to catch the wrong usage.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 6de56c3b33c4..079211968987 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -868,6 +868,13 @@ static inline bool pte_ci(pte_t pte)
 
 static inline void pmd_clear(pmd_t *pmdp)
 {
+   if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+   /*
+* Don't use this if we can possibly have a hash page table
+* entry mapping this.
+*/
+   WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == 
(H_PAGE_HASHPTE | _PAGE_PTE));
+   }
*pmdp = __pmd(0);
 }
 
@@ -916,6 +923,13 @@ static inline int pmd_bad(pmd_t pmd)
 
 static inline void pud_clear(pud_t *pudp)
 {
+   if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+   /*
+* Don't use this if we can possibly have a hash page table
+* entry mapping this.
+*/
+   WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == 
(H_PAGE_HASHPTE | _PAGE_PTE));
+   }
*pudp = __pud(0);
 }
 
-- 
2.26.2



Re: [PATCH v5 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline

2020-08-12 Thread Srikar Dronamraju
Hi Andrew, Michal, David

* Andrew Morton  [2020-08-06 21:32:11]:

> On Fri, 3 Jul 2020 18:28:23 +0530 Srikar Dronamraju 
>  wrote:
> 
> > > The memory hotplug changes that somehow because you can hotremove numa
> > > nodes and therefore make the nodemask sparse but that is not a common
> > > case. I am not sure what would happen if a completely new node was added
> > > and its corresponding node was already used by the renumbered one
> > > though. It would likely conflate the two I am afraid. But I am not sure
> > > this is really possible with x86 and a lack of a bug report would
> > > suggest that nobody is doing that at least.
> > > 
> > 
> > JFYI,
> > Satheesh copied in this mailchain had opened a bug a year on crash with vcpu
> > hotplug on memoryless node. 
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=202187
> 
> So...  do we merge this patch or not?  Seems that the overall view is
> "risky but nobody is likely to do anything better any time soon"?

Can we decide on this one way or the other?

-- 
Thanks and Regards
Srikar Dronamraju