Re: [PATCH v2 2/7] powerpc/prom: Introduce early_reserve_mem_old()

2020-09-15 Thread Cédric Le Goater
On 9/15/20 6:46 PM, Christophe Leroy wrote:
> Cédric Le Goater  a écrit :
> 
>> and condition its call with IS_ENABLED(CONFIG_PPC32). This fixes a
>> compile error with W=1.
>>
>> arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
>> arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not 
>> used [-Werror=unused-but-set-variable]
>>   __be64 *reserve_map;
>>   ^~~
>> cc1: all warnings being treated as errors
>>
>> Cc: Christophe Leroy 
> 
> @csgroup.eu instead of @c-s.fr please
> 
>> Signed-off-by: Cédric Le Goater 
>> ---
>>  arch/powerpc/kernel/prom.c | 37 -
>>  1 file changed, 20 insertions(+), 17 deletions(-)
> 
> That's a lot of changes for a tiny warning.
> 
> You could make it easy by just replacing the #ifdef by:
> 
>     if (!IS_ENABLED(CONFIG_PPC32))
>     return;

It's equivalent and it moves out the reserve_map variable of the main routine
which I think is better.

>>
>> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
>> index d8a2fb87ba0c..c958b67cf1a5 100644
>> --- a/arch/powerpc/kernel/prom.c
>> +++ b/arch/powerpc/kernel/prom.c
>> @@ -620,27 +620,14 @@ static void __init early_reserve_mem_dt(void)
>>  }
>>  }
>>
>> -static void __init early_reserve_mem(void)
>> +static void __init early_reserve_mem_old(void)
> 
> Why _old ? Do you mean ppc32 are old ? Modern ADSL boxes like for instance 
> the famous French freebox have powerpc32 microcontroller.
> Eventually you could name it _ppc32, but I don't think that's the good way, 
> see above.

I choose old because of the comment ' ... booting from an old kexec ... ', 
but I agree _ppc32 might be a better choice.

Thanks,

C. 

> Christophe
> 
>>  {
>>  __be64 *reserve_map;
>>
>>  reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
>>  fdt_off_mem_rsvmap(initial_boot_params));
>>
>> -    /* Look for the new "reserved-regions" property in the DT */
>> -    early_reserve_mem_dt();
>> -
>> -#ifdef CONFIG_BLK_DEV_INITRD
>> -    /* Then reserve the initrd, if any */
>> -    if (initrd_start && (initrd_end > initrd_start)) {
>> -    memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
>> -    ALIGN(initrd_end, PAGE_SIZE) -
>> -    ALIGN_DOWN(initrd_start, PAGE_SIZE));
>> -    }
>> -#endif /* CONFIG_BLK_DEV_INITRD */
>> -
>> -#ifdef CONFIG_PPC32
>> -    /*
>> +    /*
>>   * Handle the case where we might be booting from an old kexec
>>   * image that setup the mem_rsvmap as pairs of 32-bit values
>>   */
>> @@ -658,9 +645,25 @@ static void __init early_reserve_mem(void)
>>  DBG("reserving: %x -> %x\n", base_32, size_32);
>>  memblock_reserve(base_32, size_32);
>>  }
>> -    return;
>>  }
>> -#endif
>> +}
>> +
>> +static void __init early_reserve_mem(void)
>> +{
>> +    /* Look for the new "reserved-regions" property in the DT */
>> +    early_reserve_mem_dt();
>> +
>> +#ifdef CONFIG_BLK_DEV_INITRD
>> +    /* Then reserve the initrd, if any */
>> +    if (initrd_start && (initrd_end > initrd_start)) {
>> +    memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
>> +    ALIGN(initrd_end, PAGE_SIZE) -
>> +    ALIGN_DOWN(initrd_start, PAGE_SIZE));
>> +    }
>> +#endif /* CONFIG_BLK_DEV_INITRD */
>> +
>> +    if (IS_ENABLED(CONFIG_PPC32))
>> +    early_reserve_mem_old();
>>  }
>>
>>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>> -- 
>> 2.25.4
> 
> 



[PATCH v2 2/2] powerpc/64s: Add cp_abort after tlbiel to invalidate copy-buffer address

2020-09-15 Thread Nicholas Piggin
The copy buffer is implemented as a real address in the nest which is
translated from EA by copy, and used for memory access by paste. This
requires that it be invalidated by TLB invalidation.

TLBIE does invalidate the copy buffer, but TLBIEL does not. Add cp_abort
to the tlbiel sequence.

Signed-off-by: Nicholas Piggin 
---
v2:
- Untangle headers that were causing build failures.
- Improve the comment a bit.
- Exempt POWER9 from the workaround, as described by the comment (we
  worked this out already but I forgot about it when doing v1!)

 arch/powerpc/include/asm/synch.h   | 19 ++-
 arch/powerpc/mm/book3s64/hash_native.c |  8 
 arch/powerpc/mm/book3s64/radix_tlb.c   | 12 ++--
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h
index aca70fb43147..a2e89f27d547 100644
--- a/arch/powerpc/include/asm/synch.h
+++ b/arch/powerpc/include/asm/synch.h
@@ -3,8 +3,9 @@
 #define _ASM_POWERPC_SYNCH_H 
 #ifdef __KERNEL__
 
+#include 
 #include 
-#include 
+#include 
 
 #ifndef __ASSEMBLY__
 extern unsigned int __start___lwsync_fixup, __stop___lwsync_fixup;
@@ -20,6 +21,22 @@ static inline void isync(void)
 {
__asm__ __volatile__ ("isync" : : : "memory");
 }
+
+static inline void ppc_after_tlbiel_barrier(void)
+{
+asm volatile("ptesync": : :"memory");
+   /*
+* POWER9, POWER10 need a cp_abort after tlbiel to ensure the copy
+* is invalidated correctly. If this is not done, the paste can take
+* data from the physical address that was translated at copy time.
+*
+* POWER9 in practice does not need this, because address spaces
+* with accelerators mapped will use tlbie (which does invalidate
+* the copy) to invalidate translations. It's not possible to limit
+* POWER10 this way due to local copy-paste.
+*/
+asm volatile(ASM_FTR_IFSET(PPC_CP_ABORT, "", %0) : : "i" 
(CPU_FTR_ARCH_31) : "memory");
+}
 #endif /* __ASSEMBLY__ */
 
 #if defined(__powerpc64__)
diff --git a/arch/powerpc/mm/book3s64/hash_native.c 
b/arch/powerpc/mm/book3s64/hash_native.c
index cf20e5229ce1..0203cdf48c54 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -82,7 +82,7 @@ static void tlbiel_all_isa206(unsigned int num_sets, unsigned 
int is)
for (set = 0; set < num_sets; set++)
tlbiel_hash_set_isa206(set, is);
 
-   asm volatile("ptesync": : :"memory");
+   ppc_after_tlbiel_barrier();
 }
 
 static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
@@ -110,7 +110,7 @@ static void tlbiel_all_isa300(unsigned int num_sets, 
unsigned int is)
 */
tlbiel_hash_set_isa300(0, is, 0, 2, 1);
 
-   asm volatile("ptesync": : :"memory");
+   ppc_after_tlbiel_barrier();
 
asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
 }
@@ -303,7 +303,7 @@ static inline void tlbie(unsigned long vpn, int psize, int 
apsize,
asm volatile("ptesync": : :"memory");
if (use_local) {
__tlbiel(vpn, psize, apsize, ssize);
-   asm volatile("ptesync": : :"memory");
+   ppc_after_tlbiel_barrier();
} else {
__tlbie(vpn, psize, apsize, ssize);
fixup_tlbie_vpn(vpn, psize, apsize, ssize);
@@ -879,7 +879,7 @@ static void native_flush_hash_range(unsigned long number, 
int local)
__tlbiel(vpn, psize, psize, ssize);
} pte_iterate_hashed_end();
}
-   asm volatile("ptesync":::"memory");
+   ppc_after_tlbiel_barrier();
} else {
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 0d233763441f..5c9d2fccacc7 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -65,7 +65,7 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned 
int is)
for (set = 1; set < num_sets; set++)
tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
 
-   asm volatile("ptesync": : :"memory");
+   ppc_after_tlbiel_barrier();
 }
 
 void radix__tlbiel_all(unsigned int action)
@@ -296,7 +296,7 @@ static __always_inline void _tlbiel_pid(unsigned long pid, 
unsigned long ric)
 
/* For PWC, only one flush is needed */
if (ric == RIC_FLUSH_PWC) {
-   asm volatile("ptesync": : :"memory");
+   ppc_after_tlbiel_barrier();
return;
}
 
@@ -304,7 +304,7 @@ static __always_inline void _tlbiel_pid(unsigned long pid, 
unsigned long ric)
for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
__tlbiel_pid(pid, set, RIC_FLUSH_TLB);
 
-   asm volatile("ptesync": : :"memory");
+   

[PATCH v2 1/2] powerpc: untangle cputable mce include

2020-09-15 Thread Nicholas Piggin
Having cputable.h include mce.h means it pulls in a bunch of low level
headers (e.g., synch.h) which then can't use CPU_FTR_ definitions.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/cputable.h | 5 -
 arch/powerpc/kernel/cputable.c  | 1 +
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 1 +
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 32a15dc49e8c..f89205eff691 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -9,11 +9,6 @@
 
 #ifndef __ASSEMBLY__
 
-/*
- * Added to include __machine_check_early_realmode_* functions
- */
-#include 
-
 /* This structure can grow, it's real size is used by head.S code
  * via the mkdefs mechanism.
  */
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 2aa89c6b2896..b5bc2edef440 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include   /* for PTRRELOC on ARCH=ppc */
+#include 
 #include 
 #include 
 
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index f204ad79b6b5..1098863e17ee 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -17,6 +17,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-- 
2.23.0



[PATCH -next] powerpc/pseries: convert to use DEFINE_SEQ_ATTRIBUTE macro

2020-09-15 Thread Liu Shixin
Use DEFINE_SEQ_ATTRIBUTE macro to simplify the code.

Signed-off-by: Liu Shixin 
---
 arch/powerpc/platforms/pseries/hvCall_inst.c | 23 +++-
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c 
b/arch/powerpc/platforms/pseries/hvCall_inst.c
index c40c62ec432e..2c59b4986ea5 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -70,31 +70,14 @@ static int hc_show(struct seq_file *m, void *p)
return 0;
 }
 
-static const struct seq_operations hcall_inst_seq_ops = {
+static const struct seq_operations hcall_inst_sops = {
 .start = hc_start,
 .next  = hc_next,
 .stop  = hc_stop,
 .show  = hc_show
 };
 
-static int hcall_inst_seq_open(struct inode *inode, struct file *file)
-{
-   int rc;
-   struct seq_file *seq;
-
-   rc = seq_open(file, _inst_seq_ops);
-   seq = file->private_data;
-   seq->private = file_inode(file)->i_private;
-
-   return rc;
-}
-
-static const struct file_operations hcall_inst_seq_fops = {
-   .open = hcall_inst_seq_open,
-   .read = seq_read,
-   .llseek = seq_lseek,
-   .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(hcall_inst);
 
 #defineHCALL_ROOT_DIR  "hcall_inst"
 #define CPU_NAME_BUF_SIZE  32
@@ -149,7 +132,7 @@ static int __init hcall_inst_init(void)
snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
debugfs_create_file(cpu_name_buf, 0444, hcall_root,
per_cpu(hcall_stats, cpu),
-   _inst_seq_fops);
+   _inst_fops);
}
 
return 0;
-- 
2.25.1



RE: [PATCH v1] soc: fsl: rcpm: Add ACPI support

2020-09-15 Thread Ran Wang
Hi Ard,

On Tuesday, September 15, 2020 7:10 PM, Ard Biesheuvel wrote:
> Subject: Re: [PATCH v1] soc: fsl: rcpm: Add ACPI support
> 
> On 9/15/20 1:06 PM, kuldip dwivedi wrote:
> > Add ACPI support in fsl RCPM driver. This is required to support ACPI
> > S3 state. S3 is the ACPI sleep state that is known as "sleep" or
> > "suspend to RAM".
> > It essentially turns off most power of the system but keeps memory
> > powered.
> >
> > Signed-off-by: tanveer 
> > Signed-off-by: kuldip dwivedi 
> 
> Why does the OS need to program this device? Can't this be done by
> firmware?

This device is use to tell HW which IP (such as USB, SDHC, SATA, etc) should 
not be
clock gated during system enter low power state (to allow that IP work as a
wakeup source). And user does this configuration in device tree. So implement
this RCPM driver to do it in kernel rather than firmware.

Regards,
Ran

> > ---
> >
> > Notes:
> >  1. Add ACPI match table
> >  2. NXP team members are added for confirming HID changes
> >  3. There is only one node in ACPI so no need to check for
> > current device explicitly
> >  4. These changes are tested on LX2160A and LS1046A platforms
> >
> >   drivers/soc/fsl/rcpm.c | 22 +++---
> >   1 file changed, 19 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c index
> > a093dbe6d2cb..e75a436fb159 100644
> > --- a/drivers/soc/fsl/rcpm.c
> > +++ b/drivers/soc/fsl/rcpm.c
> > @@ -2,10 +2,12 @@
> >   //
> >   // rcpm.c - Freescale QorIQ RCPM driver
> >   //
> > -// Copyright 2019 NXP
> > +// Copyright 2019-2020 NXP
> > +// Copyright 2020 Puresoftware Ltd.
> >   //
> >   // Author: Ran Wang 
> >
> > +#include 
> >   #include 
> >   #include 
> >   #include 
> > @@ -57,8 +59,13 @@ static int rcpm_pm_prepare(struct device *dev)
> > rcpm->wakeup_cells + 1);
> >
> > /*  Wakeup source should refer to current rcpm device */
> > -   if (ret || (np->phandle != value[0]))
> > -   continue;
> > +   if (is_acpi_node(dev->fwnode)) {
> > +   if (ret)
> > +   continue;
> > +   } else {
> > +   if (ret || (np->phandle != value[0]))
> > +   continue;
> > +   }
> >
> > /* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines the
> >  * number of IPPDEXPCR register cells, and "fsl,rcpm-wakeup"
> > @@ -139,10 +146,19 @@ static const struct of_device_id rcpm_of_match[]
> = {
> >   };
> >   MODULE_DEVICE_TABLE(of, rcpm_of_match);
> >
> > +#ifdef CONFIG_ACPI
> > +static const struct acpi_device_id rcpm_acpi_match[] = {
> > +   { "NXP0015", },
> > +   { }
> > +};
> > +MODULE_DEVICE_TABLE(acpi, rcpm_acpi_match); #endif
> > +
> >   static struct platform_driver rcpm_driver = {
> > .driver = {
> > .name = "rcpm",
> > .of_match_table = rcpm_of_match,
> > +   .acpi_match_table = ACPI_PTR(rcpm_acpi_match),
> > .pm = _pm_ops,
> > },
> > .probe = rcpm_probe,
> >



RE: [PATCH v1] soc: fsl: rcpm: Add ACPI support

2020-09-15 Thread Ran Wang
Hi Kuldip,

On Tuesday, September 15, 2020 7:07 PM, kuldip dwivedi wrote:
> Subject: [PATCH v1] soc: fsl: rcpm: Add ACPI support

Actually I also post a patch for this recently: 
https://lore.kernel.org/patchwork/patch/1299959/  :)

Regards,
Ran

> Add ACPI support in fsl RCPM driver. This is required to support ACPI S3 
> state.
> S3 is the ACPI sleep state that is known as "sleep" or "suspend to RAM".
> It essentially turns off most power of the system but keeps memory powered.

Actually the low power mode is to gate clocks rather than power down on 
Layerscape platforms.

> Signed-off-by: tanveer 
> Signed-off-by: kuldip dwivedi 
> ---
> 
> Notes:
> 1. Add ACPI match table
> 2. NXP team members are added for confirming HID changes
> 3. There is only one node in ACPI so no need to check for
>current device explicitly
> 4. These changes are tested on LX2160A and LS1046A platforms
> 
>  drivers/soc/fsl/rcpm.c | 22 +++---
>  1 file changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c index
> a093dbe6d2cb..e75a436fb159 100644
> --- a/drivers/soc/fsl/rcpm.c
> +++ b/drivers/soc/fsl/rcpm.c
> @@ -2,10 +2,12 @@
>  //
>  // rcpm.c - Freescale QorIQ RCPM driver  // -// Copyright 2019 NXP
> +// Copyright 2019-2020 NXP
> +// Copyright 2020 Puresoftware Ltd.
>  //
>  // Author: Ran Wang 
> 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -57,8 +59,13 @@ static int rcpm_pm_prepare(struct device *dev)
>   rcpm->wakeup_cells + 1);
> 
>   /*  Wakeup source should refer to current rcpm device */
> - if (ret || (np->phandle != value[0]))
> - continue;
> + if (is_acpi_node(dev->fwnode)) {
> + if (ret)
> + continue;
> + } else {
> + if (ret || (np->phandle != value[0]))
> + continue;
> + }
>   /* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines the
>* number of IPPDEXPCR register cells, and "fsl,rcpm-wakeup"
> @@ -139,10 +146,19 @@ static const struct of_device_id rcpm_of_match[] =
> {  };  MODULE_DEVICE_TABLE(of, rcpm_of_match);
> 
> +#ifdef CONFIG_ACPI
> +static const struct acpi_device_id rcpm_acpi_match[] = {
> + { "NXP0015", },
> + { }
> +};
> +MODULE_DEVICE_TABLE(acpi, rcpm_acpi_match); #endif
> +
>  static struct platform_driver rcpm_driver = {
>   .driver = {
>   .name = "rcpm",
>   .of_match_table = rcpm_of_match,
> + .acpi_match_table = ACPI_PTR(rcpm_acpi_match),
>   .pm = _pm_ops,
>   },
>   .probe = rcpm_probe,
> --
> 2.17.1



Re: Injecting SLB miltihit crashes kernel 5.9.0-rc5

2020-09-15 Thread Nicholas Piggin
Excerpts from Michael Ellerman's message of September 15, 2020 10:54 pm:
> Michal Suchánek  writes:
>> Hello,
>>
>> Using the SLB mutihit injection test module (which I did not write so I
>> do not want to post it here) to verify updates on my 5.3 frankernekernel
>> I found that the kernel crashes with Oops: kernel bad access.
>>
>> I tested on latest upstream kernel build that I have at hand and the
>> result is te same (minus the message - nothing was logged and the kernel
>> simply rebooted).
> 
> That's disappointing.

It seems to work okay with qemu and mambo injection on upstream
(powernv_defconfig). I wonder why that nmi_enter is crashing.
Can you post the output of a successful test with the patch
reverted?


qemu injection test output - 
[  195.279885][C0] Disabling lock debugging due to kernel taint
[  195.280891][C0] MCE: CPU0: machine check (Warning) Host SLB Multihit 
DAR: deadbeef [Recovered]
[  195.282117][C0] MCE: CPU0: NIP: [c003c2b4] 
isa300_idle_stop_mayloss+0x68/0x6c
[  195.283631][C0] MCE: CPU0: Initiator CPU
[  195.284432][C0] MCE: CPU0: Probable Software error (some chance of 
hardware cause)
[  220.711577][   T90] MCE: CPU0: machine check (Warning) Host SLB Multihit 
DAR: deadbeef [Recovered]
[  220.712805][   T90] MCE: CPU0: PID: 90 Comm: yes NIP: [7fff7fdac2e0]
[  220.713553][   T90] MCE: CPU0: Initiator CPU
[  220.714021][   T90] MCE: CPU0: Probable Software error (some chance of 
hardware cause)

Thanks,
Nick


Re: [PATCH] ibmvfc: Avoid link down on FS9100 canister reboot

2020-09-15 Thread Martin K. Petersen


Brian,

> When a canister on a FS9100, or similar storage, running in NPIV mode,
> is rebooted, its WWPNs will fail over to another canister.

[...]

Applied to 5.10/scsi-staging, thanks! I fixed a bunch of checkpatch
warnings.

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH net] ibmvnic: update MAINTAINERS

2020-09-15 Thread David Miller
From: Dany Madden 
Date: Mon, 14 Sep 2020 20:35:35 -0400

> Update supporters for IBM Power SRIOV Virtual NIC Device Driver. 
> Thomas Falcon is moving on to other works. Dany Madden, Lijun Pan 
> and Sukadev Bhattiprolu are the current supporters.
> 
> Signed-off-by: Dany Madden 

Applied.


Re: [PATCH] scsi: ibmvfc: Fix error return in ibmvfc_probe()

2020-09-15 Thread Martin K. Petersen
On Mon, 7 Sep 2020 16:39:49 +0800, Jing Xiangfeng wrote:

> Fix to return error code PTR_ERR() from the error handling case instead
> of 0.

Applied to 5.10/scsi-queue, thanks!

[1/1] scsi: ibmvfc: Fix error return in ibmvfc_probe()
  https://git.kernel.org/mkp/scsi/c/5e48a084f4e8

-- 
Martin K. Petersen  Oracle Linux Engineering


[PATCH v3] pseries/hotplug-memory: hot-add: skip redundant LMB lookup

2020-09-15 Thread Scott Cheloha
During memory hot-add, dlpar_add_lmb() calls memory_add_physaddr_to_nid()
to determine which node id (nid) to use when later calling __add_memory().

This is wasteful.  On pseries, memory_add_physaddr_to_nid() finds an
appropriate nid for a given address by looking up the LMB containing the
address and then passing that LMB to of_drconf_to_nid_single() to get the
nid.  In dlpar_add_lmb() we get this address from the LMB itself.

In short, we have a pointer to an LMB and then we are searching for
that LMB *again* in order to find its nid.

If we call of_drconf_to_nid_single() directly from dlpar_add_lmb() we
can skip the redundant lookup.  The only error handling we need to
duplicate from memory_add_physaddr_to_nid() is the fallback to the
default nid when drconf_to_nid_single() returns -1 (NUMA_NO_NODE) or
an invalid nid.

Skipping the extra lookup makes hot-add operations faster, especially
on machines with many LMBs.

Consider an LPAR with 126976 LMBs.  In one test, hot-adding 126000
LMBs on an upatched kernel took ~3.5 hours while a patched kernel
completed the same operation in ~2 hours:

Unpatched (12450 seconds):
Sep  9 04:06:31 ltc-brazos1 drmgr[810169]: drmgr: -c mem -a -q 126000
Sep  9 04:06:31 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 
126000 LMB(s)
[...]
Sep  9 07:34:01 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 2000 
(drc index 8002) was hot-added

Patched (7065 seconds):
Sep  8 21:49:57 ltc-brazos1 drmgr[877703]: drmgr: -c mem -a -q 126000
Sep  8 21:49:57 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 
126000 LMB(s)
[...]
Sep  8 23:27:42 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 2000 
(drc index 8002) was hot-added

It should be noted that the speedup grows more substantial when
hot-adding LMBs at the end of the drconf range.  This is because we
are skipping a linear LMB search.

To see the distinction, consider smaller hot-add test on the same
LPAR.  A perf-stat run with 10 iterations showed that hot-adding 4096
LMBs completed less than 1 second faster on a patched kernel:

Unpatched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

104,753.42 msec task-clock#0.992 CPUs utilized  
  ( +-  0.55% )
 4,708  context-switches  #0.045 K/sec  
  ( +-  0.69% )
 2,444  cpu-migrations#0.023 K/sec  
  ( +-  1.25% )
   394  page-faults   #0.004 K/sec  
  ( +-  0.22% )
   445,902,503,057  cycles#4.257 GHz
  ( +-  0.55% )  (66.67%)
 8,558,376,740  stalled-cycles-frontend   #1.92% frontend cycles 
idle ( +-  0.88% )  (49.99%)
   300,346,181,651  stalled-cycles-backend#   67.36% backend cycles 
idle  ( +-  0.76% )  (50.01%)
   258,091,488,691  instructions  #0.58  insn per cycle
  #1.16  stalled cycles per 
insn  ( +-  0.22% )  (66.67%)
70,568,169,256  branches  #  673.660 M/sec  
  ( +-  0.17% )  (50.01%)
 3,100,725,426  branch-misses #4.39% of all branches
  ( +-  0.20% )  (49.99%)

   105.583 +- 0.589 seconds time elapsed  ( +-  0.56% )

Patched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

104,055.69 msec task-clock#0.993 CPUs utilized  
  ( +-  0.32% )
 4,606  context-switches  #0.044 K/sec  
  ( +-  0.20% )
 2,463  cpu-migrations#0.024 K/sec  
  ( +-  0.93% )
   394  page-faults   #0.004 K/sec  
  ( +-  0.25% )
   442,951,129,921  cycles#4.257 GHz
  ( +-  0.32% )  (66.66%)
 8,710,413,329  stalled-cycles-frontend   #1.97% frontend cycles 
idle ( +-  0.47% )  (50.06%)
   299,656,905,836  stalled-cycles-backend#   67.65% backend cycles 
idle  ( +-  0.39% )  (50.02%)
   252,731,168,193  instructions  #0.57  insn per cycle
  #1.19  stalled cycles per 
insn  ( +-  0.20% )  (66.66%)
68,902,851,121  branches  #  662.173 M/sec  
  ( +-  0.13% )  (49.94%)
 3,100,242,882  branch-misses #4.50% of all branches
  ( +-  0.15% )  (49.98%)

   104.829 +- 0.325 seconds time elapsed  ( +-  0.31% )

This is consistent.  An add-by-count hot-add operation adds LMBs
greedily, so LMBs near the start of the drconf range are considered
first.  On an otherwise idle LPAR with so many LMBs we would expect to
find the LMBs we need near the start of the drconf range, hence the
smaller speedup.

Signed-off-by: Scott Cheloha 
---

Re: [PATCH v2 3/4] sparc64: remove mm_cpumask clearing to fix kthread_use_mm race

2020-09-15 Thread David Miller
From: Nicholas Piggin 
Date: Tue, 15 Sep 2020 13:24:07 +1000

> Excerpts from David Miller's message of September 15, 2020 5:59 am:
>> From: Nicholas Piggin 
>> Date: Mon, 14 Sep 2020 14:52:18 +1000
>> 
>>  ...
>>> The basic fix for sparc64 is to remove its mm_cpumask clearing code. The
>>> optimisation could be effectively restored by sending IPIs to mm_cpumask
>>> members and having them remove themselves from mm_cpumask. This is more
>>> tricky so I leave it as an exercise for someone with a sparc64 SMP.
>>> powerpc has a (currently similarly broken) example.
>>> 
>>> Signed-off-by: Nicholas Piggin 
>> 
>> Sad to see this optimization go away, but what can I do:
>> 
>> Acked-by: David S. Miller 
>> 
> 
> Thanks Dave, any objection if we merge this via the powerpc tree
> to keep the commits together?

No objection.


Re: [PATCH] Revert "powerpc/64s: machine check interrupt update NMI accounting"

2020-09-15 Thread peterz
On Tue, Sep 15, 2020 at 08:06:59PM +0200, Michal Suchanek wrote:
> This reverts commit 116ac378bb3ff844df333e7609e7604651a0db9d.
> 
> This commit causes the kernel to oops and reboot when injecting a SLB
> multihit which causes a MCE.
> 
> Before this commit a SLB multihit was corrected by the kernel and the
> system continued to operate normally.
> 
> cc: sta...@vger.kernel.org
> Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
> accounting")
> Signed-off-by: Michal Suchanek 

Ever since 69ea03b56ed2 ("hardirq/nmi: Allow nested nmi_enter()")
nmi_enter() supports nesting natively.


[PATCH] Revert "powerpc/64s: machine check interrupt update NMI accounting"

2020-09-15 Thread Michal Suchanek
This reverts commit 116ac378bb3ff844df333e7609e7604651a0db9d.

This commit causes the kernel to oops and reboot when injecting a SLB
multihit which causes a MCE.

Before this commit a SLB multihit was corrected by the kernel and the
system continued to operate normally.

cc: sta...@vger.kernel.org
Fixes: 116ac378bb3f ("powerpc/64s: machine check interrupt update NMI 
accounting")
Signed-off-by: Michal Suchanek 
---
 arch/powerpc/kernel/mce.c   |  7 ---
 arch/powerpc/kernel/traps.c | 18 +++---
 2 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ada59f6c4298..2e13528dcc92 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -591,14 +591,10 @@ EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 long notrace machine_check_early(struct pt_regs *regs)
 {
long handled = 0;
-   bool nested = in_nmi();
u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
 
this_cpu_set_ftrace_enabled(0);
 
-   if (!nested)
-   nmi_enter();
-
hv_nmi_check_nonrecoverable(regs);
 
/*
@@ -607,9 +603,6 @@ long notrace machine_check_early(struct pt_regs *regs)
if (ppc_md.machine_check_early)
handled = ppc_md.machine_check_early(regs);
 
-   if (!nested)
-   nmi_exit();
-
this_cpu_set_ftrace_enabled(ftrace_enabled);
 
return handled;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index d1ebe152f210..7853b770918d 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -827,19 +827,7 @@ void machine_check_exception(struct pt_regs *regs)
 {
int recover = 0;
 
-   /*
-* BOOK3S_64 does not call this handler as a non-maskable interrupt
-* (it uses its own early real-mode handler to handle the MCE proper
-* and then raises irq_work to call this handler when interrupts are
-* enabled).
-*
-* This is silly. The BOOK3S_64 should just call a different function
-* rather than expecting semantics to magically change. Something
-* like 'non_nmi_machine_check_exception()', perhaps?
-*/
-   const bool nmi = !IS_ENABLED(CONFIG_PPC_BOOK3S_64);
-
-   if (nmi) nmi_enter();
+   nmi_enter();
 
__this_cpu_inc(irq_stat.mce_exceptions);
 
@@ -865,7 +853,7 @@ void machine_check_exception(struct pt_regs *regs)
if (check_io_access(regs))
goto bail;
 
-   if (nmi) nmi_exit();
+   nmi_exit();
 
die("Machine check", regs, SIGBUS);
 
@@ -876,7 +864,7 @@ void machine_check_exception(struct pt_regs *regs)
return;
 
 bail:
-   if (nmi) nmi_exit();
+   nmi_exit();
 }
 
 void SMIException(struct pt_regs *regs)
-- 
2.28.0



[PATCH v2] pseries/hotplug-memory: hot-add: skip redundant LMB lookup

2020-09-15 Thread Scott Cheloha
During memory hot-add, dlpar_add_lmb() calls memory_add_physaddr_to_nid()
to determine which node id (nid) to use when later calling __add_memory().

This is wasteful.  On pseries, memory_add_physaddr_to_nid() finds an
appropriate nid for a given address by looking up the LMB containing the
address and then passing that LMB to of_drconf_to_nid_single() to get the
nid.  In dlpar_add_lmb() we get this address from the LMB itself.

In short, we have a pointer to an LMB and then we are searching for
that LMB *again* in order to find its nid.

If we call of_drconf_to_nid_single() directly from dlpar_add_lmb() we
can skip the redundant lookup.  The only error handling we need to
duplicate from memory_add_physaddr_to_nid() is the fallback to the
default nid when drconf_to_nid_single() returns -1 (NUMA_NO_NODE) or
an invalid nid.

Skipping the extra lookup makes hot-add operations faster, especially
on machines with many LMBs.

Consider an LPAR with 126976 LMBs.  In one test, hot-adding 126000
LMBs on an upatched kernel took ~3.5 hours while a patched kernel
completed the same operation in ~2 hours:

Unpatched (12450 seconds):
Sep  9 04:06:31 ltc-brazos1 drmgr[810169]: drmgr: -c mem -a -q 126000
Sep  9 04:06:31 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 
126000 LMB(s)
[...]
Sep  9 07:34:01 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 2000 
(drc index 8002) was hot-added

Patched (7065 seconds):
Sep  8 21:49:57 ltc-brazos1 drmgr[877703]: drmgr: -c mem -a -q 126000
Sep  8 21:49:57 ltc-brazos1 kernel: pseries-hotplug-mem: Attempting to hot-add 
126000 LMB(s)
[...]
Sep  8 23:27:42 ltc-brazos1 kernel: pseries-hotplug-mem: Memory at 2000 
(drc index 8002) was hot-added

It should be noted that the speedup grows more substantial when
hot-adding LMBs at the end of the drconf range.  This is because we
are skipping a linear LMB search.

To see the distinction, consider smaller hot-add test on the same
LPAR.  A perf-stat run with 10 iterations showed that hot-adding 4096
LMBs completed less than 1 second faster on a patched kernel:

Unpatched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

104,753.42 msec task-clock#0.992 CPUs utilized  
  ( +-  0.55% )
 4,708  context-switches  #0.045 K/sec  
  ( +-  0.69% )
 2,444  cpu-migrations#0.023 K/sec  
  ( +-  1.25% )
   394  page-faults   #0.004 K/sec  
  ( +-  0.22% )
   445,902,503,057  cycles#4.257 GHz
  ( +-  0.55% )  (66.67%)
 8,558,376,740  stalled-cycles-frontend   #1.92% frontend cycles 
idle ( +-  0.88% )  (49.99%)
   300,346,181,651  stalled-cycles-backend#   67.36% backend cycles 
idle  ( +-  0.76% )  (50.01%)
   258,091,488,691  instructions  #0.58  insn per cycle
  #1.16  stalled cycles per 
insn  ( +-  0.22% )  (66.67%)
70,568,169,256  branches  #  673.660 M/sec  
  ( +-  0.17% )  (50.01%)
 3,100,725,426  branch-misses #4.39% of all branches
  ( +-  0.20% )  (49.99%)

   105.583 +- 0.589 seconds time elapsed  ( +-  0.56% )

Patched:
 Performance counter stats for 'drmgr -c mem -a -q 4096' (10 runs):

104,055.69 msec task-clock#0.993 CPUs utilized  
  ( +-  0.32% )
 4,606  context-switches  #0.044 K/sec  
  ( +-  0.20% )
 2,463  cpu-migrations#0.024 K/sec  
  ( +-  0.93% )
   394  page-faults   #0.004 K/sec  
  ( +-  0.25% )
   442,951,129,921  cycles#4.257 GHz
  ( +-  0.32% )  (66.66%)
 8,710,413,329  stalled-cycles-frontend   #1.97% frontend cycles 
idle ( +-  0.47% )  (50.06%)
   299,656,905,836  stalled-cycles-backend#   67.65% backend cycles 
idle  ( +-  0.39% )  (50.02%)
   252,731,168,193  instructions  #0.57  insn per cycle
  #1.19  stalled cycles per 
insn  ( +-  0.20% )  (66.66%)
68,902,851,121  branches  #  662.173 M/sec  
  ( +-  0.13% )  (49.94%)
 3,100,242,882  branch-misses #4.50% of all branches
  ( +-  0.15% )  (49.98%)

   104.829 +- 0.325 seconds time elapsed  ( +-  0.31% )

This is consistent.  An add-by-count hot-add operation adds LMBs
greedily, so LMBs near the start of the drconf range are considered
first.  On an otherwise idle LPAR with so many LMBs we would expect to
find the LMBs we need near the start of the drconf range, hence the
smaller speedup.

Signed-off-by: Scott Cheloha 
---

Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread John Hubbard

On 9/11/20 1:36 PM, Vasily Gorbik wrote:

Currently to make sure that every page table entry is read just once
gup_fast walks perform READ_ONCE and pass pXd value down to the next
gup_pXd_range function by value e.g.:

static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
  unsigned int flags, struct page **pages, int *nr)
...
 pudp = pud_offset(, addr);

This function passes a reference on that local value copy to pXd_offset,
and might get the very same pointer in return. This happens when the
level is folded (on most arches), and that pointer should not be iterated.

On s390 due to the fact that each task might have different 5,4 or
3-level address translation and hence different levels folded the logic
is more complex and non-iteratable pointer to a local copy leads to
severe problems.

Here is an example of what happens with gup_fast on s390, for a task
with 3-levels paging, crossing a 2 GB pud boundary:

// addr = 0x1007000, end = 0x10080001000
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
  unsigned int flags, struct page **pages, int *nr)
{
 unsigned long next;
 pud_t *pudp;

 // pud_offset returns  itself (a pointer to a value on stack)
 pudp = pud_offset(, addr);
 do {
 // on second iteratation reading "random" stack value
 pud_t pud = READ_ONCE(*pudp);

 // next = 0x1008000, due to PUD_SIZE/MASK != 
PGDIR_SIZE/MASK on s390
 next = pud_addr_end(addr, end);
 ...
 } while (pudp++, addr = next, addr != end); // pudp++ iterating over 
stack

 return 1;
}

This happens since s390 moved to common gup code with
commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
and commit 1a42010cdc26 ("s390/mm: convert to the generic
get_user_pages_fast code"). s390 tried to mimic static level folding by
changing pXd_offset primitives to always calculate top level page table
offset in pgd_offset and just return the value passed when pXd_offset
has to act as folded.

What is crucial for gup_fast and what has been overlooked is
that PxD_SIZE/MASK and thus pXd_addr_end should also change
correspondingly. And the latter is not possible with dynamic folding.

To fix the issue in addition to pXd values pass original
pXdp pointers down to gup_pXd_range functions. And introduce
pXd_offset_lockless helpers, which take an additional pXd
entry value parameter. This has already been discussed in
https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1

Cc:  # 5.2+
Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast code")
Reviewed-by: Gerald Schaefer 
Reviewed-by: Alexander Gordeev 
Signed-off-by: Vasily Gorbik 
---


Looks cleaner than I'd dared hope for. :)

Reviewed-by: John Hubbard 


thanks,
--
John Hubbard
NVIDIA


v2: added brackets  -> &(pgd)

  arch/s390/include/asm/pgtable.h | 42 +++--
  include/linux/pgtable.h | 10 
  mm/gup.c| 18 +++---
  3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7eb01a5459cd..b55561cc8786 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, 
unsigned long address)
  
  #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
  
-static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)

+static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long 
address)
  {
-   if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
-   return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
-   return (p4d_t *) pgd;
+   if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
+   return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
+   return (p4d_t *) pgdp;
  }
+#define p4d_offset_lockless p4d_offset_lockless
  
-static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)

+static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
  {
-   if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
-   return (pud_t *) p4d_deref(*p4d) + pud_index(address);
-   return (pud_t *) p4d;
+   return p4d_offset_lockless(pgdp, *pgdp, address);
+}
+
+static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long 
address)
+{
+   if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
+   return (pud_t *) p4d_deref(p4d) + pud_index(address);
+   return (pud_t *) p4dp;
+}
+#define pud_offset_lockless pud_offset_lockless
+
+static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long address)
+{
+   return pud_offset_lockless(p4dp, *p4dp, 

Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread Mike Rapoport
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote:
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
> 
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> ...
> pudp = pud_offset(, addr);
> 
> This function passes a reference on that local value copy to pXd_offset,
> and might get the very same pointer in return. This happens when the
> level is folded (on most arches), and that pointer should not be iterated.
> 
> On s390 due to the fact that each task might have different 5,4 or
> 3-level address translation and hence different levels folded the logic
> is more complex and non-iteratable pointer to a local copy leads to
> severe problems.
> 
> Here is an example of what happens with gup_fast on s390, for a task
> with 3-levels paging, crossing a 2 GB pud boundary:
> 
> // addr = 0x1007000, end = 0x10080001000
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> {
> unsigned long next;
> pud_t *pudp;
> 
> // pud_offset returns  itself (a pointer to a value on stack)
> pudp = pud_offset(, addr);
> do {
> // on second iteratation reading "random" stack value
> pud_t pud = READ_ONCE(*pudp);
> 
> // next = 0x1008000, due to PUD_SIZE/MASK != 
> PGDIR_SIZE/MASK on s390
> next = pud_addr_end(addr, end);
> ...
> } while (pudp++, addr = next, addr != end); // pudp++ iterating over 
> stack
> 
> return 1;
> }
> 
> This happens since s390 moved to common gup code with
> commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
> and commit 1a42010cdc26 ("s390/mm: convert to the generic
> get_user_pages_fast code"). s390 tried to mimic static level folding by
> changing pXd_offset primitives to always calculate top level page table
> offset in pgd_offset and just return the value passed when pXd_offset
> has to act as folded.
> 
> What is crucial for gup_fast and what has been overlooked is
> that PxD_SIZE/MASK and thus pXd_addr_end should also change
> correspondingly. And the latter is not possible with dynamic folding.
> 
> To fix the issue in addition to pXd values pass original
> pXdp pointers down to gup_pXd_range functions. And introduce
> pXd_offset_lockless helpers, which take an additional pXd
> entry value parameter. This has already been discussed in
> https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1
> 
> Cc:  # 5.2+
> Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast 
> code")
> Reviewed-by: Gerald Schaefer 
> Reviewed-by: Alexander Gordeev 
> Signed-off-by: Vasily Gorbik 

Reviewed-by: Mike Rapoport 

> ---
> v2: added brackets  -> &(pgd)
> 
>  arch/s390/include/asm/pgtable.h | 42 +++--
>  include/linux/pgtable.h | 10 
>  mm/gup.c| 18 +++---
>  3 files changed, 49 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
> index 7eb01a5459cd..b55561cc8786 100644
> --- a/arch/s390/include/asm/pgtable.h
> +++ b/arch/s390/include/asm/pgtable.h
> @@ -1260,26 +1260,44 @@ static inline pgd_t *pgd_offset_raw(pgd_t *pgd, 
> unsigned long address)
>  
>  #define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address)
>  
> -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
> +static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned 
> long address)
>  {
> - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
> - return (p4d_t *) pgd_deref(*pgd) + p4d_index(address);
> - return (p4d_t *) pgd;
> + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1)
> + return (p4d_t *) pgd_deref(pgd) + p4d_index(address);
> + return (p4d_t *) pgdp;
>  }
> +#define p4d_offset_lockless p4d_offset_lockless
>  
> -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
> +static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long address)
>  {
> - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
> - return (pud_t *) p4d_deref(*p4d) + pud_index(address);
> - return (pud_t *) p4d;
> + return p4d_offset_lockless(pgdp, *pgdp, address);
> +}
> +
> +static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned 
> long address)
> +{
> + if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2)
> + return (pud_t *) p4d_deref(p4d) + pud_index(address);
> + return (pud_t *) p4dp;
> +}
> +#define pud_offset_lockless pud_offset_lockless
> +
> 

Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread Jason Gunthorpe
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote:
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
> 
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> ...
> pudp = pud_offset(, addr);
> 
> This function passes a reference on that local value copy to pXd_offset,
> and might get the very same pointer in return. This happens when the
> level is folded (on most arches), and that pointer should not be iterated.
> 
> On s390 due to the fact that each task might have different 5,4 or
> 3-level address translation and hence different levels folded the logic
> is more complex and non-iteratable pointer to a local copy leads to
> severe problems.
> 
> Here is an example of what happens with gup_fast on s390, for a task
> with 3-levels paging, crossing a 2 GB pud boundary:
> 
> // addr = 0x1007000, end = 0x10080001000
> static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
>  unsigned int flags, struct page **pages, int *nr)
> {
> unsigned long next;
> pud_t *pudp;
> 
> // pud_offset returns  itself (a pointer to a value on stack)
> pudp = pud_offset(, addr);
> do {
> // on second iteratation reading "random" stack value
> pud_t pud = READ_ONCE(*pudp);
> 
> // next = 0x1008000, due to PUD_SIZE/MASK != 
> PGDIR_SIZE/MASK on s390
> next = pud_addr_end(addr, end);
> ...
> } while (pudp++, addr = next, addr != end); // pudp++ iterating over 
> stack
> 
> return 1;
> }
> 
> This happens since s390 moved to common gup code with
> commit d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust")
> and commit 1a42010cdc26 ("s390/mm: convert to the generic
> get_user_pages_fast code"). s390 tried to mimic static level folding by
> changing pXd_offset primitives to always calculate top level page table
> offset in pgd_offset and just return the value passed when pXd_offset
> has to act as folded.
> 
> What is crucial for gup_fast and what has been overlooked is
> that PxD_SIZE/MASK and thus pXd_addr_end should also change
> correspondingly. And the latter is not possible with dynamic folding.
> 
> To fix the issue in addition to pXd values pass original
> pXdp pointers down to gup_pXd_range functions. And introduce
> pXd_offset_lockless helpers, which take an additional pXd
> entry value parameter. This has already been discussed in
> https://lkml.kernel.org/r/20190418100218.0a4afd51@mschwideX1
> 
> Cc:  # 5.2+
> Fixes: 1a42010cdc26 ("s390/mm: convert to the generic get_user_pages_fast 
> code")
> Reviewed-by: Gerald Schaefer 
> Reviewed-by: Alexander Gordeev 
> Signed-off-by: Vasily Gorbik 
> ---
> v2: added brackets  -> &(pgd)

Reviewed-by: Jason Gunthorpe 

Regards,
Jason


Re: [PATCH v2] mm/gup: fix gup_fast with dynamic page table folding

2020-09-15 Thread Vasily Gorbik
On Fri, Sep 11, 2020 at 10:36:43PM +0200, Vasily Gorbik wrote:
> Currently to make sure that every page table entry is read just once
> gup_fast walks perform READ_ONCE and pass pXd value down to the next
> gup_pXd_range function by value e.g.:
...snip...
> ---
> v2: added brackets  -> &(pgd)
> 
>  arch/s390/include/asm/pgtable.h | 42 +++--
>  include/linux/pgtable.h | 10 
>  mm/gup.c| 18 +++---
>  3 files changed, 49 insertions(+), 21 deletions(-)

Andrew, any chance you would pick this up?

There is an Ack from Linus. And I haven't seen any objections from Jason or 
John.
This seems to be as safe for other architectures as possible.

@Jason and John
Any acks/nacks?

Thank you,
Vasily


Re: [PATCH v2 2/7] powerpc/prom: Introduce early_reserve_mem_old()

2020-09-15 Thread Christophe Leroy

Cédric Le Goater  a écrit :


and condition its call with IS_ENABLED(CONFIG_PPC32). This fixes a
compile error with W=1.

arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’:
arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set  
but not used [-Werror=unused-but-set-variable]

  __be64 *reserve_map;
  ^~~
cc1: all warnings being treated as errors

Cc: Christophe Leroy 


@csgroup.eu instead of @c-s.fr please


Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kernel/prom.c | 37 -
 1 file changed, 20 insertions(+), 17 deletions(-)


That's a lot of changes for a tiny warning.

You could make it easy by just replacing the #ifdef by:

if (!IS_ENABLED(CONFIG_PPC32))
return;



diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d8a2fb87ba0c..c958b67cf1a5 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -620,27 +620,14 @@ static void __init early_reserve_mem_dt(void)
}
 }

-static void __init early_reserve_mem(void)
+static void __init early_reserve_mem_old(void)


Why _old ? Do you mean ppc32 are old ? Modern ADSL boxes like for  
instance the famous French freebox have powerpc32 microcontroller.
Eventually you could name it _ppc32, but I don't think that's the good  
way, see above.


Christophe


 {
__be64 *reserve_map;

reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
fdt_off_mem_rsvmap(initial_boot_params));

-   /* Look for the new "reserved-regions" property in the DT */
-   early_reserve_mem_dt();
-
-#ifdef CONFIG_BLK_DEV_INITRD
-   /* Then reserve the initrd, if any */
-   if (initrd_start && (initrd_end > initrd_start)) {
-   memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
-   ALIGN(initrd_end, PAGE_SIZE) -
-   ALIGN_DOWN(initrd_start, PAGE_SIZE));
-   }
-#endif /* CONFIG_BLK_DEV_INITRD */
-
-#ifdef CONFIG_PPC32
-   /*
+   /*
 * Handle the case where we might be booting from an old kexec
 * image that setup the mem_rsvmap as pairs of 32-bit values
 */
@@ -658,9 +645,25 @@ static void __init early_reserve_mem(void)
DBG("reserving: %x -> %x\n", base_32, size_32);
memblock_reserve(base_32, size_32);
}
-   return;
}
-#endif
+}
+
+static void __init early_reserve_mem(void)
+{
+   /* Look for the new "reserved-regions" property in the DT */
+   early_reserve_mem_dt();
+
+#ifdef CONFIG_BLK_DEV_INITRD
+   /* Then reserve the initrd, if any */
+   if (initrd_start && (initrd_end > initrd_start)) {
+   memblock_reserve(ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
+   ALIGN(initrd_end, PAGE_SIZE) -
+   ALIGN_DOWN(initrd_start, PAGE_SIZE));
+   }
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+   if (IS_ENABLED(CONFIG_PPC32))
+   early_reserve_mem_old();
 }

 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
--
2.25.4





Re: [5.9.0-rc5-20200914] Kernel crash while running LTP(mlock201)

2020-09-15 Thread Sachin Sant


> On 15-Sep-2020, at 6:39 PM, Matthew Wilcox  wrote:
> 
> On Tue, Sep 15, 2020 at 09:24:38PM +1000, Michael Ellerman wrote:
>> Sachin Sant  writes:
>>> While running LTP tests (specifically mlock201) against next-20200914 tree
>>> on a POWER9 LPAR results in following crash.
>> 
>> Looks the same as:
>> 
>> https://lore.kernel.org/linux-mm/20200914085545.GB28738@shao2-debian/
> 
> https://lore.kernel.org/linux-mm/20200914112738.gm6...@casper.infradead.org/

Thanks. The patch fixes the problem for me.

Tested-by: Sachin Sant 

thanks
-Sachin



Re: [PATCH 14/15] selftests/clone3: Avoid OS-defined clone_args

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:19AM -0700, Kees Cook wrote:
> As the UAPI headers start to appear in distros, we need to avoid
> outdated versions of struct clone_args to be able to test modern
> features. Additionally pull in the syscall numbers correctly.
> 
> Signed-off-by: Kees Cook 
> ---

Hm, with this patch applied I'm getting:

gcc -g -I../../../../usr/include/clone3_set_tid.c 
/home/brauner/src/git/linux/linux/tools/testing/selftests/kselftest_harness.h 
/home/brauner/src/git/linux/linux/tools/testing/selftests/kselftest.h -lcap -o 
/home/brauner/src/git/linux/linux/tools/testing/selftests/clone3/clone3_set_tid
In file included from clone3_set_tid.c:24:
clone3_selftests.h:37:8: error: redefinition of ‘struct clone_args’
   37 | struct clone_args {
  |^~
In file included from clone3_set_tid.c:12:
/usr/include/linux/sched.h:92:8: note: originally defined here
   92 | struct clone_args {
  |^~
make: *** [../lib.mk:140: 
/home/brauner/src/git/linux/linux/tools/testing/selftests/clone3/clone3_set_tid]
 Error 1

One trick to avoid this could be:

#ifndef CLONE_ARGS_SIZE_VER0
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#endif

#ifndef CLONE_ARGS_SIZE_VER1
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#endif

#ifndef CLONE_ARGS_SIZE_VER2
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
#endif

struct __clone_args {
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
__aligned_u64 parent_tid;
__aligned_u64 exit_signal;
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
};

static pid_t sys_clone3(struct __clone_args *args, size_t size)
{
return syscall(__NR_clone3, args, size);
}

Christian


Re: [PATCH 15/15] selftests/seccomp: Use __NR_mknodat instead of __NR_mknod

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:20AM -0700, Kees Cook wrote:
> The __NR_mknod syscall doesn't exist on arm64 (only __NR_mknodat).
> Switch to the modern syscall.
> 
> Fixes: ad5682184a81 ("selftests/seccomp: Check for EPOLLHUP for user_notif")
> Signed-off-by: Kees Cook 
> ---

Thanks! Looks good.
Acked-by: Christian Brauner 


Re: [PATCH 11/15] selftests/seccomp: Remove SYSCALL_NUM_RET_SHARE_REG in favor of SYSCALL_RET_SET

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:16AM -0700, Kees Cook wrote:
> Instead of special-casing the specific case of shared registers, create
> a default SYSCALL_RET_SET() macro (mirroring SYSCALL_NUM_SET()), that
> writes to the SYSCALL_RET register. For architectures that can't set the
> return value (for whatever reason), they can define SYSCALL_RET_SET()
> without an associated SYSCALL_RET() macro. This also paves the way for
> architectures that need to do special things to set the return value
> (e.g. powerpc).
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 10/15] selftests/seccomp: Avoid redundant register flushes

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:15AM -0700, Kees Cook wrote:
> When none of the registers have changed, don't flush them back. This can
> happen if the architecture uses a non-register way to change the syscall
> (e.g. arm64) , and a return value hasn't been written.
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 09/15] selftests/seccomp: Convert REGSET calls into ARCH_GETREG/ARCH_SETREG

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:14AM -0700, Kees Cook wrote:
> Consolidate the REGSET logic into the new ARCH_GETREG() and
> ARCH_SETREG() macros, avoiding more #ifdef code in function bodies.
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH v2] powerpc/papr_scm: Fix warning triggered by perf_stats_show()

2020-09-15 Thread Vaibhav Jain
Michael Ellerman  writes:

> Vaibhav Jain  writes:
>> A warning is reported by the kernel in case perf_stats_show() returns
>> an error code. The warning is of the form below:
>>
>>  papr_scm ibm,persistent-memory:ibm,pmemory@4411:
>>Failed to query performance stats, Err:-10
>>  dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
>>  fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count
>>
>> On investigation it looks like that the compiler is silently truncating the
>> return value of drc_pmem_query_stats() from 'long' to 'int', since the
>> variable used to store the return code 'rc' is an 'int'. This
>> truncated value is then returned back as a 'ssize_t' back from
>> perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
>> unsigned number and triggers this warning..
>>
>> To fix this we update the type of variable 'rc' from 'int' to
>> 'ssize_t' that prevents the compiler from truncating the return value
>> of drc_pmem_query_stats() and returning correct signed value back from
>> perf_stats_show().
>>
>> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
>>stats from PHYP')
>
> Please don't word wrap the Fixes tag it breaks b4.
>
> I've fixed it up this time.

Thanks Mpe

>
> cheers

-- 
Cheers
~ Vaibhav


Re: [PATCH 08/15] selftests/seccomp: Convert HAVE_GETREG into ARCH_GETREG/ARCH_SETREG

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:13AM -0700, Kees Cook wrote:
> Instead of special-casing the get/set-registers routines, move the
> HAVE_GETREG logic into the new ARCH_GETREG() and ARCH_SETREG() macros.
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 07/15] selftests/seccomp: Remove syscall setting #ifdefs

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:12AM -0700, Kees Cook wrote:
> With all architectures now using the common SYSCALL_NUM_SET() macro, the
> arch-specific #ifdef can be removed from change_syscall() itself.
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 06/15] selftests/seccomp: mips: Remove O32-specific macro

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:11AM -0700, Kees Cook wrote:
> Instead of having the mips O32 macro special-cased, pull the logic into
> the SYSCALL_NUM() macro. Additionally include the ABI headers, since
> these appear to have been missing, leaving __NR_O32_Linux undefined.
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 05/15] selftests/seccomp: arm64: Define SYSCALL_NUM_SET macro

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:10AM -0700, Kees Cook wrote:
> Remove the arm64 special-case in change_syscall().
> 
> Signed-off-by: Kees Cook 
> ---

We're using iovecs in ptrace()??

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 04/15] selftests/seccomp: arm: Define SYSCALL_NUM_SET macro

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:09AM -0700, Kees Cook wrote:
> Remove the arm special-case in change_syscall().
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 03/15] selftests/seccomp: mips: Define SYSCALL_NUM_SET macro

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:08AM -0700, Kees Cook wrote:
> Remove the mips special-case in change_syscall().
> 
> Signed-off-by: Kees Cook 
> ---
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 17 +
>  1 file changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
> b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 1c83e743bfb1..02a9a6599746 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -1742,6 +1742,13 @@ TEST_F(TRACE_poke, getpid_runs_normally)
>  # define ARCH_REGS   struct pt_regs
>  # define SYSCALL_NUM(_regs)  (_regs).regs[2]
>  # define SYSCALL_SYSCALL_NUM regs[4]
> +# define SYSCALL_NUM_SET(_regs, _nr) \
> + do {\
> + if ((_regs).regs[2] == __NR_O32_Linux)  \
> + (_regs).regs[4] = _nr;  \
> + else\
> + (_regs).regs[2] = _nr;  \
> + } while (0)

I think that

# define SYSCALL_NUM_SET(_regs, _nr)\
do {\
if (SYSCALL_NUM(_regs) == __NR_O32_Linux)   \
(_regs).regs[4] = _nr;  \
else\
(_regs).regs[2] = _nr;  \
} while (0)

would read better but that's just a matter of taste. :)

Looks good!
Acked-by: Christian Brauner 


Re: [PATCH 02/15] selftests/seccomp: Provide generic syscall setting macro

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:07AM -0700, Kees Cook wrote:
> In order to avoid "#ifdef"s in the main function bodies, create a new
> macro, SYSCALL_NUM_SET(), where arch-specific logic can live.
> 
> Signed-off-by: Kees Cook 
> ---

SYSCALL_SWITCH(_regs, nr)?

But looks good either way!
Acked-by: Christian Brauner 


Re: [PATCH 01/15] selftests/seccomp: Refactor arch register macros to avoid xtensa special case

2020-09-15 Thread Christian Brauner
On Sat, Sep 12, 2020 at 04:08:06AM -0700, Kees Cook wrote:
> To avoid an xtensa special-case, refactor all arch register macros to
> take the register variable instead of depending on the macro expanding
> as a struct member name.
> 
> Signed-off-by: Kees Cook 
> ---

Looks good!
Acked-by: Christian Brauner 


[Bug 209277] powerpc: obsolete driver: Marvell MV64X60 MPSC

2020-09-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=209277

--- Comment #1 from Necip Fazil Yildiran (fazilyildi...@gmail.com) ---
The config MV64X60 in arch/powerpc/platforms/embedded6xx/Kconfig is 
non-prompt selected nowhere -- thus, cannot be enabled.

In addition, a few other configs cannot be enabled due to their dependency
on MV64X60, e.g., EDAC_MV64X60.

The last to use this driver was by PrPMC 280/2800, for which the support
was ended with the commit 3c8464a9b12b 
("powerpc: Delete old PrPMC 280/2800 support").

This looks like the related configs (e.g., MV64X60, EDAC_MV64X60) and the
code (e.g., drivers/edac/mv64x60_edac.c) for Marvell MV64X60 MPSC are now
obsolete.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 209277] New: Dead code :q

2020-09-15 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=209277

Bug ID: 209277
   Summary: Dead code :q
   Product: Platform Specific/Hardware
   Version: 2.5
Kernel Version: 5.9-rc4
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: enhancement
  Priority: P1
 Component: PPC-32
  Assignee: platform_ppc...@kernel-bugs.osdl.org
  Reporter: fazilyildi...@gmail.com
Regression: No

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [5.9.0-rc5-20200914] Kernel crash while running LTP(mlock201)

2020-09-15 Thread Matthew Wilcox
On Tue, Sep 15, 2020 at 09:24:38PM +1000, Michael Ellerman wrote:
> Sachin Sant  writes:
> > While running LTP tests (specifically mlock201) against next-20200914 tree
> > on a POWER9 LPAR results in following crash.
> 
> Looks the same as:
> 
> https://lore.kernel.org/linux-mm/20200914085545.GB28738@shao2-debian/

https://lore.kernel.org/linux-mm/20200914112738.gm6...@casper.infradead.org/


Re: Injecting SLB miltihit crashes kernel 5.9.0-rc5

2020-09-15 Thread Michael Ellerman
Michal Suchánek  writes:
> Hello,
>
> Using the SLB mutihit injection test module (which I did not write so I
> do not want to post it here) to verify updates on my 5.3 frankernekernel
> I found that the kernel crashes with Oops: kernel bad access.
>
> I tested on latest upstream kernel build that I have at hand and the
> result is te same (minus the message - nothing was logged and the kernel
> simply rebooted).

That's disappointing.

> Since the whole effort to write a real mode MCE handler was supposed to
> prevent this maybe the SLB injection module should be added to the
> kernel selftests?

Yes I'd like to see it upstream. I think it should be integrated into
LKDTM, which contains other dangerous things like that and is designed
for testing how the kernel handles/recovers from bad conditions.

cheers


Re: [PATCH 00/15] selftests/seccomp: Refactor change_syscall()

2020-09-15 Thread Michael Ellerman
Kees Cook  writes:
> On Mon, Sep 14, 2020 at 10:15:18PM +1000, Michael Ellerman wrote:
>> Kees Cook  writes:
>> > Hi,
>> >
>> > This refactors the seccomp selftest macros used in change_syscall(),
>> > in an effort to remove special cases for mips, arm, arm64, and xtensa,
>> > which paves the way for powerpc fixes.
>> >
>> > I'm not entirely done testing, but all-arch build tests and x86_64
>> > selftests pass. I'll be doing arm, arm64, and i386 selftests shortly,
>> > but I currently don't have an easy way to check xtensa, mips, nor
>> > powerpc. Any help there would be appreciated!
>> 
>> The series builds fine for me, and all the tests pass (see below).
>> 
>> Thanks for picking up those changes to deal with powerpc being oddball.
>> 
>> Tested-by: Michael Ellerman  (powerpc)
>
> Awesome; thanks!
>
> However...
>
>> ./seccomp_bpf
>> TAP version 13
>> 1..86
>> # Starting 86 tests from 7 test cases.
>> #  RUN   global.kcmp ...
>> #OK  global.kcmp
>> ok 1 global.kcmp
>> [...]
>> #  RUN   global.KILL_thread ...
>> TAP version 13
>> 1..86
>> # Starting 86 tests from 7 test cases.
>
> Was this a mis-paste, or has something very very bad happened here in
> global.KILL_one_arg_six finishes?
>
...
>> TAP version 13
>> 1..86
>> # Starting 86 tests from 7 test cases.
>> [...]
>> # PASSED: 86 / 86 tests passed.
>> # Totals: pass:86 fail:0 xfail:0 xpass:0 skip:0 error:0
>
> And after every user_notification test? O_O

Haha, I thought that was normal :)

It's because of redirection, I run the tests with:

  find . -executable -type f -print -execdir '{}' ';' | tee test.log

If I just run it directly on the terminal everything is normal.

It'll be fork() vs libc buffering.

I can fix it with:

$ stdbuf -oL ./seccomp_bpf | tee test.log

Or the patch below.

I can send a proper patch for that tomorrow, I don't know that harness
code, but I think that's the right fix.

cheers


diff --git a/tools/testing/selftests/kselftest_harness.h 
b/tools/testing/selftests/kselftest_harness.h
index 4f78e4805633..b1bd00ff3d94 100644
--- a/tools/testing/selftests/kselftest_harness.h
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -971,6 +971,7 @@ void __run_test(struct __fixture_metadata *f,
 
ksft_print_msg(" RUN   %s%s%s.%s ...\n",
   f->name, variant->name[0] ? "." : "", variant->name, t->name);
+   fflush(stdout);
t->pid = fork();
if (t->pid < 0) {
ksft_print_msg("ERROR SPAWNING TEST CHILD\n");


[PATCH] powerpc/64s: move the last of the page fault handling logic to C

2020-09-15 Thread Nicholas Piggin
The page fault handling still has some complex logic particularly around
hash table handling, in asm. Implement this in C instead.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/bug.h|   1 +
 arch/powerpc/kernel/exceptions-64s.S  | 131 +-
 arch/powerpc/mm/book3s64/hash_utils.c |  77 +--
 arch/powerpc/mm/fault.c   |  55 ++-
 4 files changed, 124 insertions(+), 140 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 338f36cd9934..d714d83bbc7c 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -112,6 +112,7 @@
 
 struct pt_regs;
 extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
+extern int hash__do_page_fault(struct pt_regs *, unsigned long, unsigned long);
 extern void bad_page_fault(struct pt_regs *, unsigned long, int);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
 extern void _exception_pkey(struct pt_regs *, unsigned long, int);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index f7d748b88705..f830b893fe03 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1403,14 +1403,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
  *
  * Handling:
  * - Hash MMU
- *   Go to do_hash_page first to see if the HPT can be filled from an entry in
- *   the Linux page table. Hash faults can hit in kernel mode in a fairly
+ *   Go to do_hash_fault, which attempts to fill the HPT from an entry in the
+ *   Linux page table. Hash faults can hit in kernel mode in a fairly
  *   arbitrary state (e.g., interrupts disabled, locks held) when accessing
  *   "non-bolted" regions, e.g., vmalloc space. However these should always be
- *   backed by Linux page tables.
+ *   backed by Linux page table entries.
  *
- *   If none is found, do a Linux page fault. Linux page faults can happen in
- *   kernel mode due to user copy operations of course.
+ *   If no entry is found the Linux page fault handler is invoked (by
+ *   do_hash_fault). Linux page faults can happen in kernel mode due to user
+ *   copy operations of course.
  *
  * - Radix MMU
  *   The hardware loads from the Linux page table directly, so a fault goes
@@ -1438,13 +1439,17 @@ EXC_COMMON_BEGIN(data_access_common)
GEN_COMMON data_access
ld  r4,_DAR(r1)
ld  r5,_DSISR(r1)
+   addir3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-   ld  r6,_MSR(r1)
-   li  r3,0x300
-   b   do_hash_page/* Try to handle as hpte fault */
+   bl  do_hash_fault
 MMU_FTR_SECTION_ELSE
-   b   handle_page_fault
+   bl  do_page_fault
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+cmpdi  r3,0
+   beq+interrupt_return
+   /* We need to restore NVGPRS */
+   REST_NVGPRS(r1)
+   b   interrupt_return
 
GEN_KVM data_access
 
@@ -1539,13 +1544,17 @@ EXC_COMMON_BEGIN(instruction_access_common)
GEN_COMMON instruction_access
ld  r4,_DAR(r1)
ld  r5,_DSISR(r1)
+   addir3,r1,STACK_FRAME_OVERHEAD
 BEGIN_MMU_FTR_SECTION
-   ld  r6,_MSR(r1)
-   li  r3,0x400
-   b   do_hash_page/* Try to handle as hpte fault */
+   bl  do_hash_fault
 MMU_FTR_SECTION_ELSE
-   b   handle_page_fault
+   bl  do_page_fault
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+cmpdi  r3,0
+   beq+interrupt_return
+   /* We need to restore NVGPRS */
+   REST_NVGPRS(r1)
+   b   interrupt_return
 
GEN_KVM instruction_access
 
@@ -3197,99 +3206,3 @@ disable_machine_check:
RFI_TO_KERNEL
 1: mtlrr0
blr
-
-/*
- * Hash table stuff
- */
-   .balign IFETCH_ALIGN_BYTES
-do_hash_page:
-#ifdef CONFIG_PPC_BOOK3S_64
-   lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h
-   ori r0,r0,DSISR_BAD_FAULT_64S@l
-   and.r0,r5,r0/* weird error? */
-   bne-handle_page_fault   /* if not, try to insert a HPTE */
-
-   /*
-* If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
-* don't call hash_page, just fail the fault. This is required to
-* prevent re-entrancy problems in the hash code, namely perf
-* interrupts hitting while something holds H_PAGE_BUSY, and taking a
-* hash fault. See the comment in hash_preload().
-*/
-   ld  r11, PACA_THREAD_INFO(r13)
-   lwz r0,TI_PREEMPT(r11)
-   andis.  r0,r0,NMI_MASK@h
-   bne 77f
-
-   /*
-* r3 contains the trap number
-* r4 contains the faulting address
-* r5 contains dsisr
-* r6 msr
-*
-* at return r3 = 0 for success, 1 for page fault, negative for error
-*/
-   bl  __hash_page /* build 

[PATCH 6/6] powerpc/64: irq replay remove decrementer overflow check

2020-09-15 Thread Nicholas Piggin
This is an ad-hoc way to catch some cases of decrementer overflow. It
won't catch cases where interrupts were hard disabled before any soft
masked interrupts fired, for example. And it doesn't catch cases that
have overflowed an even number of times.

It's not clear what exactly what problem s being solved here. A lost
timer when we have an IRQ off latency of more than ~4.3 seconds could
be avoided (so long as it's also less than ~8.6s) but this is already
a hard lockup order of magnitude event, and the decrementer will wrap
again and provide a timer interrupt within the same latency magnitdue.

So the test catches some cases of lost decrementers in very exceptional
(buggy) latency event cases, reducing timer interrupt latency in that
case by up to 4.3 seconds. And for large decrementer, it's useless. It
is performed in potentially quite a hot path, reading the TB can be
a noticable overhead.

Perhaps more importantly it allows the clunky MSR[EE] vs
PACA_IRQ_HARD_DIS incoherency to be removed.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/irq.c | 50 +--
 1 file changed, 1 insertion(+), 49 deletions(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 631e6d236c97..d7162f142f24 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -102,14 +102,6 @@ static inline notrace unsigned long get_irq_happened(void)
return happened;
 }
 
-static inline notrace int decrementer_check_overflow(void)
-{
-   u64 now = get_tb_or_rtc();
-   u64 *next_tb = this_cpu_ptr(_next_tb);
- 
-   return now >= *next_tb;
-}
-
 #ifdef CONFIG_PPC_BOOK3E
 
 /* This is called whenever we are re-enabling interrupts
@@ -142,35 +134,6 @@ notrace unsigned int __check_irq_replay(void)
trace_hardirqs_on();
trace_hardirqs_off();
 
-   /*
-* We are always hard disabled here, but PACA_IRQ_HARD_DIS may
-* not be set, which means interrupts have only just been hard
-* disabled as part of the local_irq_restore or interrupt return
-* code. In that case, skip the decrementr check becaus it's
-* expensive to read the TB.
-*
-* HARD_DIS then gets cleared here, but it's reconciled later.
-* Either local_irq_disable will replay the interrupt and that
-* will reconcile state like other hard interrupts. Or interrupt
-* retur will replay the interrupt and in that case it sets
-* PACA_IRQ_HARD_DIS by hand (see comments in entry_64.S).
-*/
-   if (happened & PACA_IRQ_HARD_DIS) {
-   local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
-
-   /*
-* We may have missed a decrementer interrupt if hard disabled.
-* Check the decrementer register in case we had a rollover
-* while hard disabled.
-*/
-   if (!(happened & PACA_IRQ_DEC)) {
-   if (decrementer_check_overflow()) {
-   local_paca->irq_happened |= PACA_IRQ_DEC;
-   happened |= PACA_IRQ_DEC;
-   }
-   }
-   }
-
if (happened & PACA_IRQ_DEC) {
local_paca->irq_happened &= ~PACA_IRQ_DEC;
return 0x900;
@@ -229,18 +192,6 @@ void replay_soft_interrupts(void)
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
WARN_ON_ONCE(mfmsr() & MSR_EE);
 
-   if (happened & PACA_IRQ_HARD_DIS) {
-   /*
-* We may have missed a decrementer interrupt if hard disabled.
-* Check the decrementer register in case we had a rollover
-* while hard disabled.
-*/
-   if (!(happened & PACA_IRQ_DEC)) {
-   if (decrementer_check_overflow())
-   happened |= PACA_IRQ_DEC;
-   }
-   }
-
/*
 * Force the delivery of pending soft-disabled interrupts on PS3.
 * Any HV call will have this side effect.
@@ -345,6 +296,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
WARN_ON_ONCE(!(mfmsr() & MSR_EE));
__hard_irq_disable();
+   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
} else {
/*
 * We should already be hard disabled here. We had bugs
-- 
2.23.0



[PATCH 5/6] powerpc/64: make restore_interrupts 64e only

2020-09-15 Thread Nicholas Piggin
This is not used by 64s.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/irq.c | 37 +++--
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index b725509f9073..631e6d236c97 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -191,6 +191,25 @@ notrace unsigned int __check_irq_replay(void)
 
return 0;
 }
+
+/*
+ * This is specifically called by assembly code to re-enable interrupts
+ * if they are currently disabled. This is typically called before
+ * schedule() or do_signal() when returning to userspace. We do it
+ * in C to avoid the burden of dealing with lockdep etc...
+ *
+ * NOTE: This is called with interrupts hard disabled but not marked
+ * as such in paca->irq_happened, so we need to resync this.
+ */
+void notrace restore_interrupts(void)
+{
+   if (irqs_disabled()) {
+   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+   local_irq_enable();
+   } else
+   __hard_irq_enable();
+}
+
 #endif /* CONFIG_PPC_BOOK3E */
 
 void replay_soft_interrupts(void)
@@ -364,24 +383,6 @@ notrace void arch_local_irq_restore(unsigned long mask)
 }
 EXPORT_SYMBOL(arch_local_irq_restore);
 
-/*
- * This is specifically called by assembly code to re-enable interrupts
- * if they are currently disabled. This is typically called before
- * schedule() or do_signal() when returning to userspace. We do it
- * in C to avoid the burden of dealing with lockdep etc...
- *
- * NOTE: This is called with interrupts hard disabled but not marked
- * as such in paca->irq_happened, so we need to resync this.
- */
-void notrace restore_interrupts(void)
-{
-   if (irqs_disabled()) {
-   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
-   local_irq_enable();
-   } else
-   __hard_irq_enable();
-}
-
 /*
  * This is a helper to use when about to go into idle low-power
  * when the latter has the side effect of re-enabling interrupts
-- 
2.23.0



[PATCH 4/6] powerpc/64e: remove 64s specific interrupt soft-mask code

2020-09-15 Thread Nicholas Piggin
Since the assembly soft-masking code was moved to 64e specific, there
are some 64s specific interrupt types still there. Remove them.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64e.S | 10 --
 arch/powerpc/kernel/irq.c|  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index ca444ca82b8d..f579ce46eef2 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -1302,16 +1302,6 @@ fast_exception_return:
addir3,r1,STACK_FRAME_OVERHEAD;
bl  do_IRQ
b   ret_from_except
-1: cmpwi   cr0,r3,0xf00
-   bne 1f
-   addir3,r1,STACK_FRAME_OVERHEAD;
-   bl  performance_monitor_exception
-   b   ret_from_except
-1: cmpwi   cr0,r3,0xe60
-   bne 1f
-   addir3,r1,STACK_FRAME_OVERHEAD;
-   bl  handle_hmi_exception
-   b   ret_from_except
 1: cmpwi   cr0,r3,0x900
bne 1f
addir3,r1,STACK_FRAME_OVERHEAD;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 736a6b56e7d6..b725509f9073 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -113,7 +113,7 @@ static inline notrace int decrementer_check_overflow(void)
 #ifdef CONFIG_PPC_BOOK3E
 
 /* This is called whenever we are re-enabling interrupts
- * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if
+ * and returns either 0 (nothing to do) or 500/900/280 if
  * there's an EE, DEC or DBELL to generate.
  *
  * This is called in two contexts: From arch_local_irq_restore()
-- 
2.23.0



[PATCH 3/6] powerpc/64e: remove PACA_IRQ_EE_EDGE

2020-09-15 Thread Nicholas Piggin
This is not used anywhere.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/hw_irq.h|  5 ++---
 arch/powerpc/kernel/exceptions-64e.S |  1 -
 arch/powerpc/kernel/irq.c| 23 ---
 3 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index 35060be09073..50dc35711db3 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -25,9 +25,8 @@
 #define PACA_IRQ_DBELL 0x02
 #define PACA_IRQ_EE0x04
 #define PACA_IRQ_DEC   0x08 /* Or FIT */
-#define PACA_IRQ_EE_EDGE   0x10 /* BookE only */
-#define PACA_IRQ_HMI   0x20
-#define PACA_IRQ_PMI   0x40
+#define PACA_IRQ_HMI   0x10
+#define PACA_IRQ_PMI   0x20
 
 /*
  * Some soft-masked interrupts must be hard masked until they are replayed
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index d9ed79415100..ca444ca82b8d 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -988,7 +988,6 @@ kernel_dbg_exc:
 .endm
 
 masked_interrupt_book3e_0x500:
-   // XXX When adding support for EPR, use PACA_IRQ_EE_EDGE
masked_interrupt_book3e PACA_IRQ_EE 1
 
 masked_interrupt_book3e_0x900:
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 3fdad9336885..736a6b56e7d6 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -181,16 +181,6 @@ notrace unsigned int __check_irq_replay(void)
return 0x500;
}
 
-   /*
-* Check if an EPR external interrupt happened this bit is typically
-* set if we need to handle another "edge" interrupt from within the
-* MPIC "EPR" handler.
-*/
-   if (happened & PACA_IRQ_EE_EDGE) {
-   local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
-   return 0x500;
-   }
-
if (happened & PACA_IRQ_DBELL) {
local_paca->irq_happened &= ~PACA_IRQ_DBELL;
return 0x280;
@@ -270,19 +260,6 @@ void replay_soft_interrupts(void)
hard_irq_disable();
}
 
-   /*
-* Check if an EPR external interrupt happened this bit is typically
-* set if we need to handle another "edge" interrupt from within the
-* MPIC "EPR" handler.
-*/
-   if (IS_ENABLED(CONFIG_PPC_BOOK3E) && (happened & PACA_IRQ_EE_EDGE)) {
-   local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
-   regs.trap = 0x500;
-   do_IRQ();
-   if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS))
-   hard_irq_disable();
-   }
-
if (IS_ENABLED(CONFIG_PPC_DOORBELL) && (happened & PACA_IRQ_DBELL)) {
local_paca->irq_happened &= ~PACA_IRQ_DBELL;
if (IS_ENABLED(CONFIG_PPC_BOOK3E))
-- 
2.23.0



[PATCH 1/6] powerpc/64: fix irq replay missing preempt

2020-09-15 Thread Nicholas Piggin
Prior to commit 3282a3da25bd ("powerpc/64: Implement soft interrupt
replay in C"), replayed interrupts returned by the regular interrupt
exit code, which performs preemption in case an interrupt had set
need_resched.

This logic was missed by the conversion. Adding preempt_disable/enable
around the interrupt replay and final irq enable will reschedule if
needed.

Fixes: 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in C")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/irq.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index bf21ebd36190..77019699606a 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -368,6 +368,12 @@ notrace void arch_local_irq_restore(unsigned long mask)
}
}
 
+   /*
+* Disable preempt here, so that the below preempt_enable will
+* perform resched if required (a replayed interrupt may set
+* need_resched).
+*/
+   preempt_disable();
irq_soft_mask_set(IRQS_ALL_DISABLED);
trace_hardirqs_off();
 
@@ -377,6 +383,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
trace_hardirqs_on();
irq_soft_mask_set(IRQS_ENABLED);
__hard_irq_enable();
+   preempt_enable();
 }
 EXPORT_SYMBOL(arch_local_irq_restore);
 
-- 
2.23.0



[PATCH 2/6] powerpc/64: fix irq replay pt_regs->softe value

2020-09-15 Thread Nicholas Piggin
Replayed interrupts get an "artificial" struct pt_regs constructed to
pass to interrupt handler functions. This did not get the softe field
set correctly, it's as though the interrupt has hit while irqs are
disabled. It should be IRQS_ENABLED.

This is possibly harmless, asynchronous handlers should not be testing
if irqs were disabled, but it might be possible for example some code
is shared with synchronous or NMI handlers, and it makes more sense if
debug output looks at this.

Fixes: 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in C")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 77019699606a..3fdad9336885 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -214,7 +214,7 @@ void replay_soft_interrupts(void)
struct pt_regs regs;
 
ppc_save_regs();
-   regs.softe = IRQS_ALL_DISABLED;
+   regs.softe = IRQS_ENABLED;
 
 again:
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
-- 
2.23.0



Re: [PATCH v2] powerpc/papr_scm: Fix warning triggered by perf_stats_show()

2020-09-15 Thread Michael Ellerman
Vaibhav Jain  writes:
> A warning is reported by the kernel in case perf_stats_show() returns
> an error code. The warning is of the form below:
>
>  papr_scm ibm,persistent-memory:ibm,pmemory@4411:
> Failed to query performance stats, Err:-10
>  dev_attr_show: perf_stats_show+0x0/0x1c0 [papr_scm] returned bad count
>  fill_read_buffer: dev_attr_show+0x0/0xb0 returned bad count
>
> On investigation it looks like that the compiler is silently truncating the
> return value of drc_pmem_query_stats() from 'long' to 'int', since the
> variable used to store the return code 'rc' is an 'int'. This
> truncated value is then returned back as a 'ssize_t' back from
> perf_stats_show() to 'dev_attr_show()' which thinks of it as a large
> unsigned number and triggers this warning..
>
> To fix this we update the type of variable 'rc' from 'int' to
> 'ssize_t' that prevents the compiler from truncating the return value
> of drc_pmem_query_stats() and returning correct signed value back from
> perf_stats_show().
>
> Fixes: 2d02bf835e573 ('powerpc/papr_scm: Fetch nvdimm performance
>stats from PHYP')

Please don't word wrap the Fixes tag it breaks b4.

I've fixed it up this time.

cheers


Re: [PATCH v2 1/4] mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race

2020-09-15 Thread Michael Ellerman
Nicholas Piggin  writes:
> Excerpts from pet...@infradead.org's message of September 14, 2020 8:56 pm:
>> On Mon, Sep 14, 2020 at 02:52:16PM +1000, Nicholas Piggin wrote:
>>> Reading and modifying current->mm and current->active_mm and switching
>>> mm should be done with irqs off, to prevent races seeing an intermediate
>>> state.
...
>>> 
>>> Signed-off-by: Nicholas Piggin 
>> 
>> Acked-by: Peter Zijlstra (Intel) 
>> 
>> I'm thinking we want this selected on x86 as well. Andy?
>
> Thanks for the ack. The plan was to take it through the powerpc tree,
> but if you'd want x86 to select it, maybe a topic branch? Although
> Michael will be away during the next merge window so I don't want to
> get too fancy. Would you mind doing it in a follow up merge after
> powerpc, being that it's (I think) a small change?

Or get akpm to take the series, including the x86 change.

cheers


Re: [5.9.0-rc5-20200914] Kernel crash while running LTP(mlock201)

2020-09-15 Thread Michael Ellerman
Sachin Sant  writes:
> While running LTP tests (specifically mlock201) against next-20200914 tree
> on a POWER9 LPAR results in following crash.

Looks the same as:

https://lore.kernel.org/linux-mm/20200914085545.GB28738@shao2-debian/

cheers

> BUG: Kernel NULL pointer dereference on read at 0x
> Faulting instruction address: 0xc0454248
> Oops: Kernel access of bad area, sig: 11 [#1]
> LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> Modules linked in: af_packet(E) nft_ct(E) nf_conntrack(E) nf_defrag_ipv6(E) 
> nf_defrag_ipv4(E) libcrc32c(E) ip6_tables(E) nft_compat(E) ip_set(E) 
> rfkill(E) nf_tables(E) nfnetlink(E) vmx_crypto(E) uio_pdrv_genirq(E) 
> gf128mul(E) uio(E) rtc_generic(E) crct10dif_vpmsum(E) sch_fq_codel(E) 
> ip_tables(E) x_tables(E) ext4(E) crc16(E) mbcache(E) jbd2(E) sd_mod(E) 
> t10_pi(E) sg(E) ibmvscsi(E) scsi_transport_srp(E) scsi_mod(E) ibmveth(E) 
> crc32c_vpmsum(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) autofs4(E)
> CPU: 11 PID: 26435 Comm: mlock201 Tainted: GE 
> 5.9.0-rc5-next-20200914-281.gf529200-default #1
> NIP:  c0454248 LR: c0445a74 CTR: c0413150
> REGS: c008e645b770 TRAP: 0300   Tainted: GE  
> (5.9.0-rc5-next-20200914-281.gf529200-default)
> MSR:  80009033   CR: 28002482  XER: 2004
> CFAR: c000fbb0 DAR:  DSISR: 4000 IRQMASK: 0 
> GPR00: c0445a74 c008e645ba00 c17c4500  
> GPR04: 0001 c008ea109e98 c008f0c4  
> GPR08:    0003 
> GPR12: c0413150 c0001ec70200  c1502038 
> GPR16: 7fff9c61 7fff9c61 7fff9c61 c0cb02f8 
> GPR20: 7fff9c5c 7fff9c62 c008e645bcd8 c008f0c4 
> GPR24: c00c023c0d00 fe7f  c008f0c4 
> GPR28: c008ea109e98 0001 c008ea9288a8  
> NIP [c0454248] PageHuge+0x8/0x60
> LR [c0445a74] find_get_incore_page+0x114/0x160
> Call Trace:
> [c008e645ba00] [c0445994] find_get_incore_page+0x34/0x160 
> (unreliable)
> [c008e645ba40] [c0412e54] mincore_page+0x24/0x160
> [c008e645ba70] [c0413020] __mincore_unmapped_range+0x90/0x160
> [c008e645bac0] [c0413680] mincore_pte_range+0x530/0x5d0
> [c008e645bb40] [c0422a38] walk_pgd_range+0x4e8/0xae0
> [c008e645bc30] [c04230c4] __walk_page_range+0x94/0x250
> [c008e645bcb0] [c04233d8] walk_page_range+0x158/0x1e0
> [c008e645bd40] [c041386c] sys_mincore+0x14c/0x370
> [c008e645bdc0] [c0033eb8] system_call_exception+0xf8/0x200
> [c008e645be20] [c000d140] system_call_common+0xf0/0x27c
> Instruction dump:
> e8410018 38210020 e8010010 7c0803a6 4e800020 6000 3d41 7d435378 
> 4e800020 6000 7c0802a6 6000  75290001 40820010 e9230008 
> ---[ end trace 357eb14a3b22eab2 ]—
>
>
> The function find_get_incore_page() was introduced with 
> 3fcbe4eb49a0406e6202e8c8c3560f30965a8e79 
>
> mm: factor find_get_incore_page out of mincore_page
>
>
> Thanks
> -Sachin


Re: [PATCH v1] soc: fsl: rcpm: Add ACPI support

2020-09-15 Thread Ard Biesheuvel

On 9/15/20 1:06 PM, kuldip dwivedi wrote:

Add ACPI support in fsl RCPM driver. This is required
to support ACPI S3 state. S3 is the ACPI sleep state
that is known as "sleep" or "suspend to RAM".
It essentially turns off most power of the system but
keeps memory powered.

Signed-off-by: tanveer 
Signed-off-by: kuldip dwivedi 


Why does the OS need to program this device? Can't this be done by firmware?


---

Notes:
 1. Add ACPI match table
 2. NXP team members are added for confirming HID changes
 3. There is only one node in ACPI so no need to check for
current device explicitly
 4. These changes are tested on LX2160A and LS1046A platforms

  drivers/soc/fsl/rcpm.c | 22 +++---
  1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c
index a093dbe6d2cb..e75a436fb159 100644
--- a/drivers/soc/fsl/rcpm.c
+++ b/drivers/soc/fsl/rcpm.c
@@ -2,10 +2,12 @@
  //
  // rcpm.c - Freescale QorIQ RCPM driver
  //
-// Copyright 2019 NXP
+// Copyright 2019-2020 NXP
+// Copyright 2020 Puresoftware Ltd.
  //
  // Author: Ran Wang 
  
+#include 

  #include 
  #include 
  #include 
@@ -57,8 +59,13 @@ static int rcpm_pm_prepare(struct device *dev)
rcpm->wakeup_cells + 1);
  
  		/*  Wakeup source should refer to current rcpm device */

-   if (ret || (np->phandle != value[0]))
-   continue;
+   if (is_acpi_node(dev->fwnode)) {
+   if (ret)
+   continue;
+   } else {
+   if (ret || (np->phandle != value[0]))
+   continue;
+   }
  
  		/* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines the

 * number of IPPDEXPCR register cells, and "fsl,rcpm-wakeup"
@@ -139,10 +146,19 @@ static const struct of_device_id rcpm_of_match[] = {
  };
  MODULE_DEVICE_TABLE(of, rcpm_of_match);
  
+#ifdef CONFIG_ACPI

+static const struct acpi_device_id rcpm_acpi_match[] = {
+   { "NXP0015", },
+   { }
+};
+MODULE_DEVICE_TABLE(acpi, rcpm_acpi_match);
+#endif
+
  static struct platform_driver rcpm_driver = {
.driver = {
.name = "rcpm",
.of_match_table = rcpm_of_match,
+   .acpi_match_table = ACPI_PTR(rcpm_acpi_match),
.pm = _pm_ops,
},
.probe = rcpm_probe,





[PATCH v1] soc: fsl: rcpm: Add ACPI support

2020-09-15 Thread kuldip dwivedi
Add ACPI support in fsl RCPM driver. This is required
to support ACPI S3 state. S3 is the ACPI sleep state
that is known as "sleep" or "suspend to RAM".
It essentially turns off most power of the system but
keeps memory powered.

Signed-off-by: tanveer 
Signed-off-by: kuldip dwivedi 
---

Notes:
1. Add ACPI match table
2. NXP team members are added for confirming HID changes
3. There is only one node in ACPI so no need to check for
   current device explicitly
4. These changes are tested on LX2160A and LS1046A platforms

 drivers/soc/fsl/rcpm.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c
index a093dbe6d2cb..e75a436fb159 100644
--- a/drivers/soc/fsl/rcpm.c
+++ b/drivers/soc/fsl/rcpm.c
@@ -2,10 +2,12 @@
 //
 // rcpm.c - Freescale QorIQ RCPM driver
 //
-// Copyright 2019 NXP
+// Copyright 2019-2020 NXP
+// Copyright 2020 Puresoftware Ltd.
 //
 // Author: Ran Wang 
 
+#include 
 #include 
 #include 
 #include 
@@ -57,8 +59,13 @@ static int rcpm_pm_prepare(struct device *dev)
rcpm->wakeup_cells + 1);
 
/*  Wakeup source should refer to current rcpm device */
-   if (ret || (np->phandle != value[0]))
-   continue;
+   if (is_acpi_node(dev->fwnode)) {
+   if (ret)
+   continue;
+   } else {
+   if (ret || (np->phandle != value[0]))
+   continue;
+   }
 
/* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines the
 * number of IPPDEXPCR register cells, and "fsl,rcpm-wakeup"
@@ -139,10 +146,19 @@ static const struct of_device_id rcpm_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, rcpm_of_match);
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id rcpm_acpi_match[] = {
+   { "NXP0015", },
+   { }
+};
+MODULE_DEVICE_TABLE(acpi, rcpm_acpi_match);
+#endif
+
 static struct platform_driver rcpm_driver = {
.driver = {
.name = "rcpm",
.of_match_table = rcpm_of_match,
+   .acpi_match_table = ACPI_PTR(rcpm_acpi_match),
.pm = _pm_ops,
},
.probe = rcpm_probe,
-- 
2.17.1



Re: [PATCH 00/15] selftests/seccomp: Refactor change_syscall()

2020-09-15 Thread Max Filippov
On Mon, Sep 14, 2020 at 1:32 PM Kees Cook  wrote:
> On Mon, Sep 14, 2020 at 10:15:18PM +1000, Michael Ellerman wrote:
> > Kees Cook  writes:
> However...
>
> >
> > cheers
> >
> >
> > ./seccomp_bpf
> > TAP version 13
> > 1..86
> > # Starting 86 tests from 7 test cases.
> > #  RUN   global.kcmp ...
> > #OK  global.kcmp
> > ok 1 global.kcmp
> > [...]
> > #  RUN   global.KILL_thread ...
> > TAP version 13
> > 1..86
> > # Starting 86 tests from 7 test cases.
>
> Was this a mis-paste, or has something very very bad happened here in
> global.KILL_one_arg_six finishes?

I observe similar output corruption on xtensa when I redirect test output
into a file or pipe it to 'cat'. When it goes to the terminal it looks normal.

-- 
Thanks.
-- Max


Re: [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-15 Thread Miquel Raynal
Hi Joe,

For MTD:

>  drivers/mtd/nand/raw/nandsim.c|  2 +-

Reviewed-by: Miquel Raynal 


Thanks,
Miquèl


Re: [Intel-gfx] [trivial PATCH] treewide: Convert switch/case fallthrough; to break;

2020-09-15 Thread Jani Nikula
On Wed, 09 Sep 2020, Joe Perches  wrote:
> diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c 
> b/drivers/gpu/drm/i915/display/intel_sprite.c
> index 5ac0dbf0e03d..35ac539cc2b1 100644
> --- a/drivers/gpu/drm/i915/display/intel_sprite.c
> +++ b/drivers/gpu/drm/i915/display/intel_sprite.c
> @@ -2861,7 +2861,7 @@ static bool gen12_plane_format_mod_supported(struct 
> drm_plane *_plane,
>   case I915_FORMAT_MOD_Y_TILED_GEN12_MC_CCS:
>   if (!gen12_plane_supports_mc_ccs(dev_priv, plane->id))
>   return false;
> - fallthrough;
> + break;
>   case DRM_FORMAT_MOD_LINEAR:
>   case I915_FORMAT_MOD_X_TILED:
>   case I915_FORMAT_MOD_Y_TILED:

Acked-by: Jani Nikula 

for merging via whichever tree seems best.

BR,
Jani.


-- 
Jani Nikula, Intel Open Source Graphics Center


Re: [PATCH 00/15] selftests/seccomp: Refactor change_syscall()

2020-09-15 Thread Max Filippov
Hello,

On Sat, Sep 12, 2020 at 4:08 AM Kees Cook  wrote:
> This refactors the seccomp selftest macros used in change_syscall(),
> in an effort to remove special cases for mips, arm, arm64, and xtensa,
> which paves the way for powerpc fixes.
>
> I'm not entirely done testing, but all-arch build tests and x86_64
> selftests pass. I'll be doing arm, arm64, and i386 selftests shortly,
> but I currently don't have an easy way to check xtensa, mips, nor
> powerpc. Any help there would be appreciated!

I've built and tested this series on xtensa. I had to disable two tests:
user_notification_addfd and user_notification_addfd_rlimit because
they use memfd_create and prlimit which are not available in uClibc.
With this change I've got all 86 tests passing with the following log:

./seccomp_bpf
TAP version 13
1..86
# Starting 86 tests from 7 test cases.
#  RUN   TRAP.dfl ...
#OK  TRAP.dfl
ok 1 TRAP.dfl
#  RUN   TRAP.ign ...
#OK  TRAP.ign
ok 2 TRAP.ign
#  RUN   TRAP.handler ...
#OK  TRAP.handler
ok 3 TRAP.handler
#  RUN   precedence.allow_ok ...
#OK  precedence.allow_ok
ok 4 precedence.allow_ok
#  RUN   precedence.kill_is_highest ...
#OK  precedence.kill_is_highest
ok 5 precedence.kill_is_highest
#  RUN   precedence.kill_is_highest_in_any_order ...
#OK  precedence.kill_is_highest_in_any_order
ok 6 precedence.kill_is_highest_in_any_order
#  RUN   precedence.trap_is_second ...
#OK  precedence.trap_is_second
ok 7 precedence.trap_is_second
#  RUN   precedence.trap_is_second_in_any_order ...
#OK  precedence.trap_is_second_in_any_order
ok 8 precedence.trap_is_second_in_any_order
#  RUN   precedence.errno_is_third ...
#OK  precedence.errno_is_third
ok 9 precedence.errno_is_third
#  RUN   precedence.errno_is_third_in_any_order ...
#OK  precedence.errno_is_third_in_any_order
ok 10 precedence.errno_is_third_in_any_order
#  RUN   precedence.trace_is_fourth ...
#OK  precedence.trace_is_fourth
ok 11 precedence.trace_is_fourth
#  RUN   precedence.trace_is_fourth_in_any_order ...
#OK  precedence.trace_is_fourth_in_any_order
ok 12 precedence.trace_is_fourth_in_any_order
#  RUN   precedence.log_is_fifth ...
#OK  precedence.log_is_fifth
ok 13 precedence.log_is_fifth
#  RUN   precedence.log_is_fifth_in_any_order ...
#OK  precedence.log_is_fifth_in_any_order
ok 14 precedence.log_is_fifth_in_any_order
#  RUN   TRACE_poke.read_has_side_effects ...
#OK  TRACE_poke.read_has_side_effects
ok 15 TRACE_poke.read_has_side_effects
#  RUN   TRACE_poke.getpid_runs_normally ...
#OK  TRACE_poke.getpid_runs_normally
ok 16 TRACE_poke.getpid_runs_normally
#  RUN   TRACE_syscall.ptrace.negative_ENOSYS ...
#OK  TRACE_syscall.ptrace.negative_ENOSYS
ok 17 TRACE_syscall.ptrace.negative_ENOSYS
#  RUN   TRACE_syscall.ptrace.syscall_allowed ...
#OK  TRACE_syscall.ptrace.syscall_allowed
ok 18 TRACE_syscall.ptrace.syscall_allowed
#  RUN   TRACE_syscall.ptrace.syscall_redirected ...
#OK  TRACE_syscall.ptrace.syscall_redirected
ok 19 TRACE_syscall.ptrace.syscall_redirected
#  RUN   TRACE_syscall.ptrace.syscall_errno ...
#OK  TRACE_syscall.ptrace.syscall_errno
ok 20 TRACE_syscall.ptrace.syscall_errno
#  RUN   TRACE_syscall.ptrace.syscall_faked ...
#OK  TRACE_syscall.ptrace.syscall_faked
ok 21 TRACE_syscall.ptrace.syscall_faked
#  RUN   TRACE_syscall.ptrace.skip_after ...
#OK  TRACE_syscall.ptrace.skip_after
ok 22 TRACE_syscall.ptrace.skip_after
#  RUN   TRACE_syscall.ptrace.kill_after ...
#OK  TRACE_syscall.ptrace.kill_after
ok 23 TRACE_syscall.ptrace.kill_after
#  RUN   TRACE_syscall.seccomp.negative_ENOSYS ...
#OK  TRACE_syscall.seccomp.negative_ENOSYS
ok 24 TRACE_syscall.seccomp.negative_ENOSYS
#  RUN   TRACE_syscall.seccomp.syscall_allowed ...
#OK  TRACE_syscall.seccomp.syscall_allowed
ok 25 TRACE_syscall.seccomp.syscall_allowed
#  RUN   TRACE_syscall.seccomp.syscall_redirected ...
#OK  TRACE_syscall.seccomp.syscall_redirected
ok 26 TRACE_syscall.seccomp.syscall_redirected
#  RUN   TRACE_syscall.seccomp.syscall_errno ...
#OK  TRACE_syscall.seccomp.syscall_errno
ok 27 TRACE_syscall.seccomp.syscall_errno
#  RUN   TRACE_syscall.seccomp.syscall_faked ...
#OK  TRACE_syscall.seccomp.syscall_faked
ok 28 TRACE_syscall.seccomp.syscall_faked
#  RUN   TRACE_syscall.seccomp.skip_after ...
#OK  TRACE_syscall.seccomp.skip_after
ok 29 TRACE_syscall.seccomp.skip_after
#  RUN   TRACE_syscall.seccomp.kill_after ...
#OK  TRACE_syscall.seccomp.kill_after
ok 30 

Injecting SLB miltihit crashes kernel 5.9.0-rc5

2020-09-15 Thread Michal Suchánek
Hello,

Using the SLB mutihit injection test module (which I did not write so I
do not want to post it here) to verify updates on my 5.3 frankernekernel
I found that the kernel crashes with Oops: kernel bad access.

I tested on latest upstream kernel build that I have at hand and the
result is te same (minus the message - nothing was logged and the kernel
simply rebooted).

Since the whole effort to write a real mode MCE handler was supposed to
prevent this maybe the SLB injection module should be added to the
kernel selftests?

Thanks

Michal


[powerpc:next-test] BUILD SUCCESS d7d40595a2568d199396c863460cecd5ae676c34

2020-09-15 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
next-test
branch HEAD: d7d40595a2568d199396c863460cecd5ae676c34  Merge coregroup support 
into next

elapsed time: 1137m

configs tested: 175
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm64   defconfig
arm  allyesconfig
arm  allmodconfig
m68k  hp300_defconfig
sh  sdk7786_defconfig
powerpc  g5_defconfig
arm   tegra_defconfig
openrisc simple_smp_defconfig
powerpcicon_defconfig
mips  cavium_octeon_defconfig
arc  axs103_smp_defconfig
m68km5407c3_defconfig
powerpc  ppc6xx_defconfig
arm  pxa255-idp_defconfig
arc  axs103_defconfig
m68k amcore_defconfig
mipsmaltaup_xpa_defconfig
mips   rbtx49xx_defconfig
powerpc   currituck_defconfig
riscv  rv32_defconfig
armclps711x_defconfig
arm  iop32x_defconfig
powerpc wii_defconfig
arm lpc32xx_defconfig
sh  r7780mp_defconfig
powerpcmvme5100_defconfig
um i386_defconfig
archsdk_defconfig
powerpcgamecube_defconfig
powerpc mpc836x_mds_defconfig
powerpc xes_mpc85xx_defconfig
x86_64  defconfig
powerpc   eiger_defconfig
mipsnlm_xlr_defconfig
arm shannon_defconfig
powerpc  pcm030_defconfig
pariscgeneric-64bit_defconfig
shdreamcast_defconfig
armmps2_defconfig
powerpc mpc8540_ads_defconfig
mips pnx8335_stb225_defconfig
powerpc linkstation_defconfig
powerpc  storcenter_defconfig
alpha   defconfig
m68k   sun3_defconfig
armvt8500_v6_v7_defconfig
powerpc  mgcoge_defconfig
ia64defconfig
mipsbcm47xx_defconfig
arcnsimosci_defconfig
arm   efm32_defconfig
sh  polaris_defconfig
arm   stm32_defconfig
arm   spear13xx_defconfig
powerpc mpc8315_rdb_defconfig
powerpc  obs600_defconfig
riscvalldefconfig
armmulti_v5_defconfig
powerpc tqm8541_defconfig
arm hackkit_defconfig
nds32 allnoconfig
arm ezx_defconfig
m68kmvme16x_defconfig
arm   omap1_defconfig
arm   multi_v4t_defconfig
powerpc  mpc866_ads_defconfig
m68k alldefconfig
mips   jazz_defconfig
arm  zx_defconfig
mips  bmips_stb_defconfig
arm   aspeed_g5_defconfig
sparc   sparc64_defconfig
powerpc tqm8560_defconfig
riscvallmodconfig
armspear3xx_defconfig
m68km5307c3_defconfig
c6x  alldefconfig
armtrizeps4_defconfig
armmulti_v7_defconfig
powerpc  tqm8xx_defconfig
sh  defconfig
m68k  sun3x_defconfig
powerpc   ppc64_defconfig
powerpc stx_gp3_defconfig
arm  gemini_defconfig
mips  maltasmvp_eva_defconfig
xtensa  iss_defconfig
xtensa   alldefconfig
ia64 allyesconfig
ia64 allmodconfig
m68kdefconfig
m68k allmodconfig
m68k allyesconfig
nios2   defconfig
c6x  allyesconfig
arc  allyesconfig
nds32   defconfig
cskydefconfig
alpha

[powerpc:merge] BUILD SUCCESS 27e2fbcd815a088d7d83c7158f76b6e95ab07c50

2020-09-15 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: 27e2fbcd815a088d7d83c7158f76b6e95ab07c50  Automatic merge of 
'master', 'next' and 'fixes' (2020-09-14 23:28)

elapsed time: 1138m

configs tested: 154
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm64   defconfig
arm  allyesconfig
arm  allmodconfig
powerpc  ppc40x_defconfig
mipsvocore2_defconfig
m68k amcore_defconfig
mipsmaltaup_xpa_defconfig
mips   rbtx49xx_defconfig
powerpc   currituck_defconfig
riscv  rv32_defconfig
armclps711x_defconfig
arm  iop32x_defconfig
powerpc wii_defconfig
arm lpc32xx_defconfig
powerpcicon_defconfig
powerpc redwood_defconfig
arm s3c6400_defconfig
sh   j2_defconfig
powerpc sbc8548_defconfig
powerpc  ppc6xx_defconfig
sh  rsk7201_defconfig
arm shannon_defconfig
powerpc  pcm030_defconfig
pariscgeneric-64bit_defconfig
shdreamcast_defconfig
armmps2_defconfig
arm  pxa3xx_defconfig
sh  defconfig
m68km5307c3_defconfig
powerpc  mgcoge_defconfig
sparc   defconfig
arm nhk8815_defconfig
armvt8500_v6_v7_defconfig
ia64defconfig
mipsbcm47xx_defconfig
arcnsimosci_defconfig
m68k  hp300_defconfig
powerpc  obs600_defconfig
riscvalldefconfig
armmulti_v5_defconfig
powerpc tqm8541_defconfig
arm hackkit_defconfig
arm ezx_defconfig
nds32 allnoconfig
m68kmvme16x_defconfig
arm   omap1_defconfig
arm   multi_v4t_defconfig
powerpc  mpc866_ads_defconfig
m68k alldefconfig
mips   jazz_defconfig
arm  zx_defconfig
mips  bmips_stb_defconfig
sh  r7780mp_defconfig
arm   aspeed_g5_defconfig
sparc   sparc64_defconfig
powerpc tqm8560_defconfig
riscvallmodconfig
armspear3xx_defconfig
powerpc mpc8540_ads_defconfig
powerpcgamecube_defconfig
c6x  alldefconfig
armtrizeps4_defconfig
armmulti_v7_defconfig
powerpc  mpc885_ads_defconfig
arm   milbeaut_m10v_defconfig
microblaze  defconfig
ia64 allmodconfig
ia64 allyesconfig
m68kdefconfig
m68k allmodconfig
m68k allyesconfig
nios2   defconfig
c6x  allyesconfig
arc  allyesconfig
nds32   defconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
nios2allyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
parisc   allyesconfig
s390defconfig
i386 allyesconfig
sparcallyesconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a004-20200914
x86_64   randconfig-a006-20200914
x86_64   randconfig-a003-20200914
x86_64 

Re: [PATCH kernel] powerpc/dma: Fix dma_map_ops::get_required_mask

2020-09-15 Thread Christoph Hellwig
On Wed, Sep 09, 2020 at 07:36:04PM +1000, Alexey Kardashevskiy wrote:
> I want dma_get_required_mask() to return the bigger mask always.
> 
> Now it depends on (in dma_alloc_direct()):
> 1. dev->dma_ops_bypass: set via pci_set_(coherent_)dma_mask();
> 2. dev->coherent_dma_mask - the same;
> 3. dev->bus_dma_limit - usually not set at all.
> 
> So until we set the mask, dma_get_required_mask() returns smaller mask.
> So aacraid and likes (which calls dma_get_required_mask() before setting
> it) will remain prone for breaks.

Well, the original intent of dma_get_required_mask is to return the
mask that the driver then uses to figure out what to set, so what aacraid
does fits that use case.  Of course that idea is pretty bogus for
PCIe devices.

I suspect the right fix is to just not query dma_get_required_mask for
PCIe devices in aacraid (and other drivers that do something similar).


[5.9.0-rc5-20200914] Kernel crash while running LTP(mlock201)

2020-09-15 Thread Sachin Sant
While running LTP tests (specifically mlock201) against next-20200914 tree
on a POWER9 LPAR results in following crash.

BUG: Kernel NULL pointer dereference on read at 0x
Faulting instruction address: 0xc0454248
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: af_packet(E) nft_ct(E) nf_conntrack(E) nf_defrag_ipv6(E) 
nf_defrag_ipv4(E) libcrc32c(E) ip6_tables(E) nft_compat(E) ip_set(E) rfkill(E) 
nf_tables(E) nfnetlink(E) vmx_crypto(E) uio_pdrv_genirq(E) gf128mul(E) uio(E) 
rtc_generic(E) crct10dif_vpmsum(E) sch_fq_codel(E) ip_tables(E) x_tables(E) 
ext4(E) crc16(E) mbcache(E) jbd2(E) sd_mod(E) t10_pi(E) sg(E) ibmvscsi(E) 
scsi_transport_srp(E) scsi_mod(E) ibmveth(E) crc32c_vpmsum(E) dm_mirror(E) 
dm_region_hash(E) dm_log(E) dm_mod(E) autofs4(E)
CPU: 11 PID: 26435 Comm: mlock201 Tainted: GE 
5.9.0-rc5-next-20200914-281.gf529200-default #1
NIP:  c0454248 LR: c0445a74 CTR: c0413150
REGS: c008e645b770 TRAP: 0300   Tainted: GE  
(5.9.0-rc5-next-20200914-281.gf529200-default)
MSR:  80009033   CR: 28002482  XER: 2004
CFAR: c000fbb0 DAR:  DSISR: 4000 IRQMASK: 0 
GPR00: c0445a74 c008e645ba00 c17c4500  
GPR04: 0001 c008ea109e98 c008f0c4  
GPR08:    0003 
GPR12: c0413150 c0001ec70200  c1502038 
GPR16: 7fff9c61 7fff9c61 7fff9c61 c0cb02f8 
GPR20: 7fff9c5c 7fff9c62 c008e645bcd8 c008f0c4 
GPR24: c00c023c0d00 fe7f  c008f0c4 
GPR28: c008ea109e98 0001 c008ea9288a8  
NIP [c0454248] PageHuge+0x8/0x60
LR [c0445a74] find_get_incore_page+0x114/0x160
Call Trace:
[c008e645ba00] [c0445994] find_get_incore_page+0x34/0x160 
(unreliable)
[c008e645ba40] [c0412e54] mincore_page+0x24/0x160
[c008e645ba70] [c0413020] __mincore_unmapped_range+0x90/0x160
[c008e645bac0] [c0413680] mincore_pte_range+0x530/0x5d0
[c008e645bb40] [c0422a38] walk_pgd_range+0x4e8/0xae0
[c008e645bc30] [c04230c4] __walk_page_range+0x94/0x250
[c008e645bcb0] [c04233d8] walk_page_range+0x158/0x1e0
[c008e645bd40] [c041386c] sys_mincore+0x14c/0x370
[c008e645bdc0] [c0033eb8] system_call_exception+0xf8/0x200
[c008e645be20] [c000d140] system_call_common+0xf0/0x27c
Instruction dump:
e8410018 38210020 e8010010 7c0803a6 4e800020 6000 3d41 7d435378 
4e800020 6000 7c0802a6 6000  75290001 40820010 e9230008 
---[ end trace 357eb14a3b22eab2 ]—


The function find_get_incore_page() was introduced with 
3fcbe4eb49a0406e6202e8c8c3560f30965a8e79 

mm: factor find_get_incore_page out of mincore_page


Thanks
-Sachin