Re: Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Doru Iorgulescu
Awesome, thanks!

On Wed, May 24, 2023, 6:03 PM Michael Ellerman  wrote:

> Bagas Sanjaya  writes:
> > On 5/24/23 17:58, Bagas Sanjaya wrote:
> >> Anyway, I'm adding it to regzbot:
> >>
> >> #regzbot introduced: 23baf831a32c04f
> https://bugzilla.kernel.org/show_bug.cgi?id=217477
> >> #regzbot title: Allocator MAX_ORDER exceeds SECTION_SIZE caused by
> MAX_ORDER redefinition
> >>
> >
> > From bugzilla [1], the reporter had successfully tried the proposed
> > kernel config fix, so:
> >
> > #regzbot resolve: reducing CONFIG_ARCH_FORCE_MAX_ORDER to 8 resolves the
> regression
>
> Should be fixed properly by:
>
>
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20230519113806.370635-1-...@ellerman.id.au/
>
> Which is in powerpc-fixes as 358e526a1648.
>
> cheers
>


Re: Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Doru Iorgulescu
Glad to hear it!
Thank you

On Wed, May 24, 2023, 3:58 PM Bagas Sanjaya  wrote:

> On 5/24/23 17:58, Bagas Sanjaya wrote:
> > Anyway, I'm adding it to regzbot:
> >
> > #regzbot introduced: 23baf831a32c04f
> https://bugzilla.kernel.org/show_bug.cgi?id=217477
> > #regzbot title: Allocator MAX_ORDER exceeds SECTION_SIZE caused by
> MAX_ORDER redefinition
> >
>
> From bugzilla [1], the reporter had successfully tried the proposed
> kernel config fix, so:
>
> #regzbot resolve: reducing CONFIG_ARCH_FORCE_MAX_ORDER to 8 resolves the
> regression
>
> Thanks for all who participates in this regression report!
>
> [1]: https://bugzilla.kernel.org/show_bug.cgi?id=217477#c8
>
> --
> An old man doll... just what I always wanted! - Clara
>
>


Re: Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Bagas Sanjaya
On Thu, May 25, 2023 at 01:03:22AM +1000, Michael Ellerman wrote:
> Should be fixed properly by:
> 
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20230519113806.370635-1-...@ellerman.id.au/
> 
> Which is in powerpc-fixes as 358e526a1648.

Telling regzbot:

#regzbot fix: 358e526a1648cd

Thanks.

-- 
An old man doll... just what I always wanted! - Clara


signature.asc
Description: PGP signature


Re: [PATCH v9 3/4] of: kexec: Refactor IMA buffer related functions to make them reusable

2023-05-24 Thread Stefan Berger




On 5/24/23 19:16, Jerry Snitselaar wrote:

On Tue, Apr 18, 2023 at 09:44:08AM -0400, Stefan Berger wrote:

Refactor IMA buffer related functions to make them reusable for carrying
TPM logs across kexec.

Signed-off-by: Stefan Berger 
Cc: Rob Herring 
Cc: Frank Rowand 
Cc: Mimi Zohar 
Reviewed-by: Mimi Zohar 
Reviewed-by: Rob Herring 
Tested-by: Nageswara R Sastry 
Tested-by: Coiby Xu 



Reviewed-by: Jerry Snitselaar 


Thanks a lot for looking at the patches. Unfortunately this series only 
resolves the issue with the newer kexec call but we have seen systems where the 
older one is used for setting a crash kernel and when the crash happens we're 
back to square one. I have been trying to embed the log into the memory of the 
flattened of-device tree but that also turns out to be not so straight forward.

Stefan

---
v6:
  - Add __init to get_kexec_buffer as suggested by Jonathan

v5:
  - Rebased on Jonathan McDowell's commit "b69a2afd5afc x86/kexec: Carry
forward IMA measurement log on kexec"
v4:
  - Move debug output into setup_buffer()
---
  drivers/of/kexec.c | 126 ++---
  1 file changed, 74 insertions(+), 52 deletions(-)

diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 1373d7e0a9b3..fa8c0c75adf9 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -117,45 +117,57 @@ static int do_get_kexec_buffer(const void *prop, int len, 
unsigned long *addr,
  }
  
  #ifdef CONFIG_HAVE_IMA_KEXEC

-/**
- * ima_get_kexec_buffer - get IMA buffer from the previous kernel
- * @addr:  On successful return, set to point to the buffer contents.
- * @size:  On successful return, set to the buffer size.
- *
- * Return: 0 on success, negative errno on error.
- */
-int __init ima_get_kexec_buffer(void **addr, size_t *size)
+static int __init get_kexec_buffer(const char *name, unsigned long *addr,
+  size_t *size)
  {
int ret, len;
-   unsigned long tmp_addr;
unsigned long start_pfn, end_pfn;
-   size_t tmp_size;
const void *prop;
  
-	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", );

+   prop = of_get_property(of_chosen, name, );
if (!prop)
return -ENOENT;
  
-	ret = do_get_kexec_buffer(prop, len, _addr, _size);

+   ret = do_get_kexec_buffer(prop, len, addr, size);
if (ret)
return ret;
  
-	/* Do some sanity on the returned size for the ima-kexec buffer */

-   if (!tmp_size)
+   /* Do some sanity on the returned size for the kexec buffer */
+   if (!*size)
return -ENOENT;
  
  	/*

 * Calculate the PFNs for the buffer and ensure
 * they are with in addressable memory.
 */
-   start_pfn = PHYS_PFN(tmp_addr);
-   end_pfn = PHYS_PFN(tmp_addr + tmp_size - 1);
+   start_pfn = PHYS_PFN(*addr);
+   end_pfn = PHYS_PFN(*addr + *size - 1);
if (!page_is_ram(start_pfn) || !page_is_ram(end_pfn)) {
-   pr_warn("IMA buffer at 0x%lx, size = 0x%zx beyond memory\n",
-   tmp_addr, tmp_size);
+   pr_warn("%s buffer at 0x%lx, size = 0x%zx beyond memory\n",
+   name, *addr, *size);
return -EINVAL;
}
  
+	return 0;

+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr:  On successful return, set to point to the buffer contents.
+ * @size:  On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
+{
+   int ret;
+   unsigned long tmp_addr;
+   size_t tmp_size;
+
+   ret = get_kexec_buffer("linux,ima-kexec-buffer", _addr, _size);
+   if (ret)
+   return ret;
+
*addr = __va(tmp_addr);
*size = tmp_size;
  
@@ -188,72 +200,82 @@ int __init ima_free_kexec_buffer(void)

  }
  #endif
  
-/**

- * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
- *
- * @fdt: Flattened Device Tree to update
- * @chosen_node: Offset to the chosen node in the device tree
- *
- * The IMA measurement buffer is of no use to a subsequent kernel, so we always
- * remove it from the device tree.
- */
-static void remove_ima_buffer(void *fdt, int chosen_node)
+static int remove_buffer(void *fdt, int chosen_node, const char *name)
  {
int ret, len;
unsigned long addr;
size_t size;
const void *prop;
  
-	if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))

-   return;
-
-   prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", );
+   prop = fdt_getprop(fdt, chosen_node, name, );
if (!prop)
-   return;
+   return -ENOENT;
  
  	ret = do_get_kexec_buffer(prop, len, , );

-   fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
+   fdt_delprop(fdt, chosen_node, name);
if (ret)
-   

Re: [PATCH v9 3/4] of: kexec: Refactor IMA buffer related functions to make them reusable

2023-05-24 Thread Jerry Snitselaar
On Tue, Apr 18, 2023 at 09:44:08AM -0400, Stefan Berger wrote:
> Refactor IMA buffer related functions to make them reusable for carrying
> TPM logs across kexec.
> 
> Signed-off-by: Stefan Berger 
> Cc: Rob Herring 
> Cc: Frank Rowand 
> Cc: Mimi Zohar 
> Reviewed-by: Mimi Zohar 
> Reviewed-by: Rob Herring 
> Tested-by: Nageswara R Sastry 
> Tested-by: Coiby Xu 
> 

Reviewed-by: Jerry Snitselaar 

> ---
> v6:
>  - Add __init to get_kexec_buffer as suggested by Jonathan
> 
> v5:
>  - Rebased on Jonathan McDowell's commit "b69a2afd5afc x86/kexec: Carry
>forward IMA measurement log on kexec"
> v4:
>  - Move debug output into setup_buffer()
> ---
>  drivers/of/kexec.c | 126 ++---
>  1 file changed, 74 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
> index 1373d7e0a9b3..fa8c0c75adf9 100644
> --- a/drivers/of/kexec.c
> +++ b/drivers/of/kexec.c
> @@ -117,45 +117,57 @@ static int do_get_kexec_buffer(const void *prop, int 
> len, unsigned long *addr,
>  }
>  
>  #ifdef CONFIG_HAVE_IMA_KEXEC
> -/**
> - * ima_get_kexec_buffer - get IMA buffer from the previous kernel
> - * @addr:On successful return, set to point to the buffer contents.
> - * @size:On successful return, set to the buffer size.
> - *
> - * Return: 0 on success, negative errno on error.
> - */
> -int __init ima_get_kexec_buffer(void **addr, size_t *size)
> +static int __init get_kexec_buffer(const char *name, unsigned long *addr,
> +size_t *size)
>  {
>   int ret, len;
> - unsigned long tmp_addr;
>   unsigned long start_pfn, end_pfn;
> - size_t tmp_size;
>   const void *prop;
>  
> - prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", );
> + prop = of_get_property(of_chosen, name, );
>   if (!prop)
>   return -ENOENT;
>  
> - ret = do_get_kexec_buffer(prop, len, _addr, _size);
> + ret = do_get_kexec_buffer(prop, len, addr, size);
>   if (ret)
>   return ret;
>  
> - /* Do some sanity on the returned size for the ima-kexec buffer */
> - if (!tmp_size)
> + /* Do some sanity on the returned size for the kexec buffer */
> + if (!*size)
>   return -ENOENT;
>  
>   /*
>* Calculate the PFNs for the buffer and ensure
>* they are with in addressable memory.
>*/
> - start_pfn = PHYS_PFN(tmp_addr);
> - end_pfn = PHYS_PFN(tmp_addr + tmp_size - 1);
> + start_pfn = PHYS_PFN(*addr);
> + end_pfn = PHYS_PFN(*addr + *size - 1);
>   if (!page_is_ram(start_pfn) || !page_is_ram(end_pfn)) {
> - pr_warn("IMA buffer at 0x%lx, size = 0x%zx beyond memory\n",
> - tmp_addr, tmp_size);
> + pr_warn("%s buffer at 0x%lx, size = 0x%zx beyond memory\n",
> + name, *addr, *size);
>   return -EINVAL;
>   }
>  
> + return 0;
> +}
> +
> +/**
> + * ima_get_kexec_buffer - get IMA buffer from the previous kernel
> + * @addr:On successful return, set to point to the buffer contents.
> + * @size:On successful return, set to the buffer size.
> + *
> + * Return: 0 on success, negative errno on error.
> + */
> +int __init ima_get_kexec_buffer(void **addr, size_t *size)
> +{
> + int ret;
> + unsigned long tmp_addr;
> + size_t tmp_size;
> +
> + ret = get_kexec_buffer("linux,ima-kexec-buffer", _addr, _size);
> + if (ret)
> + return ret;
> +
>   *addr = __va(tmp_addr);
>   *size = tmp_size;
>  
> @@ -188,72 +200,82 @@ int __init ima_free_kexec_buffer(void)
>  }
>  #endif
>  
> -/**
> - * remove_ima_buffer - remove the IMA buffer property and reservation from 
> @fdt
> - *
> - * @fdt: Flattened Device Tree to update
> - * @chosen_node: Offset to the chosen node in the device tree
> - *
> - * The IMA measurement buffer is of no use to a subsequent kernel, so we 
> always
> - * remove it from the device tree.
> - */
> -static void remove_ima_buffer(void *fdt, int chosen_node)
> +static int remove_buffer(void *fdt, int chosen_node, const char *name)
>  {
>   int ret, len;
>   unsigned long addr;
>   size_t size;
>   const void *prop;
>  
> - if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
> - return;
> -
> - prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", );
> + prop = fdt_getprop(fdt, chosen_node, name, );
>   if (!prop)
> - return;
> + return -ENOENT;
>  
>   ret = do_get_kexec_buffer(prop, len, , );
> - fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
> + fdt_delprop(fdt, chosen_node, name);
>   if (ret)
> - return;
> + return ret;
>  
>   ret = fdt_find_and_del_mem_rsv(fdt, addr, size);
>   if (!ret)
> - pr_debug("Removed old IMA buffer reservation.\n");
> + pr_debug("Remove old %s buffer reserveration", name);
> + return ret;
>  }
>  
> 

Re: [PATCH v9 2/4] tpm: of: Make of-tree specific function commonly available

2023-05-24 Thread Jerry Snitselaar
On Tue, Apr 18, 2023 at 09:44:07AM -0400, Stefan Berger wrote:
> Simplify tpm_read_log_of() by moving reusable parts of the code into
> an inline function that makes it commonly available so it can be
> used also for kexec support. Call the new of_tpm_get_sml_parameters()
> function from the TPM Open Firmware driver.
> 
> Signed-off-by: Stefan Berger 
> Cc: Jarkko Sakkinen 
> Cc: Jason Gunthorpe 
> Cc: Rob Herring 
> Cc: Frank Rowand 
> Reviewed-by: Mimi Zohar 
> Tested-by: Nageswara R Sastry 
> Tested-by: Coiby Xu 
> Acked-by: Jarkko Sakkinen 
> 

Reviewed-by: Jerry Snitselaar 

> ---
> v9:
>  - Rebased on 6.3-rc7; call tpm_read_log_memory_region if -ENODEV returned
> 
> v7:
>  - Added original comment back into inlined function
> 
> v4:
>  - converted to inline function
> ---
>  drivers/char/tpm/eventlog/of.c | 30 +---
>  include/linux/tpm.h| 36 ++
>  2 files changed, 41 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/char/tpm/eventlog/of.c b/drivers/char/tpm/eventlog/of.c
> index 930fe43d5daf..41688d9cbd3b 100644
> --- a/drivers/char/tpm/eventlog/of.c
> +++ b/drivers/char/tpm/eventlog/of.c
> @@ -51,11 +51,10 @@ static int tpm_read_log_memory_region(struct tpm_chip 
> *chip)
>  int tpm_read_log_of(struct tpm_chip *chip)
>  {
>   struct device_node *np;
> - const u32 *sizep;
> - const u64 *basep;
>   struct tpm_bios_log *log;
>   u32 size;
>   u64 base;
> + int ret;
>  
>   log = >log;
>   if (chip->dev.parent && chip->dev.parent->of_node)
> @@ -66,30 +65,11 @@ int tpm_read_log_of(struct tpm_chip *chip)
>   if (of_property_read_bool(np, "powered-while-suspended"))
>   chip->flags |= TPM_CHIP_FLAG_ALWAYS_POWERED;
>  
> - sizep = of_get_property(np, "linux,sml-size", NULL);
> - basep = of_get_property(np, "linux,sml-base", NULL);
> - if (sizep == NULL && basep == NULL)
> + ret = of_tpm_get_sml_parameters(np, , );
> + if (ret == -ENODEV)
>   return tpm_read_log_memory_region(chip);
> - if (sizep == NULL || basep == NULL)
> - return -EIO;
> -
> - /*
> -  * For both vtpm/tpm, firmware has log addr and log size in big
> -  * endian format. But in case of vtpm, there is a method called
> -  * sml-handover which is run during kernel init even before
> -  * device tree is setup. This sml-handover function takes care
> -  * of endianness and writes to sml-base and sml-size in little
> -  * endian format. For this reason, vtpm doesn't need conversion
> -  * but physical tpm needs the conversion.
> -  */
> - if (of_property_match_string(np, "compatible", "IBM,vtpm") < 0 &&
> - of_property_match_string(np, "compatible", "IBM,vtpm20") < 0) {
> - size = be32_to_cpup((__force __be32 *)sizep);
> - base = be64_to_cpup((__force __be64 *)basep);
> - } else {
> - size = *sizep;
> - base = *basep;
> - }
> + if (ret < 0)
> + return ret;
>  
>   if (size == 0) {
>   dev_warn(>dev, "%s: Event log area empty\n", __func__);
> diff --git a/include/linux/tpm.h b/include/linux/tpm.h
> index 4dc97b9f65fb..dd9a376a1dbb 100644
> --- a/include/linux/tpm.h
> +++ b/include/linux/tpm.h
> @@ -461,4 +461,40 @@ static inline struct tpm_chip *tpm_default_chip(void)
>   return NULL;
>  }
>  #endif
> +
> +#ifdef CONFIG_OF
> +static inline int of_tpm_get_sml_parameters(struct device_node *np,
> + u64 *base, u32 *size)
> +{
> + const u32 *sizep;
> + const u64 *basep;
> +
> + sizep = of_get_property(np, "linux,sml-size", NULL);
> + basep = of_get_property(np, "linux,sml-base", NULL);
> + if (sizep == NULL && basep == NULL)
> + return -ENODEV;
> + if (sizep == NULL || basep == NULL)
> + return -EIO;
> +
> + /*
> +  * For both vtpm/tpm, firmware has log addr and log size in big
> +  * endian format. But in case of vtpm, there is a method called
> +  * sml-handover which is run during kernel init even before
> +  * device tree is setup. This sml-handover function takes care
> +  * of endianness and writes to sml-base and sml-size in little
> +  * endian format. For this reason, vtpm doesn't need conversion
> +  * but physical tpm needs the conversion.
> +  */
> + if (of_property_match_string(np, "compatible", "IBM,vtpm") < 0 &&
> + of_property_match_string(np, "compatible", "IBM,vtpm20") < 0) {
> + *size = be32_to_cpup((__force __be32 *)sizep);
> + *base = be64_to_cpup((__force __be64 *)basep);
> + } else {
> + *size = *sizep;
> + *base = *basep;
> + }
> + return 0;
> +}
> +#endif
> +
>  #endif
> -- 
> 2.38.1
> 



Re: [PATCH v9 1/4] drivers: of: kexec ima: Support 32-bit platforms

2023-05-24 Thread Jerry Snitselaar
On Tue, Apr 18, 2023 at 09:44:06AM -0400, Stefan Berger wrote:
> From: Palmer Dabbelt 
> 
> RISC-V recently added kexec_file() support, which uses enables kexec
> IMA.  We're the first 32-bit platform to support this, so we found a
> build bug.
> 
> Acked-by: Rob Herring 
> Signed-off-by: Palmer Dabbelt 
> Reviewed-by: Mimi Zohar 

Reviewed-by: Jerry Snitselaar 

> ---
>  drivers/of/kexec.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
> index f26d2ba8a371..1373d7e0a9b3 100644
> --- a/drivers/of/kexec.c
> +++ b/drivers/of/kexec.c
> @@ -250,8 +250,8 @@ static int setup_ima_buffer(const struct kimage *image, 
> void *fdt,
>   if (ret)
>   return -EINVAL;
>  
> - pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n",
> -  image->ima_buffer_addr, image->ima_buffer_size);
> + pr_debug("IMA buffer at 0x%pa, size = 0x%zx\n",
> +  >ima_buffer_addr, image->ima_buffer_size);
>  
>   return 0;
>  }
> -- 
> 2.38.1
> 



Re: [PATCH v4 22/23] PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler

2023-05-24 Thread Bjorn Helgaas
On Tue, May 23, 2023 at 06:22:13PM -0500, Terry Bowman wrote:
> From: Robert Richter 
> 
> In Restricted CXL Device (RCD) mode a CXL device is exposed as an
> RCiEP, but CXL downstream and upstream ports are not enumerated and
> not visible in the PCIe hierarchy. Protocol and link errors are sent
> to an RCEC.
>
> Restricted CXL host (RCH) downstream port-detected errors are signaled
> as internal AER errors, either Uncorrectable Internal Error (UIE) or
> Corrected Internal Errors (CIE). 

>From the parallelism with RCD above, I first thought that RCH devices
were non-RCD mode and *were* enumerated as part of the PCIe hierarchy,
but actually I suspect it's more like the following?

  ... but CXL downstream and upstream ports are not enumerated and not
  visible in the PCIe hierarchy.

  Protocol and link errors from these non-enumerated ports are
  signaled as internal AER errors ... via a CXL RCEC.

> The error source is the id of the RCEC.

This seems odd; I assume this refers to the RCEC's AER Error Source
Identification register, and the ERR_COR or ERR_FATAL/NONFATAL Source
Identification would ordinarily be the Requester ID of the RCiEP that
"sent" the Error Message.  But you're saying it's actually the ID of
the *RCEC*, not the RCiEP?

We're going to call pci_aer_handle_error() as well, to handle the
non-internal errors, and I'm pretty sure that path expects the RCiEP
ID there.

Whatever the answer, I'm not sure this sentence is actually relevant
to this patch, since this patch doesn't read PCI_ERR_ROOT_ERR_SRC or
look at struct aer_err_source.id.

> A CXL handler must then inspect the error status in various CXL
> registers residing in the dport's component register space (CXL RAS
> capability) or the dport's RCRB (PCIe AER extended capability). [1]
> 
> Errors showing up in the RCEC's error handler must be handled and
> connected to the CXL subsystem. Implement this by forwarding the error
> to all CXL devices below the RCEC. Since the entire CXL device is
> controlled only using PCIe Configuration Space of device 0, function
> 0, only pass it there [2]. The error handling is limited to currently
> supported devices with the Memory Device class code set
> (PCI_CLASS_MEMORY_CXL, 502h), where the handler can be implemented in
> the existing cxl_pci driver. Support of CXL devices (e.g. a CXL.cache
> device) can be enabled later.

I assume the Memory Devices are CXL devices, so maybe "Error handling
for *other* CXL devices ... can be enabled later"?  

IIUC, this happens via cxl_rch_handle_error_iter() calling
pci_error_handlers for CXL RCiEPs.  Maybe the is_cxl_mem_dev() check
belongs inside those handlers, since that driver claimed the RCiEP and
should know its functionality?  Maybe is_internal_error() and
cxl_error_is_native(), too?

> In addition to errors directed to the CXL endpoint device, a handler
> must also inspect the CXL RAS and PCIe AER capabilities of the CXL
> downstream port that is connected to the device.
> 
> Since CXL downstream port errors are signaled using internal errors,
> the handler requires those errors to be unmasked. This is subject of a
> follow-on patch.
> 
> The reason for choosing this implementation is that a CXL RCEC device
> is bound to the AER port driver,

  ... is that the AER service driver claims the CXL RCEC device, but
  does not allow registration of a CXL sub-service driver ...

> but the driver does not allow it to
> register a custom specific handler to support CXL. Connecting the RCEC
> hard-wired with a CXL handler does not work, as the CXL subsystem
> might not be present all the time. The alternative to add an
> implementation to the portdrv to allow the registration of a custom
> RCEC error handler isn't worth doing it as CXL would be its only user.
> Instead, just check for an CXL RCEC and pass it down to the connected
> CXL device's error handler. With this approach the code can entirely
> be implemented in the PCIe AER driver and is independent of the CXL
> subsystem. The CXL driver only provides the handler.
> 
> [1] CXL 3.0 spec, 12.2.1.1 RCH Downstream Port-detected Errors
> [2] CXL 3.0 spec, 8.1.3 PCIe DVSEC for CXL Devices
> 
> Co-developed-by: Terry Bowman 
> Signed-off-by: Terry Bowman 
> Signed-off-by: Robert Richter 
> Cc: "Oliver O'Halloran" 
> Cc: Bjorn Helgaas 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-...@vger.kernel.org

Given the questions are minor:

Acked-by: Bjorn Helgaas 

> ---
>  drivers/pci/pcie/Kconfig |  12 +
>  drivers/pci/pcie/aer.c   | 100 ++-
>  2 files changed, 110 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> index 228652a59f27..4f0e70fafe2d 100644
> --- a/drivers/pci/pcie/Kconfig
> +++ b/drivers/pci/pcie/Kconfig
> @@ -49,6 +49,18 @@ config PCIEAER_INJECT
> gotten from:
>
> https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
>  
> +config PCIEAER_CXL
> + bool "PCI Express 

Re: [PATCH v5 13/18] watchdog/hardlockup: Have the perf hardlockup use __weak functions more cleanly

2023-05-24 Thread Doug Anderson
Hi,

On Wed, May 24, 2023 at 6:59 AM Petr Mladek  wrote:
>
> On Fri 2023-05-19 10:18:37, Douglas Anderson wrote:
> > The fact that there watchdog_hardlockup_enable(),
> > watchdog_hardlockup_disable(), and watchdog_hardlockup_probe() are
> > declared __weak means that the configured hardlockup detector can
> > define non-weak versions of those functions if it needs to. Instead of
> > doing this, the perf hardlockup detector hooked itself into the
> > default __weak implementation, which was a bit awkward. Clean this up.
> >
> > >From comments, it looks as if the original design was done because the
> > __weak function were expected to implemented by the architecture and
> > not by the configured hardlockup detector. This got awkward when we
> > tried to add the buddy lockup detector which was not arch-specific but
> > wanted to hook into those same functions.
> >
> > This is not expected to have any functional impact.
> >
> > @@ -187,27 +187,33 @@ static inline void watchdog_hardlockup_kick(void) { }
> >  #endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */
> >
> >  /*
> > - * These functions can be overridden if an architecture implements its
> > - * own hardlockup detector.
> > + * These functions can be overridden based on the configured hardlockdup 
> > detector.
> >   *
> >   * watchdog_hardlockup_enable/disable can be implemented to start and stop 
> > when
> > - * softlockup watchdog start and stop. The arch must select the
> > + * softlockup watchdog start and stop. The detector must select the
> >   * SOFTLOCKUP_DETECTOR Kconfig.
> >   */
> > -void __weak watchdog_hardlockup_enable(unsigned int cpu)
> > -{
> > - hardlockup_detector_perf_enable();
> > -}
> > +void __weak watchdog_hardlockup_enable(unsigned int cpu) { }
> >
> > -void __weak watchdog_hardlockup_disable(unsigned int cpu)
> > -{
> > - hardlockup_detector_perf_disable();
> > -}
> > +void __weak watchdog_hardlockup_disable(unsigned int cpu) { }
> >
> >  /* Return 0, if a hardlockup watchdog is available. Error code otherwise */
> >  int __weak __init watchdog_hardlockup_probe(void)
> >  {
> > - return hardlockup_detector_perf_init();
> > + /*
> > +  * If CONFIG_HAVE_NMI_WATCHDOG is defined then an architecture
> > +  * is assumed to have the hard watchdog available and we return 0.
> > +  */
> > + if (IS_ENABLED(CONFIG_HAVE_NMI_WATCHDOG))
> > + return 0;
> > +
> > + /*
> > +  * Hardlockup detectors other than those using 
> > CONFIG_HAVE_NMI_WATCHDOG
> > +  * are required to implement a non-weak version of this probe function
> > +  * to tell whether they are available. If they don't override then
> > +  * we'll return -ENODEV.
> > +  */
> > + return -ENODEV;
> >  }
>
> When thinking more about it. It is weird that we need to handle
> CONFIG_HAVE_NMI_WATCHDOG in this default week function.
>
> It should be handled in watchdog_hardlockup_probe() implemented
> in kernel/watchdog_perf.c.
>
> IMHO, the default __weak function could always return -ENODEV;
>
> Would it make sense, please?

I don't quite understand. I'd agree that the special case for
CONFIG_HAVE_NMI_WATCHDOG is ugly, but it was also ugly before. IMO
it's actually a little less ugly / easier to understand after my
patch. ...but let me walk through how I think this special case works
and maybe you can tell me where I'm confused.

The first thing to understand is that CONFIG_HARDLOCKUP_DETECTOR_PERF
and CONFIG_HAVE_NMI_WATCHDOG are mutually exclusive from each other.
This was true before any of my patches and is still true after them.
Specifically, if CONFIG_HAVE_NMI_WATCHDOG is defined then an
architecture implements arch_touch_nmi_watchdog() (as documented in
the Kconfig docs for HAVE_NMI_WATCHDOG). Looking at the tree before my
series you can see that the perf hardlockup detector also implemented
arch_touch_nmi_watchdog(). This would have caused a conflict. The
mutual exclusion was presumably enforced by an architecture not
defining both HAVE_NMI_WATCHDOG and HAVE_HARDLOCKUP_DETECTOR_PERF.

The second thing to understand is that an architecture that defines
CONFIG_HAVE_NMI_WATCHDOG is _not_ required to implement
watchdog_hardlockup_probe() (used to be called watchdog_nmi_probe()).
Maybe this should change, but at the very least it appears that
SPARC64 defines HAVE_NMI_WATCHDOG but doesn't define
watchdog_hardlockup_probe() AKA watchdog_nmi_probe(). Anyone who
defines CONFIG_HAVE_NMI_WATCHDOG and doesn't implement
watchdog_hardlockup_probe() is claiming that their watchdog needs no
probing and is always available.

So with that context:

1. We can't handle any special cases for CONFIG_HAVE_NMI_WATCHDOG in
"kernel/watchdog_perf.c". The special cases that we need to handle are
all for the cases where CONFIG_HARDLOCKUP_DETECTOR_PERF isn't defined
and that means "kernel/watchdog_perf.c" isn't included.

2. We can't have the default __weak function return -ENODEV because
CONFIG_HAVE_NMI_WATCHDOG doesn't require an 

Re: [PATCH 13/14] thread_info: move function declarations to linux/thread_info.h

2023-05-24 Thread Catalin Marinas
On Wed, May 17, 2023 at 03:11:01PM +0200, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> There are a few __weak functions in kernel/fork.c, which architectures
> can override. If there is no prototype, the compiler warns about them:
> 
> kernel/fork.c:164:13: error: no previous prototype for 
> 'arch_release_task_struct' [-Werror=missing-prototypes]
> kernel/fork.c:991:20: error: no previous prototype for 'arch_task_cache_init' 
> [-Werror=missing-prototypes]
> kernel/fork.c:1086:12: error: no previous prototype for 
> 'arch_dup_task_struct' [-Werror=missing-prototypes]
> 
> There are already prototypes in a number of architecture specific headers
> that have addressed those warnings before, but it's much better to have
> these in a single place so the warning no longer shows up anywhere.
> 
> Signed-off-by: Arnd Bergmann 

For arm64:

Acked-by: Catalin Marinas 


Re: [PATCH 10/14] suspend: add a arch_resume_nosmt() prototype

2023-05-24 Thread Rafael J. Wysocki
On Wed, May 17, 2023 at 4:52 PM Arnd Bergmann  wrote:
>
> On Wed, May 17, 2023, at 15:48, Rafael J. Wysocki wrote:
> > On Wed, May 17, 2023 at 3:12 PM Arnd Bergmann  wrote:
> >>
> >> From: Arnd Bergmann 
> >>
> >> The arch_resume_nosmt() has a __weak definition, plus an x86
> >> specific override, but no prototype that ensures the two have
> >> the same arguments. This causes a W=1 warning:
> >>
> >> arch/x86/power/hibernate.c:189:5: error: no previous prototype for 
> >> 'arch_resume_nosmt' [-Werror=missing-prototypes]
> >>
> >> Add the prototype in linux/suspend.h, which is included in
> >> both places.
> >>
> >> Signed-off-by: Arnd Bergmann 
> >
> > Do you want me to pick this up?
>
> Yes, please do. Thanks,

Done, thanks!


[PATCH 9/9] powerpc/pseries: Honour current SMT state when DLPAR onlining CPUs

2023-05-24 Thread Michael Ellerman
Integrate with the generic SMT support, so that when a CPU is DLPAR
onlined it is brought up with the correct SMT mode.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 61fb7cb00880..e62835a12d73 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -398,6 +398,14 @@ static int dlpar_online_cpu(struct device_node *dn)
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
+
+   if (!topology_is_primary_thread(cpu)) {
+   if (cpu_smt_control != CPU_SMT_ENABLED)
+   break;
+   if (!topology_smt_thread_allowed(cpu))
+   break;
+   }
+
cpu_maps_update_done();
find_and_update_cpu_nid(cpu);
rc = device_online(get_cpu_device(cpu));
-- 
2.40.1



[PATCH 8/9] powerpc: Add HOTPLUG_SMT support

2023-05-24 Thread Michael Ellerman
Add support for HOTPLUG_SMT, which enables the generic sysfs SMT support
files in /sys/devices/system/cpu/smt, as well as the "nosmt" boot
parameter.

Implement the recently added hooks to allow partial SMT states, allow
any number of threads per core.

Tie the config symbol to HOTPLUG_CPU, which enables it on the major
platforms that support SMT. If there are other platforms that want the
SMT support that can be tweaked in future.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/Kconfig|  1 +
 arch/powerpc/include/asm/topology.h | 25 +
 arch/powerpc/kernel/smp.c   |  3 +++
 3 files changed, 29 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 539d1f03ff42..5cf87ca10a9c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -273,6 +273,7 @@ config PPC
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_VIRT_CPU_ACCOUNTING_GEN
+   select HOTPLUG_SMT  if HOTPLUG_CPU
select HUGETLB_PAGE_SIZE_VARIABLE   if PPC_BOOK3S_64 && HUGETLB_PAGE
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 8a4d4f4d9749..1e9117a22d14 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -143,5 +143,30 @@ static inline int cpu_to_coregroup_id(int cpu)
 #endif
 #endif
 
+#ifdef CONFIG_HOTPLUG_SMT
+#include 
+#include 
+
+static inline bool topology_smt_supported(void)
+{
+   return threads_per_core > 1;
+}
+
+static inline bool topology_smt_threads_supported(unsigned int num_threads)
+{
+   return num_threads <= threads_per_core;
+}
+
+static inline bool topology_is_primary_thread(unsigned int cpu)
+{
+   return cpu == cpu_first_thread_sibling(cpu);
+}
+
+static inline bool topology_smt_thread_allowed(unsigned int cpu)
+{
+   return cpu_thread_in_core(cpu) < cpu_smt_num_threads;
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 265801a3e94c..eed20b9253b7 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1154,6 +1154,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
if (smp_ops && smp_ops->probe)
smp_ops->probe();
+
+   // Initalise the generic SMT topology support
+   cpu_smt_check_topology(threads_per_core);
 }
 
 void smp_prepare_boot_cpu(void)
-- 
2.40.1



[PATCH 7/9] powerpc/pseries: Initialise CPU hotplug callbacks earlier

2023-05-24 Thread Michael Ellerman
As part of the generic HOTPLUG_SMT code, there is support for disabling
secondary SMT threads at boot time, by passing "nosmt" on the kernel
command line.

The way that is implemented is the secondary threads are brought partly
online, and then taken back offline again. That is done to support x86
CPUs needing certain initialisation done on all threads. However powerpc
has similar needs, see commit d70a54e2d085 ("powerpc/powernv: Ignore
smt-enabled on Power8 and later").

For that to work the powerpc CPU hotplug callbacks need to be registered
before secondary CPUs are brought online, otherwise __cpu_disable()
fails due to smp_ops->cpu_disable being NULL.

So split the basic initialisation into pseries_cpu_hotplug_init() which
can be called early from setup_arch(). The DLPAR related initialisation
can still be done later, because it needs to do allocations.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 22 
 arch/powerpc/platforms/pseries/pseries.h |  2 ++
 arch/powerpc/platforms/pseries/setup.c   |  2 ++
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 1a3cb313976a..61fb7cb00880 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -845,15 +845,9 @@ static struct notifier_block pseries_smp_nb = {
.notifier_call = pseries_smp_notifier,
 };
 
-static int __init pseries_cpu_hotplug_init(void)
+void __init pseries_cpu_hotplug_init(void)
 {
int qcss_tok;
-   unsigned int node;
-
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-   ppc_md.cpu_probe = dlpar_cpu_probe;
-   ppc_md.cpu_release = dlpar_cpu_release;
-#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
rtas_stop_self_token = rtas_function_token(RTAS_FN_STOP_SELF);
qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE);
@@ -862,12 +856,22 @@ static int __init pseries_cpu_hotplug_init(void)
qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk(KERN_INFO "CPU Hotplug not supported by firmware "
"- disabling.\n");
-   return 0;
+   return;
}
 
smp_ops->cpu_offline_self = pseries_cpu_offline_self;
smp_ops->cpu_disable = pseries_cpu_disable;
smp_ops->cpu_die = pseries_cpu_die;
+}
+
+static int __init pseries_dlpar_init(void)
+{
+   unsigned int node;
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+   ppc_md.cpu_probe = dlpar_cpu_probe;
+   ppc_md.cpu_release = dlpar_cpu_release;
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
/* Processors can be added/removed only on LPAR */
if (firmware_has_feature(FW_FEATURE_LPAR)) {
@@ -886,4 +890,4 @@ static int __init pseries_cpu_hotplug_init(void)
 
return 0;
 }
-machine_arch_initcall(pseries, pseries_cpu_hotplug_init);
+machine_arch_initcall(pseries, pseries_dlpar_init);
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index f8bce40ebd0c..f8893ba46e83 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -75,11 +75,13 @@ static inline int dlpar_hp_pmem(struct pseries_hp_errorlog 
*hp_elog)
 
 #ifdef CONFIG_HOTPLUG_CPU
 int dlpar_cpu(struct pseries_hp_errorlog *hp_elog);
+void pseries_cpu_hotplug_init(void);
 #else
 static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
 {
return -EOPNOTSUPP;
 }
+static inline void pseries_cpu_hotplug_init(void) { }
 #endif
 
 /* PCI root bridge prepare function override for pseries */
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index e2a57cfa6c83..41451b76c6e5 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -816,6 +816,8 @@ static void __init pSeries_setup_arch(void)
/* Discover PIC type and setup ppc_md accordingly */
smp_init_pseries();
 
+   // Setup CPU hotplug callbacks
+   pseries_cpu_hotplug_init();
 
if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE))
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
-- 
2.40.1



[PATCH 6/9] cpu/SMT: Allow enabling partial SMT states via sysfs

2023-05-24 Thread Michael Ellerman
Add support to the /sys/devices/system/cpu/smt/control interface for
enabling a specified number of SMT threads per core, including partial
SMT states where not all threads are brought online.

The current interface accepts "on" and "off", to enable either 1 or all
SMT threads per core.

This commit allows writing an integer, between 1 and the number of SMT
threads supported by the machine. Writing 1 is a synonym for "off", 2 or
more enables SMT with the specified number of threads.

When reading the file, if all threads are online "on" is returned, to
avoid changing behaviour for existing users. If some other number of
threads is online then the integer value is returned.

There is a hook which allows arch code to control how many threads per
core are supported. To retain the existing behaviour, the x86 hook only
supports 1 thread or all threads.

Signed-off-by: Michael Ellerman 
---
 .../ABI/testing/sysfs-devices-system-cpu  |  1 +
 kernel/cpu.c  | 39 ---
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index f54867cadb0f..3c4cfb59d495 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -555,6 +555,7 @@ Description:Control Symmetric Multi Threading (SMT)
  
=
 "on" SMT is enabled
 "off"SMT is disabled
+""SMT is enabled with N threads per 
core.
 "forceoff"   SMT is force disabled. Cannot be 
changed.
 "notsupported"   SMT is not supported by the CPU
 "notimplemented" SMT runtime toggling is not
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 72b9a03a4fef..aca23c7b4547 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2497,7 +2497,7 @@ static ssize_t
 __store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
 {
-   int ctrlval, ret;
+   int ctrlval, ret, num_threads, orig_threads;
 
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
@@ -2505,20 +2505,38 @@ __store_smt_control(struct device *dev, struct 
device_attribute *attr,
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
 
-   if (sysfs_streq(buf, "on"))
+   if (sysfs_streq(buf, "on")) {
ctrlval = CPU_SMT_ENABLED;
-   else if (sysfs_streq(buf, "off"))
+   num_threads = cpu_smt_max_threads;
+   } else if (sysfs_streq(buf, "off")) {
ctrlval = CPU_SMT_DISABLED;
-   else if (sysfs_streq(buf, "forceoff"))
+   num_threads = 1;
+   } else if (sysfs_streq(buf, "forceoff")) {
ctrlval = CPU_SMT_FORCE_DISABLED;
-   else
+   num_threads = 1;
+   } else if (kstrtoint(buf, 10, _threads) == 0) {
+   if (num_threads == 1)
+   ctrlval = CPU_SMT_DISABLED;
+   else if (num_threads > 1 && 
topology_smt_threads_supported(num_threads))
+   ctrlval = CPU_SMT_ENABLED;
+   else
+   return -EINVAL;
+   } else {
return -EINVAL;
+   }
 
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
 
-   if (ctrlval != cpu_smt_control) {
+   orig_threads = cpu_smt_num_threads;
+   cpu_smt_num_threads = num_threads;
+
+   if (num_threads > orig_threads) {
+   ret = cpuhp_smt_enable();
+   } else if (num_threads < orig_threads) {
+   ret = cpuhp_smt_disable(ctrlval);
+   } else if (ctrlval != cpu_smt_control) {
switch (ctrlval) {
case CPU_SMT_ENABLED:
ret = cpuhp_smt_enable();
@@ -2556,6 +2574,15 @@ static ssize_t control_show(struct device *dev,
 {
const char *state = smt_states[cpu_smt_control];
 
+   /*
+* If SMT is enabled but not all threads are enabled then show the
+* number of threads. If all threads are enabled show "on". Otherwise
+* show the state name.
+*/
+   if (cpu_smt_control == CPU_SMT_ENABLED &&
+   cpu_smt_num_threads != cpu_smt_max_threads)
+   return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
 }
 
-- 
2.40.1



[PATCH 5/9] cpu/SMT: Create topology_smt_thread_allowed()

2023-05-24 Thread Michael Ellerman
A subsequent patch will enable partial SMT states, ie. when not all SMT
threads are brought online.

To support that, add an arch helper which checks whether a given CPU is
allowed to be brought online depending on how many SMT threads are
currently enabled.

Call the helper from cpu_smt_enable(), and cpu_smt_allowed() when SMT is
enabled, to check if the particular thread should be onlined. Notably,
also call it from cpu_smt_disable() if CPU_SMT_ENABLED, to allow
offlining some threads to move from a higher to lower number of threads
online.

Signed-off-by: Michael Ellerman 
---
 arch/x86/include/asm/topology.h |  2 ++
 arch/x86/kernel/smpboot.c   | 15 +++
 kernel/cpu.c| 10 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 197ec2589f5d..c6245ea6e589 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -145,6 +145,7 @@ int topology_phys_to_logical_die(unsigned int die, unsigned 
int cpu);
 bool topology_is_primary_thread(unsigned int cpu);
 bool topology_smt_supported(void);
 bool topology_smt_threads_supported(unsigned int threads);
+bool topology_smt_thread_allowed(unsigned int cpu);
 #else
 #define topology_max_packages()(1)
 static inline int
@@ -159,6 +160,7 @@ static inline int topology_max_smt_threads(void) { return 
1; }
 static inline bool topology_is_primary_thread(unsigned int cpu) { return true; 
}
 static inline bool topology_smt_supported(void) { return false; }
 static inline bool topology_smt_threads_supported(unsigned int threads) { 
return false; }
+static inline bool topology_smt_thread_allowed(unsigned int cpu) { return 
false; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c7ba62beae3e..244b4df40600 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -298,6 +298,21 @@ bool topology_smt_threads_supported(unsigned int threads)
return threads == 1 || threads == smp_num_siblings;
 }
 
+/**
+ * topology_smt_thread_allowed - When enabling SMT check whether this 
particular
+ *  CPU thread is allowed to be brought online.
+ * @cpu:   CPU to check
+ */
+bool topology_smt_thread_allowed(unsigned int cpu)
+{
+   /*
+* No extra logic s required here to support different thread values
+* because threads will always == 1 or smp_num_siblings because of
+* topology_smt_threads_supported().
+*/
+   return true;
+}
+
 /**
  * topology_phys_to_logical_pkg - Map a physical package id to a logical
  *
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a08dd8f93675..72b9a03a4fef 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -458,7 +458,7 @@ early_param("nosmt", smt_cmdline_disable);
 
 static inline bool cpu_smt_allowed(unsigned int cpu)
 {
-   if (cpu_smt_control == CPU_SMT_ENABLED)
+   if (cpu_smt_control == CPU_SMT_ENABLED && 
topology_smt_thread_allowed(cpu))
return true;
 
if (topology_is_primary_thread(cpu))
@@ -2273,6 +2273,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
+   /*
+* Disable can be called with CPU_SMT_ENABLED when changing
+* from a higher to lower number of SMT threads per core.
+*/
+   if (ctrlval == CPU_SMT_ENABLED && 
topology_smt_thread_allowed(cpu))
+   continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
@@ -2307,6 +2313,8 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
+   if (!topology_smt_thread_allowed(cpu))
+   continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
-- 
2.40.1



[PATCH 2/9] cpu/SMT: Move smt/control simple exit cases earlier

2023-05-24 Thread Michael Ellerman
Move the simple exit cases, ie. which don't depend on the value written,
earlier in the function. That makes it clearer that regardless of the
input those states can not be transitioned out of.

That does have a user-visible effect, in that the error returned will
now always be EPERM/ENODEV for those states, regardless of the value
written. Previously writing an invalid value would return EINVAL even
when in those states.

Signed-off-by: Michael Ellerman 
---
 kernel/cpu.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f4a2c5845bcb..01398ce3e131 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2481,6 +2481,12 @@ __store_smt_control(struct device *dev, struct 
device_attribute *attr,
 {
int ctrlval, ret;
 
+   if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+   return -EPERM;
+
+   if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+   return -ENODEV;
+
if (sysfs_streq(buf, "on"))
ctrlval = CPU_SMT_ENABLED;
else if (sysfs_streq(buf, "off"))
@@ -2490,12 +2496,6 @@ __store_smt_control(struct device *dev, struct 
device_attribute *attr,
else
return -EINVAL;
 
-   if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
-   return -EPERM;
-
-   if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
-   return -ENODEV;
-
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
-- 
2.40.1



[PATCH 4/9] cpu/SMT: Create topology_smt_threads_supported()

2023-05-24 Thread Michael Ellerman
A subsequent patch will enable partial SMT states, ie. when not all SMT
threads are brought online.

To support that, add an arch helper to check how many SMT threads are
supported.

To retain existing behaviour, the x86 implementation only allows a
single thread or all threads to be online.

Signed-off-by: Michael Ellerman 
---
 arch/x86/include/asm/topology.h |  2 ++
 arch/x86/kernel/smpboot.c   | 12 
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 66927a59e822..197ec2589f5d 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -144,6 +144,7 @@ int topology_phys_to_logical_pkg(unsigned int pkg);
 int topology_phys_to_logical_die(unsigned int die, unsigned int cpu);
 bool topology_is_primary_thread(unsigned int cpu);
 bool topology_smt_supported(void);
+bool topology_smt_threads_supported(unsigned int threads);
 #else
 #define topology_max_packages()(1)
 static inline int
@@ -157,6 +158,7 @@ static inline int topology_max_die_per_package(void) { 
return 1; }
 static inline int topology_max_smt_threads(void) { return 1; }
 static inline bool topology_is_primary_thread(unsigned int cpu) { return true; 
}
 static inline bool topology_smt_supported(void) { return false; }
+static inline bool topology_smt_threads_supported(unsigned int threads) { 
return false; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 352f0ce1ece4..c7ba62beae3e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -286,6 +286,18 @@ bool topology_smt_supported(void)
return smp_num_siblings > 1;
 }
 
+/**
+ * topology_smt_threads_supported - Check if the given number of SMT threads
+ * is supported.
+ *
+ * @threads:   The number of SMT threads.
+ */
+bool topology_smt_threads_supported(unsigned int threads)
+{
+   // Only support a single thread or all threads.
+   return threads == 1 || threads == smp_num_siblings;
+}
+
 /**
  * topology_phys_to_logical_pkg - Map a physical package id to a logical
  *
-- 
2.40.1



[PATCH 3/9] cpu/SMT: Store the current/max number of threads

2023-05-24 Thread Michael Ellerman
A subsequent patch will enable partial SMT states, ie. when not all SMT
threads are brought online.

To support that the SMT code needs to know the maximum number of SMT
threads, and also the currently configured number.

The arch code knows the max number of threads, so have the arch code
pass that value to cpu_smt_check_topology(). Note that although
topology_max_smt_threads() exists, it is not configured early enough to
be used here.

Signed-off-by: Michael Ellerman 
---
 arch/x86/kernel/cpu/bugs.c |  3 ++-
 include/linux/cpu_smt.h|  6 --
 kernel/cpu.c   | 12 +++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 182af64387d0..3406799c1e9d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -133,7 +134,7 @@ void __init check_bugs(void)
 * identify_boot_cpu() initialized SMT support information, let the
 * core code know.
 */
-   cpu_smt_check_topology();
+   cpu_smt_check_topology(smp_num_siblings);
 
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h
index 17e105b52d85..8d4ae26047c9 100644
--- a/include/linux/cpu_smt.h
+++ b/include/linux/cpu_smt.h
@@ -12,15 +12,17 @@ enum cpuhp_smt_control {
 
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
+extern unsigned int cpu_smt_num_threads;
 extern void cpu_smt_disable(bool force);
-extern void cpu_smt_check_topology(void);
+extern void cpu_smt_check_topology(unsigned int num_threads);
 extern bool cpu_smt_possible(void);
 extern int cpuhp_smt_enable(void);
 extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
 #else
 # define cpu_smt_control   (CPU_SMT_NOT_IMPLEMENTED)
+# define cpu_smt_num_threads 1
 static inline void cpu_smt_disable(bool force) { }
-static inline void cpu_smt_check_topology(void) { }
+static inline void cpu_smt_check_topology(unsigned int num_threads) { }
 static inline bool cpu_smt_possible(void) { return false; }
 static inline int cpuhp_smt_enable(void) { return 0; }
 static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 
0; }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 01398ce3e131..a08dd8f93675 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -414,6 +414,8 @@ void __weak arch_smt_update(void) { }
 
 #ifdef CONFIG_HOTPLUG_SMT
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads;
 
 void __init cpu_smt_disable(bool force)
 {
@@ -433,10 +435,18 @@ void __init cpu_smt_disable(bool force)
  * The decision whether SMT is supported can only be done after the full
  * CPU identification. Called from architecture code.
  */
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_check_topology(unsigned int num_threads)
 {
if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+   cpu_smt_max_threads = num_threads;
+
+   // May already be disabled by nosmt command line parameter
+   if (cpu_smt_control != CPU_SMT_ENABLED)
+   cpu_smt_num_threads = 1;
+   else
+   cpu_smt_num_threads = num_threads;
 }
 
 static int __init smt_cmdline_disable(char *str)
-- 
2.40.1



[PATCH 1/9] cpu/SMT: Move SMT prototypes into cpu_smt.h

2023-05-24 Thread Michael Ellerman
A subsequent patch would like to use the cpuhp_smt_control enum as part
of the interface between generic and arch code.

Currently that leads to circular header dependencies. So split the enum
and related declarations into a separate header.

Signed-off-by: Michael Ellerman 
---
 arch/x86/include/asm/topology.h |  2 ++
 include/linux/cpu.h | 25 +
 include/linux/cpu_smt.h | 29 +
 3 files changed, 32 insertions(+), 24 deletions(-)
 create mode 100644 include/linux/cpu_smt.h

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 458c891a8273..66927a59e822 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -136,6 +136,8 @@ static inline int topology_max_smt_threads(void)
return __max_smt_threads;
 }
 
+#include 
+
 int topology_update_package_map(unsigned int apicid, unsigned int cpu);
 int topology_update_die_map(unsigned int dieid, unsigned int cpu);
 int topology_phys_to_logical_pkg(unsigned int pkg);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 8582a7142623..40548f3c201c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct device;
 struct device_node;
@@ -202,30 +203,6 @@ void cpuhp_report_idle_dead(void);
 static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-enum cpuhp_smt_control {
-   CPU_SMT_ENABLED,
-   CPU_SMT_DISABLED,
-   CPU_SMT_FORCE_DISABLED,
-   CPU_SMT_NOT_SUPPORTED,
-   CPU_SMT_NOT_IMPLEMENTED,
-};
-
-#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
-extern enum cpuhp_smt_control cpu_smt_control;
-extern void cpu_smt_disable(bool force);
-extern void cpu_smt_check_topology(void);
-extern bool cpu_smt_possible(void);
-extern int cpuhp_smt_enable(void);
-extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
-#else
-# define cpu_smt_control   (CPU_SMT_NOT_IMPLEMENTED)
-static inline void cpu_smt_disable(bool force) { }
-static inline void cpu_smt_check_topology(void) { }
-static inline bool cpu_smt_possible(void) { return false; }
-static inline int cpuhp_smt_enable(void) { return 0; }
-static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 
0; }
-#endif
-
 extern bool cpu_mitigations_off(void);
 extern bool cpu_mitigations_auto_nosmt(void);
 
diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h
new file mode 100644
index ..17e105b52d85
--- /dev/null
+++ b/include/linux/cpu_smt.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CPU_SMT_H_
+#define _LINUX_CPU_SMT_H_
+
+enum cpuhp_smt_control {
+   CPU_SMT_ENABLED,
+   CPU_SMT_DISABLED,
+   CPU_SMT_FORCE_DISABLED,
+   CPU_SMT_NOT_SUPPORTED,
+   CPU_SMT_NOT_IMPLEMENTED,
+};
+
+#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
+extern enum cpuhp_smt_control cpu_smt_control;
+extern void cpu_smt_disable(bool force);
+extern void cpu_smt_check_topology(void);
+extern bool cpu_smt_possible(void);
+extern int cpuhp_smt_enable(void);
+extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
+#else
+# define cpu_smt_control   (CPU_SMT_NOT_IMPLEMENTED)
+static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_check_topology(void) { }
+static inline bool cpu_smt_possible(void) { return false; }
+static inline int cpuhp_smt_enable(void) { return 0; }
+static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 
0; }
+#endif
+
+#endif /* _LINUX_CPU_SMT_H_ */
-- 
2.40.1



Re: [PATCH] powerpc/iommu: limit number of TCEs to 512 for H_STUFF_TCE hcall

2023-05-24 Thread Michael Ellerman
Gaurav Batra  writes:
> On 5/17/23 7:19 AM, Michael Ellerman wrote:
>> Gaurav Batra  writes:
>>> Hello Michael,
>>>
>>> System test hit the crash. I believe, it was PHYP that resulted in it
>>> due to number of TCEs passed in to be >512.
>> OK. It's always good to spell out in the change log whether it's a
>> theoretical/unlikely bug, or one that's actually been hit in testing or
>> the field.

> I will submit another version of the patch with some changes in the log 
> once I figure out how to Tag it for stable (as mentioned below).
> 
>>> I was wondering about the Fixes tag as well. But, this interface, in
>>> it's current form, is there from the day the file was created. So, in
>>> this case, should I mention the first commit which created this source file?
>> If it really goes back to the origin commit, then it's probably better
>> to just say so and tag it for stable, rather than pointing to 1da177e4.

> How to do I tag it for stable? Will it be part of the "Fixes:" tag or 
> some other tag?

A stable tag is just adding in the change log:

Cc: sta...@vger.kernel.org

Note *not* in the email headers, in the change log.

That asks the stable kernel folks to backport the commit to all
currently active stable trees. Whereas when you use a Fixes: tag it will
be only backported to stable trees that contain the offending commit.

>> I wonder though is there something else that changed that means this bug
>> is now being hit but wasn't before? Or maybe it's just that we are
>> testing on systems with large enough amounts of memory to hit this but
>> which aren't using a direct mapping?
>
>  From the details in Bugzilla, it does seems like the HCALL was 
> previously taking long as well but PHYP was more relaxed about it. Now, 
> PHYP is limiting on how long can an HCALL take.
>
> Below are some excerpts from the Bug: 202349
>
> Linux is passing too many counts in H_STUFF_TCE. The higher the counts, 
> the longer the HCALL takes. From a Hypervisor perspective, we cannot 
> stop Linux from doing this or it will violate the rules in the PAPR 
> (which then would cause Linux to crash). The dispatcher team has 
> "tightened the screws" on long running HCALLs by causing this trap to 
> fire. From our discussions, they will not put the limits back where they 
> were before.

OK, that explains why it's only been noticed recently, so that's good
info. In the change log you can say something like "newer firmware has
started enforcing the limits".

cheers


Re: Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Michael Ellerman
Bagas Sanjaya  writes:
> On 5/24/23 17:58, Bagas Sanjaya wrote:
>> Anyway, I'm adding it to regzbot:
>> 
>> #regzbot introduced: 23baf831a32c04f 
>> https://bugzilla.kernel.org/show_bug.cgi?id=217477
>> #regzbot title: Allocator MAX_ORDER exceeds SECTION_SIZE caused by MAX_ORDER 
>> redefinition
>> 
>
> From bugzilla [1], the reporter had successfully tried the proposed
> kernel config fix, so:
>
> #regzbot resolve: reducing CONFIG_ARCH_FORCE_MAX_ORDER to 8 resolves the 
> regression

Should be fixed properly by:

https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20230519113806.370635-1-...@ellerman.id.au/

Which is in powerpc-fixes as 358e526a1648.

cheers


Re: [PATCH] mm/slab: rename CONFIG_SLAB to CONFIG_SLAB_DEPRECATED

2023-05-24 Thread Vlastimil Babka
On 5/24/23 02:29, David Rientjes wrote:
> On Tue, 23 May 2023, Vlastimil Babka wrote:
> 
>> As discussed at LSF/MM [1] [2] and with no objections raised there,
>> deprecate the SLAB allocator. Rename the user-visible option so that
>> users with CONFIG_SLAB=y get a new prompt with explanation during make
>> oldconfig, while make olddefconfig will just switch to SLUB.
>> 
>> In all defconfigs with CONFIG_SLAB=y remove the line so those also
>> switch to SLUB. Regressions due to the switch should be reported to
>> linux-mm and slab maintainers.
>> 
>> [1] https://lore.kernel.org/all/4b9fc9c6-b48c-198f-5f80-811a44737...@suse.cz/
>> [2] https://lwn.net/Articles/932201/
>> 
>> Signed-off-by: Vlastimil Babka 
> 
> Acked-by: David Rientjes 

Thanks.

> The Kconfig option says that SLAB will be removed in a few cycles.  I 
> think we should wait until at least the next LTS kernel is forked at the 
> end of the year so that users who upgrade to only the LTS releases can be 
> prompted for this change and surface any concerns.  Slab allocation is a 
> critical subsystem, so I presume this is the safest and most responsible 
> way to do the SLAB deprecation.  Hopefully that timeline works for 
> everybody.

Sure, and in fact looking at predicted release dates [1], if the deprecation
goes into 6.5 then 6.7 ("few" == 2) is already end of January 2024, anyway.

[1] https://hansen.beer/~dave/phb/


Re: [PATCH v5 13/18] watchdog/hardlockup: Have the perf hardlockup use __weak functions more cleanly

2023-05-24 Thread Petr Mladek
On Fri 2023-05-19 10:18:37, Douglas Anderson wrote:
> The fact that there watchdog_hardlockup_enable(),
> watchdog_hardlockup_disable(), and watchdog_hardlockup_probe() are
> declared __weak means that the configured hardlockup detector can
> define non-weak versions of those functions if it needs to. Instead of
> doing this, the perf hardlockup detector hooked itself into the
> default __weak implementation, which was a bit awkward. Clean this up.
> 
> >From comments, it looks as if the original design was done because the
> __weak function were expected to implemented by the architecture and
> not by the configured hardlockup detector. This got awkward when we
> tried to add the buddy lockup detector which was not arch-specific but
> wanted to hook into those same functions.
> 
> This is not expected to have any functional impact.
>
> @@ -187,27 +187,33 @@ static inline void watchdog_hardlockup_kick(void) { }
>  #endif /* !CONFIG_HARDLOCKUP_DETECTOR_PERF */
>  
>  /*
> - * These functions can be overridden if an architecture implements its
> - * own hardlockup detector.
> + * These functions can be overridden based on the configured hardlockdup 
> detector.
>   *
>   * watchdog_hardlockup_enable/disable can be implemented to start and stop 
> when
> - * softlockup watchdog start and stop. The arch must select the
> + * softlockup watchdog start and stop. The detector must select the
>   * SOFTLOCKUP_DETECTOR Kconfig.
>   */
> -void __weak watchdog_hardlockup_enable(unsigned int cpu)
> -{
> - hardlockup_detector_perf_enable();
> -}
> +void __weak watchdog_hardlockup_enable(unsigned int cpu) { }
>  
> -void __weak watchdog_hardlockup_disable(unsigned int cpu)
> -{
> - hardlockup_detector_perf_disable();
> -}
> +void __weak watchdog_hardlockup_disable(unsigned int cpu) { }
>  
>  /* Return 0, if a hardlockup watchdog is available. Error code otherwise */
>  int __weak __init watchdog_hardlockup_probe(void)
>  {
> - return hardlockup_detector_perf_init();
> + /*
> +  * If CONFIG_HAVE_NMI_WATCHDOG is defined then an architecture
> +  * is assumed to have the hard watchdog available and we return 0.
> +  */
> + if (IS_ENABLED(CONFIG_HAVE_NMI_WATCHDOG))
> + return 0;
> +
> + /*
> +  * Hardlockup detectors other than those using CONFIG_HAVE_NMI_WATCHDOG
> +  * are required to implement a non-weak version of this probe function
> +  * to tell whether they are available. If they don't override then
> +  * we'll return -ENODEV.
> +  */
> + return -ENODEV;
>  }

When thinking more about it. It is weird that we need to handle
CONFIG_HAVE_NMI_WATCHDOG in this default week function.

It should be handled in watchdog_hardlockup_probe() implemented
in kernel/watchdog_perf.c.

IMHO, the default __weak function could always return -ENODEV;

Would it make sense, please?

Best Regards,
Petr


Re: Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Bagas Sanjaya
On 5/24/23 20:11, Doru Iorgulescu wrote:
> Glad to hear it! 
> Thank you

tl;dr:

> A: http://en.wikipedia.org/wiki/Top_post
> Q: Were do I find info about this thing called top-posting?
> A: Because it messes up the order in which people normally read text.
> Q: Why is top-posting such a bad thing?
> A: Top-posting.
> Q: What is the most annoying thing in e-mail?
> 
> A: No.
> Q: Should I include quotations after my reply?
> 
> http://daringfireball.net/2007/07/on_top

(IOW, I have no idea what you're referring to. Please stop top-posting;
reply inline with appropriate context instead.)

Also, don't send HTML emails as many mailing lists (including LKML)
don't like it.

Thanks.

-- 
An old man doll... just what I always wanted! - Clara



Re: [PATCH v5 12/18] watchdog/hardlockup: Rename some "NMI watchdog" constants/function

2023-05-24 Thread Petr Mladek
On Fri 2023-05-19 10:18:36, Douglas Anderson wrote:
> Do a search and replace of:
> - NMI_WATCHDOG_ENABLED => WATCHDOG_HARDLOCKUP_ENABLED
> - SOFT_WATCHDOG_ENABLED => WATCHDOG_SOFTOCKUP_ENABLED
> - watchdog_nmi_ => watchdog_hardlockup_
> - nmi_watchdog_available => watchdog_hardlockup_available
> - nmi_watchdog_user_enabled => watchdog_hardlockup_user_enabled
> - soft_watchdog_user_enabled => watchdog_softlockup_user_enabled
> - NMI_WATCHDOG_DEFAULT => WATCHDOG_HARDLOCKUP_DEFAULT
> 
> Then update a few comments near where names were changed.
> 
> This is specifically to make it less confusing when we want to
> introduce the buddy hardlockup detector, which isn't using NMIs. As
> part of this, we sanitized a few names for consistency.
> 
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -30,17 +30,17 @@
>  static DEFINE_MUTEX(watchdog_mutex);
>  
>  #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
> -# define NMI_WATCHDOG_DEFAULT1
> +# define WATCHDOG_HARDLOCKUP_DEFAULT 1
>  #else
> -# define NMI_WATCHDOG_DEFAULT0
> +# define WATCHDOG_HARDLOCKUP_DEFAULT 0
>  #endif
>  
>  unsigned long __read_mostly watchdog_enabled;
>  int __read_mostly watchdog_user_enabled = 1;
> -int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
> -int __read_mostly soft_watchdog_user_enabled = 1;
> +int __read_mostly watchdog_hardlockup_user_enabled = 
> WATCHDOG_HARDLOCKUP_DEFAULT;
> +int __read_mostly watchdog_softlockup_user_enabled = 1;

I still see nmi_watchdog_user_enabled and soft_watchdog_user_enabled
in include/linux/nmi.h. They are declared there and also mentioned
in a comment.

It seems that they do not actually need to be declared there.
I guess that it was need for the /proc stuff. But it was
moved into kernel/watchdog.c by the commit commit dd0693fdf054f2ed37
("watchdog: move watchdog sysctl interface to watchdog.c").

>  int __read_mostly watchdog_thresh = 10;
> -static int __read_mostly nmi_watchdog_available;
> +static int __read_mostly watchdog_hardlockup_available;
>  
>  struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);

Otherwise, I like the changes.

With the following:

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 83076bf70ce8..d14fe345eae9 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,8 +17,6 @@ void lockup_detector_soft_poweroff(void);
 void lockup_detector_cleanup(void);
 
 extern int watchdog_user_enabled;
-extern int nmi_watchdog_user_enabled;
-extern int soft_watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 
@@ -68,8 +66,8 @@ static inline void reset_hung_task_detector(void) { }
  * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
  * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
  *
- * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and
- * 'soft_watchdog_user_enabled' are variables that are only used as an
+ * 'watchdog_user_enabled', 'watchdog_hardlockup_user_enabled' and
+ * 'watchdog_softlockup_user_enabled' are variables that are only used as an
  * 'interface' between the parameters in /proc/sys/kernel and the internal
  * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
  * handled differently because its value is not boolean, and the lockup

Reviewed-by: Petr Mladek 

Even better might be to remove the unused declaration in a separate
patch. I think that more declarations are not needed after moving
the /proc stuff into kernel/watchdog.c.

But it might also be fixed later.

Best Regards,
Petr


Re: [PATCH v5 11/18] watchdog/hardlockup: Move perf hardlockup watchdog petting to watchdog.c

2023-05-24 Thread Petr Mladek
On Fri 2023-05-19 10:18:35, Douglas Anderson wrote:
> In preparation for the buddy hardlockup detector, which wants the same
> petting logic as the current perf hardlockup detector, move the code
> to watchdog.c. While doing this, rename the global variable to match
> others nearby. As part of this change we have to change the code to
> account for the fact that the CPU we're running on might be different
> than the one we're checking.
> 
> Currently the code in watchdog.c is guarded by
> CONFIG_HARDLOCKUP_DETECTOR_PERF, which makes this change seem
> silly. However, a future patch will change this.
> 
> Signed-off-by: Douglas Anderson 

Reviewed-by: Petr Mladek 

Best Regards,
Petr


Re: Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Bagas Sanjaya
On 5/24/23 17:58, Bagas Sanjaya wrote:
> Anyway, I'm adding it to regzbot:
> 
> #regzbot introduced: 23baf831a32c04f 
> https://bugzilla.kernel.org/show_bug.cgi?id=217477
> #regzbot title: Allocator MAX_ORDER exceeds SECTION_SIZE caused by MAX_ORDER 
> redefinition
> 

>From bugzilla [1], the reporter had successfully tried the proposed
kernel config fix, so:

#regzbot resolve: reducing CONFIG_ARCH_FORCE_MAX_ORDER to 8 resolves the 
regression

Thanks for all who participates in this regression report!

[1]: https://bugzilla.kernel.org/show_bug.cgi?id=217477#c8

-- 
An old man doll... just what I always wanted! - Clara



Re: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Zi Yan
On 24 May 2023, at 6:58, Bagas Sanjaya wrote:

> Hi,
>
> I notice a powerpc[64?] build regression on Bugzilla [1]. Quoting from it:
>
>>  CC  arch/powerpc/kernel/asm-offsets.s
>> In file included from ./include/linux/gfp.h:7,
>> from ./include/linux/xarray.h:15,
>> from ./include/linux/list_lru.h:14,
>> from ./include/linux/fs.h:13,
>> from ./include/linux/compat.h:17,
>> from arch/powerpc/kernel/asm-offsets.c:12:
>> ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds 
>> SECTION_SIZE
>> 1735 | #error Allocator MAX_ORDER exceeds SECTION_SIZE
>>  |  ^
>> make[5]: *** [scripts/Makefile.build:114: arch/powerpc/kernel/asm-offsets.s] 
>> Error 1

By checking the config file from the bugzilla, ARCH_FORCE_MAX_ORDER is set to 9,
(SECTION_SIZE is 24 and 64KB page is used, so 9+16=25>24) but it should be 8
after recent MAX_ORDER changes. I guess the user was using an old config file.

Changing ARCH_FORCE_MAX_ORDER to 8 in the config should fix the issue.

>
> Apparently removing the errored line solves the problem for the reporter
> (the attached dmesg on [2] looks fine at a glance).
>
> Anyway, I'm adding it to regzbot:
>
> #regzbot introduced: 23baf831a32c04f 
> https://bugzilla.kernel.org/show_bug.cgi?id=217477
> #regzbot title: Allocator MAX_ORDER exceeds SECTION_SIZE caused by MAX_ORDER 
> redefinition
>
> Thanks.
>
> [1]: https://bugzilla.kernel.org/show_bug.cgi?id=217477
> [2]: https://bugzilla.kernel.org/show_bug.cgi?id=217477#c1
>
> -- 
> An old man doll... just what I always wanted! - Clara

--
Best Regards,
Yan, Zi


signature.asc
Description: OpenPGP digital signature


Re: [PATCH v5 10/18] watchdog/hardlockup: Add a "cpu" param to watchdog_hardlockup_check()

2023-05-24 Thread Petr Mladek
On Tue 2023-05-23 09:34:37, Doug Anderson wrote:
> Hi,
> 
> On Tue, May 23, 2023 at 9:02 AM Petr Mladek  wrote:
> >
> > On Fri 2023-05-19 10:18:34, Douglas Anderson wrote:
> > > In preparation for the buddy hardlockup detector where the CPU
> > > checking for lockup might not be the currently running CPU, add a
> > > "cpu" parameter to watchdog_hardlockup_check().
> > >
> > > As part of this change, make hrtimer_interrupts an atomic_t since now
> > > the CPU incrementing the value and the CPU reading the value might be
> > > different. Technially this could also be done with just READ_ONCE and
> > > WRITE_ONCE, but atomic_t feels a little cleaner in this case.
> > >
> > > While hrtimer_interrupts is made atomic_t, we change
> > > hrtimer_interrupts_saved from "unsigned long" to "int". The "int" is
> > > needed to match the data type backing atomic_t for hrtimer_interrupts.
> > > Even if this changes us from 64-bits to 32-bits (which I don't think
> > > is true for most compilers), it doesn't really matter. All we ever do
> > > is increment it every few seconds and compare it to an old value so
> > > 32-bits is fine (even 16-bits would be). The "signed" vs "unsigned"
> > > also doesn't matter for simple equality comparisons.
> > >
> > > hrtimer_interrupts_saved is _not_ switched to atomic_t nor even
> > > accessed with READ_ONCE / WRITE_ONCE. The hrtimer_interrupts_saved is
> > > always consistently accessed with the same CPU. NOTE: with the
> > > upcoming "buddy" detector there is one special case. When a CPU goes
> > > offline/online then we can change which CPU is the one to consistently
> > > access a given instance of hrtimer_interrupts_saved. We still can't
> > > end up with a partially updated hrtimer_interrupts_saved, however,
> > > because we end up petting all affected CPUs to make sure the new and
> > > old CPU can't end up somehow read/write hrtimer_interrupts_saved at
> > > the same time.
> > >
> > > --- a/kernel/watchdog.c
> > > +++ b/kernel/watchdog.c
> > > @@ -87,29 +87,34 @@ __setup("nmi_watchdog=", hardlockup_panic_setup);
> > >
> > >  #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
> > >
> > > -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
> > > -static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
> > > +static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts);
> > > +static DEFINE_PER_CPU(int, hrtimer_interrupts_saved);
> > >  static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned);
> > >  static unsigned long watchdog_hardlockup_all_cpu_dumped;
> > >
> > > -static bool is_hardlockup(void)
> > > +static bool is_hardlockup(unsigned int cpu)
> > >  {
> > > - unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
> > > + int hrint = atomic_read(_cpu(hrtimer_interrupts, cpu));
> > >
> > > - if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
> > > + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint)
> > >   return true;
> > >
> > > - __this_cpu_write(hrtimer_interrupts_saved, hrint);
> > > + /*
> > > +  * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE
> > > +  * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is
> > > +  * written/read by a single CPU.
> > > +  */
> > > + per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
> > >
> > >   return false;
> > >  }
> > >
> > >  static void watchdog_hardlockup_kick(void)
> > >  {
> > > - __this_cpu_inc(hrtimer_interrupts);
> > > + atomic_inc(raw_cpu_ptr(_interrupts));
> >
> > Is there any particular reason why raw_*() is needed, please?
> >
> > My expectation is that the raw_ API should be used only when
> > there is a good reason for it. Where a good reason might be
> > when the checks might fail but the consistency is guaranteed
> > another way.
> >
> > IMHO, we should use:
> >
> > atomic_inc(this_cpu_ptr(_interrupts));
> >
> > To be honest, I am a bit lost in the per_cpu API definitions.
> >
> > But this_cpu_ptr() seems to be implemented the same way as
> > per_cpu_ptr() when CONFIG_DEBUG_PREEMPT is enabled.
> > And we use per_cpu_ptr() in is_hardlockup().
> >
> > Also this_cpu_ptr() is used more commonly:
> >
> > $> git grep this_cpu_ptr | wc -l
> > 1385
> > $> git grep raw_cpu_ptr | wc -l
> > 114
> 
> Hmmm, I think maybe I confused myself. The old code purposely used the
> double-underscore prefixed version of this_cpu_inc(). I couldn't find
> a double-underscore version of this_cpu_ptr() and I somehow convinced
> myself that the raw() version was the right equivalent version.
> 
> You're right that this_cpu_ptr() is a better choice here and I don't
> see any reason why we'd need the raw version.

I was confused too. Honestly, it looks a bit messy to me.

My understanding is that this_cpu*() API has the following semantic:

+ this_cpu_*()* actively disables interrupts/preemption

+ __this_cpu_*() just warns when the task could migrate
between CPUs.

+ raw_cpu_*() can be used in preemtible context 

Fwd: ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE (v6.4-rc3 build regression)

2023-05-24 Thread Bagas Sanjaya
Hi,

I notice a powerpc[64?] build regression on Bugzilla [1]. Quoting from it:

>  CC  arch/powerpc/kernel/asm-offsets.s
> In file included from ./include/linux/gfp.h:7,
> from ./include/linux/xarray.h:15,
> from ./include/linux/list_lru.h:14,
> from ./include/linux/fs.h:13,
> from ./include/linux/compat.h:17,
> from arch/powerpc/kernel/asm-offsets.c:12:
> ./include/linux/mmzone.h:1735:2: error: #error Allocator MAX_ORDER exceeds 
> SECTION_SIZE
> 1735 | #error Allocator MAX_ORDER exceeds SECTION_SIZE
>  |  ^
> make[5]: *** [scripts/Makefile.build:114: arch/powerpc/kernel/asm-offsets.s] 
> Error 1

Apparently removing the errored line solves the problem for the reporter
(the attached dmesg on [2] looks fine at a glance).

Anyway, I'm adding it to regzbot:

#regzbot introduced: 23baf831a32c04f 
https://bugzilla.kernel.org/show_bug.cgi?id=217477
#regzbot title: Allocator MAX_ORDER exceeds SECTION_SIZE caused by MAX_ORDER 
redefinition

Thanks.

[1]: https://bugzilla.kernel.org/show_bug.cgi?id=217477
[2]: https://bugzilla.kernel.org/show_bug.cgi?id=217477#c1

-- 
An old man doll... just what I always wanted! - Clara


Re: [PATCH] powerpc/crypto: fix build warnings when DEBUG_FS is not enabled

2023-05-24 Thread Herbert Xu
On Fri, May 19, 2023 at 03:33:34PM -0700, Randy Dunlap wrote:
> Fix build warnings when DEBUG_FS is not enabled by using an empty
> do-while loop instead of a value:
> 
> In file included from ../drivers/crypto/nx/nx.c:27:
> ../drivers/crypto/nx/nx.c: In function 'nx_register_algs':
> ../drivers/crypto/nx/nx.h:173:33: warning: statement with no effect 
> [-Wunused-value]
>   173 | #define NX_DEBUGFS_INIT(drv)(0)
> ../drivers/crypto/nx/nx.c:573:9: note: in expansion of macro 'NX_DEBUGFS_INIT'
>   573 | NX_DEBUGFS_INIT(_driver);
> ../drivers/crypto/nx/nx.c: In function 'nx_remove':
> ../drivers/crypto/nx/nx.h:174:33: warning: statement with no effect 
> [-Wunused-value]
>   174 | #define NX_DEBUGFS_FINI(drv)(0)
> ../drivers/crypto/nx/nx.c:793:17: note: in expansion of macro 
> 'NX_DEBUGFS_FINI'
>   793 | NX_DEBUGFS_FINI(_driver);
> 
> Also, there is no need to build nx_debugfs.o when DEBUG_FS is not
> enabled, so change the Makefile to accommodate that.
> 
> Fixes: ae0222b7289d ("powerpc/crypto: nx driver code supporting nx 
> encryption")
> Fixes: aef7b31c8833 ("powerpc/crypto: Build files for the nx device driver")
> Signed-off-by: Randy Dunlap 
> Cc: Breno Leitão 
> Cc: Nayna Jain 
> Cc: Paulo Flabiano Smorigo 
> Cc: Herbert Xu 
> Cc: "David S. Miller" 
> Cc: linux-cry...@vger.kernel.org
> Cc: Michael Ellerman 
> Cc: Nicholas Piggin 
> Cc: Christophe Leroy 
> Cc: linuxppc-dev@lists.ozlabs.org
> ---
>  drivers/crypto/nx/Makefile |2 +-
>  drivers/crypto/nx/nx.h |4 ++--
>  2 files changed, 3 insertions(+), 3 deletions(-)

Patch applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: linux-next: boot failure after merge of the tip tree

2023-05-24 Thread Peter Zijlstra
On Wed, May 24, 2023 at 03:44:59PM +1000, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the tip tree, today's linux-next build (powerpc
> pseries_le_defconfig) failed to boot like this:
> 
>  Built 1 zonelists, mobility grouping on.  Total pages: 32736
>  Policy zone: Normal
>  mem auto-init: stack:all(zero), heap alloc:off, heap free:off
>  Memory: 2027392K/2097152K available (17984K kernel code, 3328K rwdata, 
> 14784K rodata, 6080K init, 1671K bss, 69760K reserved, 0K cma-reserved)
> 
> *crickets*
> 
> Bisected to commit
> 
>   f4ab23558310 ("slub: Replace cmpxchg_double()")
> 
> I have reverted that commit (and the following one) for today.

Sorry about that; turns out I forgot to test the case where cmpxchg128
wasn't available.

Please see:

  
https://lkml.kernel.org/r/20230524093246.gp83...@hirez.programming.kicks-ass.net

As stated there, I'm going to zap tip/locking/core for a few days and
let this fixed version run through the robots again before re-instating
it.


[PATCH 4/4] powerpc/64s/radix: combine final TLB flush and lazy tlb mm shootdown IPIs

2023-05-24 Thread Nicholas Piggin
This performs lazy tlb mm shootdown when doing the exit TLB flush when
all mm users go away and user mappings are removed, which avoids having
to do the lazy tlb mm shootdown IPIs on the final mmput when all kernel
references disappear.

powerpc/64s uses a broadcast TLBIE for the exit TLB flush if remote CPUs
need to be invalidated (unless TLBIE is disabled), so this doesn't
necessarily save IPIs but it does avoid a broadcast TLBIE which is quite
expensive.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 8160c1630c3d..e2aaee6df1f6 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -1301,7 +1301,31 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 * See the comment for radix in arch_exit_mmap().
 */
if (tlb->fullmm) {
-   __flush_all_mm(mm, true);
+   if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+   /*
+* Shootdown based lazy tlb mm refcounting means we
+* have to IPI everyone in the mm_cpumask anyway soon
+* when the mm goes away, so might as well do it as
+* part of the final flush now.
+*
+* If lazy shootdown was improved to reduce IPIs (e.g.,
+* by batching), then it may end up being better to use
+* tlbies here instead.
+*/
+   smp_mb(); /* see radix__flush_tlb_mm */
+   exit_flush_lazy_tlbs(mm);
+   _tlbiel_pid(mm->context.id, RIC_FLUSH_ALL);
+
+   /*
+* It should not be possible to have coprocessors still
+* attached here.
+*/
+   if (WARN_ON_ONCE(atomic_read(>context.copros) > 0))
+   __flush_all_mm(mm, true);
+   } else {
+   __flush_all_mm(mm, true);
+   }
+
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
-- 
2.40.1



[PATCH 3/4] powerpc: Add mm_cpumask warning when context switching

2023-05-24 Thread Nicholas Piggin
When context switching away from an mm, add a CONFIG_DEBUG_VM warning
check to ensure this CPU is still set in the mask. This could catch
bugs where the mask is improperly trimmed while the CPU is still using
the mm.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/mmu_context.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 894468975a44..b24c19078eb1 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -43,12 +43,13 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
 {
+   int cpu = smp_processor_id();
bool new_on_cpu = false;
 
/* Mark this context has been used on the new CPU */
-   if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
+   if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
VM_WARN_ON_ONCE(next == _mm);
-   cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+   cpumask_set_cpu(cpu, mm_cpumask(next));
inc_mm_active_cpus(next);
 
/*
@@ -101,6 +102,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 * sub architectures. Out of line for now
 */
switch_mmu_context(prev, next, tsk);
+
+   VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(prev)));
 }
 
 #ifndef CONFIG_PPC_BOOK3S_64
-- 
2.40.1



[PATCH 2/4] powerpc/64s: Use dec_mm_active_cpus helper

2023-05-24 Thread Nicholas Piggin
Avoid open-coded atomic_dec on mm->context.active_cpus and use the
function made for it. Add CONFIG_DEBUG_VM underflow checking on the
counter.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 2 +-
 arch/powerpc/include/asm/mmu_context.h   | 1 +
 arch/powerpc/mm/book3s64/radix_tlb.c | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 570a4960cf17..5cf0e9c953b3 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -261,7 +261,7 @@ static inline void radix_init_pseries(void) { }
 #define arch_clear_mm_cpumask_cpu(cpu, mm) \
do {\
if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {\
-   atomic_dec(&(mm)->context.active_cpus); \
+   dec_mm_active_cpus(mm); \
cpumask_clear_cpu(cpu, mm_cpumask(mm)); \
}   \
} while (0)
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 57f5017111f4..37bffa0f7918 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -127,6 +127,7 @@ static inline void inc_mm_active_cpus(struct mm_struct *mm)
 
 static inline void dec_mm_active_cpus(struct mm_struct *mm)
 {
+   VM_WARN_ON_ONCE(atomic_read(>context.active_cpus) <= 0);
atomic_dec(>context.active_cpus);
 }
 
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 90953cf9f648..8160c1630c3d 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -808,7 +808,7 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool 
always_flush)
 * that's what the caller expects.
 */
if (cpumask_test_cpu(cpu, mm_cpumask(mm))) {
-   atomic_dec(>context.active_cpus);
+   dec_mm_active_cpus(mm);
cpumask_clear_cpu(cpu, mm_cpumask(mm));
always_flush = true;
}
-- 
2.40.1



[PATCH 1/4] powerpc: Account mm_cpumask and active_cpus in init_mm

2023-05-24 Thread Nicholas Piggin
init_mm mm_cpumask and context.active_cpus is not maintained at boot
and hotplug. This seems to be harmless because init_mm does not have a
userspace and so never gets user TLBs flushed, but it looks odd and it
prevents some sanity checks being added.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/setup-common.c |  6 +-
 arch/powerpc/kernel/smp.c  | 12 
 arch/powerpc/mm/mmu_context.c  |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index d2a446216444..16843294d978 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -969,8 +969,12 @@ void __init setup_arch(char **cmdline_p)
klp_init_thread_info(_task);
 
setup_initial_init_mm(_stext, _etext, _edata, _end);
-
+   /* sched_init() does the mmgrab(_mm) for the primary CPU */
+   VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(_mm)));
+   cpumask_set_cpu(smp_processor_id(), mm_cpumask(_mm));
+   inc_mm_active_cpus(_mm);
mm_iommu_init(_mm);
+
irqstack_early_init();
exc_lvl_early_init();
emergency_stack_init();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 265801a3e94c..76a57dc753c8 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1615,6 +1616,9 @@ void start_secondary(void *unused)
 
mmgrab_lazy_tlb(_mm);
current->active_mm = _mm;
+   VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(_mm)));
+   cpumask_set_cpu(cpu, mm_cpumask(_mm));
+   inc_mm_active_cpus(_mm);
 
smp_store_cpu_info(cpu);
set_dec(tb_ticks_per_jiffy);
@@ -1750,6 +1754,14 @@ int __cpu_disable(void)
 
 void __cpu_die(unsigned int cpu)
 {
+   /*
+* This could perhaps be a generic call in idlea_task_dead(), but
+* that requires testing from all archs, so first put it here to
+*/
+   VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(_mm)));
+   dec_mm_active_cpus(_mm);
+   cpumask_clear_cpu(cpu, mm_cpumask(_mm));
+
if (smp_ops->cpu_die)
smp_ops->cpu_die(cpu);
 }
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 1fb9c99f8679..894468975a44 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -47,6 +47,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 
/* Mark this context has been used on the new CPU */
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
+   VM_WARN_ON_ONCE(next == _mm);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
inc_mm_active_cpus(next);
 
-- 
2.40.1



[PATCH 0/4] powerpc: mm_cpumask cleanups and lazy tlb mm

2023-05-24 Thread Nicholas Piggin
In the process of doing patch 4, I found a few things we could improve
and tighten up with mm_cpumask handling, so added those first. They're
mostly just debugging, no real fixes or dependency on patch 4 there.

Thanks,
Nick

Nicholas Piggin (4):
  powerpc: Account mm_cpumask and active_cpus in init_mm
  powerpc/64s: Use dec_mm_active_cpus helper
  powerpc: Add mm_cpumask warning when context switching
  powerpc/64s/radix: combine final TLB flush and lazy tlb mm shootdown
IPIs

 arch/powerpc/include/asm/book3s/64/mmu.h |  2 +-
 arch/powerpc/include/asm/mmu_context.h   |  1 +
 arch/powerpc/kernel/setup-common.c   |  6 -
 arch/powerpc/kernel/smp.c| 12 ++
 arch/powerpc/mm/book3s64/radix_tlb.c | 28 ++--
 arch/powerpc/mm/mmu_context.c|  8 +--
 6 files changed, 51 insertions(+), 6 deletions(-)

-- 
2.40.1



[PATCH 2/2] lazy tlb: consolidate lazy tlb mm switching

2023-05-24 Thread Nicholas Piggin
Switching a kernel thread using a "lazy tlb mm" to init_mm is a
relatively common sequence that is not quite trivial. Consolidate this
into a function.

This fixes a bug in do_shoot_lazy_tlb() for any arch that implements
finish_arch_post_lock_switch(). None select MMU_LAZY_TLB_SHOOTDOWN at
the moment.

Fixes: 2655421ae69fa ("lazy tlb: shoot lazies, non-refcounting lazy tlb mm 
reference handling scheme")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/book3s64/radix_tlb.c |  6 +
 include/linux/sched/task.h   |  2 ++
 kernel/fork.c|  7 ++
 kernel/sched/core.c  | 34 
 4 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index ce804b7bf84e..90953cf9f648 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -795,12 +795,8 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool 
always_flush)
goto out;
 
if (current->active_mm == mm) {
-   WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
-   mmgrab_lazy_tlb(_mm);
-   current->active_mm = _mm;
-   switch_mm_irqs_off(mm, _mm, current);
-   mmdrop_lazy_tlb(mm);
+   kthread_end_lazy_tlb_mm();
}
 
/*
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 537cbf9a2ade..23693b94a09b 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -61,6 +61,8 @@ extern int lockdep_tasklist_lock_is_held(void);
 extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 
+extern void kthread_end_lazy_tlb_mm(void);
+
 extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
 extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args 
*kargs);
 extern void sched_post_fork(struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..8b005c2c7c3c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -854,11 +854,8 @@ static void do_shoot_lazy_tlb(void *arg)
 {
struct mm_struct *mm = arg;
 
-   if (current->active_mm == mm) {
-   WARN_ON_ONCE(current->mm);
-   current->active_mm = _mm;
-   switch_mm(mm, _mm, current);
-   }
+   if (current->active_mm == mm)
+   kthread_end_lazy_tlb_mm();
 }
 
 static void cleanup_lazy_tlbs(struct mm_struct *mm)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc4ef1f3394b..71706df22b41 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5346,6 +5346,29 @@ context_switch(struct rq *rq, struct task_struct *prev,
return finish_task_switch(prev);
 }
 
+/*
+ * If this kthread has a user process's mm for its active_mm (aka lazy tlb mm)
+ * then switch away from it, to init_mm. Must not be called while using an
+ * mm with kthread_use_mm().
+ */
+void kthread_end_lazy_tlb_mm(void)
+{
+   struct mm_struct *mm = current->active_mm;
+
+   WARN_ON_ONCE(!irqs_disabled());
+
+   if (WARN_ON_ONCE(current->mm))
+   return; /* Not a kthread or doing kthread_use_mm */
+
+   if (mm != _mm) {
+   mmgrab_lazy_tlb(_mm);
+   current->active_mm = _mm;
+   switch_mm_irqs_off(mm, _mm, current);
+   finish_arch_post_lock_switch();
+   mmdrop_lazy_tlb(mm);
+   }
+}
+
 /*
  * nr_running and nr_context_switches:
  *
@@ -9375,17 +9398,8 @@ void sched_setnuma(struct task_struct *p, int nid)
  */
 void idle_task_prepare_exit(void)
 {
-   struct mm_struct *mm = current->active_mm;
-
WARN_ON(!irqs_disabled());
-
-   if (mm != _mm) {
-   mmgrab_lazy_tlb(_mm);
-   current->active_mm = _mm;
-   switch_mm_irqs_off(mm, _mm, current);
-   finish_arch_post_lock_switch();
-   mmdrop_lazy_tlb(mm);
-   }
+   kthread_end_lazy_tlb_mm();
/* finish_cpu() will mmdrop the init_mm ref after this CPU stops */
 }
 
-- 
2.40.1



[PATCH 1/2] lazy tlb: fix hotplug exit race with MMU_LAZY_TLB_SHOOTDOWN

2023-05-24 Thread Nicholas Piggin
CPU unplug first calls __cpu_disable(), and that's where powerpc calls
cleanup_cpu_mmu_context(), which clears this CPU from mm_cpumask() of
all mms in the system.

However this CPU may still be using a lazy tlb mm, and its mm_cpumask
bit will be cleared from it. The CPU does not switch away from the lazy
tlb mm until arch_cpu_idle_dead() calls idle_task_exit().

If that user mm exits in this window, it will not be subject to the lazy
tlb mm shootdown and may be freed while in use as a lazy mm by the CPU
that is being unplugged.

cleanup_cpu_mmu_context() could be moved later, but it looks better to
move the lazy tlb mm switching earlier. The problem with doing the lazy
mm switching in idle_task_exit() is explained in commit bf2c59fce4074
("sched/core: Fix illegal RCU from offline CPUs"), which added a wart to
switch away from the mm but leave it set in active_mm to be cleaned up
later.

So instead, switch away from the lazy tlb mm on the stopper kthread
before the CPU is taken down. This CPU will never switch to a user
thread from this point, so it has no chance to pick up a new lazy tlb
mm. This removes the lazy tlb mm handling wart in CPU unplug.

idle_task_exit() remains to reduce churn in the patch. It could be
removed entirely after this because finish_cpu() makes a similar check.
finish_cpu() itself is not strictly needed because init_mm will never
have its refcount drop to zero.  But it is conceptually nicer to keep it
rather than have the idle thread drop the reference on the mm it is
using.

Fixes: 2655421ae69fa ("lazy tlb: shoot lazies, non-refcounting lazy tlb mm 
reference handling scheme")
Signed-off-by: Nicholas Piggin 
---
 include/linux/sched/hotplug.h |  2 ++
 kernel/cpu.c  | 11 +++
 kernel/sched/core.c   | 24 +++-
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h
index 412cdaba33eb..cb447d8e3f9a 100644
--- a/include/linux/sched/hotplug.h
+++ b/include/linux/sched/hotplug.h
@@ -19,8 +19,10 @@ extern int sched_cpu_dying(unsigned int cpu);
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern void idle_task_prepare_exit(void);
 extern void idle_task_exit(void);
 #else
+static inline void idle_task_prepare_exit(void) {}
 static inline void idle_task_exit(void) {}
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f4a2c5845bcb..584def27ff24 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -618,12 +618,13 @@ static int finish_cpu(unsigned int cpu)
struct mm_struct *mm = idle->active_mm;
 
/*
-* idle_task_exit() will have switched to _mm, now
-* clean up any remaining active_mm state.
+* idle_task_prepare_exit() ensured the idle task was using
+* _mm. Now that the CPU has stopped, drop that refcount.
 */
-   if (mm != _mm)
-   idle->active_mm = _mm;
+   WARN_ON(mm != _mm);
+   idle->active_mm = NULL;
mmdrop_lazy_tlb(mm);
+
return 0;
 }
 
@@ -1030,6 +1031,8 @@ static int take_cpu_down(void *_param)
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
 
+   idle_task_prepare_exit();
+
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..bc4ef1f3394b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9373,19 +9373,33 @@ void sched_setnuma(struct task_struct *p, int nid)
  * Ensure that the idle task is using init_mm right before its CPU goes
  * offline.
  */
-void idle_task_exit(void)
+void idle_task_prepare_exit(void)
 {
struct mm_struct *mm = current->active_mm;
 
-   BUG_ON(cpu_online(smp_processor_id()));
-   BUG_ON(current != this_rq()->idle);
+   WARN_ON(!irqs_disabled());
 
if (mm != _mm) {
-   switch_mm(mm, _mm, current);
+   mmgrab_lazy_tlb(_mm);
+   current->active_mm = _mm;
+   switch_mm_irqs_off(mm, _mm, current);
finish_arch_post_lock_switch();
+   mmdrop_lazy_tlb(mm);
}
+   /* finish_cpu() will mmdrop the init_mm ref after this CPU stops */
+}
+
+/*
+ * After the CPU is offline, double check that it was previously switched to
+ * init_mm. This call can be removed because the condition is caught in
+ * finish_cpu() as well.
+ */
+void idle_task_exit(void)
+{
+   BUG_ON(cpu_online(smp_processor_id()));
+   BUG_ON(current != this_rq()->idle);
 
-   /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
+   WARN_ON_ONCE(current->active_mm != _mm);
 }
 
 static int __balance_push_cpu_stop(void *arg)
-- 
2.40.1