[PATCH] Add static_key_feature_checks_initialized flag

2024-03-26 Thread Nicholas Miehlbradt
JUMP_LABEL_FEATURE_CHECK_DEBUG used static_key_initialized to determine
whether {cpu,mmu}_has_feature() was used before static keys were
initialized. However, {cpu,mmu}_has_feature() should not be used before
setup_feature_keys() is called. As static_key_initalized is set much
earlier during boot there is a window in which JUMP_LABEL_FEATURE_CHECK_DEBUG
will not report errors. Add a flag specifically to indicate when
{cpu,mmu}_has_feature() is safe to use.

Signed-off-by: Nicholas Miehlbradt 
---
 arch/powerpc/include/asm/cpu_has_feature.h | 2 +-
 arch/powerpc/include/asm/feature-fixups.h  | 2 ++
 arch/powerpc/include/asm/mmu.h | 2 +-
 arch/powerpc/lib/feature-fixups.c  | 8 
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cpu_has_feature.h 
b/arch/powerpc/include/asm/cpu_has_feature.h
index 727d4b321937..0efabccd820c 100644
--- a/arch/powerpc/include/asm/cpu_has_feature.h
+++ b/arch/powerpc/include/asm/cpu_has_feature.h
@@ -29,7 +29,7 @@ static __always_inline bool cpu_has_feature(unsigned long 
feature)
 #endif
 
 #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG
-   if (!static_key_initialized) {
+   if (!static_key_feature_checks_initialized) {
printk("Warning! cpu_has_feature() used prior to jump label 
init!\n");
dump_stack();
return early_cpu_has_feature(feature);
diff --git a/arch/powerpc/include/asm/feature-fixups.h 
b/arch/powerpc/include/asm/feature-fixups.h
index 77824bd289a3..17d168dd8b49 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -291,6 +291,8 @@ extern long __start___rfi_flush_fixup, 
__stop___rfi_flush_fixup;
 extern long __start___barrier_nospec_fixup, __stop___barrier_nospec_fixup;
 extern long __start__btb_flush_fixup, __stop__btb_flush_fixup;
 
+extern bool static_key_feature_checks_initialized;
+
 void apply_feature_fixups(void);
 void update_mmu_feature_fixups(unsigned long mask);
 void setup_feature_keys(void);
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 3b72c7ed24cf..24f830cf9bb4 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -251,7 +251,7 @@ static __always_inline bool mmu_has_feature(unsigned long 
feature)
 #endif
 
 #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG
-   if (!static_key_initialized) {
+   if (!static_key_feature_checks_initialized) {
printk("Warning! mmu_has_feature() used prior to jump label 
init!\n");
dump_stack();
return early_mmu_has_feature(feature);
diff --git a/arch/powerpc/lib/feature-fixups.c 
b/arch/powerpc/lib/feature-fixups.c
index 4f82581ca203..b7201ba50b2e 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -25,6 +25,13 @@
 #include 
 #include 
 
+/*
+ * Used to generate warnings if mmu or cpu feature check functions that
+ * use static keys before they are initialized.
+ */
+bool static_key_feature_checks_initialized __read_mostly;
+EXPORT_SYMBOL_GPL(static_key_feature_checks_initialized);
+
 struct fixup_entry {
unsigned long   mask;
unsigned long   value;
@@ -679,6 +686,7 @@ void __init setup_feature_keys(void)
jump_label_init();
cpu_feature_keys_init();
mmu_feature_keys_init();
+   static_key_feature_checks_initialized = true;
 }
 
 static int __init check_features(void)
-- 
2.40.1



Re: [PATCHv3 pci-next 1/2] PCI/AER: correctable error message as KERN_INFO

2024-03-26 Thread Ethan Zhao

On 3/27/2024 5:17 AM, Bjorn Helgaas wrote:

On Tue, Mar 26, 2024 at 09:39:54AM +0800, Ethan Zhao wrote:

On 3/25/2024 6:15 PM, Xi Ruoyao wrote:

On Mon, 2024-03-25 at 16:45 +0800, Ethan Zhao wrote:

On 3/25/2024 1:19 AM, Xi Ruoyao wrote:

On Mon, 2023-09-18 at 14:39 -0500, Bjorn Helgaas wrote:

On Mon, Sep 18, 2023 at 07:42:30PM +0800, Xi Ruoyao wrote:

...
My workstation suffers from too much correctable AER reporting as well
(related to Intel's errata "RPL013: Incorrectly Formed PCIe Packets May
Generate Correctable Errors" and/or the motherboard design, I guess).

We should rate-limit correctable error reporting so it's not
overwhelming.

At the same time, I'm *also* interested in the cause of these errors,
in case there's a Linux defect or a hardware erratum that we can work
around.  Do you have a bug report with any more details, e.g., a dmesg
log and "sudo lspci -vv" output?

Hi Bjorn,

Sorry for the *very* late reply (somehow I didn't see the reply at all
before it was removed by my cron job, and now I just savaged it from
lore.kernel.org...)

The dmesg is like:

[  882.456994] pcieport :00:1c.1: AER: Multiple Correctable error message 
received from :00:1c.1
[  882.457002] pcieport :00:1c.1: AER: found no error details for 
:00:1c.1
[  882.457003] pcieport :00:1c.1: AER: Multiple Correctable error message 
received from :06:00.0
[  883.545763] pcieport :00:1c.1: AER: Multiple Correctable error message 
received from :00:1c.1
[  883.545789] pcieport :00:1c.1: PCIe Bus Error: severity=Correctable, 
type=Physical Layer, (Receiver ID)
[  883.545790] pcieport :00:1c.1:   device [8086:7a39] error 
status/mask=0001/2000
[  883.545792] pcieport :00:1c.1:    [ 0] RxErr  (First)
[  883.545794] pcieport :00:1c.1: AER:   Error of this Agent is reported 
first
[  883.545798] r8169 :06:00.0: PCIe Bus Error: severity=Correctable, 
type=Physical Layer, (Transmitter ID)
[  883.545799] r8169 :06:00.0:   device [10ec:8125] error 
status/mask=1101/e000
[  883.545800] r8169 :06:00.0:    [ 0] RxErr  (First)
[  883.545801] r8169 :06:00.0:    [ 8] Rollover
[  883.545802] r8169 :06:00.0:    [12] Timeout
[  883.545815] pcieport :00:1c.1: AER: Correctable error message received 
from :00:1c.1
[  883.545823] pcieport :00:1c.1: AER: found no error details for 
:00:1c.1
[  883.545824] pcieport :00:1c.1: AER: Multiple Correctable error message 
received from :06:00.0

lspci output attached.

Intel has issued an errata "RPL013" saying:

"Under complex microarchitectural conditions, the PCIe controller may
transmit an incorrectly formed Transaction Layer Packet (TLP), which
will fail CRC checks. When this erratum occurs, the PCIe end point may
record correctable errors resulting in either a NAK or link recovery.
Intel® has not observed any functional impact due to this erratum."

But I'm really unsure if it describes my issue.

Do you think I have some broken hardware and I should replace the CPU
and/or the motherboard (where the r8169 is soldered)?  I've noticed that
my 13900K is almost impossible to overclock (despite it's a K), but I've
not encountered any issue other than these AER reporting so far after I
gave up overclocking.

Seems there are two r8169 nics on your board, only :06:00.0 reports
aer errors, how about another one the :07:00.0 nic ?

It never happens to :07:00.0, even if I plug the ethernet cable into
it instead of :06:00.0.

So something is wrong with the physical layer, I guess.


Maybe I should just use :07:00.0 and blacklist :06:00.0 as I
don't need two NICs?

Yup,
ratelimit the AER warning is another choice instead of change WARN to INFO.
if corrected error flood happens, even the function is working, suggests
something was already wrong, likely will be worse, that is the meaning of
WARN I think.

We should fix this.  IMHO Correctable Errors should be "info" level,
non-alarming, and rate-limited.  They're basically hints about link
integrity.


This case, hit following errors:

[  883.545800] r8169 :06:00.0:    [ 0] RxErr
[  883.545801] r8169 :06:00.0:    [ 8] Rollover
[  883.545802] r8169 :06:00.0:    [12] Timeout

#1 Timeout -- replay timer timeout, means endpoint didn't response with ACK 
DLLP or
NACK in time, that caused the re-send timer timeout, the sender will re-send the
packet.

#2 Rollover -- the counter of re-transmission reaches 0 (from 11b ->00b), means 
the
sender had tried 3 times. that would trigger link retraining to recover.

#1 & #2 happened together, but no uncorrected errors reported, means the link 
was
recovered, the issue mostly caused by improper TxEQ, receiver equalization, bad
signal integrity.

#3 RxErr -- bad DLLP, bad TLP, clock issue, signal integrity issue etc.

so, yup, basically, the signal integrity is not good enough.
Though the function could work, its performance will be impacted.

If we change 

Re: [PATCH v11 0/5]arm64: add ARCH_HAS_COPY_MC support

2024-03-26 Thread Tong Tiangen

Hi Mark:

Kindly ping...

Thanks,
Tong.

在 2024/2/7 21:21, Tong Tiangen 写道:

With the increase of memory capacity and density, the probability of memory
error also increases. The increasing size and density of server RAM in data
centers and clouds have shown increased uncorrectable memory errors.

Currently, more and more scenarios that can tolerate memory errors,such as
CoW[1,2], KSM copy[3], coredump copy[4], khugepaged[5,6], uaccess copy[7],
etc.

This patchset introduces a new processing framework on ARM64, which enables
ARM64 to support error recovery in the above scenarios, and more scenarios
can be expanded based on this in the future.

In arm64, memory error handling in do_sea(), which is divided into two cases:
  1. If the user state consumed the memory errors, the solution is to kill
 the user process and isolate the error page.
  2. If the kernel state consumed the memory errors, the solution is to
 panic.

For case 2, Undifferentiated panic may not be the optimal choice, as it can
be handled better. In some scenarios, we can avoid panic, such as uaccess,
if the uaccess fails due to memory error, only the user process will be
affected, killing the user process and isolating the user page with
hardware memory errors is a better choice.

[1] commit d302c2398ba2 ("mm, hwpoison: when copy-on-write hits poison, take page 
offline")
[2] commit 1cb9dc4b475c ("mm: hwpoison: support recovery from HugePage copy-on-write 
faults")
[3] commit 6b970599e807 ("mm: hwpoison: support recovery from 
ksm_might_need_to_copy()")
[4] commit 245f09226893 ("mm: hwpoison: coredump: support recovery from 
dump_user_range()")
[5] commit 98c76c9f1ef7 ("mm/khugepaged: recover from poisoned anonymous 
memory")
[6] commit 12904d953364 ("mm/khugepaged: recover from poisoned file-backed 
memory")
[7] commit 278b917f8cb9 ("x86/mce: Add _ASM_EXTABLE_CPY for copy user access")

--
Test result:

1. copy_page(), copy_mc_page() basic function test pass, and the disassembly
contents remains the same before and after refactor.

2. copy_to/from_user() access kernel NULL pointer raise translation fault
and dump error message then die(), test pass.

3. Test following scenarios: copy_from_user(), get_user(), COW.

Before patched: trigger a hardware memory error then panic.
After  patched: trigger a hardware memory error without panic.

Testing step:
step1. start an user-process.
step2. poison(einj) the user-process's page.
step3: user-process access the poison page in kernel mode, then trigger SEA.
step4: the kernel will not panic, only the user process is killed, the 
poison
   page is isolated. (before patched, the kernel will panic in do_sea())

--

Since V10:
  Accroding Mark's suggestion:
  1. Merge V10's patch2 and patch3 to V11's patch2.
  2. Patch2(V11): use new fixup_type for ld* in copy_to_user(), fix fatal
 issues (NULL kernel pointeraccess) been fixup incorrectly.
  3. Patch2(V11): refactoring the logic of do_sea().
  4. Patch4(V11): Remove duplicate assembly logic and remove do_mte().

  Besides:
  1. Patch2(V11): remove st* insn's fixup, st* generally not trigger memory 
error.
  2. Split a part of the logic of patch2(V11) to patch5(V11), for detail,
 see patch5(V11)'s commit msg.
  3. Remove patch6(v10) “arm64: introduce copy_mc_to_kernel() implementation”.
 During modification, some problems that cannot be solved in a short
 period are found. The patch will be released after the problems are
 solved.
  4. Add test result in this patch.
  5. Modify patchset title, do not use machine check and remove "-next".

Since V9:
  1. Rebase to latest kernel version 6.8-rc2.
  2. Add patch 6/6 to support copy_mc_to_kernel().

Since V8:
  1. Rebase to latest kernel version and fix topo in some of the patches.
  2. According to the suggestion of Catalin, I attempted to modify the
 return value of function copy_mc_[user]_highpage() to bytes not copied.
 During the modification process, I found that it would be more
 reasonable to return -EFAULT when copy error occurs (referring to the
 newly added patch 4).

 For ARM64, the implementation of copy_mc_[user]_highpage() needs to
 consider MTE. Considering the scenario where data copying is successful
 but the MTE tag copying fails, it is also not reasonable to return
 bytes not copied.
  3. Considering the recent addition of machine check safe support for
 multiple scenarios, modify commit message for patch 5 (patch 4 for V8).

Since V7:
  Currently, there are patches supporting recover from poison
  consumption for the cow scenario[1]. Therefore, Supporting cow
  scenario under the arm64 architecture only needs to modify the relevant
  code under the arch/.
  [1]https://lore.kernel.org/lkml/20221031201029.102123-1-tony.l...@intel.com/

Since V6:
  Resend patches that are not merged into the mainline in V6.

Since V5:
  1. Add patch2/3 to add 

Re: [PATCH 0/9] enabled -Wformat-truncation for clang

2024-03-26 Thread Jakub Kicinski
On Tue, 26 Mar 2024 23:37:59 +0100 Arnd Bergmann wrote:
> I hope that the patches can get picked up by platform maintainers
> directly, so the final patch can go in later on.

platform == subsystem? :)


[PATCH 8/9] ALSA: aoa: avoid false-positive format truncation warning

2024-03-26 Thread Arnd Bergmann
From: Arnd Bergmann 

clang warns about what it interprets as a truncated snprintf:

sound/aoa/soundbus/i2sbus/core.c:171:6: error: 'snprintf' will always be 
truncated; specified size is 6, but format string expands to at least 7 
[-Werror,-Wformat-truncation-non-kprintf]

The actual problem here is that it does not understand the special
%pOFn format string and assumes that it is a pointer followed by
the string "OFn", which would indeed not fit.

Slightly increasing the size of the buffer to its natural alignment
avoids the warning, as it is now long enough for the correct and
the incorrect interprations.

Fixes: b917d58dcfaa ("ALSA: aoa: Convert to using %pOFn instead of 
device_node.name")
Signed-off-by: Arnd Bergmann 
---
 sound/aoa/soundbus/i2sbus/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/aoa/soundbus/i2sbus/core.c b/sound/aoa/soundbus/i2sbus/core.c
index b8ff5cccd0c8..5431d2c49421 100644
--- a/sound/aoa/soundbus/i2sbus/core.c
+++ b/sound/aoa/soundbus/i2sbus/core.c
@@ -158,7 +158,7 @@ static int i2sbus_add_dev(struct macio_dev *macio,
struct device_node *child, *sound = NULL;
struct resource *r;
int i, layout = 0, rlen, ok = force;
-   char node_name[6];
+   char node_name[8];
static const char *rnames[] = { "i2sbus: %pOFn (control)",
"i2sbus: %pOFn (tx)",
"i2sbus: %pOFn (rx)" };
-- 
2.39.2



[PATCH 0/9] enabled -Wformat-truncation for clang

2024-03-26 Thread Arnd Bergmann
From: Arnd Bergmann 

With randconfig build testing, I found only eight files that produce
warnings with clang when -Wformat-truncation is enabled. This means
we can just turn it on by default rather than only enabling it for
"make W=1".

Unfortunately, gcc produces a lot more warnings when the option
is enabled, so it's not yet possible to turn it on both both
compilers.

I hope that the patches can get picked up by platform maintainers
directly, so the final patch can go in later on.

 Arnd

Arnd Bergmann (9):
  fbdev: shmobile: fix snprintf truncation
  enetc: avoid truncating error message
  qed: avoid truncating work queue length
  mlx5: avoid truncating error message
  surface3_power: avoid format string truncation warning
  Input: IMS: fix printf string overflow
  scsi: mylex: fix sysfs buffer lengths
  ALSA: aoa: avoid false-positive format truncation warning
  kbuild: enable -Wformat-truncation on clang

 drivers/input/misc/ims-pcu.c  |  4 ++--
 drivers/net/ethernet/freescale/enetc/enetc.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/esw/bridge.c  |  2 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c|  9 ---
 drivers/platform/surface/surface3_power.c |  2 +-
 drivers/scsi/myrb.c   | 20 
 drivers/scsi/myrs.c   | 24 +--
 drivers/video/fbdev/sh_mobile_lcdcfb.c|  2 +-
 scripts/Makefile.extrawarn|  2 ++
 sound/aoa/soundbus/i2sbus/core.c  |  2 +-
 10 files changed, 35 insertions(+), 34 deletions(-)

-- 
2.39.2

Cc: Dmitry Torokhov 
Cc: Claudiu Manoil 
Cc: Vladimir Oltean 
Cc: Jakub Kicinski 
Cc: Saeed Mahameed 
Cc: Leon Romanovsky 
Cc: Ariel Elior 
Cc: Manish Chopra 
Cc: Hans de Goede 
Cc: "Ilpo Järvinen" 
Cc: Maximilian Luz 
Cc: Hannes Reinecke 
Cc: "Martin K. Petersen" 
Cc: Helge Deller 
Cc: Masahiro Yamada 
Cc: Nathan Chancellor 
Cc: Nicolas Schier 
Cc: Johannes Berg 
Cc: Jaroslav Kysela 
Cc: Takashi Iwai 
Cc: Nick Desaulniers 
Cc: Bill Wendling 
Cc: Justin Stitt 
Cc: linux-in...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: net...@vger.kernel.org
Cc: linux-r...@vger.kernel.org
Cc: platform-driver-...@vger.kernel.org
Cc: linux-s...@vger.kernel.org
Cc: linux-fb...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linux-kbu...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: alsa-de...@alsa-project.org
Cc: linux-so...@vger.kernel.org
Cc: l...@lists.linux.dev



Re: [PATCHv3 pci-next 1/2] PCI/AER: correctable error message as KERN_INFO

2024-03-26 Thread Bjorn Helgaas
On Tue, Mar 26, 2024 at 09:39:54AM +0800, Ethan Zhao wrote:
> On 3/25/2024 6:15 PM, Xi Ruoyao wrote:
> > On Mon, 2024-03-25 at 16:45 +0800, Ethan Zhao wrote:
> > > On 3/25/2024 1:19 AM, Xi Ruoyao wrote:
> > > > On Mon, 2023-09-18 at 14:39 -0500, Bjorn Helgaas wrote:
> > > > > On Mon, Sep 18, 2023 at 07:42:30PM +0800, Xi Ruoyao wrote:
> > > > > > ...
> > > > > > My workstation suffers from too much correctable AER reporting as 
> > > > > > well
> > > > > > (related to Intel's errata "RPL013: Incorrectly Formed PCIe Packets 
> > > > > > May
> > > > > > Generate Correctable Errors" and/or the motherboard design, I 
> > > > > > guess).
> > > > > We should rate-limit correctable error reporting so it's not
> > > > > overwhelming.
> > > > > 
> > > > > At the same time, I'm *also* interested in the cause of these errors,
> > > > > in case there's a Linux defect or a hardware erratum that we can work
> > > > > around.  Do you have a bug report with any more details, e.g., a dmesg
> > > > > log and "sudo lspci -vv" output?
> > > > Hi Bjorn,
> > > > 
> > > > Sorry for the *very* late reply (somehow I didn't see the reply at all
> > > > before it was removed by my cron job, and now I just savaged it from
> > > > lore.kernel.org...)
> > > > 
> > > > The dmesg is like:
> > > > 
> > > > [  882.456994] pcieport :00:1c.1: AER: Multiple Correctable error 
> > > > message received from :00:1c.1
> > > > [  882.457002] pcieport :00:1c.1: AER: found no error details for 
> > > > :00:1c.1
> > > > [  882.457003] pcieport :00:1c.1: AER: Multiple Correctable error 
> > > > message received from :06:00.0
> > > > [  883.545763] pcieport :00:1c.1: AER: Multiple Correctable error 
> > > > message received from :00:1c.1
> > > > [  883.545789] pcieport :00:1c.1: PCIe Bus Error: 
> > > > severity=Correctable, type=Physical Layer, (Receiver ID)
> > > > [  883.545790] pcieport :00:1c.1:   device [8086:7a39] error 
> > > > status/mask=0001/2000
> > > > [  883.545792] pcieport :00:1c.1:    [ 0] RxErr  
> > > > (First)
> > > > [  883.545794] pcieport :00:1c.1: AER:   Error of this Agent is 
> > > > reported first
> > > > [  883.545798] r8169 :06:00.0: PCIe Bus Error: 
> > > > severity=Correctable, type=Physical Layer, (Transmitter ID)
> > > > [  883.545799] r8169 :06:00.0:   device [10ec:8125] error 
> > > > status/mask=1101/e000
> > > > [  883.545800] r8169 :06:00.0:    [ 0] RxErr  
> > > > (First)
> > > > [  883.545801] r8169 :06:00.0:    [ 8] Rollover
> > > > [  883.545802] r8169 :06:00.0:    [12] Timeout
> > > > [  883.545815] pcieport :00:1c.1: AER: Correctable error message 
> > > > received from :00:1c.1
> > > > [  883.545823] pcieport :00:1c.1: AER: found no error details for 
> > > > :00:1c.1
> > > > [  883.545824] pcieport :00:1c.1: AER: Multiple Correctable error 
> > > > message received from :06:00.0
> > > > 
> > > > lspci output attached.
> > > > 
> > > > Intel has issued an errata "RPL013" saying:
> > > > 
> > > > "Under complex microarchitectural conditions, the PCIe controller may
> > > > transmit an incorrectly formed Transaction Layer Packet (TLP), which
> > > > will fail CRC checks. When this erratum occurs, the PCIe end point may
> > > > record correctable errors resulting in either a NAK or link recovery.
> > > > Intel® has not observed any functional impact due to this erratum."
> > > > 
> > > > But I'm really unsure if it describes my issue.
> > > > 
> > > > Do you think I have some broken hardware and I should replace the CPU
> > > > and/or the motherboard (where the r8169 is soldered)?  I've noticed that
> > > > my 13900K is almost impossible to overclock (despite it's a K), but I've
> > > > not encountered any issue other than these AER reporting so far after I
> > > > gave up overclocking.
> > > Seems there are two r8169 nics on your board, only :06:00.0 reports
> > > aer errors, how about another one the :07:00.0 nic ?
> > It never happens to :07:00.0, even if I plug the ethernet cable into
> > it instead of :06:00.0.
> 
> So something is wrong with the physical layer, I guess.
> 
> > Maybe I should just use :07:00.0 and blacklist :06:00.0 as I
> > don't need two NICs?
> 
> Yup,
> ratelimit the AER warning is another choice instead of change WARN to INFO.
> if corrected error flood happens, even the function is working, suggests
> something was already wrong, likely will be worse, that is the meaning of
> WARN I think.

We should fix this.  IMHO Correctable Errors should be "info" level,
non-alarming, and rate-limited.  They're basically hints about link
integrity.

Bjorn


Re: [PATCH 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files

2024-03-26 Thread Balasubrmanian, Vignesh


On 3/14/2024 10:09 PM, Dave Hansen wrote:

Caution: This message originated from an External Source. Use proper caution 
when opening attachments, clicking links, or responding.


On 3/14/24 09:29, Borislav Petkov wrote:

That argument breaks down a bit on the flags though:

  xc.xfeat_flags = xstate_flags[i];

Because it comes _directly_ from CPUID with zero filtering:

  cpuid_count(XSTATE_CPUID, i, , , , );
  ...
  xstate_flags[i] = ecx;

So this layout is quite dependent on what's in x86's CPUID.

Yeah, no, this should not be copying CPUID flags - those flags should be
*translated* to independently defined flags which describe those
buffers.

Ditto for:

 xc.xfeat_type = i;

Right now, that's bound to CPUID and XSAVE.  "feat_type==10" can only
ever be PKRU and that's derived from the XSAVE architecture.

If you want this to be extensible to things outside of the XSAVE
architecture, it needs to be something actually extensible and not
entangled with XSAVE.

In other words "xc.xfeat_type" can enumerate XSAVE state components
being in the dump, but it should not be limited to XSAVE.  Just as an
example:

enum feat_type {
 FEATURE_XSAVE_PKRU,
 FEATURE_XSAVE__YMM,
 FEATURE_XSAVE_BNDREGS,
 FEATURE_XSAVE_BNDCSR,
 ...
 RANDOM_STATE_NOT_XSAVE
};

See how feat_type==1 is PKRU and *NOT* feat_type==10?  That opens the
door to RANDOM_STATE_NOT_XSAVE or anything else you want.  This would be
_actually_ extensible.



Thanks for the review.
I will add new enum, instead of using "enum xfeature".
Currently we are retaining the flags field. The value will be set to 
zero at this point, and the field will be reserved for future use.
GDB / LLDB would not require this field at this point. Do let us know if 
this is not OK.


-thanks,
vigneshbalu.


Re: [PATCH 14/64] i2c: cpm: reword according to newest specification

2024-03-26 Thread Andi Shyti
Hi Wolfram,

...

> @@ -570,7 +570,7 @@ static int cpm_i2c_setup(struct cpm_i2c *cpm)
>   out_8(>i2c_reg->i2brg, brg);
>  
>   out_8(>i2c_reg->i2mod, 0x00);
> - out_8(>i2c_reg->i2com, I2COM_MASTER);  /* Master mode */
> + out_8(>i2c_reg->i2com, I2COM_MASTER);  /* Host mode */

I2COM_MASTER might be coming from the datasheet.

Reviewed-by: Andi Shyti 

Thanks,


Re: Appropriate liburcu cache line size for Power

2024-03-26 Thread Segher Boessenkool
On Tue, Mar 26, 2024 at 06:19:38PM +1100, Michael Ellerman wrote:
> Mathieu Desnoyers  writes:
> The ISA doesn't specify the cache line size, other than it is smaller
> than a page.

It also says it is "aligned".  Nowhere is it said what an aligned size
is, but it seems clear it has to be a power of two.

> In practice all the 64-bit IBM server CPUs I'm aware of have used 128
> bytes.

Yup.  It is 128B on p3 already.

> It is possible to discover at runtime via AUXV headers. But that's no
> use if you want a compile-time constant.

The architecture does not require the data block size to be equal to the
instruction block size, already.  But many programs subscribe to an
overly simplified worldview, which is a big reason everything is 128B on
all modern PowerPC.

It is quite a nice tradeoff size, there has to be a huge change in the
world for this to ever change :-)


Segher


Re: [PATCH v3 0/5] ASoC: fsl: Support register and unregister rpmsg sound card through remoteproc

2024-03-26 Thread Mark Brown
On Mon, 11 Mar 2024 20:13:44 +0900, Chancel Liu wrote:
>   echo /lib/firmware/fw.elf > /sys/class/remoteproc/remoteproc0/firmware
> (A)   echo start > /sys/class/remoteproc/remoteproc0/state
> (B)   echo stop > /sys/class/remoteproc/remoteproc0/state
> 
> The rpmsg sound card is registered in (A) and unregistered in (B).
> After "start", imx-audio-rpmsg registers devices for ASoC platform driver
> and machine driver. Then sound card is registered. After "stop",
> imx-audio-rpmsg unregisters devices for ASoC platform driver and machine
> driver. Then sound card is unregistered.
> 
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/5] ASoC: fsl: imx-pcm-rpmsg: Register component with rpmsg channel name
  commit: 41f96cd53f2838ac4894491ac5eadb06f1e5b858
[2/5] ASoC: fsl: imx-audio-rpmsg: Register device with rpmsg channel name
  commit: dacc7459745df168152b5cceba34efade9b5e063
[3/5] ASoC: fsl: Let imx-audio-rpmsg register platform device for card
  commit: c73524768e9e1a7ac9eb3a4d36a1ac0d34f22644
[4/5] ASoC: fsl: fsl_rpmsg: Register CPU DAI with name of rpmsg channel
  commit: 0aa7f5406afa828a93d84d75c9b9ac991cd75367
[5/5] ASoC: fsl: imx-rpmsg: Update to correct DT node
  commit: c14445bdcb98feddf9afaeb05e6193cc1f8eec1a

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark



Re: [RFC PATCH 1/8] mm: Provide pagesize to pmd_populate()

2024-03-26 Thread Jason Gunthorpe
On Mon, Mar 25, 2024 at 07:05:01PM +, Christophe Leroy wrote:

> Not looked into details yet, but I guess so.
> 
> By the way there is a wiki dedicated to huge pages on powerpc, you can 
> have a look at it here : 
> https://github.com/linuxppc/wiki/wiki/Huge-pages , maybe you'll find 
> good ideas there to help me.

There sure are alot of page tables types here

I'm a bit wondering about terminology, eg on the first diagram "huge
pte entry" means a PUD entry that is a leaf? Which ones are contiguous
replications?

Just general remarks on the ones with huge pages:

 hash 64k and hugepage 16M/16G
 radix 64k/radix hugepage 2M/1G
 radix 4k/radix hugepage 2M/1G
 nohash 32
  - I think this is just a normal x86 like scheme? PMD/PUD can be a
leaf with the same size as a next level table. 

Do any of these cases need to know the higher level to parse the
lower? eg is there a 2M bit in the PUD indicating that the PMD 
is a table of 2M leafs or does each PMD entry have a bit
indicating it is a leaf?

 hash 4k and hugepage 16M/16G
 nohash 64
  - How does this work? I guess since 8xx explicitly calls out
consecutive this is actually the pgd can point to 512 256M
entries or 8 16G entries? Ie the table size at each level is
varable? Or is it the same and the table size is still 512 and
each 16G entry is replicated 64 times?

Do the offset accessors already abstract this enough?

 8xx 4K
 8xx 16K
   - As this series does?

Jason


Re: Appropriate liburcu cache line size for Power

2024-03-26 Thread Mathieu Desnoyers

On 2024-03-26 03:19, Michael Ellerman wrote:

Mathieu Desnoyers  writes:

Hi,


Hi Mathieu,


In the powerpc architecture support within the liburcu project [1]
we have a cache line size defined as 256 bytes with the following
comment:

/* Include size of POWER5+ L3 cache lines: 256 bytes */
#define CAA_CACHE_LINE_SIZE 256

I recently received a pull request on github [2] asking to
change this to 128 bytes. All the material provided supports
that the cache line sizes on powerpc are 128 bytes or less (even
L3 on POWER7, POWER8, and POWER9) [3].

I wonder where the 256 bytes L3 cache line size for POWER5+
we have in liburcu comes from, and I wonder if it's the right choice
for a cache line size on all powerpc, considering that the Linux
kernel cache line size appear to use 128 bytes on recent Power
architectures. I recall some benchmark experiments Paul and I did
on a 64-core 1.9GHz POWER5+ machine that benefited from a 256 bytes
cache line size, and I suppose this is why we came up with this
value, but I don't have the detailed specs of that machine.

Any feedback on this matter would be appreciated.


The ISA doesn't specify the cache line size, other than it is smaller
than a page.

In practice all the 64-bit IBM server CPUs I'm aware of have used 128
bytes. There are some 64-bit CPUs that use 64 bytes, eg. pasemi PA6T and
Freescale e6500.

It is possible to discover at runtime via AUXV headers. But that's no
use if you want a compile-time constant.


Indeed, and this CAA_CACHE_LINE_SIZE is part of the liburcu powerpc ABI,
so changing this would require a soname bump, which I don't want to do
without really good reasons.



I'm happy to run some benchmarks if you can point me at what to run. I
had a poke around the repository and found short_bench, but it seemed to
run for a very long time.


I've created a dedicated test program for this, see:

https://github.com/compudj/userspace-rcu-dev/tree/false-sharing

example use:

On a AMD Ryzen 7 PRO 6850U with Radeon Graphics:

for a in 8 16 32 64 128 256 512; do tests/unit/test_false_sharing -s $a; done
ok 1 - Stride 8 bytes, increments per ms per thread: 21320
1..1
ok 1 - Stride 16 bytes, increments per ms per thread: 22657
1..1
ok 1 - Stride 32 bytes, increments per ms per thread: 47599
1..1
ok 1 - Stride 64 bytes, increments per ms per thread: 531364
1..1
ok 1 - Stride 128 bytes, increments per ms per thread: 523634
1..1
ok 1 - Stride 256 bytes, increments per ms per thread: 519402
1..1
ok 1 - Stride 512 bytes, increments per ms per thread: 520651
1..1

Would point to false-sharing starting with cache lines smaller than
64 bytes. I get similar results (false-sharing under 64 bytes) with a
AMD EPYC 9654 96-Core Processor.

The test programs runs 4 threads by default, which can be overridden
with "-t N". This may be needed if you want this to use all cores from
a larger machine. See "-h" for options.

On a POWER9 (architected), altivec supported:

for a in 8 16 32 64 128 256 512; do tests/unit/test_false_sharing -s $a; done
ok 1 - Stride 8 bytes, increments per ms per thread: 12264
1..1
ok 1 - Stride 16 bytes, increments per ms per thread: 12276
1..1
ok 1 - Stride 32 bytes, increments per ms per thread: 25638
1..1
ok 1 - Stride 64 bytes, increments per ms per thread: 39934
1..1
ok 1 - Stride 128 bytes, increments per ms per thread: 53971
1..1
ok 1 - Stride 256 bytes, increments per ms per thread: 53599
1..1
ok 1 - Stride 512 bytes, increments per ms per thread: 53962
1..1

This points at false-sharing below 128 bytes stride.

On a e6500, altivec supported, Model 2.0 (pvr 8040 0120)

for a in 8 16 32 64 128 256 512; do tests/unit/test_false_sharing -s $a; done
ok 1 - Stride 8 bytes, increments per ms per thread: 9049
1..1
ok 1 - Stride 16 bytes, increments per ms per thread: 9054
1..1
ok 1 - Stride 32 bytes, increments per ms per thread: 18643
1..1
ok 1 - Stride 64 bytes, increments per ms per thread: 37417
1..1
ok 1 - Stride 128 bytes, increments per ms per thread: 37906
1..1
ok 1 - Stride 256 bytes, increments per ms per thread: 37870
1..1
ok 1 - Stride 512 bytes, increments per ms per thread: 37899
1..1

Which points at false-sharing below 64 bytes.

I prefer to be cautious about this cache line size value and aim for
a value which takes into account the largest known cache line size
for an architecture rather than use a too small due to the large
overhead caused by false-sharing.

Feedback is welcome.

Thanks,

Mathieu

--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com



Re: [PATCH v3 00/12] mm/gup: Unify hugetlb, part 2

2024-03-26 Thread Jason Gunthorpe
On Mon, Mar 25, 2024 at 02:58:48PM -0400, Peter Xu wrote:

> > This remark would be a little easier to understand if you refer to
> > hugetlb_walk() not huge_pte_offset() - the latter is only used to
> > implement hugetlb_walk() and isn't removed by this series, while a
> > single hugetlb_walk() was removed.
> 
> Right.  Here huge_pte_offset() is the arch API that I hope we can remove,
> the hugetlb_walk() is simply the wrapper.

But arguably hugetlb_walk is the thing that should go..

In the generic code we should really try to get away from the weird
hugetlb abuse of treating every level as a pte_t. That is not how the
generic page table API works and that weirdness is part of what
motivates the arch API to supply special functions for reading. Not
every arch can just cast every level to a pte_t and still work.

But that weirdness is also saving alot of code so something else needs
to be though up..

> > Regardless, I think the point is spot on, the direction should be to
> > make the page table reading generic with minimal/no interaction with
> > hugetlbfs in the generic code.
> 
> Yes, and I also like your terms on calling them "pgtable readers".  It's a
> better way to describe the difference in that regard between
> huge_pte_offset() v.s. huge_pte_alloc().  Exactly that's my goal, that we
> should start with the "readers".

Yeah, it makes alot of sense to tackle the readers first - we are
pretty close now to having enough done to have generic readers. I
would imagine tackling everything outside mm/huge*.c to use the normal
page table API for reading.

Then consider what to do with the reading in mm/huge*.c
 
> The writters might change semantics when merge, and can be more
> challenging, I'll need to look into details of each one, like page fault
> handlers.  Such work may need to be analyzed case by case, and this GUP
> part is definitely the low hanging fruit comparing to the rest.

The write side is tricky, I think if the read side is sorted out then
it will be easer to reason about the write side. Today the write side
is paired with the special read side and it is extra hard to
understand if there is something weird hidden in the arch.
 
> measurements too when getting there.  And btw, IIUC the major challenge of
> pagewalk.c is not the removal of walk_hugetlb_range() alone - that may not
> be that hard if that's the solo purpose.  The better way to go is to remove
> mm_walk_ops.hugetlb_entry() altogether, which will cause a change in all
> callers; that's "the challenge".. pretty much labor works, not a major
> technical challenge it seems.  Not sure if it's a good news or bad..

Ugh, that is a big pain. It is relying on that hugetlbfs trick of
passing in a pte that is not a pte to make the API generic..

The more I look at this the more I think we need to get to Matthew's
idea of having some kind of generic page table API that is not tightly
tied to level. Replacing the hugetlb trick of 'everything is a PTE'
with 5 special cases in every place seems just horrible.

   struct mm_walk_ops {
   int (*leaf_entry)(struct mm_walk_state *state, struct mm_walk *walk);
   }

And many cases really want something like:
   struct mm_walk_state state;

   if (!mm_walk_seek_leaf(state, mm, address))
  goto no_present
   if (mm_walk_is_write(state)) ..

And detailed walking:
   for_each_pt_leaf(state, mm, address) {
   if (mm_walk_is_write(state)) ..
   }

Replacing it with a mm_walk_state that retains the level or otherwise
to allow decoding any entry composes a lot better. Forced Loop
unrolling can get back to the current code gen in alot of places.

It also makes the power stuff a bit nicer as the mm_walk_state could
automatically retain back pointers to the higher levels in the state
struct too...

The puzzle is how to do it and still get reasonable efficient codegen,
many operations are going to end up switching on some state->level to
know how to decode the entry.

> One thing I'll soon look into is to allow huge mappings for PFNMAP; there's
> one request from VFIO side for MMIO. Dropping mm_walk_ops.hugetlb_entry()
> seems to be good for all such purposes; well, I may need to bite the bullet
> here.. for either of the purposes to move on.

That would be a nice feature!

> > If, or how much, the hugepd write side remains special is the open
> > question, I think.
> 
> It seems balls are rolling in that aspect, I haven't looked in depth there
> in Christophe's series but it's great to have started!

Yes!

Jason


Re: [PATCH v2 3/6] mm/mm_init.c: add new function calc_nr_all_pages()

2024-03-26 Thread Baoquan He
On 03/26/24 at 08:57am, Mike Rapoport wrote:
> Hi Baoquan,
> 
> On Mon, Mar 25, 2024 at 10:56:43PM +0800, Baoquan He wrote:
> > This is a preparation to calculate nr_kernel_pages and nr_all_pages,
> > both of which will be used later in alloc_large_system_hash().
> > 
> > nr_all_pages counts up all free but not reserved memory in memblock
> > allocator, including HIGHMEM memory. While nr_kernel_pages counts up
> > all free but not reserved low memory in memblock allocator, excluding
> > HIGHMEM memory.
> 
> Sorry I've missed this in the previous review, but I think this patch and
> the patch "remove unneeded calc_memmap_size()" can be merged into "remove
> meaningless calculation of zone->managed_pages in free_area_init_core()"
> with an appropriate update of the commit message.
> 
> With the current patch splitting there will be compilation warning about 
> unused
> function for this and the next patch.

Thanks for careful checking.

We need to make patch bisect-able to not break compiling so that people can
spot the cirminal commit, that's for sure. Do we need care about the
compiling warning from intermediate patch in one series? Not sure about
it. I always suggest people to seperate out this kind of newly added
function to a standalone patch for better reviewing and later checking,
and I saw a lot of commits like this by searching with
'git log --oneline | grep helper'

>  
> > Signed-off-by: Baoquan He 
> > ---
> >  mm/mm_init.c | 24 
> >  1 file changed, 24 insertions(+)
> > 
> > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > index 153fb2dc666f..c57a7fc97a16 100644
> > --- a/mm/mm_init.c
> > +++ b/mm/mm_init.c
> > @@ -1264,6 +1264,30 @@ static void __init 
> > reset_memoryless_node_totalpages(struct pglist_data *pgdat)
> > pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
> >  }
> >  
> > +static void __init calc_nr_kernel_pages(void)
> > +{
> > +   unsigned long start_pfn, end_pfn;
> > +   phys_addr_t start_addr, end_addr;
> > +   u64 u;
> > +#ifdef CONFIG_HIGHMEM
> > +   unsigned long high_zone_low = 
> > arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
> > +#endif
> > +
> > +   for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, _addr, 
> > _addr, NULL) {
> > +   start_pfn = PFN_UP(start_addr);
> > +   end_pfn   = PFN_DOWN(end_addr);
> > +
> > +   if (start_pfn < end_pfn) {
> > +   nr_all_pages += end_pfn - start_pfn;
> > +#ifdef CONFIG_HIGHMEM
> > +   start_pfn = clamp(start_pfn, 0, high_zone_low);
> > +   end_pfn = clamp(end_pfn, 0, high_zone_low);
> > +#endif
> > +   nr_kernel_pages += end_pfn - start_pfn;
> > +   }
> > +   }
> > +}
> > +
> >  static void __init calculate_node_totalpages(struct pglist_data *pgdat,
> > unsigned long node_start_pfn,
> > unsigned long node_end_pfn)
> > -- 
> > 2.41.0
> > 
> 
> -- 
> Sincerely yours,
> Mike.
> 



Re: [PATCH v1 1/1] ASoC: fsl: imx-es8328: Remove leftover gpio initialisation

2024-03-26 Thread Mark Brown
On Mon, 25 Mar 2024 21:13:41 +0200, Andy Shevchenko wrote:
> The gpio field is not used anymore, remove the leftover.
> This also fixes the compilation error after the ...
> 
> 

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/1] ASoC: fsl: imx-es8328: Remove leftover gpio initialisation
  commit: 6a92834166b16babd70e99c3e0ce9262893ad6ae

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark



Re: [PATCH 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files

2024-03-26 Thread Balasubrmanian, Vignesh




IMHO you should split the changes to replace ARCH_HAVE_EXTRA_ELF_NOTES
with CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES into a lead-up patch.

cheers

Thanks for the input and i will take care in next version.

regards,
vigneshbalu.


Re: [PATCH 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files

2024-03-26 Thread Balasubrmanian, Vignesh




Otherwise looks reasonable, though I see Dave has feedback to address
too. :)

Thanks for working on this!

-Kees

Thank you for the review.
I will address all this on next version.

thanks,
vigneshbalu.


--
Kees Cook


[PATCH v4] arch/powerpc/kvm: Add support for reading VPA counters for pseries guests

2024-03-26 Thread Gautam Menghani
PAPR hypervisor has introduced three new counters in the VPA area of
LPAR CPUs for KVM L2 guest (see [1] for terminology) observability - 2
for context switches from host to guest and vice versa, and 1 counter
for getting the total time spent inside the KVM guest. Add a tracepoint
that enables reading the counters for use by ftrace/perf. Note that this
tracepoint is only available for nestedv2 API (i.e, KVM on PowerVM).

Also maintain an aggregation of the context switch times in vcpu->arch.
This will be useful in getting the aggregate times with a pmu driver
which will be upstreamed in the near future.

[1] Terminology:
a. L1 refers to the VM (LPAR) booted on top of PAPR hypervisor
b. L2 refers to the KVM guest booted on top of L1.

Signed-off-by: Vaibhav Jain 
Signed-off-by: Gautam Menghani 
---
v1 -> v2:
1. Fix the build error due to invalid struct member reference.

v2 -> v3:
1. Move the counter disabling and zeroing code to a different function.
2. Move the get_lppaca() inside the tracepoint_enabled() branch.
3. Add the aggregation logic to maintain total context switch time.

v3 -> v4:
1. After vcpu_run, check the VPA flag instead of checking for tracepoint
being enabled for disabling the cs time accumulation.

 arch/powerpc/include/asm/kvm_host.h |  5 +
 arch/powerpc/include/asm/lppaca.h   | 11 ---
 arch/powerpc/kvm/book3s_hv.c| 30 +
 arch/powerpc/kvm/trace_hv.h | 25 
 4 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8abac5321..d953b32dd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -847,6 +847,11 @@ struct kvm_vcpu_arch {
gpa_t nested_io_gpr;
/* For nested APIv2 guests*/
struct kvmhv_nestedv2_io nestedv2_io;
+
+   /* Aggregate context switch and guest run time info (in ns) */
+   u64 l1_to_l2_cs_agg;
+   u64 l2_to_l1_cs_agg;
+   u64 l2_runtime_agg;
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index 61ec2447d..bda6b86b9 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -62,7 +62,8 @@ struct lppaca {
u8  donate_dedicated_cpu;   /* Donate dedicated CPU cycles */
u8  fpregs_in_use;
u8  pmcregs_in_use;
-   u8  reserved8[28];
+   u8  l2_accumul_cntrs_enable;  /* Enable usage of counters for KVM 
guest */
+   u8  reserved8[27];
__be64  wait_state_cycles;  /* Wait cycles for this proc */
u8  reserved9[28];
__be16  slb_count;  /* # of SLBs to maintain */
@@ -92,9 +93,13 @@ struct lppaca {
/* cacheline 4-5 */
 
__be32  page_ins;   /* CMO Hint - # page ins by OS */
-   u8  reserved12[148];
+   u8  reserved12[28];
+   volatile __be64 l1_to_l2_cs_tb;
+   volatile __be64 l2_to_l1_cs_tb;
+   volatile __be64 l2_runtime_tb;
+   u8 reserved13[96];
volatile __be64 dtl_idx;/* Dispatch Trace Log head index */
-   u8  reserved13[96];
+   u8  reserved14[96];
 } cacheline_aligned;
 
 #define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8e86eb577..dcd6edd3b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4108,6 +4108,27 @@ static void vcpu_vpa_increment_dispatch(struct kvm_vcpu 
*vcpu)
}
 }
 
+static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu)
+{
+   struct lppaca *lp = get_lppaca();
+   u64 l1_to_l2_ns, l2_to_l1_ns, l2_runtime_ns;
+
+   l1_to_l2_ns = tb_to_ns(be64_to_cpu(lp->l1_to_l2_cs_tb));
+   l2_to_l1_ns = tb_to_ns(be64_to_cpu(lp->l2_to_l1_cs_tb));
+   l2_runtime_ns = tb_to_ns(be64_to_cpu(lp->l2_runtime_tb));
+   trace_kvmppc_vcpu_exit_cs_time(vcpu, l1_to_l2_ns, l2_to_l1_ns,
+   l2_runtime_ns);
+   lp->l1_to_l2_cs_tb = 0;
+   lp->l2_to_l1_cs_tb = 0;
+   lp->l2_runtime_tb = 0;
+   lp->l2_accumul_cntrs_enable = 0;
+
+   // Maintain an aggregate of context switch times
+   vcpu->arch.l1_to_l2_cs_agg += l1_to_l2_ns;
+   vcpu->arch.l2_to_l1_cs_agg += l2_to_l1_ns;
+   vcpu->arch.l2_runtime_agg += l2_runtime_ns;
+}
+
 static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit,
 unsigned long lpcr, u64 *tb)
 {
@@ -4130,6 +4151,11 @@ static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu 
*vcpu, u64 time_limit,
kvmppc_gse_put_u64(io->vcpu_run_input, KVMPPC_GSID_LPCR, lpcr);
 
accumulate_time(vcpu, >arch.in_guest);
+
+   /* Enable the guest host context switch time tracking */
+   if (unlikely(trace_kvmppc_vcpu_exit_cs_time_enabled()))
+   

Re: [PATCH 3/3] tools/perf/arch/powerc: Add get_arch_regnum for powerpc

2024-03-26 Thread Tiezhu Yang

Hi Athira and Namhyung,

On 03/09/2024 03:25 PM, Athira Rajeev wrote:

The function get_dwarf_regnum() returns a DWARF register number
from a register name string. This calls arch specific function
get_arch_regnum to return register number for corresponding arch.
Add mappings for register name to register number in powerpc code:
arch/powerpc/util/dwarf-regs.c

Signed-off-by: Athira Rajeev 
---
 tools/perf/arch/powerpc/util/dwarf-regs.c | 29 +++
 1 file changed, 29 insertions(+)


I found commit 3eee606757ad ("perf dwarf-regs: Add get_dwarf_regnum()")
for x86, would you be able to share how to test these changes? What is
the difference with and without the patches?

Thanks,
Tiezhu



Re: Appropriate liburcu cache line size for Power

2024-03-26 Thread Michael Ellerman
Mathieu Desnoyers  writes:
> Hi,

Hi Mathieu,

> In the powerpc architecture support within the liburcu project [1]
> we have a cache line size defined as 256 bytes with the following
> comment:
>
> /* Include size of POWER5+ L3 cache lines: 256 bytes */
> #define CAA_CACHE_LINE_SIZE 256
>
> I recently received a pull request on github [2] asking to
> change this to 128 bytes. All the material provided supports
> that the cache line sizes on powerpc are 128 bytes or less (even
> L3 on POWER7, POWER8, and POWER9) [3].
>
> I wonder where the 256 bytes L3 cache line size for POWER5+
> we have in liburcu comes from, and I wonder if it's the right choice
> for a cache line size on all powerpc, considering that the Linux
> kernel cache line size appear to use 128 bytes on recent Power
> architectures. I recall some benchmark experiments Paul and I did
> on a 64-core 1.9GHz POWER5+ machine that benefited from a 256 bytes
> cache line size, and I suppose this is why we came up with this
> value, but I don't have the detailed specs of that machine.
>
> Any feedback on this matter would be appreciated.

The ISA doesn't specify the cache line size, other than it is smaller
than a page.

In practice all the 64-bit IBM server CPUs I'm aware of have used 128
bytes. There are some 64-bit CPUs that use 64 bytes, eg. pasemi PA6T and
Freescale e6500.

It is possible to discover at runtime via AUXV headers. But that's no
use if you want a compile-time constant.

I'm happy to run some benchmarks if you can point me at what to run. I
had a poke around the repository and found short_bench, but it seemed to
run for a very long time.

cheers


Re: [PATCH v2 3/3] powerpc/code-patching: Restore 32-bit patching performance

2024-03-26 Thread Christophe Leroy


Le 25/03/2024 à 23:48, Benjamin Gray a écrit :
> The new open/close abstraction makes it more difficult for a
> compiler to optimise. This causes 10% worse performance on
> ppc32 as in [1]. Restoring the page alignment mask and inlining
> the helpers allows the compiler to better reason about the address
> alignment, allowing more optimised cache flushing selection.

This should be squashed into patch 1. There is no point in having that 
as a separate patch when in the same series.

> 
> [1]: 
> https://lore.kernel.org/all/77fdcdeb-4af5-4ad0-a4c6-57bf0762d...@csgroup.eu/
> 
> Suggested-by: Christophe Leroy 
> Signed-off-by: Benjamin Gray 
> 
> ---
> 
> v2: * New in v2
> 
> I think Suggested-by is an appropriate tag. The patch is Christophe's
> from the link, I just added the commit description, so it could well
> be better to change the author to Christophe completely.
> ---
>   arch/powerpc/lib/code-patching.c | 6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b3a644290369..d089da115987 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -282,13 +282,13 @@ struct patch_window {
>* Interrupts must be disabled for the entire duration of the patching. The 
> PIDR
>* is potentially changed during this time.
>*/
> -static int open_patch_window(void *addr, struct patch_window *ctx)
> +static __always_inline int open_patch_window(void *addr, struct patch_window 
> *ctx)
>   {
>   unsigned long pfn = get_patch_pfn(addr);
>   
>   lockdep_assert_irqs_disabled();
>   
> - ctx->text_poke_addr = (unsigned 
> long)__this_cpu_read(cpu_patching_context.addr);
> + ctx->text_poke_addr = (unsigned 
> long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK;
>   
>   if (!mm_patch_enabled()) {
>   ctx->ptep = __this_cpu_read(cpu_patching_context.pte);
> @@ -331,7 +331,7 @@ static int open_patch_window(void *addr, struct 
> patch_window *ctx)
>   return 0;
>   }
>   
> -static void close_patch_window(struct patch_window *ctx)
> +static __always_inline void close_patch_window(struct patch_window *ctx)
>   {
>   lockdep_assert_irqs_disabled();
>   


Re: [PATCH v2 6/6] mm/mm_init.c: remove arch_reserved_kernel_pages()

2024-03-26 Thread Mike Rapoport
On Mon, Mar 25, 2024 at 10:56:46PM +0800, Baoquan He wrote:
> Since the current calculation of calc_nr_kernel_pages() has taken into
> consideration of kernel reserved memory, no need to have
> arch_reserved_kernel_pages() any more.
> 
> Signed-off-by: Baoquan He 

Reviewed-by: Mike Rapoport (IBM) 

> ---
>  arch/powerpc/include/asm/mmu.h |  4 
>  arch/powerpc/kernel/fadump.c   |  5 -
>  include/linux/mm.h |  3 ---
>  mm/mm_init.c   | 12 
>  4 files changed, 24 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
> index 3b72c7ed24cf..aa5c0fd5edb1 100644
> --- a/arch/powerpc/include/asm/mmu.h
> +++ b/arch/powerpc/include/asm/mmu.h
> @@ -406,9 +406,5 @@ extern void *abatron_pteptrs[2];
>  #include 
>  #endif
>  
> -#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
> -#define __HAVE_ARCH_RESERVED_KERNEL_PAGES
> -#endif
> -
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_POWERPC_MMU_H_ */
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index d14eda1e8589..ae8c7619e597 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -1735,8 +1735,3 @@ static void __init fadump_reserve_crash_area(u64 base)
>   memblock_reserve(mstart, msize);
>   }
>  }
> -
> -unsigned long __init arch_reserved_kernel_pages(void)
> -{
> - return memblock_reserved_size() / PAGE_SIZE;
> -}
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ad19350e1538..ab1ba0a31429 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3221,9 +3221,6 @@ static inline void show_mem(void)
>  extern long si_mem_available(void);
>  extern void si_meminfo(struct sysinfo * val);
>  extern void si_meminfo_node(struct sysinfo *val, int nid);
> -#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
> -extern unsigned long arch_reserved_kernel_pages(void);
> -#endif
>  
>  extern __printf(3, 4)
>  void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index e269a724f70e..089dc60159e9 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -2373,17 +2373,6 @@ void __init page_alloc_init_late(void)
>   page_alloc_sysctl_init();
>  }
>  
> -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
> -/*
> - * Returns the number of pages that arch has reserved but
> - * is not known to alloc_large_system_hash().
> - */
> -static unsigned long __init arch_reserved_kernel_pages(void)
> -{
> - return 0;
> -}
> -#endif
> -
>  /*
>   * Adaptive scale is meant to reduce sizes of hash tables on large memory
>   * machines. As memory size is increased the scale is also increased but at
> @@ -2426,7 +2415,6 @@ void *__init alloc_large_system_hash(const char 
> *tablename,
>   if (!numentries) {
>   /* round applicable memory size up to nearest megabyte */
>   numentries = nr_kernel_pages;
> - numentries -= arch_reserved_kernel_pages();
>  
>   /* It isn't necessary when PAGE_SIZE >= 1MB */
>   if (PAGE_SIZE < SZ_1M)
> -- 
> 2.41.0
> 

-- 
Sincerely yours,
Mike.


Re: [PATCH v2 3/6] mm/mm_init.c: add new function calc_nr_all_pages()

2024-03-26 Thread Mike Rapoport
Hi Baoquan,

On Mon, Mar 25, 2024 at 10:56:43PM +0800, Baoquan He wrote:
> This is a preparation to calculate nr_kernel_pages and nr_all_pages,
> both of which will be used later in alloc_large_system_hash().
> 
> nr_all_pages counts up all free but not reserved memory in memblock
> allocator, including HIGHMEM memory. While nr_kernel_pages counts up
> all free but not reserved low memory in memblock allocator, excluding
> HIGHMEM memory.

Sorry I've missed this in the previous review, but I think this patch and
the patch "remove unneeded calc_memmap_size()" can be merged into "remove
meaningless calculation of zone->managed_pages in free_area_init_core()"
with an appropriate update of the commit message.

With the current patch splitting there will be compilation warning about unused
function for this and the next patch.
 
> Signed-off-by: Baoquan He 
> ---
>  mm/mm_init.c | 24 
>  1 file changed, 24 insertions(+)
> 
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 153fb2dc666f..c57a7fc97a16 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1264,6 +1264,30 @@ static void __init 
> reset_memoryless_node_totalpages(struct pglist_data *pgdat)
>   pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
>  }
>  
> +static void __init calc_nr_kernel_pages(void)
> +{
> + unsigned long start_pfn, end_pfn;
> + phys_addr_t start_addr, end_addr;
> + u64 u;
> +#ifdef CONFIG_HIGHMEM
> + unsigned long high_zone_low = 
> arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
> +#endif
> +
> + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, _addr, 
> _addr, NULL) {
> + start_pfn = PFN_UP(start_addr);
> + end_pfn   = PFN_DOWN(end_addr);
> +
> + if (start_pfn < end_pfn) {
> + nr_all_pages += end_pfn - start_pfn;
> +#ifdef CONFIG_HIGHMEM
> + start_pfn = clamp(start_pfn, 0, high_zone_low);
> + end_pfn = clamp(end_pfn, 0, high_zone_low);
> +#endif
> + nr_kernel_pages += end_pfn - start_pfn;
> + }
> + }
> +}
> +
>  static void __init calculate_node_totalpages(struct pglist_data *pgdat,
>   unsigned long node_start_pfn,
>   unsigned long node_end_pfn)
> -- 
> 2.41.0
> 

-- 
Sincerely yours,
Mike.


Re: [PATCH v2 2/6] mm/mm_init.c: remove the useless dma_reserve

2024-03-26 Thread Mike Rapoport
On Mon, Mar 25, 2024 at 10:56:42PM +0800, Baoquan He wrote:
> Now nobody calls set_dma_reserve() to set value for dma_reserve, remove
> set_dma_reserve(), global variable dma_reserve and the codes using it.
> 
> Signed-off-by: Baoquan He 

Reviewed-by: Mike Rapoport (IBM) 

> ---
>  include/linux/mm.h |  1 -
>  mm/mm_init.c   | 23 ---
>  2 files changed, 24 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0436b919f1c7..ad19350e1538 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3210,7 +3210,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
>  extern int __meminit early_pfn_to_nid(unsigned long pfn);
>  #endif
>  
> -extern void set_dma_reserve(unsigned long new_dma_reserve);
>  extern void mem_init(void);
>  extern void __init mmap_init(void);
>  
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 549e76af8f82..153fb2dc666f 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -226,7 +226,6 @@ static unsigned long required_movablecore_percent 
> __initdata;
>  
>  static unsigned long nr_kernel_pages __initdata;
>  static unsigned long nr_all_pages __initdata;
> -static unsigned long dma_reserve __initdata;
>  
>  static bool deferred_struct_pages __meminitdata;
>  
> @@ -1583,12 +1582,6 @@ static void __init free_area_init_core(struct 
> pglist_data *pgdat)
>   zone_names[j], memmap_pages, freesize);
>   }
>  
> - /* Account for reserved pages */
> - if (j == 0 && freesize > dma_reserve) {
> - freesize -= dma_reserve;
> - pr_debug("  %s zone: %lu pages reserved\n", 
> zone_names[0], dma_reserve);
> - }
> -
>   if (!is_highmem_idx(j))
>   nr_kernel_pages += freesize;
>   /* Charge for highmem memmap if there are enough kernel pages */
> @@ -2547,22 +2540,6 @@ void *__init alloc_large_system_hash(const char 
> *tablename,
>   return table;
>  }
>  
> -/**
> - * set_dma_reserve - set the specified number of pages reserved in the first 
> zone
> - * @new_dma_reserve: The number of pages to mark reserved
> - *
> - * The per-cpu batchsize and zone watermarks are determined by managed_pages.
> - * In the DMA zone, a significant percentage may be consumed by kernel image
> - * and other unfreeable allocations which can skew the watermarks badly. This
> - * function may optionally be used to account for unfreeable pages in the
> - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
> - * smaller per-cpu batchsize.
> - */
> -void __init set_dma_reserve(unsigned long new_dma_reserve)
> -{
> - dma_reserve = new_dma_reserve;
> -}
> -
>  void __init memblock_free_pages(struct page *page, unsigned long pfn,
>   unsigned int order)
>  {
> -- 
> 2.41.0
> 
> 

-- 
Sincerely yours,
Mike.


Re: [PATCH v2 1/6] x86: remove unneeded memblock_find_dma_reserve()

2024-03-26 Thread Mike Rapoport
On Mon, Mar 25, 2024 at 10:56:41PM +0800, Baoquan He wrote:
> Variable dma_reserve and its usage was introduced in commit 0e0b864e069c
> ("[PATCH] Account for memmap and optionally the kernel image as holes").
> Its original purpose was to accounting for the reserved pages in DMA
> zone to make DMA zone's watermarks calculation more accurate on x86.
> 
> However, currently there's zone->managed_pages to account for all
> available pages for buddy, zone->present_pages to account for all
> present physical pages in zone. What is more important, on x86,
> calculating and setting the zone->managed_pages is a temporary move,
> all zone's managed_pages will be zeroed out and reset to the actual
> value according to how many pages are added to buddy allocator in
> mem_init(). Before mem_init(), no buddy alloction is requested. And
> zone's pcp and watermark setting are all done after mem_init(). So,
> no need to worry about the DMA zone's setting accuracy during
> free_area_init().
> 
> Hence, remove memblock_find_dma_reserve() to stop calculating and
> setting dma_reserve.
> 
> Signed-off-by: Baoquan He 

Reviewed-by: Mike Rapoport (IBM) 

> ---
>  arch/x86/include/asm/pgtable.h |  1 -
>  arch/x86/kernel/setup.c|  2 --
>  arch/x86/mm/init.c | 47 --
>  3 files changed, 50 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 315535ffb258..cefc7a84f7a4 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -1200,7 +1200,6 @@ static inline int pgd_none(pgd_t pgd)
>  extern int direct_gbpages;
>  void init_mem_mapping(void);
>  void early_alloc_pgt_buf(void);
> -extern void memblock_find_dma_reserve(void);
>  void __init poking_init(void);
>  unsigned long init_memory_mapping(unsigned long start,
> unsigned long end, pgprot_t prot);
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index ef206500ed6f..74873bd04ad1 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -1106,8 +1106,6 @@ void __init setup_arch(char **cmdline_p)
>*/
>   arch_reserve_crashkernel();
>  
> - memblock_find_dma_reserve();
> -
>   if (!early_xdbc_setup_hardware())
>   early_xdbc_register_console();
>  
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 679893ea5e68..615f0bf4bda6 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -990,53 +990,6 @@ void __init free_initrd_mem(unsigned long start, 
> unsigned long end)
>  }
>  #endif
>  
> -/*
> - * Calculate the precise size of the DMA zone (first 16 MB of RAM),
> - * and pass it to the MM layer - to help it set zone watermarks more
> - * accurately.
> - *
> - * Done on 64-bit systems only for the time being, although 32-bit systems
> - * might benefit from this as well.
> - */
> -void __init memblock_find_dma_reserve(void)
> -{
> -#ifdef CONFIG_X86_64
> - u64 nr_pages = 0, nr_free_pages = 0;
> - unsigned long start_pfn, end_pfn;
> - phys_addr_t start_addr, end_addr;
> - int i;
> - u64 u;
> -
> - /*
> -  * Iterate over all memory ranges (free and reserved ones alike),
> -  * to calculate the total number of pages in the first 16 MB of RAM:
> -  */
> - nr_pages = 0;
> - for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
> - start_pfn = min(start_pfn, MAX_DMA_PFN);
> - end_pfn   = min(end_pfn,   MAX_DMA_PFN);
> -
> - nr_pages += end_pfn - start_pfn;
> - }
> -
> - /*
> -  * Iterate over free memory ranges to calculate the number of free
> -  * pages in the DMA zone, while not counting potential partial
> -  * pages at the beginning or the end of the range:
> -  */
> - nr_free_pages = 0;
> - for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, _addr, 
> _addr, NULL) {
> - start_pfn = min_t(unsigned long, PFN_UP(start_addr), 
> MAX_DMA_PFN);
> - end_pfn   = min_t(unsigned long, PFN_DOWN(end_addr), 
> MAX_DMA_PFN);
> -
> - if (start_pfn < end_pfn)
> - nr_free_pages += end_pfn - start_pfn;
> - }
> -
> - set_dma_reserve(nr_pages - nr_free_pages);
> -#endif
> -}
> -
>  void __init zone_sizes_init(void)
>  {
>   unsigned long max_zone_pfns[MAX_NR_ZONES];
> -- 
> 2.41.0
> 

-- 
Sincerely yours,
Mike.


[PATCH v18 6/6] powerpc/crash: add crash memory hotplug support

2024-03-26 Thread Sourabh Jain
Extend the arch crash hotplug handler, as introduced by the patch title
("powerpc: add crash CPU hotplug support"), to also support memory
add/remove events.

Elfcorehdr describes the memory of the crash kernel to capture the
kernel; hence, it needs to be updated if memory resources change due to
memory add/remove events. Therefore, arch_crash_handle_hotplug_event()
is updated to recreate the elfcorehdr and replace it with the previous
one on memory add/remove events.

The memblock list is used to prepare the elfcorehdr. In the case of
memory hot remove, the memblock list is updated after the arch crash
hotplug handler is triggered, as depicted in Figure 1. Thus, the
hot-removed memory is explicitly removed from the crash memory ranges
to ensure that the memory ranges added to elfcorehdr do not include the
hot-removed memory.

Memory remove
  |
  v
Offline pages
  |
  v
 Initiate memory notify call <> crash hotplug handler
 chain for MEM_OFFLINE event
  |
  v
 Update memblock list

Figure 1

There are two system calls, `kexec_file_load` and `kexec_load`, used to
load the kdump image. A few changes have been made to ensure that the
kernel can safely update the elfcorehdr component of the kdump image for
both system calls.

For the kexec_file_load syscall, kdump image is prepared in the kernel.
To support an increasing number of memory regions, the elfcorehdr is
built with extra buffer space to ensure that it can accommodate
additional memory ranges in future.

For the kexec_load syscall, the elfcorehdr is updated only if the
KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by the
kexec tool. Passing this flag to the kernel indicates that the
elfcorehdr is built to accommodate additional memory ranges and the
elfcorehdr segment is not considered for SHA calculation, making it safe
to update.

The changes related to this feature are kept under the CRASH_HOTPLUG
config, and it is enabled by default.

Signed-off-by: Sourabh Jain 
Acked-by: Hari Bathini 
Cc: Akhil Raj 
Cc: Andrew Morton 
Cc: Aneesh Kumar K.V 
Cc: Baoquan He 
Cc: Borislav Petkov (AMD) 
Cc: Boris Ostrovsky 
Cc: Christophe Leroy 
Cc: Dave Hansen 
Cc: Dave Young 
Cc: David Hildenbrand 
Cc: Greg Kroah-Hartman 
Cc: Laurent Dufour 
Cc: Mahesh Salgaonkar 
Cc: Michael Ellerman 
Cc: Mimi Zohar 
Cc: Naveen N Rao 
Cc: Oscar Salvador 
Cc: Thomas Gleixner 
Cc: Valentin Schneider 
Cc: Vivek Goyal 
Cc: ke...@lists.infradead.org
Cc: x...@kernel.org
---
* No changes in v18.

 arch/powerpc/include/asm/kexec.h|  3 +
 arch/powerpc/include/asm/kexec_ranges.h |  1 +
 arch/powerpc/kexec/crash.c  | 95 -
 arch/powerpc/kexec/file_load_64.c   | 20 +-
 arch/powerpc/kexec/ranges.c | 85 ++
 5 files changed, 202 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index e75970351bcd..95a98b390d62 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -141,6 +141,9 @@ void arch_crash_handle_hotplug_event(struct kimage *image, 
void *arg);
 
 int arch_crash_hotplug_support(struct kimage *image, unsigned long 
kexec_flags);
 #define arch_crash_hotplug_support arch_crash_hotplug_support
+
+unsigned int arch_crash_get_elfcorehdr_size(void);
+#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
 #endif /* CONFIG_CRASH_HOTPLUG */
 
 extern int crashing_cpu;
diff --git a/arch/powerpc/include/asm/kexec_ranges.h 
b/arch/powerpc/include/asm/kexec_ranges.h
index 8489e844b447..14055896cbcb 100644
--- a/arch/powerpc/include/asm/kexec_ranges.h
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -7,6 +7,7 @@
 void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
 struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
 int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
 int get_exclude_memory_ranges(struct crash_mem **mem_ranges);
 int get_reserved_memory_ranges(struct crash_mem **mem_ranges);
 int get_crash_memory_ranges(struct crash_mem **mem_ranges);
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index 8938a19af12f..21b193e938a3 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -25,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * The primary CPU waits a while for all secondary CPUs to enter. This is to
@@ -398,6 +400,94 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
 #undef pr_fmt
 #define pr_fmt(fmt) "crash hp: " fmt
 
+/*
+ * Advertise preferred elfcorehdr size to userspace via
+ * /sys/kernel/crash_elfcorehdr_size sysfs interface.
+ */
+unsigned int arch_crash_get_elfcorehdr_size(void)
+{
+   unsigned long phdr_cnt;
+
+   /* A program header