[PATCH] Add static_key_feature_checks_initialized flag
JUMP_LABEL_FEATURE_CHECK_DEBUG used static_key_initialized to determine whether {cpu,mmu}_has_feature() was used before static keys were initialized. However, {cpu,mmu}_has_feature() should not be used before setup_feature_keys() is called. As static_key_initalized is set much earlier during boot there is a window in which JUMP_LABEL_FEATURE_CHECK_DEBUG will not report errors. Add a flag specifically to indicate when {cpu,mmu}_has_feature() is safe to use. Signed-off-by: Nicholas Miehlbradt --- arch/powerpc/include/asm/cpu_has_feature.h | 2 +- arch/powerpc/include/asm/feature-fixups.h | 2 ++ arch/powerpc/include/asm/mmu.h | 2 +- arch/powerpc/lib/feature-fixups.c | 8 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h index 727d4b321937..0efabccd820c 100644 --- a/arch/powerpc/include/asm/cpu_has_feature.h +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -29,7 +29,7 @@ static __always_inline bool cpu_has_feature(unsigned long feature) #endif #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG - if (!static_key_initialized) { + if (!static_key_feature_checks_initialized) { printk("Warning! cpu_has_feature() used prior to jump label init!\n"); dump_stack(); return early_cpu_has_feature(feature); diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 77824bd289a3..17d168dd8b49 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -291,6 +291,8 @@ extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; extern long __start___barrier_nospec_fixup, __stop___barrier_nospec_fixup; extern long __start__btb_flush_fixup, __stop__btb_flush_fixup; +extern bool static_key_feature_checks_initialized; + void apply_feature_fixups(void); void update_mmu_feature_fixups(unsigned long mask); void setup_feature_keys(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 3b72c7ed24cf..24f830cf9bb4 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -251,7 +251,7 @@ static __always_inline bool mmu_has_feature(unsigned long feature) #endif #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG - if (!static_key_initialized) { + if (!static_key_feature_checks_initialized) { printk("Warning! mmu_has_feature() used prior to jump label init!\n"); dump_stack(); return early_mmu_has_feature(feature); diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 4f82581ca203..b7201ba50b2e 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -25,6 +25,13 @@ #include #include +/* + * Used to generate warnings if mmu or cpu feature check functions that + * use static keys before they are initialized. + */ +bool static_key_feature_checks_initialized __read_mostly; +EXPORT_SYMBOL_GPL(static_key_feature_checks_initialized); + struct fixup_entry { unsigned long mask; unsigned long value; @@ -679,6 +686,7 @@ void __init setup_feature_keys(void) jump_label_init(); cpu_feature_keys_init(); mmu_feature_keys_init(); + static_key_feature_checks_initialized = true; } static int __init check_features(void) -- 2.40.1
Re: [PATCHv3 pci-next 1/2] PCI/AER: correctable error message as KERN_INFO
On 3/27/2024 5:17 AM, Bjorn Helgaas wrote: On Tue, Mar 26, 2024 at 09:39:54AM +0800, Ethan Zhao wrote: On 3/25/2024 6:15 PM, Xi Ruoyao wrote: On Mon, 2024-03-25 at 16:45 +0800, Ethan Zhao wrote: On 3/25/2024 1:19 AM, Xi Ruoyao wrote: On Mon, 2023-09-18 at 14:39 -0500, Bjorn Helgaas wrote: On Mon, Sep 18, 2023 at 07:42:30PM +0800, Xi Ruoyao wrote: ... My workstation suffers from too much correctable AER reporting as well (related to Intel's errata "RPL013: Incorrectly Formed PCIe Packets May Generate Correctable Errors" and/or the motherboard design, I guess). We should rate-limit correctable error reporting so it's not overwhelming. At the same time, I'm *also* interested in the cause of these errors, in case there's a Linux defect or a hardware erratum that we can work around. Do you have a bug report with any more details, e.g., a dmesg log and "sudo lspci -vv" output? Hi Bjorn, Sorry for the *very* late reply (somehow I didn't see the reply at all before it was removed by my cron job, and now I just savaged it from lore.kernel.org...) The dmesg is like: [ 882.456994] pcieport :00:1c.1: AER: Multiple Correctable error message received from :00:1c.1 [ 882.457002] pcieport :00:1c.1: AER: found no error details for :00:1c.1 [ 882.457003] pcieport :00:1c.1: AER: Multiple Correctable error message received from :06:00.0 [ 883.545763] pcieport :00:1c.1: AER: Multiple Correctable error message received from :00:1c.1 [ 883.545789] pcieport :00:1c.1: PCIe Bus Error: severity=Correctable, type=Physical Layer, (Receiver ID) [ 883.545790] pcieport :00:1c.1: device [8086:7a39] error status/mask=0001/2000 [ 883.545792] pcieport :00:1c.1: [ 0] RxErr (First) [ 883.545794] pcieport :00:1c.1: AER: Error of this Agent is reported first [ 883.545798] r8169 :06:00.0: PCIe Bus Error: severity=Correctable, type=Physical Layer, (Transmitter ID) [ 883.545799] r8169 :06:00.0: device [10ec:8125] error status/mask=1101/e000 [ 883.545800] r8169 :06:00.0: [ 0] RxErr (First) [ 883.545801] r8169 :06:00.0: [ 8] Rollover [ 883.545802] r8169 :06:00.0: [12] Timeout [ 883.545815] pcieport :00:1c.1: AER: Correctable error message received from :00:1c.1 [ 883.545823] pcieport :00:1c.1: AER: found no error details for :00:1c.1 [ 883.545824] pcieport :00:1c.1: AER: Multiple Correctable error message received from :06:00.0 lspci output attached. Intel has issued an errata "RPL013" saying: "Under complex microarchitectural conditions, the PCIe controller may transmit an incorrectly formed Transaction Layer Packet (TLP), which will fail CRC checks. When this erratum occurs, the PCIe end point may record correctable errors resulting in either a NAK or link recovery. Intel® has not observed any functional impact due to this erratum." But I'm really unsure if it describes my issue. Do you think I have some broken hardware and I should replace the CPU and/or the motherboard (where the r8169 is soldered)? I've noticed that my 13900K is almost impossible to overclock (despite it's a K), but I've not encountered any issue other than these AER reporting so far after I gave up overclocking. Seems there are two r8169 nics on your board, only :06:00.0 reports aer errors, how about another one the :07:00.0 nic ? It never happens to :07:00.0, even if I plug the ethernet cable into it instead of :06:00.0. So something is wrong with the physical layer, I guess. Maybe I should just use :07:00.0 and blacklist :06:00.0 as I don't need two NICs? Yup, ratelimit the AER warning is another choice instead of change WARN to INFO. if corrected error flood happens, even the function is working, suggests something was already wrong, likely will be worse, that is the meaning of WARN I think. We should fix this. IMHO Correctable Errors should be "info" level, non-alarming, and rate-limited. They're basically hints about link integrity. This case, hit following errors: [ 883.545800] r8169 :06:00.0: [ 0] RxErr [ 883.545801] r8169 :06:00.0: [ 8] Rollover [ 883.545802] r8169 :06:00.0: [12] Timeout #1 Timeout -- replay timer timeout, means endpoint didn't response with ACK DLLP or NACK in time, that caused the re-send timer timeout, the sender will re-send the packet. #2 Rollover -- the counter of re-transmission reaches 0 (from 11b ->00b), means the sender had tried 3 times. that would trigger link retraining to recover. #1 & #2 happened together, but no uncorrected errors reported, means the link was recovered, the issue mostly caused by improper TxEQ, receiver equalization, bad signal integrity. #3 RxErr -- bad DLLP, bad TLP, clock issue, signal integrity issue etc. so, yup, basically, the signal integrity is not good enough. Though the function could work, its performance will be impacted. If we change
Re: [PATCH v11 0/5]arm64: add ARCH_HAS_COPY_MC support
Hi Mark: Kindly ping... Thanks, Tong. 在 2024/2/7 21:21, Tong Tiangen 写道: With the increase of memory capacity and density, the probability of memory error also increases. The increasing size and density of server RAM in data centers and clouds have shown increased uncorrectable memory errors. Currently, more and more scenarios that can tolerate memory errors,such as CoW[1,2], KSM copy[3], coredump copy[4], khugepaged[5,6], uaccess copy[7], etc. This patchset introduces a new processing framework on ARM64, which enables ARM64 to support error recovery in the above scenarios, and more scenarios can be expanded based on this in the future. In arm64, memory error handling in do_sea(), which is divided into two cases: 1. If the user state consumed the memory errors, the solution is to kill the user process and isolate the error page. 2. If the kernel state consumed the memory errors, the solution is to panic. For case 2, Undifferentiated panic may not be the optimal choice, as it can be handled better. In some scenarios, we can avoid panic, such as uaccess, if the uaccess fails due to memory error, only the user process will be affected, killing the user process and isolating the user page with hardware memory errors is a better choice. [1] commit d302c2398ba2 ("mm, hwpoison: when copy-on-write hits poison, take page offline") [2] commit 1cb9dc4b475c ("mm: hwpoison: support recovery from HugePage copy-on-write faults") [3] commit 6b970599e807 ("mm: hwpoison: support recovery from ksm_might_need_to_copy()") [4] commit 245f09226893 ("mm: hwpoison: coredump: support recovery from dump_user_range()") [5] commit 98c76c9f1ef7 ("mm/khugepaged: recover from poisoned anonymous memory") [6] commit 12904d953364 ("mm/khugepaged: recover from poisoned file-backed memory") [7] commit 278b917f8cb9 ("x86/mce: Add _ASM_EXTABLE_CPY for copy user access") -- Test result: 1. copy_page(), copy_mc_page() basic function test pass, and the disassembly contents remains the same before and after refactor. 2. copy_to/from_user() access kernel NULL pointer raise translation fault and dump error message then die(), test pass. 3. Test following scenarios: copy_from_user(), get_user(), COW. Before patched: trigger a hardware memory error then panic. After patched: trigger a hardware memory error without panic. Testing step: step1. start an user-process. step2. poison(einj) the user-process's page. step3: user-process access the poison page in kernel mode, then trigger SEA. step4: the kernel will not panic, only the user process is killed, the poison page is isolated. (before patched, the kernel will panic in do_sea()) -- Since V10: Accroding Mark's suggestion: 1. Merge V10's patch2 and patch3 to V11's patch2. 2. Patch2(V11): use new fixup_type for ld* in copy_to_user(), fix fatal issues (NULL kernel pointeraccess) been fixup incorrectly. 3. Patch2(V11): refactoring the logic of do_sea(). 4. Patch4(V11): Remove duplicate assembly logic and remove do_mte(). Besides: 1. Patch2(V11): remove st* insn's fixup, st* generally not trigger memory error. 2. Split a part of the logic of patch2(V11) to patch5(V11), for detail, see patch5(V11)'s commit msg. 3. Remove patch6(v10) “arm64: introduce copy_mc_to_kernel() implementation”. During modification, some problems that cannot be solved in a short period are found. The patch will be released after the problems are solved. 4. Add test result in this patch. 5. Modify patchset title, do not use machine check and remove "-next". Since V9: 1. Rebase to latest kernel version 6.8-rc2. 2. Add patch 6/6 to support copy_mc_to_kernel(). Since V8: 1. Rebase to latest kernel version and fix topo in some of the patches. 2. According to the suggestion of Catalin, I attempted to modify the return value of function copy_mc_[user]_highpage() to bytes not copied. During the modification process, I found that it would be more reasonable to return -EFAULT when copy error occurs (referring to the newly added patch 4). For ARM64, the implementation of copy_mc_[user]_highpage() needs to consider MTE. Considering the scenario where data copying is successful but the MTE tag copying fails, it is also not reasonable to return bytes not copied. 3. Considering the recent addition of machine check safe support for multiple scenarios, modify commit message for patch 5 (patch 4 for V8). Since V7: Currently, there are patches supporting recover from poison consumption for the cow scenario[1]. Therefore, Supporting cow scenario under the arm64 architecture only needs to modify the relevant code under the arch/. [1]https://lore.kernel.org/lkml/20221031201029.102123-1-tony.l...@intel.com/ Since V6: Resend patches that are not merged into the mainline in V6. Since V5: 1. Add patch2/3 to add
Re: [PATCH 0/9] enabled -Wformat-truncation for clang
On Tue, 26 Mar 2024 23:37:59 +0100 Arnd Bergmann wrote: > I hope that the patches can get picked up by platform maintainers > directly, so the final patch can go in later on. platform == subsystem? :)
[PATCH 8/9] ALSA: aoa: avoid false-positive format truncation warning
From: Arnd Bergmann clang warns about what it interprets as a truncated snprintf: sound/aoa/soundbus/i2sbus/core.c:171:6: error: 'snprintf' will always be truncated; specified size is 6, but format string expands to at least 7 [-Werror,-Wformat-truncation-non-kprintf] The actual problem here is that it does not understand the special %pOFn format string and assumes that it is a pointer followed by the string "OFn", which would indeed not fit. Slightly increasing the size of the buffer to its natural alignment avoids the warning, as it is now long enough for the correct and the incorrect interprations. Fixes: b917d58dcfaa ("ALSA: aoa: Convert to using %pOFn instead of device_node.name") Signed-off-by: Arnd Bergmann --- sound/aoa/soundbus/i2sbus/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/aoa/soundbus/i2sbus/core.c b/sound/aoa/soundbus/i2sbus/core.c index b8ff5cccd0c8..5431d2c49421 100644 --- a/sound/aoa/soundbus/i2sbus/core.c +++ b/sound/aoa/soundbus/i2sbus/core.c @@ -158,7 +158,7 @@ static int i2sbus_add_dev(struct macio_dev *macio, struct device_node *child, *sound = NULL; struct resource *r; int i, layout = 0, rlen, ok = force; - char node_name[6]; + char node_name[8]; static const char *rnames[] = { "i2sbus: %pOFn (control)", "i2sbus: %pOFn (tx)", "i2sbus: %pOFn (rx)" }; -- 2.39.2
[PATCH 0/9] enabled -Wformat-truncation for clang
From: Arnd Bergmann With randconfig build testing, I found only eight files that produce warnings with clang when -Wformat-truncation is enabled. This means we can just turn it on by default rather than only enabling it for "make W=1". Unfortunately, gcc produces a lot more warnings when the option is enabled, so it's not yet possible to turn it on both both compilers. I hope that the patches can get picked up by platform maintainers directly, so the final patch can go in later on. Arnd Arnd Bergmann (9): fbdev: shmobile: fix snprintf truncation enetc: avoid truncating error message qed: avoid truncating work queue length mlx5: avoid truncating error message surface3_power: avoid format string truncation warning Input: IMS: fix printf string overflow scsi: mylex: fix sysfs buffer lengths ALSA: aoa: avoid false-positive format truncation warning kbuild: enable -Wformat-truncation on clang drivers/input/misc/ims-pcu.c | 4 ++-- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- .../ethernet/mellanox/mlx5/core/esw/bridge.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_main.c| 9 --- drivers/platform/surface/surface3_power.c | 2 +- drivers/scsi/myrb.c | 20 drivers/scsi/myrs.c | 24 +-- drivers/video/fbdev/sh_mobile_lcdcfb.c| 2 +- scripts/Makefile.extrawarn| 2 ++ sound/aoa/soundbus/i2sbus/core.c | 2 +- 10 files changed, 35 insertions(+), 34 deletions(-) -- 2.39.2 Cc: Dmitry Torokhov Cc: Claudiu Manoil Cc: Vladimir Oltean Cc: Jakub Kicinski Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: Ariel Elior Cc: Manish Chopra Cc: Hans de Goede Cc: "Ilpo Järvinen" Cc: Maximilian Luz Cc: Hannes Reinecke Cc: "Martin K. Petersen" Cc: Helge Deller Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Johannes Berg Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Cc: linux-in...@vger.kernel.org Cc: linux-ker...@vger.kernel.org Cc: net...@vger.kernel.org Cc: linux-r...@vger.kernel.org Cc: platform-driver-...@vger.kernel.org Cc: linux-s...@vger.kernel.org Cc: linux-fb...@vger.kernel.org Cc: dri-de...@lists.freedesktop.org Cc: linux-kbu...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: alsa-de...@alsa-project.org Cc: linux-so...@vger.kernel.org Cc: l...@lists.linux.dev
Re: [PATCHv3 pci-next 1/2] PCI/AER: correctable error message as KERN_INFO
On Tue, Mar 26, 2024 at 09:39:54AM +0800, Ethan Zhao wrote: > On 3/25/2024 6:15 PM, Xi Ruoyao wrote: > > On Mon, 2024-03-25 at 16:45 +0800, Ethan Zhao wrote: > > > On 3/25/2024 1:19 AM, Xi Ruoyao wrote: > > > > On Mon, 2023-09-18 at 14:39 -0500, Bjorn Helgaas wrote: > > > > > On Mon, Sep 18, 2023 at 07:42:30PM +0800, Xi Ruoyao wrote: > > > > > > ... > > > > > > My workstation suffers from too much correctable AER reporting as > > > > > > well > > > > > > (related to Intel's errata "RPL013: Incorrectly Formed PCIe Packets > > > > > > May > > > > > > Generate Correctable Errors" and/or the motherboard design, I > > > > > > guess). > > > > > We should rate-limit correctable error reporting so it's not > > > > > overwhelming. > > > > > > > > > > At the same time, I'm *also* interested in the cause of these errors, > > > > > in case there's a Linux defect or a hardware erratum that we can work > > > > > around. Do you have a bug report with any more details, e.g., a dmesg > > > > > log and "sudo lspci -vv" output? > > > > Hi Bjorn, > > > > > > > > Sorry for the *very* late reply (somehow I didn't see the reply at all > > > > before it was removed by my cron job, and now I just savaged it from > > > > lore.kernel.org...) > > > > > > > > The dmesg is like: > > > > > > > > [ 882.456994] pcieport :00:1c.1: AER: Multiple Correctable error > > > > message received from :00:1c.1 > > > > [ 882.457002] pcieport :00:1c.1: AER: found no error details for > > > > :00:1c.1 > > > > [ 882.457003] pcieport :00:1c.1: AER: Multiple Correctable error > > > > message received from :06:00.0 > > > > [ 883.545763] pcieport :00:1c.1: AER: Multiple Correctable error > > > > message received from :00:1c.1 > > > > [ 883.545789] pcieport :00:1c.1: PCIe Bus Error: > > > > severity=Correctable, type=Physical Layer, (Receiver ID) > > > > [ 883.545790] pcieport :00:1c.1: device [8086:7a39] error > > > > status/mask=0001/2000 > > > > [ 883.545792] pcieport :00:1c.1: [ 0] RxErr > > > > (First) > > > > [ 883.545794] pcieport :00:1c.1: AER: Error of this Agent is > > > > reported first > > > > [ 883.545798] r8169 :06:00.0: PCIe Bus Error: > > > > severity=Correctable, type=Physical Layer, (Transmitter ID) > > > > [ 883.545799] r8169 :06:00.0: device [10ec:8125] error > > > > status/mask=1101/e000 > > > > [ 883.545800] r8169 :06:00.0: [ 0] RxErr > > > > (First) > > > > [ 883.545801] r8169 :06:00.0: [ 8] Rollover > > > > [ 883.545802] r8169 :06:00.0: [12] Timeout > > > > [ 883.545815] pcieport :00:1c.1: AER: Correctable error message > > > > received from :00:1c.1 > > > > [ 883.545823] pcieport :00:1c.1: AER: found no error details for > > > > :00:1c.1 > > > > [ 883.545824] pcieport :00:1c.1: AER: Multiple Correctable error > > > > message received from :06:00.0 > > > > > > > > lspci output attached. > > > > > > > > Intel has issued an errata "RPL013" saying: > > > > > > > > "Under complex microarchitectural conditions, the PCIe controller may > > > > transmit an incorrectly formed Transaction Layer Packet (TLP), which > > > > will fail CRC checks. When this erratum occurs, the PCIe end point may > > > > record correctable errors resulting in either a NAK or link recovery. > > > > Intel® has not observed any functional impact due to this erratum." > > > > > > > > But I'm really unsure if it describes my issue. > > > > > > > > Do you think I have some broken hardware and I should replace the CPU > > > > and/or the motherboard (where the r8169 is soldered)? I've noticed that > > > > my 13900K is almost impossible to overclock (despite it's a K), but I've > > > > not encountered any issue other than these AER reporting so far after I > > > > gave up overclocking. > > > Seems there are two r8169 nics on your board, only :06:00.0 reports > > > aer errors, how about another one the :07:00.0 nic ? > > It never happens to :07:00.0, even if I plug the ethernet cable into > > it instead of :06:00.0. > > So something is wrong with the physical layer, I guess. > > > Maybe I should just use :07:00.0 and blacklist :06:00.0 as I > > don't need two NICs? > > Yup, > ratelimit the AER warning is another choice instead of change WARN to INFO. > if corrected error flood happens, even the function is working, suggests > something was already wrong, likely will be worse, that is the meaning of > WARN I think. We should fix this. IMHO Correctable Errors should be "info" level, non-alarming, and rate-limited. They're basically hints about link integrity. Bjorn
Re: [PATCH 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files
On 3/14/2024 10:09 PM, Dave Hansen wrote: Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. On 3/14/24 09:29, Borislav Petkov wrote: That argument breaks down a bit on the flags though: xc.xfeat_flags = xstate_flags[i]; Because it comes _directly_ from CPUID with zero filtering: cpuid_count(XSTATE_CPUID, i, , , , ); ... xstate_flags[i] = ecx; So this layout is quite dependent on what's in x86's CPUID. Yeah, no, this should not be copying CPUID flags - those flags should be *translated* to independently defined flags which describe those buffers. Ditto for: xc.xfeat_type = i; Right now, that's bound to CPUID and XSAVE. "feat_type==10" can only ever be PKRU and that's derived from the XSAVE architecture. If you want this to be extensible to things outside of the XSAVE architecture, it needs to be something actually extensible and not entangled with XSAVE. In other words "xc.xfeat_type" can enumerate XSAVE state components being in the dump, but it should not be limited to XSAVE. Just as an example: enum feat_type { FEATURE_XSAVE_PKRU, FEATURE_XSAVE__YMM, FEATURE_XSAVE_BNDREGS, FEATURE_XSAVE_BNDCSR, ... RANDOM_STATE_NOT_XSAVE }; See how feat_type==1 is PKRU and *NOT* feat_type==10? That opens the door to RANDOM_STATE_NOT_XSAVE or anything else you want. This would be _actually_ extensible. Thanks for the review. I will add new enum, instead of using "enum xfeature". Currently we are retaining the flags field. The value will be set to zero at this point, and the field will be reserved for future use. GDB / LLDB would not require this field at this point. Do let us know if this is not OK. -thanks, vigneshbalu.
Re: [PATCH 14/64] i2c: cpm: reword according to newest specification
Hi Wolfram, ... > @@ -570,7 +570,7 @@ static int cpm_i2c_setup(struct cpm_i2c *cpm) > out_8(>i2c_reg->i2brg, brg); > > out_8(>i2c_reg->i2mod, 0x00); > - out_8(>i2c_reg->i2com, I2COM_MASTER); /* Master mode */ > + out_8(>i2c_reg->i2com, I2COM_MASTER); /* Host mode */ I2COM_MASTER might be coming from the datasheet. Reviewed-by: Andi Shyti Thanks,
Re: Appropriate liburcu cache line size for Power
On Tue, Mar 26, 2024 at 06:19:38PM +1100, Michael Ellerman wrote: > Mathieu Desnoyers writes: > The ISA doesn't specify the cache line size, other than it is smaller > than a page. It also says it is "aligned". Nowhere is it said what an aligned size is, but it seems clear it has to be a power of two. > In practice all the 64-bit IBM server CPUs I'm aware of have used 128 > bytes. Yup. It is 128B on p3 already. > It is possible to discover at runtime via AUXV headers. But that's no > use if you want a compile-time constant. The architecture does not require the data block size to be equal to the instruction block size, already. But many programs subscribe to an overly simplified worldview, which is a big reason everything is 128B on all modern PowerPC. It is quite a nice tradeoff size, there has to be a huge change in the world for this to ever change :-) Segher
Re: [PATCH v3 0/5] ASoC: fsl: Support register and unregister rpmsg sound card through remoteproc
On Mon, 11 Mar 2024 20:13:44 +0900, Chancel Liu wrote: > echo /lib/firmware/fw.elf > /sys/class/remoteproc/remoteproc0/firmware > (A) echo start > /sys/class/remoteproc/remoteproc0/state > (B) echo stop > /sys/class/remoteproc/remoteproc0/state > > The rpmsg sound card is registered in (A) and unregistered in (B). > After "start", imx-audio-rpmsg registers devices for ASoC platform driver > and machine driver. Then sound card is registered. After "stop", > imx-audio-rpmsg unregisters devices for ASoC platform driver and machine > driver. Then sound card is unregistered. > > [...] Applied to https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next Thanks! [1/5] ASoC: fsl: imx-pcm-rpmsg: Register component with rpmsg channel name commit: 41f96cd53f2838ac4894491ac5eadb06f1e5b858 [2/5] ASoC: fsl: imx-audio-rpmsg: Register device with rpmsg channel name commit: dacc7459745df168152b5cceba34efade9b5e063 [3/5] ASoC: fsl: Let imx-audio-rpmsg register platform device for card commit: c73524768e9e1a7ac9eb3a4d36a1ac0d34f22644 [4/5] ASoC: fsl: fsl_rpmsg: Register CPU DAI with name of rpmsg channel commit: 0aa7f5406afa828a93d84d75c9b9ac991cd75367 [5/5] ASoC: fsl: imx-rpmsg: Update to correct DT node commit: c14445bdcb98feddf9afaeb05e6193cc1f8eec1a All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark
Re: [RFC PATCH 1/8] mm: Provide pagesize to pmd_populate()
On Mon, Mar 25, 2024 at 07:05:01PM +, Christophe Leroy wrote: > Not looked into details yet, but I guess so. > > By the way there is a wiki dedicated to huge pages on powerpc, you can > have a look at it here : > https://github.com/linuxppc/wiki/wiki/Huge-pages , maybe you'll find > good ideas there to help me. There sure are alot of page tables types here I'm a bit wondering about terminology, eg on the first diagram "huge pte entry" means a PUD entry that is a leaf? Which ones are contiguous replications? Just general remarks on the ones with huge pages: hash 64k and hugepage 16M/16G radix 64k/radix hugepage 2M/1G radix 4k/radix hugepage 2M/1G nohash 32 - I think this is just a normal x86 like scheme? PMD/PUD can be a leaf with the same size as a next level table. Do any of these cases need to know the higher level to parse the lower? eg is there a 2M bit in the PUD indicating that the PMD is a table of 2M leafs or does each PMD entry have a bit indicating it is a leaf? hash 4k and hugepage 16M/16G nohash 64 - How does this work? I guess since 8xx explicitly calls out consecutive this is actually the pgd can point to 512 256M entries or 8 16G entries? Ie the table size at each level is varable? Or is it the same and the table size is still 512 and each 16G entry is replicated 64 times? Do the offset accessors already abstract this enough? 8xx 4K 8xx 16K - As this series does? Jason
Re: Appropriate liburcu cache line size for Power
On 2024-03-26 03:19, Michael Ellerman wrote: Mathieu Desnoyers writes: Hi, Hi Mathieu, In the powerpc architecture support within the liburcu project [1] we have a cache line size defined as 256 bytes with the following comment: /* Include size of POWER5+ L3 cache lines: 256 bytes */ #define CAA_CACHE_LINE_SIZE 256 I recently received a pull request on github [2] asking to change this to 128 bytes. All the material provided supports that the cache line sizes on powerpc are 128 bytes or less (even L3 on POWER7, POWER8, and POWER9) [3]. I wonder where the 256 bytes L3 cache line size for POWER5+ we have in liburcu comes from, and I wonder if it's the right choice for a cache line size on all powerpc, considering that the Linux kernel cache line size appear to use 128 bytes on recent Power architectures. I recall some benchmark experiments Paul and I did on a 64-core 1.9GHz POWER5+ machine that benefited from a 256 bytes cache line size, and I suppose this is why we came up with this value, but I don't have the detailed specs of that machine. Any feedback on this matter would be appreciated. The ISA doesn't specify the cache line size, other than it is smaller than a page. In practice all the 64-bit IBM server CPUs I'm aware of have used 128 bytes. There are some 64-bit CPUs that use 64 bytes, eg. pasemi PA6T and Freescale e6500. It is possible to discover at runtime via AUXV headers. But that's no use if you want a compile-time constant. Indeed, and this CAA_CACHE_LINE_SIZE is part of the liburcu powerpc ABI, so changing this would require a soname bump, which I don't want to do without really good reasons. I'm happy to run some benchmarks if you can point me at what to run. I had a poke around the repository and found short_bench, but it seemed to run for a very long time. I've created a dedicated test program for this, see: https://github.com/compudj/userspace-rcu-dev/tree/false-sharing example use: On a AMD Ryzen 7 PRO 6850U with Radeon Graphics: for a in 8 16 32 64 128 256 512; do tests/unit/test_false_sharing -s $a; done ok 1 - Stride 8 bytes, increments per ms per thread: 21320 1..1 ok 1 - Stride 16 bytes, increments per ms per thread: 22657 1..1 ok 1 - Stride 32 bytes, increments per ms per thread: 47599 1..1 ok 1 - Stride 64 bytes, increments per ms per thread: 531364 1..1 ok 1 - Stride 128 bytes, increments per ms per thread: 523634 1..1 ok 1 - Stride 256 bytes, increments per ms per thread: 519402 1..1 ok 1 - Stride 512 bytes, increments per ms per thread: 520651 1..1 Would point to false-sharing starting with cache lines smaller than 64 bytes. I get similar results (false-sharing under 64 bytes) with a AMD EPYC 9654 96-Core Processor. The test programs runs 4 threads by default, which can be overridden with "-t N". This may be needed if you want this to use all cores from a larger machine. See "-h" for options. On a POWER9 (architected), altivec supported: for a in 8 16 32 64 128 256 512; do tests/unit/test_false_sharing -s $a; done ok 1 - Stride 8 bytes, increments per ms per thread: 12264 1..1 ok 1 - Stride 16 bytes, increments per ms per thread: 12276 1..1 ok 1 - Stride 32 bytes, increments per ms per thread: 25638 1..1 ok 1 - Stride 64 bytes, increments per ms per thread: 39934 1..1 ok 1 - Stride 128 bytes, increments per ms per thread: 53971 1..1 ok 1 - Stride 256 bytes, increments per ms per thread: 53599 1..1 ok 1 - Stride 512 bytes, increments per ms per thread: 53962 1..1 This points at false-sharing below 128 bytes stride. On a e6500, altivec supported, Model 2.0 (pvr 8040 0120) for a in 8 16 32 64 128 256 512; do tests/unit/test_false_sharing -s $a; done ok 1 - Stride 8 bytes, increments per ms per thread: 9049 1..1 ok 1 - Stride 16 bytes, increments per ms per thread: 9054 1..1 ok 1 - Stride 32 bytes, increments per ms per thread: 18643 1..1 ok 1 - Stride 64 bytes, increments per ms per thread: 37417 1..1 ok 1 - Stride 128 bytes, increments per ms per thread: 37906 1..1 ok 1 - Stride 256 bytes, increments per ms per thread: 37870 1..1 ok 1 - Stride 512 bytes, increments per ms per thread: 37899 1..1 Which points at false-sharing below 64 bytes. I prefer to be cautious about this cache line size value and aim for a value which takes into account the largest known cache line size for an architecture rather than use a too small due to the large overhead caused by false-sharing. Feedback is welcome. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. https://www.efficios.com
Re: [PATCH v3 00/12] mm/gup: Unify hugetlb, part 2
On Mon, Mar 25, 2024 at 02:58:48PM -0400, Peter Xu wrote: > > This remark would be a little easier to understand if you refer to > > hugetlb_walk() not huge_pte_offset() - the latter is only used to > > implement hugetlb_walk() and isn't removed by this series, while a > > single hugetlb_walk() was removed. > > Right. Here huge_pte_offset() is the arch API that I hope we can remove, > the hugetlb_walk() is simply the wrapper. But arguably hugetlb_walk is the thing that should go.. In the generic code we should really try to get away from the weird hugetlb abuse of treating every level as a pte_t. That is not how the generic page table API works and that weirdness is part of what motivates the arch API to supply special functions for reading. Not every arch can just cast every level to a pte_t and still work. But that weirdness is also saving alot of code so something else needs to be though up.. > > Regardless, I think the point is spot on, the direction should be to > > make the page table reading generic with minimal/no interaction with > > hugetlbfs in the generic code. > > Yes, and I also like your terms on calling them "pgtable readers". It's a > better way to describe the difference in that regard between > huge_pte_offset() v.s. huge_pte_alloc(). Exactly that's my goal, that we > should start with the "readers". Yeah, it makes alot of sense to tackle the readers first - we are pretty close now to having enough done to have generic readers. I would imagine tackling everything outside mm/huge*.c to use the normal page table API for reading. Then consider what to do with the reading in mm/huge*.c > The writters might change semantics when merge, and can be more > challenging, I'll need to look into details of each one, like page fault > handlers. Such work may need to be analyzed case by case, and this GUP > part is definitely the low hanging fruit comparing to the rest. The write side is tricky, I think if the read side is sorted out then it will be easer to reason about the write side. Today the write side is paired with the special read side and it is extra hard to understand if there is something weird hidden in the arch. > measurements too when getting there. And btw, IIUC the major challenge of > pagewalk.c is not the removal of walk_hugetlb_range() alone - that may not > be that hard if that's the solo purpose. The better way to go is to remove > mm_walk_ops.hugetlb_entry() altogether, which will cause a change in all > callers; that's "the challenge".. pretty much labor works, not a major > technical challenge it seems. Not sure if it's a good news or bad.. Ugh, that is a big pain. It is relying on that hugetlbfs trick of passing in a pte that is not a pte to make the API generic.. The more I look at this the more I think we need to get to Matthew's idea of having some kind of generic page table API that is not tightly tied to level. Replacing the hugetlb trick of 'everything is a PTE' with 5 special cases in every place seems just horrible. struct mm_walk_ops { int (*leaf_entry)(struct mm_walk_state *state, struct mm_walk *walk); } And many cases really want something like: struct mm_walk_state state; if (!mm_walk_seek_leaf(state, mm, address)) goto no_present if (mm_walk_is_write(state)) .. And detailed walking: for_each_pt_leaf(state, mm, address) { if (mm_walk_is_write(state)) .. } Replacing it with a mm_walk_state that retains the level or otherwise to allow decoding any entry composes a lot better. Forced Loop unrolling can get back to the current code gen in alot of places. It also makes the power stuff a bit nicer as the mm_walk_state could automatically retain back pointers to the higher levels in the state struct too... The puzzle is how to do it and still get reasonable efficient codegen, many operations are going to end up switching on some state->level to know how to decode the entry. > One thing I'll soon look into is to allow huge mappings for PFNMAP; there's > one request from VFIO side for MMIO. Dropping mm_walk_ops.hugetlb_entry() > seems to be good for all such purposes; well, I may need to bite the bullet > here.. for either of the purposes to move on. That would be a nice feature! > > If, or how much, the hugepd write side remains special is the open > > question, I think. > > It seems balls are rolling in that aspect, I haven't looked in depth there > in Christophe's series but it's great to have started! Yes! Jason
Re: [PATCH v2 3/6] mm/mm_init.c: add new function calc_nr_all_pages()
On 03/26/24 at 08:57am, Mike Rapoport wrote: > Hi Baoquan, > > On Mon, Mar 25, 2024 at 10:56:43PM +0800, Baoquan He wrote: > > This is a preparation to calculate nr_kernel_pages and nr_all_pages, > > both of which will be used later in alloc_large_system_hash(). > > > > nr_all_pages counts up all free but not reserved memory in memblock > > allocator, including HIGHMEM memory. While nr_kernel_pages counts up > > all free but not reserved low memory in memblock allocator, excluding > > HIGHMEM memory. > > Sorry I've missed this in the previous review, but I think this patch and > the patch "remove unneeded calc_memmap_size()" can be merged into "remove > meaningless calculation of zone->managed_pages in free_area_init_core()" > with an appropriate update of the commit message. > > With the current patch splitting there will be compilation warning about > unused > function for this and the next patch. Thanks for careful checking. We need to make patch bisect-able to not break compiling so that people can spot the cirminal commit, that's for sure. Do we need care about the compiling warning from intermediate patch in one series? Not sure about it. I always suggest people to seperate out this kind of newly added function to a standalone patch for better reviewing and later checking, and I saw a lot of commits like this by searching with 'git log --oneline | grep helper' > > > Signed-off-by: Baoquan He > > --- > > mm/mm_init.c | 24 > > 1 file changed, 24 insertions(+) > > > > diff --git a/mm/mm_init.c b/mm/mm_init.c > > index 153fb2dc666f..c57a7fc97a16 100644 > > --- a/mm/mm_init.c > > +++ b/mm/mm_init.c > > @@ -1264,6 +1264,30 @@ static void __init > > reset_memoryless_node_totalpages(struct pglist_data *pgdat) > > pr_debug("On node %d totalpages: 0\n", pgdat->node_id); > > } > > > > +static void __init calc_nr_kernel_pages(void) > > +{ > > + unsigned long start_pfn, end_pfn; > > + phys_addr_t start_addr, end_addr; > > + u64 u; > > +#ifdef CONFIG_HIGHMEM > > + unsigned long high_zone_low = > > arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; > > +#endif > > + > > + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, _addr, > > _addr, NULL) { > > + start_pfn = PFN_UP(start_addr); > > + end_pfn = PFN_DOWN(end_addr); > > + > > + if (start_pfn < end_pfn) { > > + nr_all_pages += end_pfn - start_pfn; > > +#ifdef CONFIG_HIGHMEM > > + start_pfn = clamp(start_pfn, 0, high_zone_low); > > + end_pfn = clamp(end_pfn, 0, high_zone_low); > > +#endif > > + nr_kernel_pages += end_pfn - start_pfn; > > + } > > + } > > +} > > + > > static void __init calculate_node_totalpages(struct pglist_data *pgdat, > > unsigned long node_start_pfn, > > unsigned long node_end_pfn) > > -- > > 2.41.0 > > > > -- > Sincerely yours, > Mike. >
Re: [PATCH v1 1/1] ASoC: fsl: imx-es8328: Remove leftover gpio initialisation
On Mon, 25 Mar 2024 21:13:41 +0200, Andy Shevchenko wrote: > The gpio field is not used anymore, remove the leftover. > This also fixes the compilation error after the ... > > Applied to https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next Thanks! [1/1] ASoC: fsl: imx-es8328: Remove leftover gpio initialisation commit: 6a92834166b16babd70e99c3e0ce9262893ad6ae All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark
Re: [PATCH 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files
IMHO you should split the changes to replace ARCH_HAVE_EXTRA_ELF_NOTES with CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES into a lead-up patch. cheers Thanks for the input and i will take care in next version. regards, vigneshbalu.
Re: [PATCH 1/1] x86/elf: Add a new .note section containing Xfeatures information to x86 core files
Otherwise looks reasonable, though I see Dave has feedback to address too. :) Thanks for working on this! -Kees Thank you for the review. I will address all this on next version. thanks, vigneshbalu. -- Kees Cook
[PATCH v4] arch/powerpc/kvm: Add support for reading VPA counters for pseries guests
PAPR hypervisor has introduced three new counters in the VPA area of LPAR CPUs for KVM L2 guest (see [1] for terminology) observability - 2 for context switches from host to guest and vice versa, and 1 counter for getting the total time spent inside the KVM guest. Add a tracepoint that enables reading the counters for use by ftrace/perf. Note that this tracepoint is only available for nestedv2 API (i.e, KVM on PowerVM). Also maintain an aggregation of the context switch times in vcpu->arch. This will be useful in getting the aggregate times with a pmu driver which will be upstreamed in the near future. [1] Terminology: a. L1 refers to the VM (LPAR) booted on top of PAPR hypervisor b. L2 refers to the KVM guest booted on top of L1. Signed-off-by: Vaibhav Jain Signed-off-by: Gautam Menghani --- v1 -> v2: 1. Fix the build error due to invalid struct member reference. v2 -> v3: 1. Move the counter disabling and zeroing code to a different function. 2. Move the get_lppaca() inside the tracepoint_enabled() branch. 3. Add the aggregation logic to maintain total context switch time. v3 -> v4: 1. After vcpu_run, check the VPA flag instead of checking for tracepoint being enabled for disabling the cs time accumulation. arch/powerpc/include/asm/kvm_host.h | 5 + arch/powerpc/include/asm/lppaca.h | 11 --- arch/powerpc/kvm/book3s_hv.c| 30 + arch/powerpc/kvm/trace_hv.h | 25 4 files changed, 68 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8abac5321..d953b32dd 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -847,6 +847,11 @@ struct kvm_vcpu_arch { gpa_t nested_io_gpr; /* For nested APIv2 guests*/ struct kvmhv_nestedv2_io nestedv2_io; + + /* Aggregate context switch and guest run time info (in ns) */ + u64 l1_to_l2_cs_agg; + u64 l2_to_l1_cs_agg; + u64 l2_runtime_agg; #endif #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h index 61ec2447d..bda6b86b9 100644 --- a/arch/powerpc/include/asm/lppaca.h +++ b/arch/powerpc/include/asm/lppaca.h @@ -62,7 +62,8 @@ struct lppaca { u8 donate_dedicated_cpu; /* Donate dedicated CPU cycles */ u8 fpregs_in_use; u8 pmcregs_in_use; - u8 reserved8[28]; + u8 l2_accumul_cntrs_enable; /* Enable usage of counters for KVM guest */ + u8 reserved8[27]; __be64 wait_state_cycles; /* Wait cycles for this proc */ u8 reserved9[28]; __be16 slb_count; /* # of SLBs to maintain */ @@ -92,9 +93,13 @@ struct lppaca { /* cacheline 4-5 */ __be32 page_ins; /* CMO Hint - # page ins by OS */ - u8 reserved12[148]; + u8 reserved12[28]; + volatile __be64 l1_to_l2_cs_tb; + volatile __be64 l2_to_l1_cs_tb; + volatile __be64 l2_runtime_tb; + u8 reserved13[96]; volatile __be64 dtl_idx;/* Dispatch Trace Log head index */ - u8 reserved13[96]; + u8 reserved14[96]; } cacheline_aligned; #define lppaca_of(cpu) (*paca_ptrs[cpu]->lppaca_ptr) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8e86eb577..dcd6edd3b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4108,6 +4108,27 @@ static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu) } } +static void do_trace_nested_cs_time(struct kvm_vcpu *vcpu) +{ + struct lppaca *lp = get_lppaca(); + u64 l1_to_l2_ns, l2_to_l1_ns, l2_runtime_ns; + + l1_to_l2_ns = tb_to_ns(be64_to_cpu(lp->l1_to_l2_cs_tb)); + l2_to_l1_ns = tb_to_ns(be64_to_cpu(lp->l2_to_l1_cs_tb)); + l2_runtime_ns = tb_to_ns(be64_to_cpu(lp->l2_runtime_tb)); + trace_kvmppc_vcpu_exit_cs_time(vcpu, l1_to_l2_ns, l2_to_l1_ns, + l2_runtime_ns); + lp->l1_to_l2_cs_tb = 0; + lp->l2_to_l1_cs_tb = 0; + lp->l2_runtime_tb = 0; + lp->l2_accumul_cntrs_enable = 0; + + // Maintain an aggregate of context switch times + vcpu->arch.l1_to_l2_cs_agg += l1_to_l2_ns; + vcpu->arch.l2_to_l1_cs_agg += l2_to_l1_ns; + vcpu->arch.l2_runtime_agg += l2_runtime_ns; +} + static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { @@ -4130,6 +4151,11 @@ static int kvmhv_vcpu_entry_nestedv2(struct kvm_vcpu *vcpu, u64 time_limit, kvmppc_gse_put_u64(io->vcpu_run_input, KVMPPC_GSID_LPCR, lpcr); accumulate_time(vcpu, >arch.in_guest); + + /* Enable the guest host context switch time tracking */ + if (unlikely(trace_kvmppc_vcpu_exit_cs_time_enabled())) +
Re: [PATCH 3/3] tools/perf/arch/powerc: Add get_arch_regnum for powerpc
Hi Athira and Namhyung, On 03/09/2024 03:25 PM, Athira Rajeev wrote: The function get_dwarf_regnum() returns a DWARF register number from a register name string. This calls arch specific function get_arch_regnum to return register number for corresponding arch. Add mappings for register name to register number in powerpc code: arch/powerpc/util/dwarf-regs.c Signed-off-by: Athira Rajeev --- tools/perf/arch/powerpc/util/dwarf-regs.c | 29 +++ 1 file changed, 29 insertions(+) I found commit 3eee606757ad ("perf dwarf-regs: Add get_dwarf_regnum()") for x86, would you be able to share how to test these changes? What is the difference with and without the patches? Thanks, Tiezhu
Re: Appropriate liburcu cache line size for Power
Mathieu Desnoyers writes: > Hi, Hi Mathieu, > In the powerpc architecture support within the liburcu project [1] > we have a cache line size defined as 256 bytes with the following > comment: > > /* Include size of POWER5+ L3 cache lines: 256 bytes */ > #define CAA_CACHE_LINE_SIZE 256 > > I recently received a pull request on github [2] asking to > change this to 128 bytes. All the material provided supports > that the cache line sizes on powerpc are 128 bytes or less (even > L3 on POWER7, POWER8, and POWER9) [3]. > > I wonder where the 256 bytes L3 cache line size for POWER5+ > we have in liburcu comes from, and I wonder if it's the right choice > for a cache line size on all powerpc, considering that the Linux > kernel cache line size appear to use 128 bytes on recent Power > architectures. I recall some benchmark experiments Paul and I did > on a 64-core 1.9GHz POWER5+ machine that benefited from a 256 bytes > cache line size, and I suppose this is why we came up with this > value, but I don't have the detailed specs of that machine. > > Any feedback on this matter would be appreciated. The ISA doesn't specify the cache line size, other than it is smaller than a page. In practice all the 64-bit IBM server CPUs I'm aware of have used 128 bytes. There are some 64-bit CPUs that use 64 bytes, eg. pasemi PA6T and Freescale e6500. It is possible to discover at runtime via AUXV headers. But that's no use if you want a compile-time constant. I'm happy to run some benchmarks if you can point me at what to run. I had a poke around the repository and found short_bench, but it seemed to run for a very long time. cheers
Re: [PATCH v2 3/3] powerpc/code-patching: Restore 32-bit patching performance
Le 25/03/2024 à 23:48, Benjamin Gray a écrit : > The new open/close abstraction makes it more difficult for a > compiler to optimise. This causes 10% worse performance on > ppc32 as in [1]. Restoring the page alignment mask and inlining > the helpers allows the compiler to better reason about the address > alignment, allowing more optimised cache flushing selection. This should be squashed into patch 1. There is no point in having that as a separate patch when in the same series. > > [1]: > https://lore.kernel.org/all/77fdcdeb-4af5-4ad0-a4c6-57bf0762d...@csgroup.eu/ > > Suggested-by: Christophe Leroy > Signed-off-by: Benjamin Gray > > --- > > v2: * New in v2 > > I think Suggested-by is an appropriate tag. The patch is Christophe's > from the link, I just added the commit description, so it could well > be better to change the author to Christophe completely. > --- > arch/powerpc/lib/code-patching.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/lib/code-patching.c > b/arch/powerpc/lib/code-patching.c > index b3a644290369..d089da115987 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -282,13 +282,13 @@ struct patch_window { >* Interrupts must be disabled for the entire duration of the patching. The > PIDR >* is potentially changed during this time. >*/ > -static int open_patch_window(void *addr, struct patch_window *ctx) > +static __always_inline int open_patch_window(void *addr, struct patch_window > *ctx) > { > unsigned long pfn = get_patch_pfn(addr); > > lockdep_assert_irqs_disabled(); > > - ctx->text_poke_addr = (unsigned > long)__this_cpu_read(cpu_patching_context.addr); > + ctx->text_poke_addr = (unsigned > long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; > > if (!mm_patch_enabled()) { > ctx->ptep = __this_cpu_read(cpu_patching_context.pte); > @@ -331,7 +331,7 @@ static int open_patch_window(void *addr, struct > patch_window *ctx) > return 0; > } > > -static void close_patch_window(struct patch_window *ctx) > +static __always_inline void close_patch_window(struct patch_window *ctx) > { > lockdep_assert_irqs_disabled(); >
Re: [PATCH v2 6/6] mm/mm_init.c: remove arch_reserved_kernel_pages()
On Mon, Mar 25, 2024 at 10:56:46PM +0800, Baoquan He wrote: > Since the current calculation of calc_nr_kernel_pages() has taken into > consideration of kernel reserved memory, no need to have > arch_reserved_kernel_pages() any more. > > Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) > --- > arch/powerpc/include/asm/mmu.h | 4 > arch/powerpc/kernel/fadump.c | 5 - > include/linux/mm.h | 3 --- > mm/mm_init.c | 12 > 4 files changed, 24 deletions(-) > > diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h > index 3b72c7ed24cf..aa5c0fd5edb1 100644 > --- a/arch/powerpc/include/asm/mmu.h > +++ b/arch/powerpc/include/asm/mmu.h > @@ -406,9 +406,5 @@ extern void *abatron_pteptrs[2]; > #include > #endif > > -#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) > -#define __HAVE_ARCH_RESERVED_KERNEL_PAGES > -#endif > - > #endif /* __KERNEL__ */ > #endif /* _ASM_POWERPC_MMU_H_ */ > diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c > index d14eda1e8589..ae8c7619e597 100644 > --- a/arch/powerpc/kernel/fadump.c > +++ b/arch/powerpc/kernel/fadump.c > @@ -1735,8 +1735,3 @@ static void __init fadump_reserve_crash_area(u64 base) > memblock_reserve(mstart, msize); > } > } > - > -unsigned long __init arch_reserved_kernel_pages(void) > -{ > - return memblock_reserved_size() / PAGE_SIZE; > -} > diff --git a/include/linux/mm.h b/include/linux/mm.h > index ad19350e1538..ab1ba0a31429 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -3221,9 +3221,6 @@ static inline void show_mem(void) > extern long si_mem_available(void); > extern void si_meminfo(struct sysinfo * val); > extern void si_meminfo_node(struct sysinfo *val, int nid); > -#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES > -extern unsigned long arch_reserved_kernel_pages(void); > -#endif > > extern __printf(3, 4) > void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); > diff --git a/mm/mm_init.c b/mm/mm_init.c > index e269a724f70e..089dc60159e9 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -2373,17 +2373,6 @@ void __init page_alloc_init_late(void) > page_alloc_sysctl_init(); > } > > -#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES > -/* > - * Returns the number of pages that arch has reserved but > - * is not known to alloc_large_system_hash(). > - */ > -static unsigned long __init arch_reserved_kernel_pages(void) > -{ > - return 0; > -} > -#endif > - > /* > * Adaptive scale is meant to reduce sizes of hash tables on large memory > * machines. As memory size is increased the scale is also increased but at > @@ -2426,7 +2415,6 @@ void *__init alloc_large_system_hash(const char > *tablename, > if (!numentries) { > /* round applicable memory size up to nearest megabyte */ > numentries = nr_kernel_pages; > - numentries -= arch_reserved_kernel_pages(); > > /* It isn't necessary when PAGE_SIZE >= 1MB */ > if (PAGE_SIZE < SZ_1M) > -- > 2.41.0 > -- Sincerely yours, Mike.
Re: [PATCH v2 3/6] mm/mm_init.c: add new function calc_nr_all_pages()
Hi Baoquan, On Mon, Mar 25, 2024 at 10:56:43PM +0800, Baoquan He wrote: > This is a preparation to calculate nr_kernel_pages and nr_all_pages, > both of which will be used later in alloc_large_system_hash(). > > nr_all_pages counts up all free but not reserved memory in memblock > allocator, including HIGHMEM memory. While nr_kernel_pages counts up > all free but not reserved low memory in memblock allocator, excluding > HIGHMEM memory. Sorry I've missed this in the previous review, but I think this patch and the patch "remove unneeded calc_memmap_size()" can be merged into "remove meaningless calculation of zone->managed_pages in free_area_init_core()" with an appropriate update of the commit message. With the current patch splitting there will be compilation warning about unused function for this and the next patch. > Signed-off-by: Baoquan He > --- > mm/mm_init.c | 24 > 1 file changed, 24 insertions(+) > > diff --git a/mm/mm_init.c b/mm/mm_init.c > index 153fb2dc666f..c57a7fc97a16 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1264,6 +1264,30 @@ static void __init > reset_memoryless_node_totalpages(struct pglist_data *pgdat) > pr_debug("On node %d totalpages: 0\n", pgdat->node_id); > } > > +static void __init calc_nr_kernel_pages(void) > +{ > + unsigned long start_pfn, end_pfn; > + phys_addr_t start_addr, end_addr; > + u64 u; > +#ifdef CONFIG_HIGHMEM > + unsigned long high_zone_low = > arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; > +#endif > + > + for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, _addr, > _addr, NULL) { > + start_pfn = PFN_UP(start_addr); > + end_pfn = PFN_DOWN(end_addr); > + > + if (start_pfn < end_pfn) { > + nr_all_pages += end_pfn - start_pfn; > +#ifdef CONFIG_HIGHMEM > + start_pfn = clamp(start_pfn, 0, high_zone_low); > + end_pfn = clamp(end_pfn, 0, high_zone_low); > +#endif > + nr_kernel_pages += end_pfn - start_pfn; > + } > + } > +} > + > static void __init calculate_node_totalpages(struct pglist_data *pgdat, > unsigned long node_start_pfn, > unsigned long node_end_pfn) > -- > 2.41.0 > -- Sincerely yours, Mike.
Re: [PATCH v2 2/6] mm/mm_init.c: remove the useless dma_reserve
On Mon, Mar 25, 2024 at 10:56:42PM +0800, Baoquan He wrote: > Now nobody calls set_dma_reserve() to set value for dma_reserve, remove > set_dma_reserve(), global variable dma_reserve and the codes using it. > > Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 1 - > mm/mm_init.c | 23 --- > 2 files changed, 24 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0436b919f1c7..ad19350e1538 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -3210,7 +3210,6 @@ static inline int early_pfn_to_nid(unsigned long pfn) > extern int __meminit early_pfn_to_nid(unsigned long pfn); > #endif > > -extern void set_dma_reserve(unsigned long new_dma_reserve); > extern void mem_init(void); > extern void __init mmap_init(void); > > diff --git a/mm/mm_init.c b/mm/mm_init.c > index 549e76af8f82..153fb2dc666f 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -226,7 +226,6 @@ static unsigned long required_movablecore_percent > __initdata; > > static unsigned long nr_kernel_pages __initdata; > static unsigned long nr_all_pages __initdata; > -static unsigned long dma_reserve __initdata; > > static bool deferred_struct_pages __meminitdata; > > @@ -1583,12 +1582,6 @@ static void __init free_area_init_core(struct > pglist_data *pgdat) > zone_names[j], memmap_pages, freesize); > } > > - /* Account for reserved pages */ > - if (j == 0 && freesize > dma_reserve) { > - freesize -= dma_reserve; > - pr_debug(" %s zone: %lu pages reserved\n", > zone_names[0], dma_reserve); > - } > - > if (!is_highmem_idx(j)) > nr_kernel_pages += freesize; > /* Charge for highmem memmap if there are enough kernel pages */ > @@ -2547,22 +2540,6 @@ void *__init alloc_large_system_hash(const char > *tablename, > return table; > } > > -/** > - * set_dma_reserve - set the specified number of pages reserved in the first > zone > - * @new_dma_reserve: The number of pages to mark reserved > - * > - * The per-cpu batchsize and zone watermarks are determined by managed_pages. > - * In the DMA zone, a significant percentage may be consumed by kernel image > - * and other unfreeable allocations which can skew the watermarks badly. This > - * function may optionally be used to account for unfreeable pages in the > - * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and > - * smaller per-cpu batchsize. > - */ > -void __init set_dma_reserve(unsigned long new_dma_reserve) > -{ > - dma_reserve = new_dma_reserve; > -} > - > void __init memblock_free_pages(struct page *page, unsigned long pfn, > unsigned int order) > { > -- > 2.41.0 > > -- Sincerely yours, Mike.
Re: [PATCH v2 1/6] x86: remove unneeded memblock_find_dma_reserve()
On Mon, Mar 25, 2024 at 10:56:41PM +0800, Baoquan He wrote: > Variable dma_reserve and its usage was introduced in commit 0e0b864e069c > ("[PATCH] Account for memmap and optionally the kernel image as holes"). > Its original purpose was to accounting for the reserved pages in DMA > zone to make DMA zone's watermarks calculation more accurate on x86. > > However, currently there's zone->managed_pages to account for all > available pages for buddy, zone->present_pages to account for all > present physical pages in zone. What is more important, on x86, > calculating and setting the zone->managed_pages is a temporary move, > all zone's managed_pages will be zeroed out and reset to the actual > value according to how many pages are added to buddy allocator in > mem_init(). Before mem_init(), no buddy alloction is requested. And > zone's pcp and watermark setting are all done after mem_init(). So, > no need to worry about the DMA zone's setting accuracy during > free_area_init(). > > Hence, remove memblock_find_dma_reserve() to stop calculating and > setting dma_reserve. > > Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport (IBM) > --- > arch/x86/include/asm/pgtable.h | 1 - > arch/x86/kernel/setup.c| 2 -- > arch/x86/mm/init.c | 47 -- > 3 files changed, 50 deletions(-) > > diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h > index 315535ffb258..cefc7a84f7a4 100644 > --- a/arch/x86/include/asm/pgtable.h > +++ b/arch/x86/include/asm/pgtable.h > @@ -1200,7 +1200,6 @@ static inline int pgd_none(pgd_t pgd) > extern int direct_gbpages; > void init_mem_mapping(void); > void early_alloc_pgt_buf(void); > -extern void memblock_find_dma_reserve(void); > void __init poking_init(void); > unsigned long init_memory_mapping(unsigned long start, > unsigned long end, pgprot_t prot); > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > index ef206500ed6f..74873bd04ad1 100644 > --- a/arch/x86/kernel/setup.c > +++ b/arch/x86/kernel/setup.c > @@ -1106,8 +1106,6 @@ void __init setup_arch(char **cmdline_p) >*/ > arch_reserve_crashkernel(); > > - memblock_find_dma_reserve(); > - > if (!early_xdbc_setup_hardware()) > early_xdbc_register_console(); > > diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c > index 679893ea5e68..615f0bf4bda6 100644 > --- a/arch/x86/mm/init.c > +++ b/arch/x86/mm/init.c > @@ -990,53 +990,6 @@ void __init free_initrd_mem(unsigned long start, > unsigned long end) > } > #endif > > -/* > - * Calculate the precise size of the DMA zone (first 16 MB of RAM), > - * and pass it to the MM layer - to help it set zone watermarks more > - * accurately. > - * > - * Done on 64-bit systems only for the time being, although 32-bit systems > - * might benefit from this as well. > - */ > -void __init memblock_find_dma_reserve(void) > -{ > -#ifdef CONFIG_X86_64 > - u64 nr_pages = 0, nr_free_pages = 0; > - unsigned long start_pfn, end_pfn; > - phys_addr_t start_addr, end_addr; > - int i; > - u64 u; > - > - /* > - * Iterate over all memory ranges (free and reserved ones alike), > - * to calculate the total number of pages in the first 16 MB of RAM: > - */ > - nr_pages = 0; > - for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) { > - start_pfn = min(start_pfn, MAX_DMA_PFN); > - end_pfn = min(end_pfn, MAX_DMA_PFN); > - > - nr_pages += end_pfn - start_pfn; > - } > - > - /* > - * Iterate over free memory ranges to calculate the number of free > - * pages in the DMA zone, while not counting potential partial > - * pages at the beginning or the end of the range: > - */ > - nr_free_pages = 0; > - for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, _addr, > _addr, NULL) { > - start_pfn = min_t(unsigned long, PFN_UP(start_addr), > MAX_DMA_PFN); > - end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), > MAX_DMA_PFN); > - > - if (start_pfn < end_pfn) > - nr_free_pages += end_pfn - start_pfn; > - } > - > - set_dma_reserve(nr_pages - nr_free_pages); > -#endif > -} > - > void __init zone_sizes_init(void) > { > unsigned long max_zone_pfns[MAX_NR_ZONES]; > -- > 2.41.0 > -- Sincerely yours, Mike.
[PATCH v18 6/6] powerpc/crash: add crash memory hotplug support
Extend the arch crash hotplug handler, as introduced by the patch title ("powerpc: add crash CPU hotplug support"), to also support memory add/remove events. Elfcorehdr describes the memory of the crash kernel to capture the kernel; hence, it needs to be updated if memory resources change due to memory add/remove events. Therefore, arch_crash_handle_hotplug_event() is updated to recreate the elfcorehdr and replace it with the previous one on memory add/remove events. The memblock list is used to prepare the elfcorehdr. In the case of memory hot remove, the memblock list is updated after the arch crash hotplug handler is triggered, as depicted in Figure 1. Thus, the hot-removed memory is explicitly removed from the crash memory ranges to ensure that the memory ranges added to elfcorehdr do not include the hot-removed memory. Memory remove | v Offline pages | v Initiate memory notify call <> crash hotplug handler chain for MEM_OFFLINE event | v Update memblock list Figure 1 There are two system calls, `kexec_file_load` and `kexec_load`, used to load the kdump image. A few changes have been made to ensure that the kernel can safely update the elfcorehdr component of the kdump image for both system calls. For the kexec_file_load syscall, kdump image is prepared in the kernel. To support an increasing number of memory regions, the elfcorehdr is built with extra buffer space to ensure that it can accommodate additional memory ranges in future. For the kexec_load syscall, the elfcorehdr is updated only if the KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by the kexec tool. Passing this flag to the kernel indicates that the elfcorehdr is built to accommodate additional memory ranges and the elfcorehdr segment is not considered for SHA calculation, making it safe to update. The changes related to this feature are kept under the CRASH_HOTPLUG config, and it is enabled by default. Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Cc: Akhil Raj Cc: Andrew Morton Cc: Aneesh Kumar K.V Cc: Baoquan He Cc: Borislav Petkov (AMD) Cc: Boris Ostrovsky Cc: Christophe Leroy Cc: Dave Hansen Cc: Dave Young Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Laurent Dufour Cc: Mahesh Salgaonkar Cc: Michael Ellerman Cc: Mimi Zohar Cc: Naveen N Rao Cc: Oscar Salvador Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vivek Goyal Cc: ke...@lists.infradead.org Cc: x...@kernel.org --- * No changes in v18. arch/powerpc/include/asm/kexec.h| 3 + arch/powerpc/include/asm/kexec_ranges.h | 1 + arch/powerpc/kexec/crash.c | 95 - arch/powerpc/kexec/file_load_64.c | 20 +- arch/powerpc/kexec/ranges.c | 85 ++ 5 files changed, 202 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index e75970351bcd..95a98b390d62 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -141,6 +141,9 @@ void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); #define arch_crash_hotplug_support arch_crash_hotplug_support + +unsigned int arch_crash_get_elfcorehdr_size(void); +#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size #endif /* CONFIG_CRASH_HOTPLUG */ extern int crashing_cpu; diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index 8489e844b447..14055896cbcb 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -7,6 +7,7 @@ void sort_memory_ranges(struct crash_mem *mrngs, bool merge); struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); +int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); int get_exclude_memory_ranges(struct crash_mem **mem_ranges); int get_reserved_memory_ranges(struct crash_mem **mem_ranges); int get_crash_memory_ranges(struct crash_mem **mem_ranges); diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index 8938a19af12f..21b193e938a3 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include /* * The primary CPU waits a while for all secondary CPUs to enter. This is to @@ -398,6 +400,94 @@ void default_machine_crash_shutdown(struct pt_regs *regs) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt +/* + * Advertise preferred elfcorehdr size to userspace via + * /sys/kernel/crash_elfcorehdr_size sysfs interface. + */ +unsigned int arch_crash_get_elfcorehdr_size(void) +{ + unsigned long phdr_cnt; + + /* A program header