[PATCH v3 11/11] selftests/powerpc/dexcr: Add DEXCR status utility lsdexcr
Add a utility 'lsdexcr' to print the current DEXCR status. Useful for quickly checking the status such as when debugging test failures or verifying the new default DEXCR does what you want (for userspace at least). Example output: # ./lsdexcr uDEXCR: 0400 (NPHIE) HDEXCR: Effective: 0400 (NPHIE) SBHE (0): clear (Speculative branch hint enable) IBRTPD (3): clear (Indirect branch recurrent target ...) SRAPD (4): clear (Subroutine return address ...) NPHIE * (5): set (Non-privileged hash instruction enable) PHIE (6): clear (Privileged hash instruction enable) DEXCR[NPHIE] enabled: hashst/hashchk working Signed-off-by: Benjamin Gray --- v1: * Report if hashst/hashchk actually does something --- .../selftests/powerpc/dexcr/.gitignore| 1 + .../testing/selftests/powerpc/dexcr/Makefile | 2 + .../testing/selftests/powerpc/dexcr/lsdexcr.c | 141 ++ 3 files changed, 144 insertions(+) create mode 100644 tools/testing/selftests/powerpc/dexcr/lsdexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index d12e4560aca9..b82f45dd46b9 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1 +1,2 @@ hashchk_test +lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 16c8b489948a..76210f2bcec3 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,7 +1,9 @@ TEST_GEN_PROGS := hashchk_test +TEST_GEN_FILES := lsdexcr include ../../lib.mk $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c +$(TEST_GEN_FILES): ../utils.c ./dexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c new file mode 100644 index ..94abbfcc389e --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +static unsigned int dexcr; +static unsigned int hdexcr; +static unsigned int effective; + +struct dexcr_aspect { + const char *name; + const char *desc; + unsigned int index; +}; + +static const struct dexcr_aspect aspects[] = { + { + .name = "SBHE", + .desc = "Speculative branch hint enable", + .index = 0, + }, + { + .name = "IBRTPD", + .desc = "Indirect branch recurrent target prediction disable", + .index = 3, + }, + { + .name = "SRAPD", + .desc = "Subroutine return address prediction disable", + .index = 4, + }, + { + .name = "NPHIE", + .desc = "Non-privileged hash instruction enable", + .index = 5, + }, + { + .name = "PHIE", + .desc = "Privileged hash instruction enable", + .index = 6, + }, +}; + +static void print_list(const char *list[], size_t len) +{ + for (size_t i = 0; i < len; i++) { + printf("%s", list[i]); + if (i + 1 < len) + printf(", "); + } +} + +static void print_dexcr(char *name, unsigned int bits) +{ + const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL}; + size_t j = 0; + + printf("%s: %08x", name, bits); + + if (bits == 0) { + printf("\n"); + return; + } + + for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) { + unsigned int mask = DEXCR_PR_BIT(aspects[i].index); + + if (bits & mask) { + enabled_aspects[j++] = aspects[i].name; + bits &= ~mask; + } + } + + if (bits) + enabled_aspects[j++] = "unknown"; + + printf(" ("); + print_list(enabled_aspects, j); + printf(")\n"); +} + +static void print_aspect(const struct dexcr_aspect *aspect) +{ + const char *attributes[8] = {NULL}; + size_t j = 0; + unsigned long mask; + + mask = DEXCR_PR_BIT(aspect->index); + if (dexcr & mask) + attributes[j++] = "set"; + if (hdexcr & mask) + attributes[j++] = "set (hypervisor)"; + if (!(effective & mask)) + attributes[j++] = "clear"; + + printf("%12s %c (%d): ", aspect->name, effective & mask ? '*' : ' ', aspect->index); + print_list(attributes, j); + printf(" \t(%s)\n", aspect->desc); +} + +int main(int argc, char *argv[]) +{ + if (!dexcr_exists()) { +
[PATCH v3 10/11] selftests/powerpc/dexcr: Add hashst/hashchk test
Test the kernel DEXCR[NPHIE] interface and hashchk exception handling. Introduces with it a DEXCR utils library for common DEXCR operations. Volatile is used to prevent the compiler optimising away the signal tests. Signed-off-by: Benjamin Gray --- v1: * Clean up dexcr makefile * Include kernel headers in CFLAGS * Use numeric literals for hashst/hashchk to support older toolchains * A lot of other refactoring --- tools/testing/selftests/powerpc/Makefile | 1 + .../selftests/powerpc/dexcr/.gitignore| 1 + .../testing/selftests/powerpc/dexcr/Makefile | 7 + tools/testing/selftests/powerpc/dexcr/dexcr.c | 132 ++ tools/testing/selftests/powerpc/dexcr/dexcr.h | 49 .../selftests/powerpc/dexcr/hashchk_test.c| 227 ++ tools/testing/selftests/powerpc/include/reg.h | 4 + .../testing/selftests/powerpc/include/utils.h | 4 + tools/testing/selftests/powerpc/utils.c | 24 ++ 9 files changed, 449 insertions(+) create mode 100644 tools/testing/selftests/powerpc/dexcr/.gitignore create mode 100644 tools/testing/selftests/powerpc/dexcr/Makefile create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.c create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.h create mode 100644 tools/testing/selftests/powerpc/dexcr/hashchk_test.c diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index ae2bfc0d822f..49f2ad1793fd 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -17,6 +17,7 @@ SUB_DIRS = alignment \ benchmarks \ cache_shape \ copyloops\ + dexcr\ dscr \ mm \ nx-gzip \ diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore new file mode 100644 index ..d12e4560aca9 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -0,0 +1 @@ +hashchk_test diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile new file mode 100644 index ..16c8b489948a --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -0,0 +1,7 @@ +TEST_GEN_PROGS := hashchk_test + +include ../../lib.mk + +$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) + +$(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c b/tools/testing/selftests/powerpc/dexcr/dexcr.c new file mode 100644 index ..65ec5347de98 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0+ + +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "reg.h" +#include "utils.h" + +static jmp_buf generic_signal_jump_buf; + +static void generic_signal_handler(int signum, siginfo_t *info, void *context) +{ + longjmp(generic_signal_jump_buf, 0); +} + +bool dexcr_exists(void) +{ + struct sigaction old; + volatile bool exists; + + old = push_signal_handler(SIGILL, generic_signal_handler); + if (setjmp(generic_signal_jump_buf)) + goto out; + + /* +* If the SPR is not recognised by the hardware it triggers +* a hypervisor emulation interrupt. If the kernel does not +* recognise/try to emulate it, we receive a SIGILL signal. +* +* If we do not receive a signal, assume we have the SPR or the +* kernel is trying to emulate it correctly. +*/ + exists = false; + mfspr(SPRN_DEXCR_RO); + exists = true; + +out: + pop_signal_handler(SIGILL, old); + return exists; +} + +/* + * Just test if a bad hashchk triggers a signal, without checking + * for support or if the NPHIE aspect is enabled. + */ +bool hashchk_triggers(void) +{ + struct sigaction old; + volatile bool triggers; + + old = push_signal_handler(SIGILL, generic_signal_handler); + if (setjmp(generic_signal_jump_buf)) + goto out; + + triggers = true; + do_bad_hashchk(); + triggers = false; + +out: + pop_signal_handler(SIGILL, old); + return triggers; +} + +unsigned int get_dexcr(enum dexcr_source source) +{ + switch (source) { + case DEXCR: + return mfspr(SPRN_DEXCR_RO); + case HDEXCR: + return mfspr(SPRN_HDEXCR_RO); + case EFFECTIVE: + return mfspr(SPRN_DEXCR_RO) | mfspr(SPRN_HDEXCR_RO); + default: + FAIL_IF_EXIT_MSG(true, "bad enum dexcr_source"); + } +} + +void await_child_success(pid_t pid) +{ + int wstatus; + + FAIL_IF_EXIT_MSG(pid == -1, "fork failed"); + FAIL_IF_EXIT_MSG(waitpid(pid, , 0) == -1,
[PATCH v3 09/11] selftests/powerpc: Add more utility macros
Adds _MSG assertion variants to provide more context behind why a failure occurred. Also include unistd.h for _exit() and stdio.h for fprintf(), and move ARRAY_SIZE macro to utils.h. The _MSG variants and ARRAY_SIZE will be used by the following DEXCR selftests. Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey --- v3: * Reword commit message * Add ruscur reviewed-by v1: * Remove the signal handler variants * Describe why headers are included --- .../testing/selftests/powerpc/include/utils.h | 27 ++- .../powerpc/pmu/sampling_tests/misc.h | 2 -- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h index 44bfd48b93d6..9dc53c4fbfe3 100644 --- a/tools/testing/selftests/powerpc/include/utils.h +++ b/tools/testing/selftests/powerpc/include/utils.h @@ -9,11 +9,17 @@ #define __cacheline_aligned __attribute__((aligned(128))) #include +#include #include #include #include #include #include "reg.h" +#include + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif /* Avoid headaches with PRI?64 - just use %ll? always */ typedef unsigned long long u64; @@ -67,7 +73,6 @@ struct perf_event_read { }; #if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 30) -#include #include static inline pid_t gettid(void) @@ -116,6 +121,16 @@ do { \ } \ } while (0) +#define FAIL_IF_MSG(x, msg)\ +do { \ + if ((x)) { \ + fprintf(stderr, \ + "[FAIL] Test FAILED on line %d: %s\n", \ + __LINE__, msg); \ + return 1; \ + } \ +} while (0) + #define FAIL_IF_EXIT(x)\ do { \ if ((x)) { \ @@ -125,6 +140,16 @@ do { \ } \ } while (0) +#define FAIL_IF_EXIT_MSG(x, msg) \ +do { \ + if ((x)) { \ + fprintf(stderr, \ + "[FAIL] Test FAILED on line %d: %s\n", \ + __LINE__, msg); \ + _exit(1); \ + } \ +} while (0) + /* The test harness uses this, yes it's gross */ #define MAGIC_SKIP_RETURN_VALUE99 diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h index 4181755cf5a0..64e25cce1435 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h @@ -18,8 +18,6 @@ #define MMCR1_RSQ 0x2000ULL /* radix scope qual field */ #define BHRB_DISABLE0x20ULL /* MMCRA BHRB DISABLE bit */ -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - extern int ev_mask_pmcxsel, ev_shift_pmcxsel; extern int ev_mask_marked, ev_shift_marked; extern int ev_mask_comb, ev_shift_comb; -- 2.40.1
[PATCH v3 08/11] Documentation: Document PowerPC kernel DEXCR interface
Describe the DEXCR and document how to configure it. Signed-off-by: Benjamin Gray --- v3: * Revise (H)DEXCR width to 64 bits * Revise configuration section v2: * Document coredump & ptrace support v1: * Remove the dynamic control docs, describe the static config option This documentation is a little bare for now, but will be expanded on when dynamic DEXCR control is added. --- Documentation/powerpc/dexcr.rst | 58 + Documentation/powerpc/index.rst | 1 + 2 files changed, 59 insertions(+) create mode 100644 Documentation/powerpc/dexcr.rst diff --git a/Documentation/powerpc/dexcr.rst b/Documentation/powerpc/dexcr.rst new file mode 100644 index ..615a631f51fa --- /dev/null +++ b/Documentation/powerpc/dexcr.rst @@ -0,0 +1,58 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +== +DEXCR (Dynamic Execution Control Register) +== + +Overview + + +The DEXCR is a privileged special purpose register (SPR) introduced in +PowerPC ISA 3.1B (Power10) that allows per-cpu control over several dynamic +execution behaviours. These behaviours include speculation (e.g., indirect +branch target prediction) and enabling return-oriented programming (ROP) +protection instructions. + +The execution control is exposed in hardware as up to 32 bits ('aspects') in +the DEXCR. Each aspect controls a certain behaviour, and can be set or cleared +to enable/disable the aspect. There are several variants of the DEXCR for +different purposes: + +DEXCR +A privileged SPR that can control aspects for userspace and kernel space +HDEXCR +A hypervisor-privileged SPR that can control aspects for the hypervisor and +enforce aspects for the kernel and userspace. +UDEXCR +An optional ultravisor-privileged SPR that can control aspects for the ultravisor. + +Userspace can examine the current DEXCR state using a dedicated SPR that +provides a non-privileged read-only view of the userspace DEXCR aspects. +There is also an SPR that provides a read-only view of the hypervisor enforced +aspects, which ORed with the userspace DEXCR view gives the effective DEXCR +state for a process. + + +Configuration += + +The DEXCR is currently unconfigurable. All threads are run with the +NPHIE aspect enabled. + + +coredump and ptrace +=== + +The userspace values of the DEXCR and HDEXCR (in this order) are exposed under +``NT_PPC_DEXCR``. These are each 64 bits and readonly, and are intended to +assist with core dumps. The DEXCR may be made writable in future. The top 32 +bits of both registers (corresponding to the non-userspace bits) are masked off. + +If the kernel config ``CONFIG_CHECKPOINT_RESTORE`` is enabled, then +``NT_PPC_HASHKEYR`` is available and exposes the HASHKEYR value of the process +for reading and writing. This is a tradeoff between increased security and +checkpoint/restore support: a process should normally have no need to know its +secret key, but restoring a process requires setting its original key. The key +therefore appears in core dumps, and an attacker may be able to retrieve it from +a coredump and effectively bypass ROP protection on any threads that share this +key (potentially all threads from the same parent that have not run ``exec()``). diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index 85e80e30160b..d33b554ca7ba 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -15,6 +15,7 @@ powerpc cxl cxlflash dawr-power9 +dexcr dscr eeh-pci-error-recovery elf_hwcaps -- 2.40.1
[PATCH v3 07/11] powerpc/ptrace: Expose HASHKEYR register to ptrace
The HASHKEYR register contains a secret per-process key to enable unique hashes per process. In general it should not be exposed to userspace at all and a regular process has no need to know its key. However, checkpoint restore in userspace (CRIU) functionality requires that a process be able to set the HASHKEYR of another process, otherwise existing hashes on the stack would be invalidated by a new random key. Exposing HASHKEYR in this way also makes it appear in core dumps, which is a security concern. Multiple threads may share a key, for example just after a fork() call, where the kernel cannot know if the child is going to return back along the parent's stack. If such a thread is coerced into making a core dump, then the HASHKEYR value will be readable and able to be used against all other threads sharing that key, effectively undoing any protection offered by hashst/hashchk. Therefore we expose HASHKEYR to ptrace when CONFIG_CHECKPOINT_RESTORE is enabled, providing a choice of increased security or migratable ROP protected processes. This is similar to how ARM exposes its PAC keys. Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey --- v3: * Add ruscur reviewed-by v2: * New in v2 --- arch/powerpc/include/uapi/asm/elf.h | 1 + arch/powerpc/kernel/ptrace/ptrace-decl.h | 3 ++ arch/powerpc/kernel/ptrace/ptrace-view.c | 36 include/uapi/linux/elf.h | 1 + 4 files changed, 41 insertions(+) diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index e0d323c808dd..a5377f494fa3 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -99,6 +99,7 @@ #define ELF_NPMU 5 /* includes siar, sdar, sier, mmcr2, mmcr0 */ #define ELF_NPKEY 3 /* includes amr, iamr, uamor */ #define ELF_NDEXCR 2 /* includes dexcr, hdexcr */ +#define ELF_NHASHKEYR 1 /* includes hashkeyr */ typedef unsigned long elf_greg_t64; typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG]; diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index 998a84f64804..4171a5727197 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -58,6 +58,9 @@ enum powerpc_regset { REGSET_EBB, /* EBB registers */ REGSET_PMR, /* Performance Monitor Registers */ REGSET_DEXCR, /* DEXCR registers */ +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_HASHKEYR,/* HASHKEYR register */ +#endif #endif #ifdef CONFIG_PPC_MEM_KEYS REGSET_PKEY,/* AMR register */ diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 87a46edb3efb..600fa72c2934 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -483,6 +483,35 @@ static int dexcr_get(struct task_struct *target, const struct user_regset *regse return membuf_store(, (unsigned long)(mfspr(SPRN_HDEXCR_RO) & 0x)); } +#ifdef CONFIG_CHECKPOINT_RESTORE +static int hashkeyr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return regset->n; +} + +static int hashkeyr_get(struct task_struct *target, const struct user_regset *regset, + struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return membuf_store(, target->thread.hashkeyr); +} + +static int hashkeyr_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, const void *kbuf, + const void __user *ubuf) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return user_regset_copyin(, , , , >thread.hashkeyr, + 0, sizeof(unsigned long)); +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_MEM_KEYS @@ -649,6 +678,13 @@ static const struct user_regset native_regsets[] = { .size = sizeof(u64), .align = sizeof(u64), .active = dexcr_active, .regset_get = dexcr_get }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_HASHKEYR] = { + .core_note_type = NT_PPC_HASHKEYR, .n = ELF_NHASHKEYR, + .size = sizeof(u64), .align = sizeof(u64), + .active = hashkeyr_active, .regset_get = hashkeyr_get, .set = hashkeyr_set + }, +#endif #endif #ifdef CONFIG_PPC_MEM_KEYS [REGSET_PKEY] = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index cfa31f1eb5d7..b705b301d88f 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -404,6 +404,7 @@ typedef struct elf64_shdr { #define NT_PPC_TM_CDSCR0x10f
[PATCH v3 06/11] powerpc/ptrace: Expose DEXCR and HDEXCR registers to ptrace
The DEXCR register is of interest when ptracing processes. Currently it is static, but eventually will be dynamically controllable by a process. If a process can control its own, then it is useful for it to be ptrace-able to (e.g., for checkpoint-restore functionality). It is also relevant to core dumps (the NPHIE aspect in particular), which use the ptrace mechanism (or is it the other way around?) to decide what to dump. The HDEXCR is useful here too, as the NPHIE aspect may be set in the HDEXCR without being set in the DEXCR. Although the HDEXCR is per-cpu and we don't track it in the task struct (it's useless in normal operation), it would be difficult to imagine why a hypervisor would set it to different values within a guest. A hypervisor cannot safely set NPHIE differently at least, as that would break programs. Expose a read-only view of the userspace DEXCR and HDEXCR to ptrace. The HDEXCR is always readonly, and is useful for diagnosing the core dumps (as the HDEXCR may set NPHIE without the DEXCR setting it). Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey --- v3: * Add ruscur reviewed-by * Expose values as 64 bits * DEXCR is no longer a config v2: * New in v2 --- arch/powerpc/include/uapi/asm/elf.h | 1 + arch/powerpc/kernel/ptrace/ptrace-decl.h | 1 + arch/powerpc/kernel/ptrace/ptrace-view.c | 36 +++- include/uapi/linux/elf.h | 1 + 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index dbc4a5b8d02d..e0d323c808dd 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -98,6 +98,7 @@ #define ELF_NEBB 3 /* includes ebbrr, ebbhr, bescr */ #define ELF_NPMU 5 /* includes siar, sdar, sier, mmcr2, mmcr0 */ #define ELF_NPKEY 3 /* includes amr, iamr, uamor */ +#define ELF_NDEXCR 2 /* includes dexcr, hdexcr */ typedef unsigned long elf_greg_t64; typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG]; diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index 463a63eb8cc7..998a84f64804 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -57,6 +57,7 @@ enum powerpc_regset { REGSET_TAR, /* TAR register */ REGSET_EBB, /* EBB registers */ REGSET_PMR, /* Performance Monitor Registers */ + REGSET_DEXCR, /* DEXCR registers */ #endif #ifdef CONFIG_PPC_MEM_KEYS REGSET_PKEY,/* AMR register */ diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 5fff0d04b23f..87a46edb3efb 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -454,7 +454,36 @@ static int pmu_set(struct task_struct *target, const struct user_regset *regset, 5 * sizeof(unsigned long)); return ret; } -#endif + +static int dexcr_active(struct task_struct *target, const struct user_regset *regset) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + return regset->n; +} + +static int dexcr_get(struct task_struct *target, const struct user_regset *regset, +struct membuf to) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_31)) + return -ENODEV; + + /* +* The DEXCR is currently static across all CPUs, so we don't +* store the target's value anywhere, but the static value +* will also be correct. +*/ + membuf_store(, (unsigned long)(DEXCR_INIT & 0x)); + + /* +* Technically the HDEXCR is per-cpu, but a hypervisor can't reasonably +* change it between CPUs of the same guest. +*/ + return membuf_store(, (unsigned long)(mfspr(SPRN_HDEXCR_RO) & 0x)); +} + +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_MEM_KEYS static int pkey_active(struct task_struct *target, const struct user_regset *regset) @@ -615,6 +644,11 @@ static const struct user_regset native_regsets[] = { .size = sizeof(u64), .align = sizeof(u64), .active = pmu_active, .regset_get = pmu_get, .set = pmu_set }, + [REGSET_DEXCR] = { + .core_note_type = NT_PPC_DEXCR, .n = ELF_NDEXCR, + .size = sizeof(u64), .align = sizeof(u64), + .active = dexcr_active, .regset_get = dexcr_get + }, #endif #ifdef CONFIG_PPC_MEM_KEYS [REGSET_PKEY] = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index ac3da855fb19..cfa31f1eb5d7 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -403,6 +403,7 @@ typedef struct elf64_shdr { #define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */
[PATCH v3 05/11] powerpc/dexcr: Support userspace ROP protection
The ISA 3.1B hashst and hashchk instructions use a per-cpu SPR HASHKEYR to hold a key used in the hash calculation. This key should be different for each process to make it harder for a malicious process to recreate valid hash values for a victim process. Add support for storing a per-thread hash key, and setting/clearing HASHKEYR appropriately. Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey --- v3: * Add ruscur reviewed-by v1: * Guard HASHKEYR update behind change check * HASHKEYR reset moved earlier to patch 2 --- arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/kernel/process.c| 17 + 2 files changed, 18 insertions(+) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index e96c9b8c2a60..8a6754ffdc7e 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -264,6 +264,7 @@ struct thread_struct { unsigned long mmcr3; unsigned long sier2; unsigned long sier3; + unsigned long hashkeyr; #endif }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 1fefafb2b29b..b68898ac07e1 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1182,6 +1182,9 @@ static inline void save_sprs(struct thread_struct *t) */ t->tar = mfspr(SPRN_TAR); } + + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) + t->hashkeyr = mfspr(SPRN_HASHKEYR); #endif } @@ -1260,6 +1263,10 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (cpu_has_feature(CPU_FTR_P9_TIDR) && old_thread->tidr != new_thread->tidr) mtspr(SPRN_TIDR, new_thread->tidr); + + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE) && + old_thread->hashkeyr != new_thread->hashkeyr) + mtspr(SPRN_HASHKEYR, new_thread->hashkeyr); #endif } @@ -1867,6 +1874,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) } p->thread.tidr = 0; +#endif +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) + p->thread.hashkeyr = current->thread.hashkeyr; #endif return 0; } @@ -1984,6 +1995,12 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) current->thread.tm_tfiar = 0; current->thread.load_tm = 0; #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) { + current->thread.hashkeyr = get_random_long(); + mtspr(SPRN_HASHKEYR, current->thread.hashkeyr); + } +#endif /* CONFIG_PPC_BOOK3S_64 */ } EXPORT_SYMBOL(start_thread); -- 2.40.1
[PATCH v3 04/11] powerpc/dexcr: Handle hashchk exception
Recognise and pass the appropriate signal to the user program when a hashchk instruction triggers. This is independent of allowing configuration of DEXCR[NPHIE], as a hypervisor can enforce this aspect regardless of the kernel. The signal mirrors how ARM reports their similar check failure. For example, their FPAC handler in arch/arm64/kernel/traps.c do_el0_fpac() does this. When we fail to read the instruction that caused the fault we send a segfault, similar to how emulate_math() does it. Signed-off-by: Benjamin Gray --- v3: * Inline hashchk detection, remove dexcr.c and associated files v1: * Refactor the hashchk check to return 0 on success, an error code on failure. Determine what to do based on specific error code. * Motivate signal and code --- arch/powerpc/include/asm/ppc-opcode.h | 1 + arch/powerpc/kernel/traps.c | 16 2 files changed, 17 insertions(+) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index ca5a0da7df4e..ef6972aa33b9 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -222,6 +222,7 @@ #define OP_31_XOP_STFSX663 #define OP_31_XOP_STFSUX695 #define OP_31_XOP_STFDX 727 +#define OP_31_XOP_HASHCHK 754 #define OP_31_XOP_STFDUX759 #define OP_31_XOP_LHBRX 790 #define OP_31_XOP_LFIWAX855 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 9bdd79aa51cf..e59ec6d32d37 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1516,6 +1516,22 @@ static void do_program_check(struct pt_regs *regs) return; } } + + if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE) && user_mode(regs)) { + ppc_inst_t insn; + + if (get_user_instr(insn, (void __user *)regs->nip)) { + _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); + return; + } + + if (ppc_inst_primary_opcode(insn) == 31 && + get_xop(ppc_inst_val(insn)) == OP_31_XOP_HASHCHK) { + _exception(SIGILL, regs, ILL_ILLOPN, regs->nip); + return; + } + } + _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); return; } -- 2.40.1
[PATCH v3 03/11] powerpc/dexcr: Add initial Dynamic Execution Control Register (DEXCR) support
ISA 3.1B introduces the Dynamic Execution Control Register (DEXCR). It is a per-cpu register that allows control over various CPU behaviours including branch hint usage, indirect branch speculation, and hashst/hashchk support. Add some definitions and basic support for the DEXCR in the kernel. Right now it just * Initialises the DEXCR and HASHKEYR to a fixed value when a CPU onlines. * Clears them in reset_sprs(). * Detects when the NPHIE aspect is supported (the others don't get looked at in this series, so there's no need to waste a CPU_FTR on them). We initialise the HASHKEYR to ensure that all cores have the same key, so an HV enforced NPHIE + swapping cores doesn't randomly crash a process using hash instructions. The stores to HASHKEYR are unconditional because the ISA makes no mention of the SPR being missing if support for doing the hashes isn't present. So all that would happen is the HASHKEYR value gets ignored. This helps slightly if NPHIE detection fails; e.g., we currently only detect it on pseries. Signed-off-by: Benjamin Gray --- v3: * Initialise DEXCR with NPHIE enabled v2: * More definitions for ptrace v1: * Only make a CPU feature for NPHIE. We only need to know if the hashst/hashchk functionality is supported for a static DEXCR. * Initialise the DEXCR to 0 when each CPU comes online. Remove the dexcr_init() and get_thread_dexcr() functions. * No longer track the DEXCR in a per-thread field. * Remove the made-up Opal features --- arch/powerpc/include/asm/book3s/64/kexec.h | 5 + arch/powerpc/include/asm/cputable.h| 4 +++- arch/powerpc/include/asm/reg.h | 11 +++ arch/powerpc/kernel/cpu_setup_power.c | 8 arch/powerpc/kernel/prom.c | 1 + 5 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/kexec.h b/arch/powerpc/include/asm/book3s/64/kexec.h index d4b9d476ecba..df37a76c1e9f 100644 --- a/arch/powerpc/include/asm/book3s/64/kexec.h +++ b/arch/powerpc/include/asm/book3s/64/kexec.h @@ -21,6 +21,11 @@ static inline void reset_sprs(void) plpar_set_ciabr(0); } + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + mtspr(SPRN_DEXCR, 0); + mtspr(SPRN_HASHKEYR, 0); + } + /* Do we need isync()? We are going via a kexec reset */ isync(); } diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 757dbded11dc..443a9d482b15 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -192,6 +192,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_P9_RADIX_PREFETCH_BUG LONG_ASM_CONST(0x0002) #define CPU_FTR_ARCH_31 LONG_ASM_CONST(0x0004) #define CPU_FTR_DAWR1 LONG_ASM_CONST(0x0008) +#define CPU_FTR_DEXCR_NPHIELONG_ASM_CONST(0x0010) #ifndef __ASSEMBLY__ @@ -451,7 +452,8 @@ static inline void cpu_feature_keys_init(void) { } CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \ CPU_FTR_ARCH_300 | CPU_FTR_ARCH_31 | \ - CPU_FTR_DAWR | CPU_FTR_DAWR1) + CPU_FTR_DAWR | CPU_FTR_DAWR1 | \ + CPU_FTR_DEXCR_NPHIE) #define CPU_FTRS_CELL (CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 6372e5f55ef0..ed6acb0f17e7 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -382,7 +382,18 @@ #define SPRN_HIOR 0x137 /* 970 Hypervisor interrupt offset */ #define SPRN_RMOR 0x138 /* Real mode offset register */ #define SPRN_HRMOR 0x139 /* Real mode offset register */ +#define SPRN_HDEXCR_RO 0x1C7 /* Hypervisor DEXCR (non-privileged, readonly) */ +#define SPRN_HASHKEYR 0x1D4 /* Non-privileged hashst/hashchk key register */ +#define SPRN_HDEXCR0x1D7 /* Hypervisor dynamic execution control register */ +#define SPRN_DEXCR_RO 0x32C /* DEXCR (non-privileged, readonly) */ #define SPRN_ASDR 0x330 /* Access segment descriptor register */ +#define SPRN_DEXCR 0x33C /* Dynamic execution control register */ +#define DEXCR_PR_BIT(aspect) PPC_BIT(32 + (aspect)) +#define DEXCR_PR_SBHEDEXCR_PR_BIT(0) /* Speculative Branch Hint Enable */ +#define DEXCR_PR_IBRTPD DEXCR_PR_BIT(3) /* Indirect Branch Recurrent Target Prediction Disable */ +#define DEXCR_PR_SRAPD DEXCR_PR_BIT(4) /* Subroutine Return Address Prediction Disable */ +#define DEXCR_PR_NPHIE DEXCR_PR_BIT(5) /* Non-Privileged Hash Instruction Enable */ +#define DEXCR_INIT DEXCR_PR_NPHIE /* Fixed
[PATCH v3 02/11] powerpc/ptrace: Add missing include
ptrace-decl.h uses user_regset_get2_fn (among other things) from regset.h. While all current users of ptrace-decl.h include regset.h before it anyway, it adds an implicit ordering dependency and breaks source tooling that tries to inspect ptrace-decl.h by itself. Signed-off-by: Benjamin Gray Reviewed-by: Russell Currey --- v3: * Add ruscur reviewed-by v2: * New in v2 --- arch/powerpc/kernel/ptrace/ptrace-decl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h b/arch/powerpc/kernel/ptrace/ptrace-decl.h index eafe5f0f6289..463a63eb8cc7 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-decl.h +++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ +#include + /* * Set of msr bits that gdb can change on behalf of a process. */ -- 2.40.1
[PATCH v3 01/11] powerpc/book3s: Add missing include
The functions here use struct task_struct fields, so need to import the full definition from . The header that defines current only forward declares struct task_struct. Failing to include this header leads to a compilation error when a translation unit does not also include indirectly. Signed-off-by: Benjamin Gray Reviewed-by: Nicholas Piggin Reviewed-by: Russell Currey --- v3: * Add ruscur reviewed-by * thread_struct -> task_struct v1: * Add npiggin reviewed-by --- arch/powerpc/include/asm/book3s/64/kup.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index 54cf46808157..84c09e546115 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -194,6 +194,7 @@ #else /* !__ASSEMBLY__ */ #include +#include DECLARE_STATIC_KEY_FALSE(uaccess_flush_key); -- 2.40.1
[PATCH v3 00/11] Add static DEXCR support
v3: * Expose (H)DEXCR in ptrace as 64 bits * Remove build config for DEXCR, always enable NPHIE * Fix up documentation to reflect this * Some commit message fixes Previous versions: v2: https://lore.kernel.org/all/20230330055040.434133-1-bg...@linux.ibm.com/ v1: https://lore.kernel.org/all/20230322054612.1340573-1-bg...@linux.ibm.com/ RFC: https://lore.kernel.org/all/20221128024458.46121-1-bg...@linux.ibm.com/ Benjamin Gray (11): powerpc/book3s: Add missing include powerpc/ptrace: Add missing include powerpc/dexcr: Add initial Dynamic Execution Control Register (DEXCR) support powerpc/dexcr: Handle hashchk exception powerpc/dexcr: Support userspace ROP protection powerpc/ptrace: Expose DEXCR and HDEXCR registers to ptrace powerpc/ptrace: Expose HASHKEYR register to ptrace Documentation: Document PowerPC kernel DEXCR interface selftests/powerpc: Add more utility macros selftests/powerpc/dexcr: Add hashst/hashchk test selftests/powerpc/dexcr: Add DEXCR status utility lsdexcr Documentation/powerpc/dexcr.rst | 58 + Documentation/powerpc/index.rst | 1 + arch/powerpc/include/asm/book3s/64/kexec.h| 5 + arch/powerpc/include/asm/book3s/64/kup.h | 1 + arch/powerpc/include/asm/cputable.h | 4 +- arch/powerpc/include/asm/ppc-opcode.h | 1 + arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/include/asm/reg.h| 11 + arch/powerpc/include/uapi/asm/elf.h | 2 + arch/powerpc/kernel/cpu_setup_power.c | 8 + arch/powerpc/kernel/process.c | 17 ++ arch/powerpc/kernel/prom.c| 1 + arch/powerpc/kernel/ptrace/ptrace-decl.h | 6 + arch/powerpc/kernel/ptrace/ptrace-view.c | 72 +- arch/powerpc/kernel/traps.c | 16 ++ include/uapi/linux/elf.h | 2 + tools/testing/selftests/powerpc/Makefile | 1 + .../selftests/powerpc/dexcr/.gitignore| 2 + .../testing/selftests/powerpc/dexcr/Makefile | 9 + tools/testing/selftests/powerpc/dexcr/dexcr.c | 132 ++ tools/testing/selftests/powerpc/dexcr/dexcr.h | 49 .../selftests/powerpc/dexcr/hashchk_test.c| 227 ++ .../testing/selftests/powerpc/dexcr/lsdexcr.c | 141 +++ tools/testing/selftests/powerpc/include/reg.h | 4 + .../testing/selftests/powerpc/include/utils.h | 31 ++- .../powerpc/pmu/sampling_tests/misc.h | 2 - tools/testing/selftests/powerpc/utils.c | 24 ++ 27 files changed, 823 insertions(+), 5 deletions(-) create mode 100644 Documentation/powerpc/dexcr.rst create mode 100644 tools/testing/selftests/powerpc/dexcr/.gitignore create mode 100644 tools/testing/selftests/powerpc/dexcr/Makefile create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.c create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.h create mode 100644 tools/testing/selftests/powerpc/dexcr/hashchk_test.c create mode 100644 tools/testing/selftests/powerpc/dexcr/lsdexcr.c -- 2.40.1
[powerpc:next] BUILD SUCCESS 4adc95a3edc152ffc3b0c88117767a31c9c01a1e
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next branch HEAD: 4adc95a3edc152ffc3b0c88117767a31c9c01a1e powerpc: update ppc_save_regs to save current r1 in pt_regs elapsed time: 726m configs tested: 179 configs skipped: 13 The following configs have been built successfully. More configs may be tested in the coming days. tested configs: alphaallyesconfig gcc alphabuildonly-randconfig-r004-20230615 gcc alpha defconfig gcc alpharandconfig-r002-20230615 gcc alpharandconfig-r011-20230614 gcc alpharandconfig-r013-20230614 gcc alpharandconfig-r025-20230615 gcc alpharandconfig-r026-20230615 gcc arc allyesconfig gcc arc defconfig gcc arc randconfig-r003-20230615 gcc arc randconfig-r034-20230615 gcc arc randconfig-r043-20230614 gcc arc randconfig-r043-20230615 gcc arm allmodconfig gcc arm allyesconfig gcc arm axm55xx_defconfig gcc arm defconfig gcc arm footbridge_defconfig gcc arm randconfig-r046-20230614 clang arm randconfig-r046-20230615 gcc arm64allyesconfig gcc arm64 defconfig gcc arm64randconfig-r033-20230615 gcc arm64randconfig-r036-20230615 gcc csky buildonly-randconfig-r003-20230614 gcc cskydefconfig gcc csky randconfig-r003-20230615 gcc csky randconfig-r034-20230615 gcc hexagon randconfig-r015-20230614 clang hexagon randconfig-r025-20230615 clang hexagon randconfig-r035-20230615 clang hexagon randconfig-r041-20230614 clang hexagon randconfig-r041-20230615 clang hexagon randconfig-r045-20230614 clang hexagon randconfig-r045-20230615 clang i386 allyesconfig gcc i386 debian-10.3 gcc i386defconfig gcc i386 randconfig-i001-20230614 clang i386 randconfig-i001-20230615 gcc i386 randconfig-i002-20230614 clang i386 randconfig-i002-20230615 gcc i386 randconfig-i003-20230614 clang i386 randconfig-i003-20230615 gcc i386 randconfig-i004-20230614 clang i386 randconfig-i004-20230615 gcc i386 randconfig-i005-20230614 clang i386 randconfig-i005-20230615 gcc i386 randconfig-i006-20230614 clang i386 randconfig-i006-20230615 gcc i386 randconfig-i011-20230614 gcc i386 randconfig-i011-20230615 clang i386 randconfig-i012-20230614 gcc i386 randconfig-i012-20230615 clang i386 randconfig-i013-20230614 gcc i386 randconfig-i013-20230615 clang i386 randconfig-i014-20230614 gcc i386 randconfig-i014-20230615 clang i386 randconfig-i015-20230614 gcc i386 randconfig-i015-20230615 clang i386 randconfig-i016-20230614 gcc i386 randconfig-i016-20230615 clang i386 randconfig-r014-20230614 gcc i386 randconfig-r021-20230615 clang i386 randconfig-r032-20230615 gcc loongarchallmodconfig gcc loongarch allnoconfig gcc loongarchbuildonly-randconfig-r005-20230614 gcc loongarch defconfig gcc loongarchrandconfig-r004-20230615 gcc loongarchrandconfig-r024-20230615 gcc m68k allmodconfig gcc m68k allyesconfig gcc m68kdefconfig gcc m68k randconfig-r002-20230615 gcc m68k randconfig-r013-20230614 gcc m68k randconfig-r023-20230615 gcc m68k randconfig-r033-20230615 gcc microblaze buildonly-randconfig-r002-20230614 gcc microblaze randconfig-r005-20230615 gcc microblaze randconfig-r023-20230615 gcc microblaze randconfig-r025-20230615 gcc microblaze randconfig-r033-20230615 gcc mips allmodconfig gcc mips
[PATCH v2 07/23 replacement] mips: add pte_unmap() to balance pte_offset_map()
To keep balance in future, __update_tlb() remember to pte_unmap() after pte_offset_map(). This is an odd case, since the caller has already done pte_offset_map_lock(), then mips forgets the address and recalculates it; but my two naive attempts to clean that up did more harm than good. Tested-by: Nathan Chancellor Signed-off-by: Hugh Dickins --- Andrew, please replace my mips patch, and its build warning fix patch, in mm-unstable by this less ambitious but working replacement - thanks. arch/mips/mm/tlb-r4k.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index 1b939abbe4ca..93c2d695588a 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -297,7 +297,7 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte) p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; - pte_t *ptep; + pte_t *ptep, *ptemap = NULL; int idx, pid; /* @@ -344,7 +344,12 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte) } else #endif { - ptep = pte_offset_map(pmdp, address); + ptemap = ptep = pte_offset_map(pmdp, address); + /* +* update_mmu_cache() is called between pte_offset_map_lock() +* and pte_unmap_unlock(), so we can assume that ptep is not +* NULL here: and what should be done below if it were NULL? +*/ #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) #ifdef CONFIG_XPA @@ -373,6 +378,9 @@ void __update_tlb(struct vm_area_struct * vma, unsigned long address, pte_t pte) tlbw_use_hazard(); htw_start(); flush_micro_tlb_vm(vma); + + if (ptemap) + pte_unmap(ptemap); local_irq_restore(flags); } -- 2.35.3
Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
Le 15/06/2023 à 18:34, Randy Dunlap a écrit : > > > On 6/15/23 09:13, Randy Dunlap wrote: >> >> >> On 6/15/23 09:05, Timothy Pearson wrote: >>> >>> >>> - Original Message - From: "Randy Dunlap" To: "Timothy Pearson" , "Michael Ellerman" Cc: "Stephen Rothwell" , "Linux Next Mailing List" , "linux-kernel" , "linuxppc-dev" , "Alexey Kardashevskiy" Sent: Thursday, June 15, 2023 11:00:08 AM Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) >>> Hi Timothy, On 6/3/23 20:57, Timothy Pearson wrote: > > > - Original Message - >> From: "Michael Ellerman" >> To: "Randy Dunlap" , "Stephen Rothwell" >> , "Linux Next Mailing List" >> >> Cc: "linux-kernel" , "linuxppc-dev" >> , "Alexey >> Kardashevskiy" , "Timothy Pearson" >> >> Sent: Saturday, June 3, 2023 7:22:51 PM >> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) > >> Randy Dunlap writes: >>> On 6/1/23 21:01, Stephen Rothwell wrote: Hi all, Changes since 20230601: >>> >>> On powerpc64, a randconfig failed with: >>> >>> In file included from ../include/linux/list.h:5, >>> from ../include/linux/preempt.h:11, >>> from ../include/linux/spinlock.h:56, >>> from ../include/linux/mmzone.h:8, >>> from ../include/linux/gfp.h:7, >>> from ../include/linux/slab.h:15, >>> from ../arch/powerpc/kernel/iommu.c:15: >>> ../arch/powerpc/kernel/iommu.c: In function >>> 'spapr_tce_setup_phb_iommus_initcall': >>> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared >>> (first use >>> in this function); did you mean 'zonelist'? >>> 1391 | list_for_each_entry(hose, _list, list_node) { >>>|^ >> ... >> >> hose_list is in pci-common.c which is built when PCI=y. >> >> PSERIES and POWERNV force PCI=y. >> >> But this config has neither: >> >> # CONFIG_PPC_POWERNV is not set >> # CONFIG_PPC_PSERIES is not set >> CONFIG_HAVE_PCI=y >> # CONFIG_PCI is not set >> # CONFIG_COMMON_CLK_RS9_PCIE is not set >> >> >> Probably the spapr_tce code should be wrapped in an #ifdef that is only >> enabled when POWERNV || PSERIES is enabled. >> >> cheers > > Sounds reasonable, I was going to look into this further over the > weekend. I > can put together a patch for Monday if that works? Did you prepare a patch for this? I am still seeing this build error. thanks. -- ~Randy >>> >>> Yes, it was sent in to the linuxppc-dev list some weeks ago. Did it not >>> arrive? >> >> I don't know - I'm not subscribed to that list. >> >> It's probably still in the patchworks review cycle >> so it hasn't been applied anywhere that gets into linux-next. > > OK, it's here, mark with Success: > http://patchwork.ozlabs.org/project/linuxppc-dev/patch/2015925968.3546872.1685990936823.javamail.zim...@raptorengineeringinc.com/ > > I don't know what happens to it next or when. Is that supposed to fix anything ? I doesn't have any Fixes: tag Christophe
Re: [PATCH v2 07/23] mips: update_mmu_cache() can replace __update_tlb()
On Wed, Jun 14, 2023 at 04:17:58PM -0700, Nathan Chancellor wrote: > Hi Hugh, > > On Thu, Jun 08, 2023 at 12:17:24PM -0700, Hugh Dickins wrote: > > Don't make update_mmu_cache() a wrapper around __update_tlb(): call it > > directly, and use the ptep (or pmdp) provided by the caller, instead of > > re-calling pte_offset_map() - which would raise a question of whether a > > pte_unmap() is needed to balance it. > > > > Check whether the "ptep" provided by the caller is actually the pmdp, > > instead of testing pmd_huge(): or test pmd_huge() too and warn if it > > disagrees? This is "hazardous" territory: needs review and testing. > > > > Signed-off-by: Hugh Dickins > > --- > > arch/mips/include/asm/pgtable.h | 15 +++ > > arch/mips/mm/tlb-r3k.c | 5 +++-- > > arch/mips/mm/tlb-r4k.c | 9 +++-- > > 3 files changed, 9 insertions(+), 20 deletions(-) > > > > I just bisected a crash while powering down a MIPS machine in QEMU to > this change as commit 8044511d3893 ("mips: update_mmu_cache() can > replace __update_tlb()") in linux-next. Unfortunately, I can still > reproduce it with the existing fix you have for this change on the > mailing list, which is present in next-20230614. > > I can reproduce it with the GCC 13.1.0 on kernel.org [1]. > > $ make -skj"$(nproc)" ARCH=mips CROSS_COMPILE=mips-linux- mrproper > malta_defconfig vmlinux > > $ qemu-system-mipsel \ > -display none \ > -nodefaults \ > -cpu 24Kf \ > -machine malta \ > -kernel vmlinux \ > -initrd rootfs.cpio \ > -m 512m \ > -serial mon:stdio > ... > Linux version 6.4.0-rc6-next-20230614 (nathan@dev-arch.thelio-3990X) > (mips-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jun 14 > 16:13:02 MST 2023 > ... > Run /init as init process > process '/bin/busybox' started with executable stack > do_page_fault(): sending SIGSEGV to init for invalid read access from > 003c > epc = 77b893dc in ld-uClibc-1.0.39.so[77b84000+8000] > ra = 77b8930c in ld-uClibc-1.0.39.so[77b84000+8000] > Kernel panic - not syncing: Attempted to kill init! exitcode=0x000b > ---[ end Kernel panic - not syncing: Attempted to kill init! > exitcode=0x000b ]--- > > The rootfs is available at [2] if it is needed. I am more than happy to > provide additional information or test patches if necessary. > > [1]: https://mirrors.edge.kernel.org/pub/tools/crosstool/ > [2]: > https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230609-194440/mipsel-rootfs.cpio.zst Seeing this on real h/w as well (just to confirm). Linux version 6.4.0-rc4-00437-g4bab5c42a698 (r...@yuzhao.bld.corp.google.com) (mips64el-linux-gnuabi64-gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils for Debian) 2.40) #3 SMP PREEMPT Thu Jun 15 01:05:20 MDT 2023 Skipping L2 locking due to reduced L2 cache size CVMSEG size: 2 cache lines (256 bytes) printk: bootconsole [early0] enabled CPU0 revision is: 000d9602 (Cavium Octeon III) FPU revision is: 00739600 Kernel sections are not in the memory maps Wasting 243712 bytes for tracking 4352 unused pages Initrd not found or empty - disabling initrd Using passed Device Tree. software IO TLB: SWIOTLB bounce buffer size adjusted to 0MB software IO TLB: area num 1. software IO TLB: mapped [mem 0x0370d000-0x0374d000] (0MB) Primary instruction cache 78kB, virtually tagged, 39 way, 16 sets, linesize 128 bytes. Primary data cache 32kB, 32-way, 8 sets, linesize 128 bytes. Zone ranges: DMA32[mem 0x0110-0xefff] Normal empty Movable zone start for each node Early memory node ranges node 0: [mem 0x0110-0x03646fff] node 0: [mem 0x0370-0x0faf] node 0: [mem 0x2000-0x4ebf] Initmem setup node 0 [mem 0x0110-0x4ebf] On node 0, zone DMA32: 4352 pages in unavailable ranges On node 0, zone DMA32: 185 pages in unavailable ranges On node 0, zone DMA32: 1280 pages in unavailable ranges On node 0, zone DMA32: 5120 pages in unavailable ranges percpu: Embedded 15 pages/cpu s24368 r8192 d28880 u61440 pcpu-alloc: s24368 r8192 d28880 u61440 alloc=15*4096 pcpu-alloc: [0] 0 [0] 1 [0] 2 [0] 3 Kernel command line: loglevel=8 console=ttyS0,115200 printk: log_buf_len individual max cpu contribution: 4096 bytes printk: log_buf_len total cpu_extra contributions: 12288 bytes printk: log_buf_len min size: 16384 bytes printk: log_buf_len: 32768 bytes printk: early log buf free: 14184(86%) Dentry cache hash table entries: 131072 (order: 8, 1048576 bytes, linear) Inode-cache hash table entries: 65536 (order: 7, 524288 bytes, linear) Built 1 zonelists, mobility grouping on. Total pages: 247772 mem auto-init: stack:all(zero), heap alloc:off, heap free:off Memory: 950032K/1004828K available (8058K kernel code, 575K rwdata, 1880K rodata, 27488K init, 158K bss, 54796K reserved, 0K cma-reserved) rcu: Preemptible
Re: [PATCH v2 07/23] mips: update_mmu_cache() can replace __update_tlb()
On Thu, 15 Jun 2023, Nathan Chancellor wrote: > On Wed, Jun 14, 2023 at 10:43:30PM -0700, Hugh Dickins wrote: > > > > I do hope that you find the first fixes the breakage; but if not, then > > I hate to be the bearer of bad news but the first patch did not fix the > breakage, I see the same issue. Boo! > > > I even more fervently hope that the second will, despite my hating it. > > Touch wood for the first, fingers crossed for the second, thanks, > > Thankfully, the second one does. Thanks for the quick and thoughtful > responses! Hurrah! Thanks a lot, Nathan. I'll set aside my disappointment and curiosity, clearly I'm not going to have much of a future as a MIPS programmer. I must take a break, then rush Andrew the second patch, well, not exactly that second patch, since most of that is revert: I'll just send the few lines of replacement patch (with a new Subject line, as update_mmu_cache() goes back to being separate from __update_tlb()). Unless you object, I'll include a Tested-by: you. I realize that your testing is limited to seeing it running; but that's true of most of the testing at this stage - it gets to be more interesting when the patch that adds the rcu_read_lock() and rcu_read_unlock() is added on top later. Thanks again, Hugh
Re: [PATCH v9 00/14] pci: Work around ASMedia ASM2824 PCIe link training failures
On Thu, Jun 15, 2023 at 01:41:10AM +0100, Maciej W. Rozycki wrote: > On Wed, 14 Jun 2023, Bjorn Helgaas wrote: > > > > This is v9 of the change to work around a PCIe link training phenomenon > > > where a pair of devices both capable of operating at a link speed above > > > 2.5GT/s seems unable to negotiate the link speed and continues training > > > indefinitely with the Link Training bit switching on and off repeatedly > > > and the data link layer never reaching the active state. > > > > > > With several requests addressed and a few extra issues spotted this > > > version has now grown to 14 patches. It has been verified for device > > > enumeration with and without PCI_QUIRKS enabled, using the same piece of > > > RISC-V hardware as previously. Hot plug or reset events have not been > > > verified, as this is difficult if at all feasible with hardware in > > > question. > > static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) > > { > > - bool retrain = true; > > int delay = 1; > > + bool retrain = false; > > + struct pci_dev *bridge; > > + > > + if (pci_is_pcie(dev)) { > > + retrain = true; > > + bridge = pci_upstream_bridge(dev); > > + } > > If doing it this way, which I actually like, I think it would be a little > bit better performance- and style-wise if this was written as: > > if (pci_is_pcie(dev)) { > bridge = pci_upstream_bridge(dev); > retrain = !!bridge; > } > > (or "retrain = bridge != NULL" if you prefer this style), and then we > don't have to repeatedly check two variables iff (pcie && !bridge) in the > loop below: Done, thanks, I do like that better. I did: bridge = pci_upstream_bridge(dev); if (bridge) retrain = true; because it seems like it flows more naturally when reading. Bjorn
Re: [PATCH v1 10/21] powerpc/kexec: refactor for kernel/Kconfig.kexec
On Thu, Jun 15, 2023 at 01:34:25PM +1000, Michael Ellerman wrote: > Eric DeVolder writes: > > -config KEXEC_FILE > > - bool "kexec file based system call" > > - select KEXEC_CORE > > - select HAVE_IMA_KEXEC if IMA > > - select KEXEC_ELF > > - depends on PPC64 > > - depends on CRYPTO=y > > - depends on CRYPTO_SHA256=y > ... > > + > > +config ARCH_HAS_KEXEC_FILE > > + def_bool PPC64 && CRYPTO && CRYPTO_SHA256 > > The =y's got lost here. > > I think they were both meaningful, because both options are tristate. So > this previously required them to be built-in (=y), whereas after your > patch it will allow them to be modules. > > I don't know for sure that those options need to be built-in, but that's > what the code does now, so this patch shouldn't change it, at least > without an explanation. This patch shouldn't change it at all, period. If you want to change it (and that sounds like a good idea, if it is possible anyway), that should be a separate patch. Segher
Re: [PATCH v1 00/21] refactor Kconfig to consolidate KEXEC and CRASH options
On 6/14/23 22:26, Michael Ellerman wrote: Eric DeVolder writes: On 6/13/23 15:21, Kees Cook wrote: On Mon, Jun 12, 2023 at 01:27:52PM -0400, Eric DeVolder wrote: The Kconfig is refactored to consolidate KEXEC and CRASH options from various arch//Kconfig files into new file kernel/Kconfig.kexec. This looks very nice! Thank you Kees! [...] - The boolean ARCH_HAS_ in effect allows the arch to determine when the feature is allowed. Archs which don't have the feature simply do not provide the corresponding ARCH_HAS_. For each arch, where there previously were KEXEC and/or CRASH options, these have been replaced with the corresponding boolean ARCH_HAS_, and an appropriate def_bool statement. For example, if the arch supports KEXEC_FILE, then the ARCH_HAS_KEXEC_FILE simply has a 'def_bool y'. This permits the KEXEC_FILE option to be available. If the arch has a 'depends on' statement in its original coding of the option, then that expression becomes part of the def_bool expression. For example, arm64 had: config KEXEC depends on PM_SLEEP_SMP and in this solution, this converts to: config ARCH_HAS_KEXEC def_bool PM_SLEEP_SMP - In order to account for the differences in the config coding for the three common options, the ARCH_SUPPORTS_ is used. This options has a 'depends on ' statement to couple it to the main option, and from there can insert the differences from the common option and the arch original coding of that option. For example, a few archs enable CRYPTO and CRYTPO_SHA256 for KEXEC_FILE. These require a ARCH_SUPPORTS_KEXEC_FILE and 'select CRYPTO' and 'select CRYPTO_SHA256' statements. Naming nit: "HAS" and "SUPPORTS" feel very similar, and looking at existing configs, "ARCH_SUPPORTS_..." is already used for doing this kind of bare "bool" management. e.g. see ARCH_SUPPORTS_INT128 It looks like you need to split "depends" and "select" so the options can be chosen separately from the "selectable" configs. How about naming this ARCH_SELECTS_, since that's what it's there for? I'm OK with this. Let's see if others agree? Yeah please rename one or both of them. At a glance the difference between HAS and SUPPORTS is very non-obvious. I like Kees' suggestion to use ARCH_SUPPORTS and ARCH_SELECTS. cheers Michael, ok thanks! eric
Re: [PATCH v1 10/21] powerpc/kexec: refactor for kernel/Kconfig.kexec
On 6/14/23 22:34, Michael Ellerman wrote: Eric DeVolder writes: The kexec and crash kernel options are provided in the common kernel/Kconfig.kexec. Utilize the common options and provide the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the equivalent set of KEXEC and CRASH options. Signed-off-by: Eric DeVolder Reviewed-by: Sourabh Jain --- arch/powerpc/Kconfig | 55 ++-- 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bff5820b7cda..36f2fe0cc8a5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -588,41 +588,21 @@ config PPC64_SUPPORTS_MEMORY_FAILURE default "y" if PPC_POWERNV select ARCH_SUPPORTS_MEMORY_FAILURE -config KEXEC - bool "kexec system call" - depends on PPC_BOOK3S || PPC_E500 || (44x && !SMP) - select KEXEC_CORE - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot - but it is independent of the system firmware. And like a reboot - you can start any kernel with it, not just Linux. - - The name comes from the similarity to the exec system call. - - It is an ongoing process to be certain the hardware in a machine - is properly shutdown, so do not be surprised if this code does not - initially work for you. As of this writing the exact hardware - interface is strongly in flux, so no good recommendation can be - made. - -config KEXEC_FILE - bool "kexec file based system call" - select KEXEC_CORE - select HAVE_IMA_KEXEC if IMA - select KEXEC_ELF - depends on PPC64 - depends on CRYPTO=y - depends on CRYPTO_SHA256=y ... + +config ARCH_HAS_KEXEC_FILE + def_bool PPC64 && CRYPTO && CRYPTO_SHA256 The =y's got lost here. I think they were both meaningful, because both options are tristate. So this previously required them to be built-in (=y), whereas after your patch it will allow them to be modules. I don't know for sure that those options need to be built-in, but that's what the code does now, so this patch shouldn't change it, at least without an explanation. cheers Thanks Michael, I've applied =y's. Good catch! eric
Re: [PATCH mm-unstable v2 08/10] kvm/x86: move tdp_mmu_enabled and shadow_accessed_mask
On Fri, May 26, 2023, Yu Zhao wrote: > tdp_mmu_enabled and shadow_accessed_mask are needed to implement > kvm_arch_has_test_clear_young(). > > Signed-off-by: Yu Zhao > --- > arch/x86/include/asm/kvm_host.h | 6 ++ > arch/x86/kvm/mmu.h | 6 -- > arch/x86/kvm/mmu/spte.h | 1 - > 3 files changed, 6 insertions(+), 7 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index fb9d1f2d6136..753c67072c47 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1772,6 +1772,7 @@ struct kvm_arch_async_pf { > > extern u32 __read_mostly kvm_nr_uret_msrs; > extern u64 __read_mostly host_efer; > +extern u64 __read_mostly shadow_accessed_mask; > extern bool __read_mostly allow_smaller_maxphyaddr; > extern bool __read_mostly enable_apicv; > extern struct kvm_x86_ops kvm_x86_ops; > @@ -1855,6 +1856,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned > irqchip, unsigned pin, >bool mask); > > extern bool tdp_enabled; > +#ifdef CONFIG_X86_64 > +extern bool tdp_mmu_enabled; > +#else > +#define tdp_mmu_enabled false > +#endif I would much prefer that these be kept in kvm/mmu.h. And looking at all the arch code, there's no reason to make kvm_arch_has_test_clear_young() a runtime callback, all of the logic is constant relative to when KVM is loaded. So rather than have generic KVM pull from arch code, what if we have arch code push info to generic KVM? We could even avoid #ifdefs if arch code passed in its handler. That might result in an extra indirect branch though, so it might be better to just use a flag? E.g. the x86 conversion would be something like this. --- arch/x86/kvm/mmu/mmu.c | 5 + arch/x86/kvm/mmu/tdp_mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.h | 1 + include/linux/kvm_host.h | 24 virt/kvm/kvm_main.c| 14 ++ 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8ebe542c565..84a4a83540f0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5809,6 +5809,11 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, max_huge_page_level = PG_LEVEL_1G; else max_huge_page_level = PG_LEVEL_2M; + + if (tdp_mmu_enabled && kvm_ad_enabled()) + kvm_init_test_clear_young(kvm_tdp_mmu_test_clear_young); + else + kvm_init_test_clear_young(NULL); } EXPORT_SYMBOL_GPL(kvm_configure_mmu); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index f463d54228f8..e878c88f0e02 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1308,7 +1308,7 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); } -bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_tdp_mmu_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range) { struct kvm_mmu_page *root; int offset = ffs(shadow_accessed_mask) - 1; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 0a63b1afabd3..aaa0b75b3896 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -34,6 +34,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); +bool kvm_tdp_mmu_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1714f82a0c47..7a0922cbc36f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -264,31 +264,15 @@ struct kvm_gfn_range { pte_t pte; bool may_block; }; + +typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); + bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn); -bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range); -#endif - -/* - * Architectures that implement kvm_arch_test_clear_young() should override - * kvm_arch_has_test_clear_young(). - * - * kvm_arch_has_test_clear_young() is allowed to return false positive, i.e., it - * can return true if kvm_arch_test_clear_young() is supported but disabled due - * to some
[PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset
From: Xiaowei Bao A workaround for the issue where the PCI Express Endpoint (EP) controller loses the values of the Maximum Link Width and Supported Link Speed from the Link Capabilities Register, which initially configured by the Reset Configuration Word (RCW) during a link-down or hot reset event. Signed-off-by: Xiaowei Bao Signed-off-by: Hou Zhiqiang Signed-off-by: Frank Li --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index 4e4fdd1dfea7..2ef02d827eeb 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -45,6 +45,7 @@ struct ls_pcie_ep { struct pci_epc_features *ls_epc; const struct ls_pcie_ep_drvdata *drvdata; int irq; + u32 lnkcap; boolbig_endian; }; @@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) struct ls_pcie_ep *pcie = dev_id; struct dw_pcie *pci = pcie->pci; u32 val, cfg; + u8 offset; val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); @@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) return IRQ_NONE; if (val & PEX_PF0_PME_MES_DR_LUD) { + + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + + dw_pcie_dbi_ro_wr_en(pci); + dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap); + dw_pcie_dbi_ro_wr_dis(pci); + cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); cfg |= PEX_PF0_CFG_READY; ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); @@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) struct ls_pcie_ep *pcie; struct pci_epc_features *ls_epc; struct resource *dbi_base; + u8 offset; int ret; pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); @@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pcie); + offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); + ret = dw_pcie_ep_init(>ep); if (ret) return ret; -- 2.34.1
Re: [PATCH mm-unstable v2 01/10] mm/kvm: add mmu_notifier_ops->test_clear_young()
On Fri, May 26, 2023, Yu Zhao wrote: > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 0e571e973bc2..374262545f96 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -258,6 +258,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); > #ifdef KVM_ARCH_WANT_MMU_NOTIFIER > struct kvm_gfn_range { > struct kvm_memory_slot *slot; > + void *args; There's no reason to make this "void *", just declare "struct test_clear_young_args" in the header. Arch code won't be able to use it regardless. And I vote for something more like "test_clear_young_metadata", as there's far more information in there than just function arguments. And to stave off the argument that "void *" would allow reuse, take this opportunity to unionize the test_clear_young field with the change_pte field, e.g. /* comment about these fields being callback specific. */ union { struct test_clear_young_metadata *metadata; pte_t pte; unsigned long callback_arg; /* needs a better name */ }; > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 51e4882d0873..31ee58754b19 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -541,6 +541,7 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned > long start, > typedef void (*on_unlock_fn_t)(struct kvm *kvm); > > struct kvm_hva_range { > + void *args; Same feedback as kvm_gfn_range. > unsigned long start; > unsigned long end; > pte_t pte; > @@ -549,6 +550,7 @@ struct kvm_hva_range { > on_unlock_fn_t on_unlock; > bool flush_on_ret; > bool may_block; > + bool lockless; > }; > > /* > @@ -602,6 +604,8 @@ static __always_inline int __kvm_handle_hva_range(struct > kvm *kvm, > hva_end = min(range->end, slot->userspace_addr + > (slot->npages << PAGE_SHIFT)); > > + gfn_range.args = range->args; And this goes away because the generic callback_arg is what gets propagated. > + > /* >* To optimize for the likely case where the address >* range is covered by zero or one memslots, don't > @@ -619,7 +623,7 @@ static __always_inline int __kvm_handle_hva_range(struct > kvm *kvm, > gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE > - 1, slot); > gfn_range.slot = slot; > > - if (!locked) { > + if (!range->lockless && !locked) { > locked = true; > KVM_MMU_LOCK(kvm); > if (!IS_KVM_NULL_FN(range->on_lock)) > @@ -628,6 +632,9 @@ static __always_inline int __kvm_handle_hva_range(struct > kvm *kvm, > break; > } > ret |= range->handler(kvm, _range); > + > + if (range->lockless && ret) I don't like overloading "lockless" to also mean "stop on ret". Just add another flag, there's literally no cost for most callbacks as everything is constant at compile time and gets optimized away. > + range.args = > + range.lockless = true; The lockless and stop_on_ret behavior needs comments. > + range.handler = kvm_arch_test_clear_young; > + > + if (!__kvm_handle_hva_range(kvm, )) > + return args.young ? MMU_NOTIFIER_RANGE_LOCKLESS : 0; > + } > + > + if (bitmap) > + return 0; > + > + range.args = NULL; > + range.lockless = false; No need to manually clear these, they'll be zeroed by the initialization code. E.g. all in all, something like so --- include/linux/kvm_host.h | 9 +++-- virt/kvm/kvm_main.c | 29 + 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7a0922cbc36f..e04605061f5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -256,12 +256,17 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif #ifdef KVM_ARCH_WANT_MMU_NOTIFIER +struct test_clear_young_metadata; + struct kvm_gfn_range { struct kvm_memory_slot *slot; - void *args; gfn_t start; gfn_t end; - pte_t pte; + union { + struct test_clear_young_metadata *metadata; + pte_t pte; + unsigned long callback_arg; + }; bool may_block; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ac83cfb30771..8cf4fee9cd8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -536,16 +536,20 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, typedef void (*on_unlock_fn_t)(struct kvm *kvm); struct kvm_hva_range { - void *args; unsigned
Re: [PATCH mm-unstable v2 09/10] kvm/x86: add kvm_arch_test_clear_young()
On Fri, May 26, 2023, Yu Zhao wrote: > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 08340219c35a..6875a819e007 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1232,6 +1232,40 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct > kvm_gfn_range *range) > return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); > } > > +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range) > +{ > + struct kvm_mmu_page *root; > + int offset = ffs(shadow_accessed_mask) - 1; > + > + if (kvm_shadow_root_allocated(kvm)) This needs a comment. > + return true; > + > + rcu_read_lock(); > + > + list_for_each_entry_rcu(root, >arch.tdp_mmu_roots, link) { As requested in v1[1], please add a macro for a lockless walk. [1] https://lkml.kernel.org/r/Y%2Fed0XYAPx%2B7pukA%40google.com > + struct tdp_iter iter; > + > + if (kvm_mmu_page_as_id(root) != range->slot->as_id) > + continue; > + > + tdp_root_for_each_leaf_pte(iter, root, range->start, > range->end) { > + u64 *sptep = rcu_dereference(iter.sptep); > + > + VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep))); Hrm, I don't like adding this in KVM. The primary MMU might guarantee that this callback is invoked if and only if the SPTE is backed by struct page memory, but there's no reason to assume that's true in KVM. If we want the sanity check, then this needs to use kvm_pfn_to_refcounted_page(). And it should use KVM's MMU_WARN_ON(), which is a mess and effectively dead code, but I'm working on changing that[*], i.e. by the time this gets to Linus' tree, the sanity check should have a much cleaner implementation. [2] https://lore.kernel.org/all/20230511235917.639770-8-sea...@google.com > + > + if (!(iter.old_spte & shadow_accessed_mask)) > + continue; > + > + if (kvm_should_clear_young(range, iter.gfn)) > + clear_bit(offset, (unsigned long *)sptep); If/when you rebase on https://github.com/kvm-x86/linux/tree/next, can you pull out the atomic bits of tdp_mmu_clear_spte_bits() and use that new helper? E.g. diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h index fae559559a80..914c34518829 100644 --- a/arch/x86/kvm/mmu/tdp_iter.h +++ b/arch/x86/kvm/mmu/tdp_iter.h @@ -58,15 +58,18 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte, return old_spte; } +static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask) +{ + atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep); + + return (u64)atomic64_fetch_and(~mask, sptep_atomic); +} + static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte, u64 mask, int level) { - atomic64_t *sptep_atomic; - - if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) { - sptep_atomic = (atomic64_t *)rcu_dereference(sptep); - return (u64)atomic64_fetch_and(~mask, sptep_atomic); - } + if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) + return tdp_mmu_clear_spte_bits_atomic(sptep, mask); __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask); return old_spte;
[PATCH 1/2] PCI: layerscape: Add support for Link down notification
Add support to pass Link down notification to Endpoint function driver so that the LINK_DOWN event can be processed by the function. Signed-off-by: Frank Li --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index de4c1758a6c3..4e4fdd1dfea7 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -88,6 +88,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) dev_dbg(pci->dev, "Link up\n"); } else if (val & PEX_PF0_PME_MES_DR_LDD) { + pci_epc_linkdown(pci->ep.epc); dev_dbg(pci->dev, "Link down\n"); } else if (val & PEX_PF0_PME_MES_DR_HRD) { dev_dbg(pci->dev, "Hot reset\n"); -- 2.34.1
Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
On 6/15/23 09:13, Randy Dunlap wrote: > > > On 6/15/23 09:05, Timothy Pearson wrote: >> >> >> - Original Message - >>> From: "Randy Dunlap" >>> To: "Timothy Pearson" , "Michael Ellerman" >>> >>> Cc: "Stephen Rothwell" , "Linux Next Mailing List" >>> , "linux-kernel" >>> , "linuxppc-dev" >>> , "Alexey Kardashevskiy" >>> Sent: Thursday, June 15, 2023 11:00:08 AM >>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) >> >>> Hi Timothy, >>> >>> On 6/3/23 20:57, Timothy Pearson wrote: - Original Message - > From: "Michael Ellerman" > To: "Randy Dunlap" , "Stephen Rothwell" > , "Linux Next Mailing List" > > Cc: "linux-kernel" , "linuxppc-dev" > , "Alexey > Kardashevskiy" , "Timothy Pearson" > > Sent: Saturday, June 3, 2023 7:22:51 PM > Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) > Randy Dunlap writes: >> On 6/1/23 21:01, Stephen Rothwell wrote: >>> Hi all, >>> >>> Changes since 20230601: >>> >> >> On powerpc64, a randconfig failed with: >> >> In file included from ../include/linux/list.h:5, >> from ../include/linux/preempt.h:11, >> from ../include/linux/spinlock.h:56, >> from ../include/linux/mmzone.h:8, >> from ../include/linux/gfp.h:7, >> from ../include/linux/slab.h:15, >> from ../arch/powerpc/kernel/iommu.c:15: >> ../arch/powerpc/kernel/iommu.c: In function >> 'spapr_tce_setup_phb_iommus_initcall': >> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared >> (first use >> in this function); did you mean 'zonelist'? >> 1391 | list_for_each_entry(hose, _list, list_node) { >> |^ > ... > > hose_list is in pci-common.c which is built when PCI=y. > > PSERIES and POWERNV force PCI=y. > > But this config has neither: > > # CONFIG_PPC_POWERNV is not set > # CONFIG_PPC_PSERIES is not set > CONFIG_HAVE_PCI=y > # CONFIG_PCI is not set > # CONFIG_COMMON_CLK_RS9_PCIE is not set > > > Probably the spapr_tce code should be wrapped in an #ifdef that is only > enabled when POWERNV || PSERIES is enabled. > > cheers Sounds reasonable, I was going to look into this further over the weekend. I can put together a patch for Monday if that works? >>> >>> Did you prepare a patch for this? I am still seeing this build error. >>> >>> thanks. >>> -- >>> ~Randy >> >> Yes, it was sent in to the linuxppc-dev list some weeks ago. Did it not >> arrive? > > I don't know - I'm not subscribed to that list. > > It's probably still in the patchworks review cycle > so it hasn't been applied anywhere that gets into linux-next. OK, it's here, mark with Success: http://patchwork.ozlabs.org/project/linuxppc-dev/patch/2015925968.3546872.1685990936823.javamail.zim...@raptorengineeringinc.com/ I don't know what happens to it next or when. thanks. -- ~Randy
Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
On 6/15/23 09:05, Timothy Pearson wrote: > > > - Original Message - >> From: "Randy Dunlap" >> To: "Timothy Pearson" , "Michael Ellerman" >> >> Cc: "Stephen Rothwell" , "Linux Next Mailing List" >> , "linux-kernel" >> , "linuxppc-dev" >> , "Alexey Kardashevskiy" >> Sent: Thursday, June 15, 2023 11:00:08 AM >> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) > >> Hi Timothy, >> >> On 6/3/23 20:57, Timothy Pearson wrote: >>> >>> >>> - Original Message - From: "Michael Ellerman" To: "Randy Dunlap" , "Stephen Rothwell" , "Linux Next Mailing List" Cc: "linux-kernel" , "linuxppc-dev" , "Alexey Kardashevskiy" , "Timothy Pearson" Sent: Saturday, June 3, 2023 7:22:51 PM Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) >>> Randy Dunlap writes: > On 6/1/23 21:01, Stephen Rothwell wrote: >> Hi all, >> >> Changes since 20230601: >> > > On powerpc64, a randconfig failed with: > > In file included from ../include/linux/list.h:5, > from ../include/linux/preempt.h:11, > from ../include/linux/spinlock.h:56, > from ../include/linux/mmzone.h:8, > from ../include/linux/gfp.h:7, > from ../include/linux/slab.h:15, > from ../arch/powerpc/kernel/iommu.c:15: > ../arch/powerpc/kernel/iommu.c: In function > 'spapr_tce_setup_phb_iommus_initcall': > ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared > (first use > in this function); did you mean 'zonelist'? > 1391 | list_for_each_entry(hose, _list, list_node) { > |^ ... hose_list is in pci-common.c which is built when PCI=y. PSERIES and POWERNV force PCI=y. But this config has neither: # CONFIG_PPC_POWERNV is not set # CONFIG_PPC_PSERIES is not set CONFIG_HAVE_PCI=y # CONFIG_PCI is not set # CONFIG_COMMON_CLK_RS9_PCIE is not set Probably the spapr_tce code should be wrapped in an #ifdef that is only enabled when POWERNV || PSERIES is enabled. cheers >>> >>> Sounds reasonable, I was going to look into this further over the weekend. >>> I >>> can put together a patch for Monday if that works? >> >> Did you prepare a patch for this? I am still seeing this build error. >> >> thanks. >> -- >> ~Randy > > Yes, it was sent in to the linuxppc-dev list some weeks ago. Did it not > arrive? I don't know - I'm not subscribed to that list. It's probably still in the patchworks review cycle so it hasn't been applied anywhere that gets into linux-next. thanks. -- ~Randy
Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
- Original Message - > From: "Randy Dunlap" > To: "Timothy Pearson" , "Michael Ellerman" > > Cc: "Stephen Rothwell" , "Linux Next Mailing List" > , "linux-kernel" > , "linuxppc-dev" > , "Alexey Kardashevskiy" > Sent: Thursday, June 15, 2023 11:00:08 AM > Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) > Hi Timothy, > > On 6/3/23 20:57, Timothy Pearson wrote: >> >> >> - Original Message - >>> From: "Michael Ellerman" >>> To: "Randy Dunlap" , "Stephen Rothwell" >>> , "Linux Next Mailing List" >>> >>> Cc: "linux-kernel" , "linuxppc-dev" >>> , "Alexey >>> Kardashevskiy" , "Timothy Pearson" >>> >>> Sent: Saturday, June 3, 2023 7:22:51 PM >>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) >> >>> Randy Dunlap writes: On 6/1/23 21:01, Stephen Rothwell wrote: > Hi all, > > Changes since 20230601: > On powerpc64, a randconfig failed with: In file included from ../include/linux/list.h:5, from ../include/linux/preempt.h:11, from ../include/linux/spinlock.h:56, from ../include/linux/mmzone.h:8, from ../include/linux/gfp.h:7, from ../include/linux/slab.h:15, from ../arch/powerpc/kernel/iommu.c:15: ../arch/powerpc/kernel/iommu.c: In function 'spapr_tce_setup_phb_iommus_initcall': ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared (first use in this function); did you mean 'zonelist'? 1391 | list_for_each_entry(hose, _list, list_node) { |^ >>> ... >>> >>> hose_list is in pci-common.c which is built when PCI=y. >>> >>> PSERIES and POWERNV force PCI=y. >>> >>> But this config has neither: >>> >>> # CONFIG_PPC_POWERNV is not set >>> # CONFIG_PPC_PSERIES is not set >>> CONFIG_HAVE_PCI=y >>> # CONFIG_PCI is not set >>> # CONFIG_COMMON_CLK_RS9_PCIE is not set >>> >>> >>> Probably the spapr_tce code should be wrapped in an #ifdef that is only >>> enabled when POWERNV || PSERIES is enabled. >>> >>> cheers >> >> Sounds reasonable, I was going to look into this further over the weekend. I >> can put together a patch for Monday if that works? > > Did you prepare a patch for this? I am still seeing this build error. > > thanks. > -- > ~Randy Yes, it was sent in to the linuxppc-dev list some weeks ago. Did it not arrive?
file removal issue in tools/testing/selftests/powerpc/mm/tlbie_test.c
Hi, Static analysis with cppcheck has found an issue in the following commit: commit 047e6575aec71d75b765c22111820c4776cd1c43 Author: Aneesh Kumar K.V Date: Tue Sep 24 09:22:53 2019 +0530 powerpc/mm: Fixup tlbie vs mtpidr/mtlpidr ordering issue on POWER9 The issue in tools/testing/selftests/powerpc/mm/tlbie_test.c in end_verification_log() is as follows: static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies) { FILE *f = fp[tid]; char logfile[30]; char path[LOGDIR_NAME_SIZE + 30]; char separator[] = "/"; fclose(f); if (nr_anamolies == 0) { remove(path); return; } etc in the case where nr_anamolies is zero the remove(path) call is using an uninitialized path, this potentially could contain uninitialized garbage on the stack (and if one is unlucky enough it may be a valid filename that one does not want to be removed). Not sure what the original intention was, but this code looks incorrect to me. Colin
Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
Hi Timothy, On 6/3/23 20:57, Timothy Pearson wrote: > > > - Original Message - >> From: "Michael Ellerman" >> To: "Randy Dunlap" , "Stephen Rothwell" >> , "Linux Next Mailing List" >> >> Cc: "linux-kernel" , "linuxppc-dev" >> , "Alexey >> Kardashevskiy" , "Timothy Pearson" >> >> Sent: Saturday, June 3, 2023 7:22:51 PM >> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c) > >> Randy Dunlap writes: >>> On 6/1/23 21:01, Stephen Rothwell wrote: Hi all, Changes since 20230601: >>> >>> On powerpc64, a randconfig failed with: >>> >>> In file included from ../include/linux/list.h:5, >>> from ../include/linux/preempt.h:11, >>> from ../include/linux/spinlock.h:56, >>> from ../include/linux/mmzone.h:8, >>> from ../include/linux/gfp.h:7, >>> from ../include/linux/slab.h:15, >>> from ../arch/powerpc/kernel/iommu.c:15: >>> ../arch/powerpc/kernel/iommu.c: In function >>> 'spapr_tce_setup_phb_iommus_initcall': >>> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared >>> (first use >>> in this function); did you mean 'zonelist'? >>> 1391 | list_for_each_entry(hose, _list, list_node) { >>> |^ >> ... >> >> hose_list is in pci-common.c which is built when PCI=y. >> >> PSERIES and POWERNV force PCI=y. >> >> But this config has neither: >> >> # CONFIG_PPC_POWERNV is not set >> # CONFIG_PPC_PSERIES is not set >> CONFIG_HAVE_PCI=y >> # CONFIG_PCI is not set >> # CONFIG_COMMON_CLK_RS9_PCIE is not set >> >> >> Probably the spapr_tce code should be wrapped in an #ifdef that is only >> enabled when POWERNV || PSERIES is enabled. >> >> cheers > > Sounds reasonable, I was going to look into this further over the weekend. I > can put together a patch for Monday if that works? Did you prepare a patch for this? I am still seeing this build error. thanks. -- ~Randy
Re: [PATCH v2 07/23] mips: update_mmu_cache() can replace __update_tlb()
On Wed, Jun 14, 2023 at 10:43:30PM -0700, Hugh Dickins wrote: > On Wed, 14 Jun 2023, Hugh Dickins wrote: > > On Wed, 14 Jun 2023, Nathan Chancellor wrote: > > > > > > I just bisected a crash while powering down a MIPS machine in QEMU to > > > this change as commit 8044511d3893 ("mips: update_mmu_cache() can > > > replace __update_tlb()") in linux-next. > > > > Thank you, Nathan, that's very helpful indeed. This patch certainly knew > > that it wanted testing, and I'm glad to hear that it is now seeing some. > > > > While powering down? The messages below look like it was just coming up, > > but no doubt that's because you were bisecting (or because I'm unfamiliar > > with what messages to expect there). It's probably irrelevant information, > > but I wonder whether the (V)machine worked well enough for a while before > > you first powered down and spotted the problem, or whether it's never got > > much further than trying to run init (busybox)? I'm trying to get a feel > > for whether the problem occurs under common or uncommon conditions. Ugh sorry, I have been looking into too many bugs lately and got my wires crossed :) this is indeed a problem when running init (which is busybox, this is a simple Buildroot file system). > > > Unfortunately, I can still > > > reproduce it with the existing fix you have for this change on the > > > mailing list, which is present in next-20230614. > > > > Right, that later fix was only for a build warning, nothing functional > > (or at least I hoped that it wasn't making any functional difference). > > > > Thanks a lot for the detailed instructions below: unfortunately, those > > would draw me into a realm of testing I've never needed to enter before, > > so a lot of time spent on setup and learning. Usually, I just stare at > > the source. > > > > What this probably says is that I should revert most my cleanup there, > > and keep as close to the existing code as possible. But some change is > > needed, and I may need to understand (or have a good guess at) what was > > going wrong, to decide what kind of retreat will be successful. > > > > Back to the source for a while: I hope I'll find examples in nearby MIPS > > kernel source (and git history), which will hint at the right way forward. > > Then send you a patch against next-20230614 to try, when I'm reasonably > > confident that it's enough to satisfy my purpose, but likely not to waste > > your time. > > I'm going to take advantage of your good nature by attaching > two alternative patches, either to go on top of next-20230614. > > mips1.patch, > arch/mips/mm/tlb-r4k.c | 12 +--- > 1 file changed, 1 insertion(+), 11 deletions(-) > > is by far my favourite. I couldn't see anything wrong with what's > already there for mips, but it seems possible that (though I didn't > find it) somewhere calls update_mmu_cache_pmd() on a page table. So > mips1.patch restores the pmd_huge() check, and cleans up further by > removing the silly pgdp, p4dp, pudp, pmdp stuff: the pointer has now > been passed in by the caller, why walk the tree again? I should have > done it this way before. > > But if that doesn't work, then I'm afraid it will have to be > mips2.patch, > arch/mips/include/asm/pgtable.h | 15 --- > arch/mips/mm/tlb-r3k.c |5 ++--- > arch/mips/mm/tlb-r4k.c | 27 ++- > 3 files changed, 32 insertions(+), 15 deletions(-) > > which reverts all of the original patch and its build warning fix, > and does a pte_unmap() to balance the silly pte_offset_map() there; > with an apologetic comment for this being about the only place in > the tree where I have no idea what to do if ptep were NULL. > > I do hope that you find the first fixes the breakage; but if not, then I hate to be the bearer of bad news but the first patch did not fix the breakage, I see the same issue. > I even more fervently hope that the second will, despite my hating it. > Touch wood for the first, fingers crossed for the second, thanks, Thankfully, the second one does. Thanks for the quick and thoughtful responses! Cheers, Nathan
[PATCH 04/10] cpu/SMT: Remove topology_smt_supported()
Since the maximum number of threads is now passed to cpu_smt_set_num_threads(), checking that value is enough to know if SMT is supported. Cc: Michael Ellerman Suggested-by: Thomas Gleixner Signed-off-by: Laurent Dufour --- arch/x86/include/asm/topology.h | 2 -- arch/x86/kernel/smpboot.c | 8 kernel/cpu.c| 2 +- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 66927a59e822..87358a8fe843 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -143,7 +143,6 @@ int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); bool topology_is_primary_thread(unsigned int cpu); -bool topology_smt_supported(void); #else #define topology_max_packages()(1) static inline int @@ -156,7 +155,6 @@ static inline int topology_phys_to_logical_die(unsigned int die, static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } -static inline bool topology_smt_supported(void) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 352f0ce1ece4..3052c171668d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -278,14 +278,6 @@ bool topology_is_primary_thread(unsigned int cpu) return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); } -/** - * topology_smt_supported - Check whether SMT is supported by the CPUs - */ -bool topology_smt_supported(void) -{ - return smp_num_siblings > 1; -} - /** * topology_phys_to_logical_pkg - Map a physical package id to a logical * diff --git a/kernel/cpu.c b/kernel/cpu.c index edca8b7bd400..e354af92b2b8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -442,7 +442,7 @@ void __init cpu_smt_set_num_threads(unsigned int num_threads, { WARN_ON(!num_threads || (num_threads > max_threads)); - if (!topology_smt_supported()) + if (max_threads == 1) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; cpu_smt_max_threads = max_threads; -- 2.41.0
[PATCH 08/10] powerpc/pseries: Initialise CPU hotplug callbacks earlier
From: Michael Ellerman As part of the generic HOTPLUG_SMT code, there is support for disabling secondary SMT threads at boot time, by passing "nosmt" on the kernel command line. The way that is implemented is the secondary threads are brought partly online, and then taken back offline again. That is done to support x86 CPUs needing certain initialisation done on all threads. However powerpc has similar needs, see commit d70a54e2d085 ("powerpc/powernv: Ignore smt-enabled on Power8 and later"). For that to work the powerpc CPU hotplug callbacks need to be registered before secondary CPUs are brought online, otherwise __cpu_disable() fails due to smp_ops->cpu_disable being NULL. So split the basic initialisation into pseries_cpu_hotplug_init() which can be called early from setup_arch(). The DLPAR related initialisation can still be done later, because it needs to do allocations. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 22 arch/powerpc/platforms/pseries/pseries.h | 2 ++ arch/powerpc/platforms/pseries/setup.c | 2 ++ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 1a3cb313976a..61fb7cb00880 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -845,15 +845,9 @@ static struct notifier_block pseries_smp_nb = { .notifier_call = pseries_smp_notifier, }; -static int __init pseries_cpu_hotplug_init(void) +void __init pseries_cpu_hotplug_init(void) { int qcss_tok; - unsigned int node; - -#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE - ppc_md.cpu_probe = dlpar_cpu_probe; - ppc_md.cpu_release = dlpar_cpu_release; -#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ rtas_stop_self_token = rtas_function_token(RTAS_FN_STOP_SELF); qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE); @@ -862,12 +856,22 @@ static int __init pseries_cpu_hotplug_init(void) qcss_tok == RTAS_UNKNOWN_SERVICE) { printk(KERN_INFO "CPU Hotplug not supported by firmware " "- disabling.\n"); - return 0; + return; } smp_ops->cpu_offline_self = pseries_cpu_offline_self; smp_ops->cpu_disable = pseries_cpu_disable; smp_ops->cpu_die = pseries_cpu_die; +} + +static int __init pseries_dlpar_init(void) +{ + unsigned int node; + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + ppc_md.cpu_probe = dlpar_cpu_probe; + ppc_md.cpu_release = dlpar_cpu_release; +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ /* Processors can be added/removed only on LPAR */ if (firmware_has_feature(FW_FEATURE_LPAR)) { @@ -886,4 +890,4 @@ static int __init pseries_cpu_hotplug_init(void) return 0; } -machine_arch_initcall(pseries, pseries_cpu_hotplug_init); +machine_arch_initcall(pseries, pseries_dlpar_init); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index f8bce40ebd0c..f8893ba46e83 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -75,11 +75,13 @@ static inline int dlpar_hp_pmem(struct pseries_hp_errorlog *hp_elog) #ifdef CONFIG_HOTPLUG_CPU int dlpar_cpu(struct pseries_hp_errorlog *hp_elog); +void pseries_cpu_hotplug_init(void); #else static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { return -EOPNOTSUPP; } +static inline void pseries_cpu_hotplug_init(void) { } #endif /* PCI root bridge prepare function override for pseries */ diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index e2a57cfa6c83..41451b76c6e5 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -816,6 +816,8 @@ static void __init pSeries_setup_arch(void) /* Discover PIC type and setup ppc_md accordingly */ smp_init_pseries(); + // Setup CPU hotplug callbacks + pseries_cpu_hotplug_init(); if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE)) if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) -- 2.41.0
[PATCH 01/10] cpu/SMT: Move SMT prototypes into cpu_smt.h
From: Michael Ellerman A subsequent patch would like to use the cpuhp_smt_control enum as part of the interface between generic and arch code. Currently that leads to circular header dependencies. So split the enum and related declarations into a separate header. Signed-off-by: Michael Ellerman --- arch/x86/include/asm/topology.h | 2 ++ include/linux/cpu.h | 25 + include/linux/cpu_smt.h | 29 + kernel/cpu.c| 1 + 4 files changed, 33 insertions(+), 24 deletions(-) create mode 100644 include/linux/cpu_smt.h diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 458c891a8273..66927a59e822 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -136,6 +136,8 @@ static inline int topology_max_smt_threads(void) return __max_smt_threads; } +#include + int topology_update_package_map(unsigned int apicid, unsigned int cpu); int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 8582a7142623..40548f3c201c 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -18,6 +18,7 @@ #include #include #include +#include struct device; struct device_node; @@ -202,30 +203,6 @@ void cpuhp_report_idle_dead(void); static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -enum cpuhp_smt_control { - CPU_SMT_ENABLED, - CPU_SMT_DISABLED, - CPU_SMT_FORCE_DISABLED, - CPU_SMT_NOT_SUPPORTED, - CPU_SMT_NOT_IMPLEMENTED, -}; - -#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) -extern enum cpuhp_smt_control cpu_smt_control; -extern void cpu_smt_disable(bool force); -extern void cpu_smt_check_topology(void); -extern bool cpu_smt_possible(void); -extern int cpuhp_smt_enable(void); -extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); -#else -# define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) -static inline void cpu_smt_disable(bool force) { } -static inline void cpu_smt_check_topology(void) { } -static inline bool cpu_smt_possible(void) { return false; } -static inline int cpuhp_smt_enable(void) { return 0; } -static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } -#endif - extern bool cpu_mitigations_off(void); extern bool cpu_mitigations_auto_nosmt(void); diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h new file mode 100644 index ..722c2e306fef --- /dev/null +++ b/include/linux/cpu_smt.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CPU_SMT_H_ +#define _LINUX_CPU_SMT_H_ + +enum cpuhp_smt_control { + CPU_SMT_ENABLED, + CPU_SMT_DISABLED, + CPU_SMT_FORCE_DISABLED, + CPU_SMT_NOT_SUPPORTED, + CPU_SMT_NOT_IMPLEMENTED, +}; + +#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) +extern enum cpuhp_smt_control cpu_smt_control; +extern void cpu_smt_disable(bool force); +extern void cpu_smt_check_topology(void); +extern bool cpu_smt_possible(void); +extern int cpuhp_smt_enable(void); +extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); +#else +# define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) +static inline void cpu_smt_disable(bool force) { } +static inline void cpu_smt_check_topology(void) { } +static inline bool cpu_smt_possible(void) { return false; } +static inline int cpuhp_smt_enable(void) { return 0; } +static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } +#endif + +#endif /* _LINUX_CPU_SMT_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index f4a2c5845bcb..237394e0574a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,7 @@ static void lockdep_release_cpus_lock(void) void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT + enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; void __init cpu_smt_disable(bool force) -- 2.41.0
[PATCH 03/10] cpu/SMT: Store the current/max number of threads
From: Michael Ellerman Some architectures (ppc64) allows partial SMT states at boot time, ie. when not all SMT threads are brought online. To support that the SMT code needs to know the maximum number of SMT threads, and also the currently configured number. The architecture code knows the max number of threads, so have the architecture code pass that value to cpu_smt_set_num_threads(). Note that although topology_max_smt_threads() exists, it is not configured early enough to be used here. As architecture, like PowerPC, allows the threads number to be set through the kernel command line, also pass that value. Signed-off-by: Michael Ellerman [ldufour: slightly reword the commit message] [ldufour: rename cpu_smt_check_topology and add a num_threads argument] Signed-off-by: Laurent Dufour --- arch/x86/kernel/cpu/bugs.c | 3 ++- include/linux/cpu_smt.h| 8 ++-- kernel/cpu.c | 21 - 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 182af64387d0..ed71ad385ea7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "cpu.h" @@ -133,7 +134,7 @@ void __init check_bugs(void) * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_check_topology(); + cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h index 722c2e306fef..0c1664294b57 100644 --- a/include/linux/cpu_smt.h +++ b/include/linux/cpu_smt.h @@ -12,15 +12,19 @@ enum cpuhp_smt_control { #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; +extern unsigned int cpu_smt_num_threads; extern void cpu_smt_disable(bool force); -extern void cpu_smt_check_topology(void); +extern void cpu_smt_set_num_threads(unsigned int num_threads, + unsigned int max_threads); extern bool cpu_smt_possible(void); extern int cpuhp_smt_enable(void); extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); #else # define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) +# define cpu_smt_num_threads 1 static inline void cpu_smt_disable(bool force) { } -static inline void cpu_smt_check_topology(void) { } +static inline void cpu_smt_set_num_threads(unsigned int num_threads, + unsigned int max_threads) { } static inline bool cpu_smt_possible(void) { return false; } static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index c67049bb3fc8..edca8b7bd400 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -415,6 +415,8 @@ void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; +static unsigned int cpu_smt_max_threads __ro_after_init; +unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX; void __init cpu_smt_disable(bool force) { @@ -428,16 +430,33 @@ void __init cpu_smt_disable(bool force) pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } + cpu_smt_num_threads = 1; } /* * The decision whether SMT is supported can only be done after the full * CPU identification. Called from architecture code. */ -void __init cpu_smt_check_topology(void) +void __init cpu_smt_set_num_threads(unsigned int num_threads, + unsigned int max_threads) { + WARN_ON(!num_threads || (num_threads > max_threads)); + if (!topology_smt_supported()) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + + cpu_smt_max_threads = max_threads; + + /* +* If SMT has been disabled via the kernel command line or SMT is +* not supported, set cpu_smt_num_threads to 1 for consistency. +* If enabled, take the architecture requested number of threads +* to bring up into account. +*/ + if (cpu_smt_control != CPU_SMT_ENABLED) + cpu_smt_num_threads = 1; + else if (num_threads < cpu_smt_num_threads) + cpu_smt_num_threads = num_threads; } static int __init smt_cmdline_disable(char *str) -- 2.41.0
[PATCH 09/10] powerpc: Add HOTPLUG_SMT support
From: Michael Ellerman Add support for HOTPLUG_SMT, which enables the generic sysfs SMT support files in /sys/devices/system/cpu/smt, as well as the "nosmt" boot parameter. Implement the recently added hooks to allow partial SMT states, allow any number of threads per core. Tie the config symbol to HOTPLUG_CPU, which enables it on the major platforms that support SMT. If there are other platforms that want the SMT support that can be tweaked in future. Signed-off-by: Michael Ellerman [ldufour: pass current SMT level to cpu_smt_set_num_threads] [ldufour: remove topology_smt_supported] Signed-off-by: Laurent Dufour --- arch/powerpc/Kconfig| 1 + arch/powerpc/include/asm/topology.h | 20 arch/powerpc/kernel/smp.c | 8 +++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 539d1f03ff42..5cf87ca10a9c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -273,6 +273,7 @@ config PPC select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_VIRT_CPU_ACCOUNTING_GEN + select HOTPLUG_SMT if HOTPLUG_CPU select HUGETLB_PAGE_SIZE_VARIABLE if PPC_BOOK3S_64 && HUGETLB_PAGE select IOMMU_HELPER if PPC64 select IRQ_DOMAIN diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 8a4d4f4d9749..7602f17d688a 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -143,5 +143,25 @@ static inline int cpu_to_coregroup_id(int cpu) #endif #endif +#ifdef CONFIG_HOTPLUG_SMT +#include +#include + +static inline bool topology_smt_threads_supported(unsigned int num_threads) +{ + return num_threads <= threads_per_core; +} + +static inline bool topology_is_primary_thread(unsigned int cpu) +{ + return cpu == cpu_first_thread_sibling(cpu); +} + +static inline bool topology_smt_thread_allowed(unsigned int cpu) +{ + return cpu_thread_in_core(cpu) < cpu_smt_num_threads; +} +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_TOPOLOGY_H */ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 265801a3e94c..cdb77d36cdd0 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1087,7 +1087,7 @@ static int __init init_big_cores(void) void __init smp_prepare_cpus(unsigned int max_cpus) { - unsigned int cpu; + unsigned int cpu, num_threads; DBG("smp_prepare_cpus\n"); @@ -1154,6 +1154,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (smp_ops && smp_ops->probe) smp_ops->probe(); + + // Initalise the generic SMT topology support + num_threads = 1; + if (smt_enabled_at_boot) + num_threads = smt_enabled_at_boot; + cpu_smt_set_num_threads(num_threads, threads_per_core); } void smp_prepare_boot_cpu(void) -- 2.41.0
[PATCH 07/10] cpu/SMT: Allow enabling partial SMT states via sysfs
From: Michael Ellerman Add support to the /sys/devices/system/cpu/smt/control interface for enabling a specified number of SMT threads per core, including partial SMT states where not all threads are brought online. The current interface accepts "on" and "off", to enable either 1 or all SMT threads per core. This commit allows writing an integer, between 1 and the number of SMT threads supported by the machine. Writing 1 is a synonym for "off", 2 or more enables SMT with the specified number of threads. When reading the file, if all threads are online "on" is returned, to avoid changing behaviour for existing users. If some other number of threads is online then the integer value is returned. There is a hook which allows arch code to control how many threads per core are supported. To retain the existing behaviour, the x86 hook only supports 1 thread or all threads. Signed-off-by: Michael Ellerman --- .../ABI/testing/sysfs-devices-system-cpu | 1 + kernel/cpu.c | 39 --- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index f54867cadb0f..3c4cfb59d495 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -555,6 +555,7 @@ Description:Control Symmetric Multi Threading (SMT) = "on" SMT is enabled "off"SMT is disabled +""SMT is enabled with N threads per core. "forceoff" SMT is force disabled. Cannot be changed. "notsupported" SMT is not supported by the CPU "notimplemented" SMT runtime toggling is not diff --git a/kernel/cpu.c b/kernel/cpu.c index ae2fa26a5b63..248f0734098a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2507,7 +2507,7 @@ static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int ctrlval, ret; + int ctrlval, ret, num_threads, orig_threads; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) return -EPERM; @@ -2515,20 +2515,38 @@ __store_smt_control(struct device *dev, struct device_attribute *attr, if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return -ENODEV; - if (sysfs_streq(buf, "on")) + if (sysfs_streq(buf, "on")) { ctrlval = CPU_SMT_ENABLED; - else if (sysfs_streq(buf, "off")) + num_threads = cpu_smt_max_threads; + } else if (sysfs_streq(buf, "off")) { ctrlval = CPU_SMT_DISABLED; - else if (sysfs_streq(buf, "forceoff")) + num_threads = 1; + } else if (sysfs_streq(buf, "forceoff")) { ctrlval = CPU_SMT_FORCE_DISABLED; - else + num_threads = 1; + } else if (kstrtoint(buf, 10, _threads) == 0) { + if (num_threads == 1) + ctrlval = CPU_SMT_DISABLED; + else if (num_threads > 1 && topology_smt_threads_supported(num_threads)) + ctrlval = CPU_SMT_ENABLED; + else + return -EINVAL; + } else { return -EINVAL; + } ret = lock_device_hotplug_sysfs(); if (ret) return ret; - if (ctrlval != cpu_smt_control) { + orig_threads = cpu_smt_num_threads; + cpu_smt_num_threads = num_threads; + + if (num_threads > orig_threads) { + ret = cpuhp_smt_enable(); + } else if (num_threads < orig_threads) { + ret = cpuhp_smt_disable(ctrlval); + } else if (ctrlval != cpu_smt_control) { switch (ctrlval) { case CPU_SMT_ENABLED: ret = cpuhp_smt_enable(); @@ -2566,6 +2584,15 @@ static ssize_t control_show(struct device *dev, { const char *state = smt_states[cpu_smt_control]; + /* +* If SMT is enabled but not all threads are enabled then show the +* number of threads. If all threads are enabled show "on". Otherwise +* show the state name. +*/ + if (cpu_smt_control == CPU_SMT_ENABLED && + cpu_smt_num_threads != cpu_smt_max_threads) + return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); } -- 2.41.0
[PATCH 02/10] cpu/SMT: Move smt/control simple exit cases earlier
From: Michael Ellerman Move the simple exit cases, ie. which don't depend on the value written, earlier in the function. That makes it clearer that regardless of the input those states can not be transitioned out of. That does have a user-visible effect, in that the error returned will now always be EPERM/ENODEV for those states, regardless of the value written. Previously writing an invalid value would return EINVAL even when in those states. Signed-off-by: Michael Ellerman --- kernel/cpu.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 237394e0574a..c67049bb3fc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2482,6 +2482,12 @@ __store_smt_control(struct device *dev, struct device_attribute *attr, { int ctrlval, ret; + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) + return -EPERM; + + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return -ENODEV; + if (sysfs_streq(buf, "on")) ctrlval = CPU_SMT_ENABLED; else if (sysfs_streq(buf, "off")) @@ -2491,12 +2497,6 @@ __store_smt_control(struct device *dev, struct device_attribute *attr, else return -EINVAL; - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) - return -EPERM; - - if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) - return -ENODEV; - ret = lock_device_hotplug_sysfs(); if (ret) return ret; -- 2.41.0
[PATCH 00/10] Introduce SMT level and add PowerPC support
I'm taking over the series Michael sent previously [1] which is smartly reviewing the initial series I sent [2]. This series is addressing the comments sent by Thomas and me on the Michael's one. Here is a short introduction to the issue this series is addressing: When a new CPU is added, the kernel is activating all its threads. This leads to weird, but functional, result when adding CPU on a SMT 4 system for instance. Here the newly added CPU 1 has 8 threads while the other one has 4 threads active (system has been booted with the 'smt-enabled=4' kernel option): ltcden3-lp12:~ # ppc64_cpu --info Core 0:0*1*2*3*4 5 6 7 Core 1:8*9* 10* 11* 12* 13* 14* 15* This mixed SMT level may confused end users and/or some applications. There is no SMT level recorded in the kernel (common code), neither in user space, as far as I know. Such a level is helpful when adding new CPU or when optimizing the energy efficiency (when reactivating CPUs). When SMP and HOTPLUG_SMT are defined, this series is adding a new SMT level (cpu_smt_num_threads) and few callbacks allowing the architecture code to fine control this value, setting a max and a "at boot" level, and controling whether a thread should be onlined or not. [1] https://lore.kernel.org/linuxppc-dev/20230524155630.794584-1-...@ellerman.id.au/ [2] https://lore.kernel.org/linuxppc-dev/20230331153905.31698-1-lduf...@linux.ibm.com/ Laurent Dufour (1): cpu/SMT: Remove topology_smt_supported() Michael Ellerman (9): cpu/SMT: Move SMT prototypes into cpu_smt.h cpu/SMT: Move smt/control simple exit cases earlier cpu/SMT: Store the current/max number of threads cpu/SMT: Create topology_smt_threads_supported() cpu/SMT: Create topology_smt_thread_allowed() cpu/SMT: Allow enabling partial SMT states via sysfs powerpc/pseries: Initialise CPU hotplug callbacks earlier powerpc: Add HOTPLUG_SMT support powerpc/pseries: Honour current SMT state when DLPAR onlining CPUs .../ABI/testing/sysfs-devices-system-cpu | 1 + arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/topology.h | 20 + arch/powerpc/kernel/smp.c | 8 +- arch/powerpc/platforms/pseries/hotplug-cpu.c | 30 +-- arch/powerpc/platforms/pseries/pseries.h | 2 + arch/powerpc/platforms/pseries/setup.c| 2 + arch/x86/include/asm/topology.h | 8 +- arch/x86/kernel/cpu/bugs.c| 3 +- arch/x86/kernel/smpboot.c | 25 +- include/linux/cpu.h | 25 +- include/linux/cpu_smt.h | 33 kernel/cpu.c | 83 +++ 13 files changed, 187 insertions(+), 54 deletions(-) create mode 100644 include/linux/cpu_smt.h -- 2.41.0
[PATCH 06/10] cpu/SMT: Create topology_smt_thread_allowed()
From: Michael Ellerman A subsequent patch will enable partial SMT states, ie. when not all SMT threads are brought online. To support that, add an arch helper which checks whether a given CPU is allowed to be brought online depending on how many SMT threads are currently enabled. Call the helper from cpu_smt_enable(), and cpu_smt_allowed() when SMT is enabled, to check if the particular thread should be onlined. Notably, also call it from cpu_smt_disable() if CPU_SMT_ENABLED, to allow offlining some threads to move from a higher to lower number of threads online. Signed-off-by: Michael Ellerman --- arch/x86/include/asm/topology.h | 2 ++ arch/x86/kernel/smpboot.c | 15 +++ kernel/cpu.c| 10 +- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 232df5ffab34..4696d4566cb5 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -144,6 +144,7 @@ int topology_phys_to_logical_pkg(unsigned int pkg); int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); bool topology_is_primary_thread(unsigned int cpu); bool topology_smt_threads_supported(unsigned int threads); +bool topology_smt_thread_allowed(unsigned int cpu); #else #define topology_max_packages()(1) static inline int @@ -157,6 +158,7 @@ static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline bool topology_smt_threads_supported(unsigned int threads) { return false; } +static inline bool topology_smt_thread_allowed(unsigned int cpu) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d163ef55577b..cfae55c2d1b0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -290,6 +290,21 @@ bool topology_smt_threads_supported(unsigned int threads) return threads == 1 || threads == smp_num_siblings; } +/** + * topology_smt_thread_allowed - When enabling SMT check whether this particular + * CPU thread is allowed to be brought online. + * @cpu: CPU to check + */ +bool topology_smt_thread_allowed(unsigned int cpu) +{ + /* +* No extra logic s required here to support different thread values +* because threads will always == 1 or smp_num_siblings because of +* topology_smt_threads_supported(). +*/ + return true; +} + /** * topology_phys_to_logical_pkg - Map a physical package id to a logical * diff --git a/kernel/cpu.c b/kernel/cpu.c index e354af92b2b8..ae2fa26a5b63 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -468,7 +468,7 @@ early_param("nosmt", smt_cmdline_disable); static inline bool cpu_smt_allowed(unsigned int cpu) { - if (cpu_smt_control == CPU_SMT_ENABLED) + if (cpu_smt_control == CPU_SMT_ENABLED && topology_smt_thread_allowed(cpu)) return true; if (topology_is_primary_thread(cpu)) @@ -2283,6 +2283,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) for_each_online_cpu(cpu) { if (topology_is_primary_thread(cpu)) continue; + /* +* Disable can be called with CPU_SMT_ENABLED when changing +* from a higher to lower number of SMT threads per core. +*/ + if (ctrlval == CPU_SMT_ENABLED && topology_smt_thread_allowed(cpu)) + continue; ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (ret) break; @@ -2317,6 +2323,8 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; + if (!topology_smt_thread_allowed(cpu)) + continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) break; -- 2.41.0
[PATCH 10/10] powerpc/pseries: Honour current SMT state when DLPAR onlining CPUs
From: Michael Ellerman Integrate with the generic SMT support, so that when a CPU is DLPAR onlined it is brought up with the correct SMT mode. Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 61fb7cb00880..e62835a12d73 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -398,6 +398,14 @@ static int dlpar_online_cpu(struct device_node *dn) for_each_present_cpu(cpu) { if (get_hard_smp_processor_id(cpu) != thread) continue; + + if (!topology_is_primary_thread(cpu)) { + if (cpu_smt_control != CPU_SMT_ENABLED) + break; + if (!topology_smt_thread_allowed(cpu)) + break; + } + cpu_maps_update_done(); find_and_update_cpu_nid(cpu); rc = device_online(get_cpu_device(cpu)); -- 2.41.0
[PATCH 05/10] cpu/SMT: Create topology_smt_threads_supported()
From: Michael Ellerman A subsequent patch will enable partial SMT states, ie. when not all SMT threads are brought online. To support that, add an arch helper to check how many SMT threads are supported. To retain existing behaviour, the x86 implementation only allows a single thread or all threads to be online. Signed-off-by: Michael Ellerman --- arch/x86/include/asm/topology.h | 2 ++ arch/x86/kernel/smpboot.c | 12 2 files changed, 14 insertions(+) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 87358a8fe843..232df5ffab34 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -143,6 +143,7 @@ int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); bool topology_is_primary_thread(unsigned int cpu); +bool topology_smt_threads_supported(unsigned int threads); #else #define topology_max_packages()(1) static inline int @@ -155,6 +156,7 @@ static inline int topology_phys_to_logical_die(unsigned int die, static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } +static inline bool topology_smt_threads_supported(unsigned int threads) { return false; } #endif static inline void arch_fix_phys_package_id(int num, u32 slot) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3052c171668d..d163ef55577b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -278,6 +278,18 @@ bool topology_is_primary_thread(unsigned int cpu) return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu)); } +/** + * topology_smt_threads_supported - Check if the given number of SMT threads + * is supported. + * + * @threads: The number of SMT threads. + */ +bool topology_smt_threads_supported(unsigned int threads) +{ + // Only support a single thread or all threads. + return threads == 1 || threads == smp_num_siblings; +} + /** * topology_phys_to_logical_pkg - Map a physical package id to a logical * -- 2.41.0
[RFC PATCH] powerpc/ftrace: Create a dummy stackframe to fix stack unwind
With ppc64 -mprofile-kernel and ppc32 -pg, profiling instructions to call into ftrace are emitted right at function entry. The instruction sequence used is minimal to reduce overhead. Crucially, a stackframe is not created for the function being traced. This breaks stack unwinding since the function being traced does not have a stackframe for itself. As such, it never shows up in the backtrace: /sys/kernel/debug/tracing # echo 1 > /proc/sys/kernel/stack_tracer_enabled /sys/kernel/debug/tracing # cat stack_trace DepthSize Location(17 entries) - 0) 4144 32 ftrace_call+0x4/0x44 1) 4112 432 get_page_from_freelist+0x26c/0x1ad0 2) 3680 496 __alloc_pages+0x290/0x1280 3) 3184 336 __folio_alloc+0x34/0x90 4) 2848 176 vma_alloc_folio+0xd8/0x540 5) 2672 272 __handle_mm_fault+0x700/0x1cc0 6) 2400 208 handle_mm_fault+0xf0/0x3f0 7) 2192 80 ___do_page_fault+0x3e4/0xbe0 8) 2112 160 do_page_fault+0x30/0xc0 9) 1952 256 data_access_common_virt+0x210/0x220 10) 1696 400 0xcf16b100 11) 1296 384 load_elf_binary+0x804/0x1b80 12) 912 208 bprm_execve+0x2d8/0x7e0 13) 704 64 do_execveat_common+0x1d0/0x2f0 14) 640 160 sys_execve+0x54/0x70 15) 480 64 system_call_exception+0x138/0x350 16) 416 416 system_call_common+0x160/0x2c4 Fix this by having ftrace create a dummy stackframe for the function being traced. Since this is only relevant when ftrace is active, we nop out the instruction to store LR in the LR save area in the profiling instruction sequence on ppc32 (and in ppc64 with older gcc versions). Instead, in ftrace, we store LR in the LR save area of the previous stackframe, and create a minimal stackframe to represent the function being traced. With this, backtraces now capture the function being traced: /sys/kernel/debug/tracing # cat stack_trace DepthSize Location(17 entries) - 0) 3888 32 _raw_spin_trylock+0x8/0x70 1) 3856 576 get_page_from_freelist+0x26c/0x1ad0 2) 3280 64 __alloc_pages+0x290/0x1280 3) 3216 336 __folio_alloc+0x34/0x90 4) 2880 176 vma_alloc_folio+0xd8/0x540 5) 2704 416 __handle_mm_fault+0x700/0x1cc0 6) 2288 96 handle_mm_fault+0xf0/0x3f0 7) 2192 48 ___do_page_fault+0x3e4/0xbe0 8) 2144 192 do_page_fault+0x30/0xc0 9) 1952 608 data_access_common_virt+0x210/0x220 10) 1344 16 0xc000334bbb50 11) 1328 416 load_elf_binary+0x804/0x1b80 12) 912 64 bprm_execve+0x2d8/0x7e0 13) 848 176 do_execveat_common+0x1d0/0x2f0 14) 672 192 sys_execve+0x54/0x70 15) 480 64 system_call_exception+0x138/0x350 16) 416 416 system_call_common+0x160/0x2c4 This results in two additional stores in the ftrace entry code, but produces reliable backtraces. Note that this change now aligns with other architectures (arm64, s390, x86). Signed-off-by: Naveen N Rao --- This applies atop the below RFC patchset: http://lore.kernel.org/cover.1686151854.git.nav...@kernel.org - Naveen arch/powerpc/kernel/trace/ftrace.c | 8 +--- arch/powerpc/kernel/trace/ftrace_entry.S | 11 --- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index a03aa4fd7a08ba..3f090f037d8ef5 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -229,13 +229,15 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) /* Expected sequence: 'mflr r0', 'stw r0,4(r1)', 'bl _mcount' */ ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); if (!ret) - ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4))); + ret = ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STW(_R0, _R1, 4)), +ppc_inst(PPC_RAW_NOP())); } else if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) { /* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl _mcount' */ ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_MFLR(_R0))); if (ret) { - ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16))); - ret |= ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); + ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0))); + ret |= ftrace_modify_code(ip - 4, ppc_inst(PPC_RAW_STD(_R0, _R1, 16)), + ppc_inst(PPC_RAW_NOP())); } } else { return -EINVAL;
[PATCH] powerpc/ftrace: Fix dropping weak symbols with older toolchains
The minimum level of gcc supported for building the kernel is v5.1. v5.x releases of gcc emitted a three instruction sequence for -mprofile-kernel: mflrr0 std r0, 16(r1) bl _mcount It is only with the v6.x releases that gcc started emitting the two instruction sequence for -mprofile-kernel, omitting the second store instruction. With the older three instruction sequence, the actual ftrace location can be the 5th instruction into a function. Update the allowed offset for ftrace location from 12 to 16 to accommodate the same. Cc: sta...@vger.kernel.org Fixes: 7af82ff90a2b06 ("powerpc/ftrace: Ignore weak functions") Signed-off-by: Naveen N Rao --- arch/powerpc/include/asm/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 91c049d51d0e10..2edc6269b1a357 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -12,7 +12,7 @@ /* Ignore unused weak functions which will have larger offsets */ #ifdef CONFIG_MPROFILE_KERNEL -#define FTRACE_MCOUNT_MAX_OFFSET 12 +#define FTRACE_MCOUNT_MAX_OFFSET 16 #elif defined(CONFIG_PPC32) #define FTRACE_MCOUNT_MAX_OFFSET 8 #endif base-commit: bd517a8442b6c6646a136421cd4c1b95bf4ce32b -- 2.40.1
[PATCH v2 2/2] powerpc/tpm: Reserve SML log when kexec'ing with kexec_file_load()
The TPM code in prom_init.c creates a small buffer of memory to store the TPM's SML (Stored Measurement Log). It's communicated to Linux via the linux,sml-base/size device tree properties of the TPM node. When kexec'ing that buffer can be overwritten, or when kdump'ing it may not be mapped by the second kernel. The latter can lead to a crash when booting the second kernel such as: tpm_ibmvtpm 7103: CRQ initialization completed BUG: Unable to handle kernel data access on read at 0xc0002ffb Faulting instruction address: 0xc000200a70e0 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc2-00134-g9307ce092f5d #314 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1200 0xf05 of:SLOF,git-5b4c5a pSeries NIP: c000200a70e0 LR: c000203dd5dc CTR: 0800 REGS: c00024543280 TRAP: 0300 Not tainted (6.2.0-rc2-00134-g9307ce092f5d) MSR: 82009033 CR: 24002280 XER: 0006 CFAR: c000200a70c8 DAR: c0002ffb DSISR: 4000 IRQMASK: 0 ... NIP memcpy_power7+0x400/0x7d0 LR kmemdup+0x5c/0x80 Call Trace: memcpy_power7+0x274/0x7d0 (unreliable) kmemdup+0x5c/0x80 tpm_read_log_of+0xe8/0x1b0 tpm_bios_log_setup+0x60/0x210 tpm_chip_register+0x134/0x320 tpm_ibmvtpm_probe+0x520/0x7d0 vio_bus_probe+0x9c/0x460 really_probe+0x104/0x420 __driver_probe_device+0xb0/0x170 driver_probe_device+0x58/0x180 __driver_attach+0xd8/0x250 bus_for_each_dev+0xb4/0x140 driver_attach+0x34/0x50 bus_add_driver+0x1e8/0x2d0 driver_register+0xb4/0x1c0 __vio_register_driver+0x74/0x9c ibmvtpm_module_init+0x34/0x48 do_one_initcall+0x80/0x320 kernel_init_freeable+0x304/0x3ac kernel_init+0x30/0x1a0 ret_from_kernel_thread+0x5c/0x64 To fix the crash, add the SML region to the usable memory areas for the kdump kernel, so that the second kernel will map the region. To avoid corruption of the region, add the region to the reserved memory areas, so that the second kernel does not use the memory for something else. Note that when loading a kdump kernel with the regular kexec_load() syscall the SML may be overwritten by the kdump kernel, depending on where the SML is in memory in relation to the crashkernel region. That is a separate problem that is not solved by this patch. Fixes: a0458284f062 ("powerpc: Add support code for kexec_file_load()") Reported-by: Stefan Berger Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kexec_ranges.h | 1 + arch/powerpc/kexec/file_load_64.c | 12 arch/powerpc/kexec/ranges.c | 20 3 files changed, 33 insertions(+) v2: Add fixes tag as suggested by Jarkko. Make change log clearer that this only fixes kexec_file_load(). diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index f83866a19e87..cf6a97f9113d 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -21,5 +21,6 @@ int add_kernel_mem_range(struct crash_mem **mem_ranges); int add_rtas_mem_range(struct crash_mem **mem_ranges); int add_opal_mem_range(struct crash_mem **mem_ranges); int add_reserved_mem_ranges(struct crash_mem **mem_ranges); +int add_sml_mem_range(struct crash_mem **mem_ranges); #endif /* _ASM_POWERPC_KEXEC_RANGES_H */ diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 110d28bede2a..90c10a89fcbc 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -79,6 +79,10 @@ static int get_exclude_memory_ranges(struct crash_mem **mem_ranges) if (ret) goto out; + ret = add_sml_mem_range(mem_ranges); + if (ret) + goto out; + ret = add_opal_mem_range(mem_ranges); if (ret) goto out; @@ -122,6 +126,10 @@ static int get_usable_memory_ranges(struct crash_mem **mem_ranges) if (ret) goto out; + ret = add_sml_mem_range(mem_ranges); + if (ret) + goto out; + ret = add_opal_mem_range(mem_ranges); if (ret) goto out; @@ -225,6 +233,10 @@ static int get_reserved_memory_ranges(struct crash_mem **mem_ranges) if (ret) goto out; + ret = add_sml_mem_range(mem_ranges); + if (ret) + goto out; + ret = add_tce_mem_ranges(mem_ranges); if (ret) goto out; diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c index 5fc53a5fcfdf..8b01655ceb5e 100644 --- a/arch/powerpc/kexec/ranges.c +++ b/arch/powerpc/kexec/ranges.c @@ -350,6 +350,26 @@ int add_rtas_mem_range(struct crash_mem **mem_ranges) return ret; } +int add_sml_mem_range(struct crash_mem **mem_ranges) +{ + struct
[PATCH v2 1/2] powerpc/tpm: Create linux,sml-base/size as big endian
There's code in prom_instantiate_sml() to do a "SML handover" (Stored Measurement Log) from OF to Linux, before Linux shuts down Open Firmware. This involves creating a buffer to hold the SML, and creating two device tree properties to record its base address and size. The kernel then later reads those properties from the device tree to find the SML. When the code was initially added in commit 4a727429abec ("PPC64: Add support for instantiating SML from Open Firmware") the powerpc kernel was always built big endian, so the properties were created big endian by default. However since then little endian support was added to powerpc, and now the code lacks conversions to big endian when creating the properties. This means on little endian kernels the device tree properties are little endian, which is contrary to the device tree spec, and in contrast to all other device tree properties. To cope with that a workaround was added in tpm_read_log_of() to skip the endian conversion if the properties were created via the SML handover. A better solution is to encode the properties as big endian as they should be, and remove the workaround. Typically changing the encoding of a property like this would present problems for kexec. However the SML is not propagated across kexec, so changing the encoding of the properties is a non-issue. Fixes: e46e22f12b19 ("tpm: enhance read_log_of() to support Physical TPM event log") Signed-off-by: Michael Ellerman Reviewed-by: Stefan Berger --- arch/powerpc/kernel/prom_init.c | 8 ++-- drivers/char/tpm/eventlog/of.c | 23 --- 2 files changed, 10 insertions(+), 21 deletions(-) v2: Add Stefan's reviewed-by. diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index d464ba412084..72fe306b6820 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1900,6 +1900,7 @@ static void __init prom_instantiate_sml(void) u32 entry = 0, size = 0, succ = 0; u64 base; __be32 val; + __be64 val64; prom_debug("prom_instantiate_sml: start...\n"); @@ -1956,10 +1957,13 @@ static void __init prom_instantiate_sml(void) reserve_mem(base, size); + val64 = cpu_to_be64(base); prom_setprop(ibmvtpm_node, "/vdevice/vtpm", "linux,sml-base", -, sizeof(base)); +, sizeof(val64)); + + val = cpu_to_be32(size); prom_setprop(ibmvtpm_node, "/vdevice/vtpm", "linux,sml-size", -, sizeof(size)); +, sizeof(val)); prom_debug("sml base = 0x%llx\n", base); prom_debug("sml size = 0x%x\n", size); diff --git a/drivers/char/tpm/eventlog/of.c b/drivers/char/tpm/eventlog/of.c index 930fe43d5daf..0bc0cb6333c6 100644 --- a/drivers/char/tpm/eventlog/of.c +++ b/drivers/char/tpm/eventlog/of.c @@ -51,8 +51,8 @@ static int tpm_read_log_memory_region(struct tpm_chip *chip) int tpm_read_log_of(struct tpm_chip *chip) { struct device_node *np; - const u32 *sizep; - const u64 *basep; + const __be32 *sizep; + const __be64 *basep; struct tpm_bios_log *log; u32 size; u64 base; @@ -73,23 +73,8 @@ int tpm_read_log_of(struct tpm_chip *chip) if (sizep == NULL || basep == NULL) return -EIO; - /* -* For both vtpm/tpm, firmware has log addr and log size in big -* endian format. But in case of vtpm, there is a method called -* sml-handover which is run during kernel init even before -* device tree is setup. This sml-handover function takes care -* of endianness and writes to sml-base and sml-size in little -* endian format. For this reason, vtpm doesn't need conversion -* but physical tpm needs the conversion. -*/ - if (of_property_match_string(np, "compatible", "IBM,vtpm") < 0 && - of_property_match_string(np, "compatible", "IBM,vtpm20") < 0) { - size = be32_to_cpup((__force __be32 *)sizep); - base = be64_to_cpup((__force __be64 *)basep); - } else { - size = *sizep; - base = *basep; - } + size = be32_to_cpup(sizep); + base = be64_to_cpup(basep); if (size == 0) { dev_warn(>dev, "%s: Event log area empty\n", __func__); -- 2.40.1
Re: [6.4-rc6] Crash during a kexec operation (tpm_amd_is_rng_defective)
[CCing the regression list, as it should be in the loop for regressions: https://docs.kernel.org/admin-guide/reporting-regressions.html] [TLDR: I'm adding this report to the list of tracked Linux kernel regressions; the text you find below is based on a few templates paragraphs you might have encountered already in similar form. See link in footer if these mails annoy you.] On 14.06.23 17:12, Sachin Sant wrote: > Following crash is observed during a kexec operation on > IBM Power10 server: > > [ 34.381548] Kernel attempted to read user page (50) - exploit attempt? (uid: > 0) > [ 34.381562] BUG: Kernel NULL pointer dereference on read at 0x0050 > [ 34.381565] Faulting instruction address: 0xc09db1e4 > [ 34.381569] Oops: Kernel access of bad area, sig: 11 [#1] > [ 34.381572] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries > [ 34.381576] Modules linked in: dm_mod(E) nft_fib_inet(E) nft_fib_ipv4(E) > nft_fib_ipv6(E) nft_fib(E) nft_reject_inet(E) nf_reject_ipv4(E) > nf_reject_ipv6(E) nft_reject(E) nft_ct(E) nft_chain_nat(E) nf_nat(E) > nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) bonding(E) tls(E) > rfkill(E) ip_set(E) sunrpc(E) nf_tables(E) nfnetlink(E) pseries_rng(E) > aes_gcm_p10_crypto(E) drm(E) drm_panel_orientation_quirks(E) xfs(E) > libcrc32c(E) sd_mod(E) sr_mod(E) t10_pi(E) crc64_rocksoft_generic(E) cdrom(E) > crc64_rocksoft(E) crc64(E) sg(E) ibmvscsi(E) scsi_transport_srp(E) ibmveth(E) > vmx_crypto(E) fuse(E) > [ 34.381613] CPU: 18 PID: 5918 Comm: kexec Kdump: loaded Tainted: G E > 6.4.0-rc6-00037-gb6dad5178cea #3 > [ 34.381618] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf06 > of:IBM,FW1030.20 (NH1030_058) hv:phyp pSeries > [ 34.381621] NIP: c09db1e4 LR: c09db928 CTR: c09eab60 > [ 34.381625] REGS: c0009742f780 TRAP: 0300 Tainted: G E > (6.4.0-rc6-00037-gb6dad5178cea) > [ 34.381628] MSR: 8280b033 CR: > 4444 XER: 0001 > [ 34.381638] CFAR: c09db19c DAR: 0050 DSISR: 4000 > IRQMASK: 0 > [ 34.381638] GPR00: c09db928 c0009742fa20 c14a1500 > c81d > [ 34.381638] GPR04: cd842c50 cd842c50 0025 > fffe > [ 34.381638] GPR08: 0009 > c00800785280 > [ 34.381638] GPR12: c09eab60 c0135fab7f00 > > [ 34.381638] GPR16: > > [ 34.381638] GPR20: > > [ 34.381638] GPR24: > c2e21e08 > [ 34.381638] GPR28: cd842c48 c2a02208 c321c0c0 > c81d > [ 34.381674] NIP [c09db1e4] tpm_amd_is_rng_defective+0x74/0x240 > [ 34.381681] LR [c09db928] tpm_chip_unregister+0x138/0x160 > [ 34.381685] Call Trace: > [ 34.381686] [c0009742faa0] [c09db928] > tpm_chip_unregister+0x138/0x160 > [ 34.381690] [c0009742fae0] [c09eab94] > tpm_ibmvtpm_remove+0x34/0x130 > [ 34.381695] [c0009742fb50] [c0115738] vio_bus_remove+0x58/0xd0 > [ 34.381701] [c0009742fb90] [c0a01ecc] device_shutdown+0x21c/0x39c > [ 34.381705] [c0009742fc20] [c01a2684] > kernel_restart_prepare+0x54/0x70 > [ 34.381710] [c0009742fc40] [c0292c48] kernel_kexec+0xa8/0x100 > [ 34.381714] [c0009742fcb0] [c01a2cd4] __do_sys_reboot+0x214/0x2c0 > [ 34.381718] [c0009742fe10] [c0034adc] > system_call_exception+0x13c/0x340 > [ 34.381723] [c0009742fe50] [c000d05c] > system_call_vectored_common+0x15c/0x2ec > [ 34.381729] --- interrupt: 3000 at 0x7fff9c5459f0 > [ 34.381732] NIP: 7fff9c5459f0 LR: CTR: > [ 34.381735] REGS: c0009742fe80 TRAP: 3000 Tainted: G E > (6.4.0-rc6-00037-gb6dad5178cea) > [ 34.381738] MSR: 8280f033 CR: > 42422884 XER: > [ 34.381747] IRQMASK: 0 > [ 34.381747] GPR00: 0058 7ad83d70 00012fc47f00 > fee1dead > [ 34.381747] GPR04: 28121969 45584543 > 0003 > [ 34.381747] GPR08: 0010 > > [ 34.381747] GPR12: 7fff9c7bb2c0 00012fc3f598 > > [ 34.381747] GPR16: 00012fc1fcc0 > > [ 34.381747] GPR20: 8913 8914 00014b891020 > 0003 > [ 34.381747] GPR24: 0001 0003 > 7ad83ef0 > [ 34.381747] GPR28: 00012fc19f10 7fff9c6419c0 00014b891080 > 00014b891040 > [ 34.381781] NIP [7fff9c5459f0] 0x7fff9c5459f0 > [ 34.381784] LR [] 0x0 > [ 34.381786] --- interrupt: 3000 > [ 34.381788] Code: 5463063e 408201c8 38210080 4e800020
Re: [RFC PATCH v2 1/1] powerpc: update ppc_save_regs to save current r1 in pt_regs
On Thu Jun 15, 2023 at 7:10 PM AEST, Aditya Gupta wrote: > ppc_save_regs() skips one stack frame while saving the CPU register states. > Instead of saving current R1, it pulls the previous stack frame pointer. > > When vmcores caused by direct panic call (such as `echo c > > /proc/sysrq-trigger`), are debugged with gdb, gdb fails to show the > backtrace correctly. On further analysis, it was found that it was because > of mismatch between r1 and NIP. > > GDB uses NIP to get current function symbol and uses corresponding debug > info of that function to unwind previous frames, but due to the > mismatching r1 and NIP, the unwinding does not work, and it fails to > unwind to the 2nd frame and hence does not show the backtrace. > > GDB backtrace with vmcore of kernel without this patch: > > - > (gdb) bt > #0 0xc02a53e8 in crash_setup_regs (oldregs=, > newregs=0xc4f8f8d8) at ./arch/powerpc/include/asm/kexec.h:69 > #1 __crash_kexec (regs=) at kernel/kexec_core.c:974 > #2 0x0063 in ?? () > #3 0xc3579320 in ?? () > - > > Further analysis revealed that the mismatch occurred because > "ppc_save_regs" was saving the previous stack's SP instead of the current > r1. This patch fixes this by storing current r1 in the saved pt_regs. > > GDB backtrace with vmcore of patched kernel: > > > (gdb) bt > #0 0xc02a53e8 in crash_setup_regs (oldregs=0x0, > newregs=0xc670b8d8) > at ./arch/powerpc/include/asm/kexec.h:69 > #1 __crash_kexec (regs=regs@entry=0x0) at kernel/kexec_core.c:974 > #2 0xc0168918 in panic (fmt=fmt@entry=0xc1654a60 "sysrq > triggered crash\n") > at kernel/panic.c:358 > #3 0xc0b735f8 in sysrq_handle_crash (key=) at > drivers/tty/sysrq.c:155 > #4 0xc0b742cc in __handle_sysrq (key=key@entry=99, > check_mask=check_mask@entry=false) > at drivers/tty/sysrq.c:602 > #5 0xc0b7506c in write_sysrq_trigger (file=, > buf=, > count=2, ppos=) at drivers/tty/sysrq.c:1163 > #6 0xc069a7bc in pde_write (ppos=, count= out>, > buf=, file=, pde=0xc362cb40) at > fs/proc/inode.c:340 > #7 proc_reg_write (file=, buf=, > count=, > ppos=) at fs/proc/inode.c:352 > #8 0xc05b3bbc in vfs_write (file=file@entry=0xc6aa6b00, > buf=buf@entry=0x61f498b4f60 0x61f498b4f60>, > count=count@entry=2, pos=pos@entry=0xc670bda0) at > fs/read_write.c:582 > #9 0xc05b4264 in ksys_write (fd=, > buf=0x61f498b4f60 , > count=2) > at fs/read_write.c:637 > #10 0xc002ea2c in system_call_exception (regs=0xc670be80, > r0=) > at arch/powerpc/kernel/syscall.c:171 > #11 0xc000c270 in system_call_vectored_common () > at arch/powerpc/kernel/interrupt_64.S:192 > > > Signed-off-by: Aditya Gupta So this now saves regs as though it was an interrupt taken in the caller, at the instruction after the call to ppc_save_regs, whereas previously the NIP was there, but R1 came from the caller's caller and that mismatch is what causes gdb's dwarf unwinder to go haywire. Nice catch, and I think I follow the fix and I think I agree with it. Before the bug was introduced, NIP also came from the grandparent. Which is what xmon presumably wanted, but since then I think maybe it makes more sense to just have the parent caller. Reivewed-by: Nicholas Piggin Fixes: d16a58f8854b1 ("powerpc: Improve ppc_save_regs()") Thanks, Nick
[RFC PATCH v2 1/1] powerpc: update ppc_save_regs to save current r1 in pt_regs
ppc_save_regs() skips one stack frame while saving the CPU register states. Instead of saving current R1, it pulls the previous stack frame pointer. When vmcores caused by direct panic call (such as `echo c > /proc/sysrq-trigger`), are debugged with gdb, gdb fails to show the backtrace correctly. On further analysis, it was found that it was because of mismatch between r1 and NIP. GDB uses NIP to get current function symbol and uses corresponding debug info of that function to unwind previous frames, but due to the mismatching r1 and NIP, the unwinding does not work, and it fails to unwind to the 2nd frame and hence does not show the backtrace. GDB backtrace with vmcore of kernel without this patch: - (gdb) bt #0 0xc02a53e8 in crash_setup_regs (oldregs=, newregs=0xc4f8f8d8) at ./arch/powerpc/include/asm/kexec.h:69 #1 __crash_kexec (regs=) at kernel/kexec_core.c:974 #2 0x0063 in ?? () #3 0xc3579320 in ?? () - Further analysis revealed that the mismatch occurred because "ppc_save_regs" was saving the previous stack's SP instead of the current r1. This patch fixes this by storing current r1 in the saved pt_regs. GDB backtrace with vmcore of patched kernel: (gdb) bt #0 0xc02a53e8 in crash_setup_regs (oldregs=0x0, newregs=0xc670b8d8) at ./arch/powerpc/include/asm/kexec.h:69 #1 __crash_kexec (regs=regs@entry=0x0) at kernel/kexec_core.c:974 #2 0xc0168918 in panic (fmt=fmt@entry=0xc1654a60 "sysrq triggered crash\n") at kernel/panic.c:358 #3 0xc0b735f8 in sysrq_handle_crash (key=) at drivers/tty/sysrq.c:155 #4 0xc0b742cc in __handle_sysrq (key=key@entry=99, check_mask=check_mask@entry=false) at drivers/tty/sysrq.c:602 #5 0xc0b7506c in write_sysrq_trigger (file=, buf=, count=2, ppos=) at drivers/tty/sysrq.c:1163 #6 0xc069a7bc in pde_write (ppos=, count=, buf=, file=, pde=0xc362cb40) at fs/proc/inode.c:340 #7 proc_reg_write (file=, buf=, count=, ppos=) at fs/proc/inode.c:352 #8 0xc05b3bbc in vfs_write (file=file@entry=0xc6aa6b00, buf=buf@entry=0x61f498b4f60 , count=count@entry=2, pos=pos@entry=0xc670bda0) at fs/read_write.c:582 #9 0xc05b4264 in ksys_write (fd=, buf=0x61f498b4f60 , count=2) at fs/read_write.c:637 #10 0xc002ea2c in system_call_exception (regs=0xc670be80, r0=) at arch/powerpc/kernel/syscall.c:171 #11 0xc000c270 in system_call_vectored_common () at arch/powerpc/kernel/interrupt_64.S:192 Signed-off-by: Aditya Gupta --- More information: This problem with gdb backtrace was discovered while working on a crash tool enhancement to improve crash analysis using gdb passthrough to be able print function arguments and local variables inside crash tool. gdb passthrough simply asks gdb to handle the backtrace printing, where it was noticed that it could not print correct backtrace in some vmcores. The changes introduced here has an implication on xmon, that it might show one extra `xmon` frame in backtrace. By looking at older commits it seems that originally the ppc_save_regs function was introduced as xmon_save_regs(). But now the same function has been renamed to ppc_save_regs() and been used in few other places as well. Tested this patch with multiple ways of crashing: 1. direct panic call (`echo c > /proc/sysrq-trigger`) 2. null dereference/oops path (the earlier implementation of `sysrq_handle_crash`) 3. sys reset 4. sys reset inside qemu (to test for any regressions, that were fixed by commit d16a58f8854b19) Changelog - V2: - fixed bogus LR by storing caller's LR area as pointed out by Naveen and Nick --- arch/powerpc/kernel/ppc_save_regs.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index 49813f982468..a9b9c32d0c1f 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -31,10 +31,10 @@ _GLOBAL(ppc_save_regs) lbz r0,PACAIRQSOFTMASK(r13) PPC_STL r0,SOFTE(r3) #endif - /* go up one stack frame for SP */ - PPC_LL r4,0(r1) - PPC_STL r4,GPR1(r3) + /* store current SP */ + PPC_STL r1,GPR1(r3) /* get caller's LR */ + PPC_LL r4,0(r1) PPC_LL r0,LRSAVE(r4) PPC_STL r0,_LINK(r3) mflrr0 -- 2.40.1
Re: [PATCH v4 1/1] PCI: layerscape: Add the endpoint linkup notifier support
On Mon, 15 May 2023 11:10:49 -0400, Frank Li wrote: > Layerscape has PME interrupt, which can be used as linkup notifier. > Set CFG_READY bit of PEX_PF0_CONFIG to enable accesses from root complex > when linkup detected. > > Applied to controller/endpoint, thanks! [1/1] PCI: layerscape: Add the endpoint linkup notifier support https://git.kernel.org/pci/pci/c/71d9ca28015c Thanks, Lorenzo
Re: [PATCH v4 1/1] PCI: layerscape: Add the endpoint linkup notifier support
On Mon, May 15, 2023 at 11:10:49AM -0400, Frank Li wrote: > Layerscape has PME interrupt, which can be used as linkup notifier. > Set CFG_READY bit of PEX_PF0_CONFIG to enable accesses from root complex > when linkup detected. > > Acked-by: Manivannan Sadhasivam > Signed-off-by: Xiaowei Bao > Signed-off-by: Frank Li > --- > Change from v3 to v4 > - swap irq and big_endian > Change from v2 to v3 > - align 80 column > - clear irq firstly > - dev_info to dev_dbg > - remove double space > - update commit message > > Change from v1 to v2 > - pme -> PME > - irq -> IRQ > - update dev_info message according to Bjorn's suggestion > > .../pci/controller/dwc/pci-layerscape-ep.c| 102 +- > 1 file changed, 101 insertions(+), 1 deletion(-) > > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c > b/drivers/pci/controller/dwc/pci-layerscape-ep.c > index c640db60edc6..5398641b6b7e 100644 > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c > @@ -18,6 +18,20 @@ > > #include "pcie-designware.h" > > +#define PEX_PF0_CONFIG 0xC0014 > +#define PEX_PF0_CFG_READYBIT(0) > + > +/* PEX PFa PCIE PME and message interrupt registers*/ > +#define PEX_PF0_PME_MES_DR 0xC0020 > +#define PEX_PF0_PME_MES_DR_LUD BIT(7) > +#define PEX_PF0_PME_MES_DR_LDD BIT(9) > +#define PEX_PF0_PME_MES_DR_HRD BIT(10) > + > +#define PEX_PF0_PME_MES_IER 0xC0028 > +#define PEX_PF0_PME_MES_IER_LUDIEBIT(7) > +#define PEX_PF0_PME_MES_IER_LDDIEBIT(9) > +#define PEX_PF0_PME_MES_IER_HRDIEBIT(10) > + > #define to_ls_pcie_ep(x) dev_get_drvdata((x)->dev) > > struct ls_pcie_ep_drvdata { > @@ -30,8 +44,86 @@ struct ls_pcie_ep { > struct dw_pcie *pci; > struct pci_epc_features *ls_epc; > const struct ls_pcie_ep_drvdata *drvdata; > + int irq; > + boolbig_endian; > }; > > +static u32 ls_lut_readl(struct ls_pcie_ep *pcie, u32 offset) > +{ > + struct dw_pcie *pci = pcie->pci; > + > + if (pcie->big_endian) > + return ioread32be(pci->dbi_base + offset); > + else > + return ioread32(pci->dbi_base + offset); > +} > + > +static void ls_lut_writel(struct ls_pcie_ep *pcie, u32 offset, u32 value) > +{ > + struct dw_pcie *pci = pcie->pci; > + > + if (pcie->big_endian) > + iowrite32be(value, pci->dbi_base + offset); > + else > + iowrite32(value, pci->dbi_base + offset); > +} > + > +static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id) > +{ > + struct ls_pcie_ep *pcie = dev_id; > + struct dw_pcie *pci = pcie->pci; > + u32 val, cfg; > + > + val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR); > + ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val); > + > + if (!val) > + return IRQ_NONE; > + > + if (val & PEX_PF0_PME_MES_DR_LUD) { > + cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG); > + cfg |= PEX_PF0_CFG_READY; > + ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg); > + dw_pcie_ep_linkup(>ep); > + > + dev_dbg(pci->dev, "Link up\n"); > + } else if (val & PEX_PF0_PME_MES_DR_LDD) { > + dev_dbg(pci->dev, "Link down\n"); > + } else if (val & PEX_PF0_PME_MES_DR_HRD) { > + dev_dbg(pci->dev, "Hot reset\n"); > + } > + > + return IRQ_HANDLED; > +} > + > +static int ls_pcie_ep_interrupt_init(struct ls_pcie_ep *pcie, > + struct platform_device *pdev) > +{ > + u32 val; > + int ret; > + > + pcie->irq = platform_get_irq_byname(pdev, "pme"); > + if (pcie->irq < 0) { > + dev_err(>dev, "Can't get 'pme' IRQ\n"); I will remove this log, platform_get_irq_byname() already logs an error. Lorenzo > + return pcie->irq; > + } > + > + ret = devm_request_irq(>dev, pcie->irq, ls_pcie_ep_event_handler, > +IRQF_SHARED, pdev->name, pcie); > + if (ret) { > + dev_err(>dev, "Can't register PCIe IRQ\n"); > + return ret; > + } > + > + /* Enable interrupts */ > + val = ls_lut_readl(pcie, PEX_PF0_PME_MES_IER); > + val |= PEX_PF0_PME_MES_IER_LDDIE | PEX_PF0_PME_MES_IER_HRDIE | > + PEX_PF0_PME_MES_IER_LUDIE; > + ls_lut_writel(pcie, PEX_PF0_PME_MES_IER, val); > + > + return 0; > +} > + > static const struct pci_epc_features* > ls_pcie_ep_get_features(struct dw_pcie_ep *ep) > { > @@ -125,6 +217,7 @@ static int __init ls_pcie_ep_probe(struct platform_device > *pdev) > struct ls_pcie_ep *pcie; > struct pci_epc_features *ls_epc; > struct resource *dbi_base; > + int ret; > > pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL); > if (!pcie) > @@ -144,6 +237,7 @@ static int __init ls_pcie_ep_probe(struct
Re: [kvm-unit-tests v4 00/12] powerpc: updates, P10, PNV support
On Thu, 15 Jun 2023 at 03:02, Nicholas Piggin wrote: > > On Wed Jun 14, 2023 at 11:09 AM AEST, Joel Stanley wrote: > > On Thu, 8 Jun 2023 at 07:58, Nicholas Piggin wrote: > > > > > > Posting again, a couple of patches were merged and accounted for review > > > comments from last time. > > > > I saw some failures in the spr tests running on a power9 powernv system: > > > > $ TESTNAME=sprs TIMEOUT=90s ACCEL= ./powerpc/run powerpc/sprs.elf -smp > > 1 |grep FAIL > > FAIL: WORT ( 895):0xc0deba80 <==> 0x > > This is just TCG machine? I'm not sure why WORT fails, AFAIKS it's the > same on POWER8 and doesn't do anything just a simple register. I think > on real hardware WORT may not have any bits implemented on POWER9 > though. Yeah, just the TCG machine. Now that you point it out all of the failures I reported are for ACCEL=, so they are running in tcg mode. run_migration timeout -k 1s --foreground 90s /usr/bin/qemu-system-ppc64 -nodefaults -machine pseries,accel=tcg -bios powerpc/boot_rom.bin -display none -serial stdio -kernel powerpc/sprs.elf -smp 1 -append -w # -initrd /tmp/tmp.61XhbvCGcb > > > $ MIGRATION=yes TESTNAME=sprs-migration TIMEOUT=90s ACCEL= > > ./powerpc/run powerpc/sprs.elf -smp 1 -append '-w' | grep FAIL > > FAIL: SRR0 ( 26):0xcafefacec0debabc <==> 0x00402244 > > FAIL: SRR1 ( 27):0xc006409ebab6 <==> 0x80001001 > > FAIL: CTRL ( 136):0x <==> 0x8001 > > FAIL: WORT ( 895):0xc0deba80 <==> 0x > > FAIL: PIR (1023):0x0010 <==> 0x0049 > > > > Linux 6.2.0-20-generic > > QEMU emulator version 7.2.0 (Debian 1:7.2+dfsg-5ubuntu2) > > > > On a power8 powernv: > > > > MIGRATION=yes TESTNAME=sprs-migration TIMEOUT=90s ACCEL= ./powerpc/run > > powerpc/sprs.elf -smp 1 -append '-w' |grep FAIL > > FAIL: SRR0 ( 26):0xcafefacec0debabc <==> 0x00402234 > > FAIL: SRR1 ( 27):0xc006409ebab6 <==> 0x80001000 > > FAIL: CTRL ( 136):0x <==> 0x8001 > > FAIL: PIR (1023):0x0060 <==> 0x0030 > > Hmm, seems we take some interrupt over migration test that is not > accounted for (could check the address in SRR0 to see where it is). > Either need to prevent that interrupt or avoid failing on SRR0/1 on > this test. > > Interesting about CTRL, I wonder if that not migrating correctly. > PIR looks like a migration issue as well, it can't be changed so > destination CPU has got a different PIR. I would be inclined to > leave those as failing to remind us to look into them. > > I'll take a look at the others though. > > Thanks, > Nick
Re: [PATCH v4 04/34] pgtable: Create struct ptdesc
On Mon, 12 Jun 2023, Vishal Moola (Oracle) wrote: > Currently, page table information is stored within struct page. As part > of simplifying struct page, create struct ptdesc for page table > information. > > Signed-off-by: Vishal Moola (Oracle) Vishal, as I think you have already guessed, your ptdesc series and my pte_free_defer() "mm: free retracted page table by RCU" series are on a collision course. Probably just trivial collisions in most architectures, which either of us can easily adjust to the other; powerpc likely to be more awkward, but fairly easily resolved; s390 quite a problem. I've so far been unable to post a v2 of my series (and powerpc and s390 were stupidly wrong in the v1), because a good s390 patch is not yet decided - Gerald Schaefer and I are currently working on that, on the s390 list (I took off most Ccs until we are settled and I can post v2). As you have no doubt found yourself, s390 has sophisticated handling of free half-pages already, and I need to add rcu_head usage in there too: it's tricky to squeeze it all in, and ptdesc does not appear to help us in any way (though mostly it's just changing some field names, okay). If ptdesc were actually allowing a flexible structure which architectures could add into, that would (in some future) be nice; but of course at present it's still fitting it all into one struct page, and mandating new restrictions which just make an architecture's job harder. Some notes on problematic fields below FYI. > --- > include/linux/pgtable.h | 51 + > 1 file changed, 51 insertions(+) > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index c5a51481bbb9..330de96ebfd6 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct > vm_area_struct *vma, > #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ > #endif /* CONFIG_MMU */ > > + > +/** > + * struct ptdesc - Memory descriptor for page tables. > + * @__page_flags: Same as page flags. Unused for page tables. > + * @pt_list: List of used page tables. Used for s390 and x86. > + * @_pt_pad_1: Padding that aliases with page's compound head. > + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. > + * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only. > + * @pt_mm: Used for x86 pgds. > + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 > only. > + * @ptl: Lock for the page table. > + * > + * This struct overlays struct page for now. Do not modify without a good > + * understanding of the issues. > + */ > +struct ptdesc { > + unsigned long __page_flags; > + > + union { > + struct list_head pt_list; I shall be needing struct rcu_head rcu_head (or pt_rcu_head or whatever, if you prefer) in this union too. Sharing the lru or pt_list with rcu_head is what's difficult to get right and efficient on s390 - and if ptdesc gave us an independent rcu_head for each page table, that would be a blessing! but sadly not, it still has to squeeze into a struct page. > + struct { > + unsigned long _pt_pad_1; > + pgtable_t pmd_huge_pte; > + }; > + }; > + unsigned long _pt_s390_gaddr; > + > + union { > + struct mm_struct *pt_mm; > + atomic_t pt_frag_refcount; Whether s390 will want pt_mm is not yet decided: I want to use it, Gerald prefers to go without it; but if we do end up using it, then pt_frag_refcount is a luxury we would have to give up. s390 does very well already with its _refcount tricks, and I'd expect powerpc's simpler but more wasteful implementation to work as well with _refcount too - I know that a few years back, powerpc did misuse _refcount (it did not allow for speculative accesses, thought it had sole ownership of that field); but s390 copes well with that, and I expect powerpc can do so too, without the luxury of pt_frag_refcount. But I've no desire to undo powerpc's use of pt_frag_refcount: just warning that we may want to undo any use of it in s390. I thought I had more issues to mention, probably Gerald will remind me of a whole new unexplored dimension! gmap perhaps. Hugh > + }; > + > +#if ALLOC_SPLIT_PTLOCKS > + spinlock_t *ptl; > +#else > + spinlock_t ptl; > +#endif > +}; > + > +#define TABLE_MATCH(pg, pt) \ > + static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) > +TABLE_MATCH(flags, __page_flags); > +TABLE_MATCH(compound_head, pt_list); > +TABLE_MATCH(compound_head, _pt_pad_1); > +TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); > +TABLE_MATCH(mapping, _pt_s390_gaddr); > +TABLE_MATCH(pt_mm, pt_mm); > +TABLE_MATCH(ptl, ptl); > +#undef TABLE_MATCH > +static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); > + > /* > * No-op macros that just return the current protection value. Defined here
[PATCH 2/2] tools/perf/tests: perf all metricgroups test fails when perf_event access is restricted
Perf all metricgroups test fails as below when perf_event access is restricted. ./perf test -v "perf all metricgroups test" Testing Memory_BW Error: Access to performance monitoring and observability operations is limited. Enforced MAC policy settings (SELinux) can limit access to performance — access to performance monitoring and observability operations for processes without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. — test child finished with -1 end perf all metricgroups test: FAILED! Fix the testcase to skip those metric events which needs perf_event access explicitly. The exit code of the testcase is based on return code of the perf stat command ( enabled by set -e option ). Hence save the exit status in a variable and use that to decide success or fail for the testcase. Signed-off-by: Athira Rajeev --- tools/perf/tests/shell/stat_all_metricgroups.sh | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh b/tools/perf/tests/shell/stat_all_metricgroups.sh index cb35e488809a..fa86b562676a 100755 --- a/tools/perf/tests/shell/stat_all_metricgroups.sh +++ b/tools/perf/tests/shell/stat_all_metricgroups.sh @@ -2,11 +2,19 @@ # perf all metricgroups test # SPDX-License-Identifier: GPL-2.0 -set -e - for m in $(perf list --raw-dump metricgroups); do echo "Testing $m" - perf stat -M "$m" -a true + result=$(perf stat -M "$m" -a true 2>&1) + rc=$? + # Skip if there is no access to perf_events monitoring + # Otherwise exit based on the return code of perf comamnd. + if [[ "$result" =~ "Access to performance monitoring and observability operations is limited" ]] + then + continue + else + [ $rc -ne 0 ] && exit $rc + fi + done exit 0 -- 2.31.1
[PATCH 1/2] tools/perf/tests: perf all metrics test fails when perf_event access is restricted
Perf all metrics test fails as below when perf_event access is restricted. ./perf test -v "perf all metrics test" Metric 'Memory_RD_BW_Chip' not printed in: Error: Access to performance monitoring and observability operations is limited. Enforced MAC policy settings (SELinux) can limit access to performance — access to performance monitoring and observability operations for processes without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability. — test child finished with -1 end perf all metrics test: FAILED! The perf all metrics test picks the input events from "perf list --raw-dump metrics" and runs "perf stat -M "$m"" for each of the metrics in the list. It fails here for some of the metrics which needs access, since it collects system wide resource details/statistics. Fix the testcase to skip those metric events. Signed-off-by: Athira Rajeev --- tools/perf/tests/shell/stat_all_metrics.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/stat_all_metrics.sh b/tools/perf/tests/shell/stat_all_metrics.sh index 54774525e18a..14b96484a359 100755 --- a/tools/perf/tests/shell/stat_all_metrics.sh +++ b/tools/perf/tests/shell/stat_all_metrics.sh @@ -6,7 +6,9 @@ err=0 for m in $(perf list --raw-dump metrics); do echo "Testing $m" result=$(perf stat -M "$m" true 2>&1) - if [[ "$result" =~ ${m:0:50} ]] || [[ "$result" =~ "" ]] + # Skip if there is no access to perf_events monitoring + # and observability operations + if [[ "$result" =~ ${m:0:50} ]] || [[ "$result" =~ "" ]] || [[ "$result" =~ "Access to performance monitoring and observability operations is limited" ]] then continue fi -- 2.31.1