date:20230615

[PATCH v3 11/11] selftests/powerpc/dexcr: Add DEXCR status utility lsdexcr

2023-06-15 Thread Benjamin Gray

Add a utility 'lsdexcr' to print the current DEXCR status. Useful for
quickly checking the status such as when debugging test failures or
verifying the new default DEXCR does what you want (for userspace at
least). Example output:

# ./lsdexcr
   uDEXCR: 0400 (NPHIE)
   HDEXCR: 
Effective: 0400 (NPHIE)

SBHE   (0): clear   (Speculative branch hint enable)
  IBRTPD   (3): clear   (Indirect branch recurrent target ...)
   SRAPD   (4): clear   (Subroutine return address ...)
   NPHIE * (5): set (Non-privileged hash instruction enable)
PHIE   (6): clear   (Privileged hash instruction enable)

DEXCR[NPHIE] enabled: hashst/hashchk working

Signed-off-by: Benjamin Gray 

---

v1: * Report if hashst/hashchk actually does something
---
 .../selftests/powerpc/dexcr/.gitignore|   1 +
 .../testing/selftests/powerpc/dexcr/Makefile  |   2 +
 .../testing/selftests/powerpc/dexcr/lsdexcr.c | 141 ++
 3 files changed, 144 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/dexcr/lsdexcr.c

diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore 
b/tools/testing/selftests/powerpc/dexcr/.gitignore
index d12e4560aca9..b82f45dd46b9 100644
--- a/tools/testing/selftests/powerpc/dexcr/.gitignore
+++ b/tools/testing/selftests/powerpc/dexcr/.gitignore
@@ -1 +1,2 @@
 hashchk_test
+lsdexcr
diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile 
b/tools/testing/selftests/powerpc/dexcr/Makefile
index 16c8b489948a..76210f2bcec3 100644
--- a/tools/testing/selftests/powerpc/dexcr/Makefile
+++ b/tools/testing/selftests/powerpc/dexcr/Makefile
@@ -1,7 +1,9 @@
 TEST_GEN_PROGS := hashchk_test
+TEST_GEN_FILES := lsdexcr
 
 include ../../lib.mk
 
 $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect)
 
 $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c
+$(TEST_GEN_FILES): ../utils.c ./dexcr.c
diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c 
b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c
new file mode 100644
index ..94abbfcc389e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include 
+#include 
+#include 
+#include 
+
+#include "dexcr.h"
+#include "utils.h"
+
+static unsigned int dexcr;
+static unsigned int hdexcr;
+static unsigned int effective;
+
+struct dexcr_aspect {
+   const char *name;
+   const char *desc;
+   unsigned int index;
+};
+
+static const struct dexcr_aspect aspects[] = {
+   {
+   .name = "SBHE",
+   .desc = "Speculative branch hint enable",
+   .index = 0,
+   },
+   {
+   .name = "IBRTPD",
+   .desc = "Indirect branch recurrent target prediction disable",
+   .index = 3,
+   },
+   {
+   .name = "SRAPD",
+   .desc = "Subroutine return address prediction disable",
+   .index = 4,
+   },
+   {
+   .name = "NPHIE",
+   .desc = "Non-privileged hash instruction enable",
+   .index = 5,
+   },
+   {
+   .name = "PHIE",
+   .desc = "Privileged hash instruction enable",
+   .index = 6,
+   },
+};
+
+static void print_list(const char *list[], size_t len)
+{
+   for (size_t i = 0; i < len; i++) {
+   printf("%s", list[i]);
+   if (i + 1 < len)
+   printf(", ");
+   }
+}
+
+static void print_dexcr(char *name, unsigned int bits)
+{
+   const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL};
+   size_t j = 0;
+
+   printf("%s: %08x", name, bits);
+
+   if (bits == 0) {
+   printf("\n");
+   return;
+   }
+
+   for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) {
+   unsigned int mask = DEXCR_PR_BIT(aspects[i].index);
+
+   if (bits & mask) {
+   enabled_aspects[j++] = aspects[i].name;
+   bits &= ~mask;
+   }
+   }
+
+   if (bits)
+   enabled_aspects[j++] = "unknown";
+
+   printf(" (");
+   print_list(enabled_aspects, j);
+   printf(")\n");
+}
+
+static void print_aspect(const struct dexcr_aspect *aspect)
+{
+   const char *attributes[8] = {NULL};
+   size_t j = 0;
+   unsigned long mask;
+
+   mask = DEXCR_PR_BIT(aspect->index);
+   if (dexcr & mask)
+   attributes[j++] = "set";
+   if (hdexcr & mask)
+   attributes[j++] = "set (hypervisor)";
+   if (!(effective & mask))
+   attributes[j++] = "clear";
+
+   printf("%12s %c (%d): ", aspect->name, effective & mask ? '*' : ' ', 
aspect->index);
+   print_list(attributes, j);
+   printf("  \t(%s)\n", aspect->desc);
+}
+
+int main(int argc, char *argv[])
+{
+   if (!dexcr_exists()) {
+

[PATCH v3 10/11] selftests/powerpc/dexcr: Add hashst/hashchk test

2023-06-15 Thread Benjamin Gray

Test the kernel DEXCR[NPHIE] interface and hashchk exception handling.

Introduces with it a DEXCR utils library for common DEXCR operations.

Volatile is used to prevent the compiler optimising away the signal
tests.

Signed-off-by: Benjamin Gray 

---
v1: * Clean up dexcr makefile
* Include kernel headers in CFLAGS
* Use numeric literals for hashst/hashchk to support older
  toolchains
* A lot of other refactoring
---
 tools/testing/selftests/powerpc/Makefile  |   1 +
 .../selftests/powerpc/dexcr/.gitignore|   1 +
 .../testing/selftests/powerpc/dexcr/Makefile  |   7 +
 tools/testing/selftests/powerpc/dexcr/dexcr.c | 132 ++
 tools/testing/selftests/powerpc/dexcr/dexcr.h |  49 
 .../selftests/powerpc/dexcr/hashchk_test.c| 227 ++
 tools/testing/selftests/powerpc/include/reg.h |   4 +
 .../testing/selftests/powerpc/include/utils.h |   4 +
 tools/testing/selftests/powerpc/utils.c   |  24 ++
 9 files changed, 449 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/dexcr/.gitignore
 create mode 100644 tools/testing/selftests/powerpc/dexcr/Makefile
 create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.c
 create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.h
 create mode 100644 tools/testing/selftests/powerpc/dexcr/hashchk_test.c

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index ae2bfc0d822f..49f2ad1793fd 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -17,6 +17,7 @@ SUB_DIRS = alignment  \
   benchmarks   \
   cache_shape  \
   copyloops\
+  dexcr\
   dscr \
   mm   \
   nx-gzip  \
diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore 
b/tools/testing/selftests/powerpc/dexcr/.gitignore
new file mode 100644
index ..d12e4560aca9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dexcr/.gitignore
@@ -0,0 +1 @@
+hashchk_test
diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile 
b/tools/testing/selftests/powerpc/dexcr/Makefile
new file mode 100644
index ..16c8b489948a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dexcr/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := hashchk_test
+
+include ../../lib.mk
+
+$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect)
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c
diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c 
b/tools/testing/selftests/powerpc/dexcr/dexcr.c
new file mode 100644
index ..65ec5347de98
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "dexcr.h"
+#include "reg.h"
+#include "utils.h"
+
+static jmp_buf generic_signal_jump_buf;
+
+static void generic_signal_handler(int signum, siginfo_t *info, void *context)
+{
+   longjmp(generic_signal_jump_buf, 0);
+}
+
+bool dexcr_exists(void)
+{
+   struct sigaction old;
+   volatile bool exists;
+
+   old = push_signal_handler(SIGILL, generic_signal_handler);
+   if (setjmp(generic_signal_jump_buf))
+   goto out;
+
+   /*
+* If the SPR is not recognised by the hardware it triggers
+* a hypervisor emulation interrupt. If the kernel does not
+* recognise/try to emulate it, we receive a SIGILL signal.
+*
+* If we do not receive a signal, assume we have the SPR or the
+* kernel is trying to emulate it correctly.
+*/
+   exists = false;
+   mfspr(SPRN_DEXCR_RO);
+   exists = true;
+
+out:
+   pop_signal_handler(SIGILL, old);
+   return exists;
+}
+
+/*
+ * Just test if a bad hashchk triggers a signal, without checking
+ * for support or if the NPHIE aspect is enabled.
+ */
+bool hashchk_triggers(void)
+{
+   struct sigaction old;
+   volatile bool triggers;
+
+   old = push_signal_handler(SIGILL, generic_signal_handler);
+   if (setjmp(generic_signal_jump_buf))
+   goto out;
+
+   triggers = true;
+   do_bad_hashchk();
+   triggers = false;
+
+out:
+   pop_signal_handler(SIGILL, old);
+   return triggers;
+}
+
+unsigned int get_dexcr(enum dexcr_source source)
+{
+   switch (source) {
+   case DEXCR:
+   return mfspr(SPRN_DEXCR_RO);
+   case HDEXCR:
+   return mfspr(SPRN_HDEXCR_RO);
+   case EFFECTIVE:
+   return mfspr(SPRN_DEXCR_RO) | mfspr(SPRN_HDEXCR_RO);
+   default:
+   FAIL_IF_EXIT_MSG(true, "bad enum dexcr_source");
+   }
+}
+
+void await_child_success(pid_t pid)
+{
+   int wstatus;
+
+   FAIL_IF_EXIT_MSG(pid == -1, "fork failed");
+   FAIL_IF_EXIT_MSG(waitpid(pid, , 0) == -1,

[PATCH v3 09/11] selftests/powerpc: Add more utility macros

2023-06-15 Thread Benjamin Gray

Adds _MSG assertion variants to provide more context behind why a
failure occurred. Also include unistd.h for _exit() and stdio.h for
fprintf(), and move ARRAY_SIZE macro to utils.h.

The _MSG variants and ARRAY_SIZE will be used by the following
DEXCR selftests.

Signed-off-by: Benjamin Gray 
Reviewed-by: Russell Currey 

---

v3: * Reword commit message
* Add ruscur reviewed-by
v1: * Remove the signal handler variants
* Describe why headers are included
---
 .../testing/selftests/powerpc/include/utils.h | 27 ++-
 .../powerpc/pmu/sampling_tests/misc.h |  2 --
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/powerpc/include/utils.h 
b/tools/testing/selftests/powerpc/include/utils.h
index 44bfd48b93d6..9dc53c4fbfe3 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -9,11 +9,17 @@
 #define __cacheline_aligned __attribute__((aligned(128)))
 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include "reg.h"
+#include 
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
 
 /* Avoid headaches with PRI?64 - just use %ll? always */
 typedef unsigned long long u64;
@@ -67,7 +73,6 @@ struct perf_event_read {
 };
 
 #if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 30)
-#include 
 #include 
 
 static inline pid_t gettid(void)
@@ -116,6 +121,16 @@ do {   
\
}   \
 } while (0)
 
+#define FAIL_IF_MSG(x, msg)\
+do {   \
+   if ((x)) {  \
+   fprintf(stderr, \
+   "[FAIL] Test FAILED on line %d: %s\n",  \
+   __LINE__, msg); \
+   return 1;   \
+   }   \
+} while (0)
+
 #define FAIL_IF_EXIT(x)\
 do {   \
if ((x)) {  \
@@ -125,6 +140,16 @@ do {   
\
}   \
 } while (0)
 
+#define FAIL_IF_EXIT_MSG(x, msg)   \
+do {   \
+   if ((x)) {  \
+   fprintf(stderr, \
+   "[FAIL] Test FAILED on line %d: %s\n",  \
+   __LINE__, msg); \
+   _exit(1);   \
+   }   \
+} while (0)
+
 /* The test harness uses this, yes it's gross */
 #define MAGIC_SKIP_RETURN_VALUE99
 
diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h 
b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h
index 4181755cf5a0..64e25cce1435 100644
--- a/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h
+++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/misc.h
@@ -18,8 +18,6 @@
 #define MMCR1_RSQ   0x2000ULL /* radix scope qual field */
 #define BHRB_DISABLE0x20ULL /* MMCRA BHRB DISABLE bit */
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
 extern int ev_mask_pmcxsel, ev_shift_pmcxsel;
 extern int ev_mask_marked, ev_shift_marked;
 extern int ev_mask_comb, ev_shift_comb;
-- 
2.40.1

[PATCH v3 08/11] Documentation: Document PowerPC kernel DEXCR interface

2023-06-15 Thread Benjamin Gray

Describe the DEXCR and document how to configure it.

Signed-off-by: Benjamin Gray 

---

v3: * Revise (H)DEXCR width to 64 bits
* Revise configuration section
v2: * Document coredump & ptrace support
v1: * Remove the dynamic control docs, describe the static config
  option

This documentation is a little bare for now, but will be expanded on
when dynamic DEXCR control is added.
---
 Documentation/powerpc/dexcr.rst | 58 +
 Documentation/powerpc/index.rst |  1 +
 2 files changed, 59 insertions(+)
 create mode 100644 Documentation/powerpc/dexcr.rst

diff --git a/Documentation/powerpc/dexcr.rst b/Documentation/powerpc/dexcr.rst
new file mode 100644
index ..615a631f51fa
--- /dev/null
+++ b/Documentation/powerpc/dexcr.rst
@@ -0,0 +1,58 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+==
+DEXCR (Dynamic Execution Control Register)
+==
+
+Overview
+
+
+The DEXCR is a privileged special purpose register (SPR) introduced in
+PowerPC ISA 3.1B (Power10) that allows per-cpu control over several dynamic
+execution behaviours. These behaviours include speculation (e.g., indirect
+branch target prediction) and enabling return-oriented programming (ROP)
+protection instructions.
+
+The execution control is exposed in hardware as up to 32 bits ('aspects') in
+the DEXCR. Each aspect controls a certain behaviour, and can be set or cleared
+to enable/disable the aspect. There are several variants of the DEXCR for
+different purposes:
+
+DEXCR
+A privileged SPR that can control aspects for userspace and kernel space
+HDEXCR
+A hypervisor-privileged SPR that can control aspects for the hypervisor and
+enforce aspects for the kernel and userspace.
+UDEXCR
+An optional ultravisor-privileged SPR that can control aspects for the 
ultravisor.
+
+Userspace can examine the current DEXCR state using a dedicated SPR that
+provides a non-privileged read-only view of the userspace DEXCR aspects.
+There is also an SPR that provides a read-only view of the hypervisor enforced
+aspects, which ORed with the userspace DEXCR view gives the effective DEXCR
+state for a process.
+
+
+Configuration
+=
+
+The DEXCR is currently unconfigurable. All threads are run with the
+NPHIE aspect enabled.
+
+
+coredump and ptrace
+===
+
+The userspace values of the DEXCR and HDEXCR (in this order) are exposed under
+``NT_PPC_DEXCR``. These are each 64 bits and readonly, and are intended to
+assist with core dumps. The DEXCR may be made writable in future. The top 32
+bits of both registers (corresponding to the non-userspace bits) are masked 
off.
+
+If the kernel config ``CONFIG_CHECKPOINT_RESTORE`` is enabled, then
+``NT_PPC_HASHKEYR`` is available and exposes the HASHKEYR value of the process
+for reading and writing. This is a tradeoff between increased security and
+checkpoint/restore support: a process should normally have no need to know its
+secret key, but restoring a process requires setting its original key. The key
+therefore appears in core dumps, and an attacker may be able to retrieve it 
from
+a coredump and effectively bypass ROP protection on any threads that share this
+key (potentially all threads from the same parent that have not run 
``exec()``).
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index 85e80e30160b..d33b554ca7ba 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -15,6 +15,7 @@ powerpc
 cxl
 cxlflash
 dawr-power9
+dexcr
 dscr
 eeh-pci-error-recovery
 elf_hwcaps
-- 
2.40.1

[PATCH v3 07/11] powerpc/ptrace: Expose HASHKEYR register to ptrace

2023-06-15 Thread Benjamin Gray

The HASHKEYR register contains a secret per-process key to enable unique
hashes per process. In general it should not be exposed to userspace
at all and a regular process has no need to know its key.

However, checkpoint restore in userspace (CRIU) functionality requires
that a process be able to set the HASHKEYR of another process, otherwise
existing hashes on the stack would be invalidated by a new random key.

Exposing HASHKEYR in this way also makes it appear in core dumps, which
is a security concern. Multiple threads may share a key, for example
just after a fork() call, where the kernel cannot know if the child is
going to return back along the parent's stack. If such a thread is
coerced into making a core dump, then the HASHKEYR value will be
readable and able to be used against all other threads sharing that key,
effectively undoing any protection offered by hashst/hashchk.

Therefore we expose HASHKEYR to ptrace when CONFIG_CHECKPOINT_RESTORE is
enabled, providing a choice of increased security or migratable ROP
protected processes. This is similar to how ARM exposes its PAC keys.

Signed-off-by: Benjamin Gray 
Reviewed-by: Russell Currey 

---

v3: * Add ruscur reviewed-by
v2: * New in v2
---
 arch/powerpc/include/uapi/asm/elf.h  |  1 +
 arch/powerpc/kernel/ptrace/ptrace-decl.h |  3 ++
 arch/powerpc/kernel/ptrace/ptrace-view.c | 36 
 include/uapi/linux/elf.h |  1 +
 4 files changed, 41 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/elf.h 
b/arch/powerpc/include/uapi/asm/elf.h
index e0d323c808dd..a5377f494fa3 100644
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -99,6 +99,7 @@
 #define ELF_NPMU   5   /* includes siar, sdar, sier, mmcr2, mmcr0 */
 #define ELF_NPKEY  3   /* includes amr, iamr, uamor */
 #define ELF_NDEXCR 2   /* includes dexcr, hdexcr */
+#define ELF_NHASHKEYR  1   /* includes hashkeyr */
 
 typedef unsigned long elf_greg_t64;
 typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG];
diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index 998a84f64804..4171a5727197 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -58,6 +58,9 @@ enum powerpc_regset {
REGSET_EBB, /* EBB registers */
REGSET_PMR, /* Performance Monitor Registers */
REGSET_DEXCR,   /* DEXCR registers */
+#ifdef CONFIG_CHECKPOINT_RESTORE
+   REGSET_HASHKEYR,/* HASHKEYR register */
+#endif
 #endif
 #ifdef CONFIG_PPC_MEM_KEYS
REGSET_PKEY,/* AMR register */
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c 
b/arch/powerpc/kernel/ptrace/ptrace-view.c
index 87a46edb3efb..600fa72c2934 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -483,6 +483,35 @@ static int dexcr_get(struct task_struct *target, const 
struct user_regset *regse
return membuf_store(, (unsigned long)(mfspr(SPRN_HDEXCR_RO) & 
0x));
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int hashkeyr_active(struct task_struct *target, const struct 
user_regset *regset)
+{
+   if (!cpu_has_feature(CPU_FTR_ARCH_31))
+   return -ENODEV;
+
+   return regset->n;
+}
+
+static int hashkeyr_get(struct task_struct *target, const struct user_regset 
*regset,
+   struct membuf to)
+{
+   if (!cpu_has_feature(CPU_FTR_ARCH_31))
+   return -ENODEV;
+
+   return membuf_store(, target->thread.hashkeyr);
+}
+
+static int hashkeyr_set(struct task_struct *target, const struct user_regset 
*regset,
+   unsigned int pos, unsigned int count, const void *kbuf,
+   const void __user *ubuf)
+{
+   if (!cpu_has_feature(CPU_FTR_ARCH_31))
+   return -ENODEV;
+
+   return user_regset_copyin(, , , , 
>thread.hashkeyr,
+ 0, sizeof(unsigned long));
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_PPC_MEM_KEYS
@@ -649,6 +678,13 @@ static const struct user_regset native_regsets[] = {
.size = sizeof(u64), .align = sizeof(u64),
.active = dexcr_active, .regset_get = dexcr_get
},
+#ifdef CONFIG_CHECKPOINT_RESTORE
+   [REGSET_HASHKEYR] = {
+   .core_note_type = NT_PPC_HASHKEYR, .n = ELF_NHASHKEYR,
+   .size = sizeof(u64), .align = sizeof(u64),
+   .active = hashkeyr_active, .regset_get = hashkeyr_get, .set = 
hashkeyr_set
+   },
+#endif
 #endif
 #ifdef CONFIG_PPC_MEM_KEYS
[REGSET_PKEY] = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index cfa31f1eb5d7..b705b301d88f 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -404,6 +404,7 @@ typedef struct elf64_shdr {
 #define NT_PPC_TM_CDSCR0x10f

[PATCH v3 06/11] powerpc/ptrace: Expose DEXCR and HDEXCR registers to ptrace

2023-06-15 Thread Benjamin Gray

The DEXCR register is of interest when ptracing processes. Currently it
is static, but eventually will be dynamically controllable by a process.
If a process can control its own, then it is useful for it to be
ptrace-able to (e.g., for checkpoint-restore functionality).

It is also relevant to core dumps (the NPHIE aspect in particular),
which use the ptrace mechanism (or is it the other way around?) to
decide what to dump. The HDEXCR is useful here too, as the NPHIE aspect
may be set in the HDEXCR without being set in the DEXCR. Although the
HDEXCR is per-cpu and we don't track it in the task struct (it's useless
in normal operation), it would be difficult to imagine why a hypervisor
would set it to different values within a guest. A hypervisor cannot
safely set NPHIE differently at least, as that would break programs.

Expose a read-only view of the userspace DEXCR and HDEXCR to ptrace.
The HDEXCR is always readonly, and is useful for diagnosing the core
dumps (as the HDEXCR may set NPHIE without the DEXCR setting it).

Signed-off-by: Benjamin Gray 
Reviewed-by: Russell Currey 

---

v3: * Add ruscur reviewed-by
* Expose values as 64 bits
* DEXCR is no longer a config
v2: * New in v2
---
 arch/powerpc/include/uapi/asm/elf.h  |  1 +
 arch/powerpc/kernel/ptrace/ptrace-decl.h |  1 +
 arch/powerpc/kernel/ptrace/ptrace-view.c | 36 +++-
 include/uapi/linux/elf.h |  1 +
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/uapi/asm/elf.h 
b/arch/powerpc/include/uapi/asm/elf.h
index dbc4a5b8d02d..e0d323c808dd 100644
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -98,6 +98,7 @@
 #define ELF_NEBB   3   /* includes ebbrr, ebbhr, bescr */
 #define ELF_NPMU   5   /* includes siar, sdar, sier, mmcr2, mmcr0 */
 #define ELF_NPKEY  3   /* includes amr, iamr, uamor */
+#define ELF_NDEXCR 2   /* includes dexcr, hdexcr */
 
 typedef unsigned long elf_greg_t64;
 typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG];
diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index 463a63eb8cc7..998a84f64804 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -57,6 +57,7 @@ enum powerpc_regset {
REGSET_TAR, /* TAR register */
REGSET_EBB, /* EBB registers */
REGSET_PMR, /* Performance Monitor Registers */
+   REGSET_DEXCR,   /* DEXCR registers */
 #endif
 #ifdef CONFIG_PPC_MEM_KEYS
REGSET_PKEY,/* AMR register */
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c 
b/arch/powerpc/kernel/ptrace/ptrace-view.c
index 5fff0d04b23f..87a46edb3efb 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -454,7 +454,36 @@ static int pmu_set(struct task_struct *target, const 
struct user_regset *regset,
 5 * sizeof(unsigned long));
return ret;
 }
-#endif
+
+static int dexcr_active(struct task_struct *target, const struct user_regset 
*regset)
+{
+   if (!cpu_has_feature(CPU_FTR_ARCH_31))
+   return -ENODEV;
+
+   return regset->n;
+}
+
+static int dexcr_get(struct task_struct *target, const struct user_regset 
*regset,
+struct membuf to)
+{
+   if (!cpu_has_feature(CPU_FTR_ARCH_31))
+   return -ENODEV;
+
+   /*
+* The DEXCR is currently static across all CPUs, so we don't
+* store the target's value anywhere, but the static value
+* will also be correct.
+*/
+   membuf_store(, (unsigned long)(DEXCR_INIT & 0x));
+
+   /*
+* Technically the HDEXCR is per-cpu, but a hypervisor can't reasonably
+* change it between CPUs of the same guest.
+*/
+   return membuf_store(, (unsigned long)(mfspr(SPRN_HDEXCR_RO) & 
0x));
+}
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_PPC_MEM_KEYS
 static int pkey_active(struct task_struct *target, const struct user_regset 
*regset)
@@ -615,6 +644,11 @@ static const struct user_regset native_regsets[] = {
.size = sizeof(u64), .align = sizeof(u64),
.active = pmu_active, .regset_get = pmu_get, .set = pmu_set
},
+   [REGSET_DEXCR] = {
+   .core_note_type = NT_PPC_DEXCR, .n = ELF_NDEXCR,
+   .size = sizeof(u64), .align = sizeof(u64),
+   .active = dexcr_active, .regset_get = dexcr_get
+   },
 #endif
 #ifdef CONFIG_PPC_MEM_KEYS
[REGSET_PKEY] = {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index ac3da855fb19..cfa31f1eb5d7 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -403,6 +403,7 @@ typedef struct elf64_shdr {
 #define NT_PPC_TM_CPPR 0x10e   /* TM checkpointed Program Priority 
Register */

[PATCH v3 05/11] powerpc/dexcr: Support userspace ROP protection

2023-06-15 Thread Benjamin Gray

The ISA 3.1B hashst and hashchk instructions use a per-cpu SPR HASHKEYR
to hold a key used in the hash calculation. This key should be different
for each process to make it harder for a malicious process to recreate
valid hash values for a victim process.

Add support for storing a per-thread hash key, and setting/clearing
HASHKEYR appropriately.

Signed-off-by: Benjamin Gray 
Reviewed-by: Russell Currey 

---

v3: * Add ruscur reviewed-by
v1: * Guard HASHKEYR update behind change check
* HASHKEYR reset moved earlier to patch 2
---
 arch/powerpc/include/asm/processor.h |  1 +
 arch/powerpc/kernel/process.c| 17 +
 2 files changed, 18 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index e96c9b8c2a60..8a6754ffdc7e 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -264,6 +264,7 @@ struct thread_struct {
unsigned long   mmcr3;
unsigned long   sier2;
unsigned long   sier3;
+   unsigned long   hashkeyr;
 
 #endif
 };
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1fefafb2b29b..b68898ac07e1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1182,6 +1182,9 @@ static inline void save_sprs(struct thread_struct *t)
 */
t->tar = mfspr(SPRN_TAR);
}
+
+   if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE))
+   t->hashkeyr = mfspr(SPRN_HASHKEYR);
 #endif
 }
 
@@ -1260,6 +1263,10 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
if (cpu_has_feature(CPU_FTR_P9_TIDR) &&
old_thread->tidr != new_thread->tidr)
mtspr(SPRN_TIDR, new_thread->tidr);
+
+   if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE) &&
+   old_thread->hashkeyr != new_thread->hashkeyr)
+   mtspr(SPRN_HASHKEYR, new_thread->hashkeyr);
 #endif
 
 }
@@ -1867,6 +1874,10 @@ int copy_thread(struct task_struct *p, const struct 
kernel_clone_args *args)
}
 
p->thread.tidr = 0;
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE))
+   p->thread.hashkeyr = current->thread.hashkeyr;
 #endif
return 0;
 }
@@ -1984,6 +1995,12 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
current->thread.tm_tfiar = 0;
current->thread.load_tm = 0;
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) {
+   current->thread.hashkeyr = get_random_long();
+   mtspr(SPRN_HASHKEYR, current->thread.hashkeyr);
+   }
+#endif /* CONFIG_PPC_BOOK3S_64 */
 }
 EXPORT_SYMBOL(start_thread);
 
-- 
2.40.1

[PATCH v3 04/11] powerpc/dexcr: Handle hashchk exception

2023-06-15 Thread Benjamin Gray

Recognise and pass the appropriate signal to the user program when a
hashchk instruction triggers. This is independent of allowing
configuration of DEXCR[NPHIE], as a hypervisor can enforce this aspect
regardless of the kernel.

The signal mirrors how ARM reports their similar check failure. For
example, their FPAC handler in arch/arm64/kernel/traps.c do_el0_fpac()
does this. When we fail to read the instruction that caused the fault
we send a segfault, similar to how emulate_math() does it.

Signed-off-by: Benjamin Gray 

---

v3: * Inline hashchk detection, remove dexcr.c and associated files
v1: * Refactor the hashchk check to return 0 on success, an error
  code on failure. Determine what to do based on specific error
  code.
* Motivate signal and code
---
 arch/powerpc/include/asm/ppc-opcode.h |  1 +
 arch/powerpc/kernel/traps.c   | 16 
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index ca5a0da7df4e..ef6972aa33b9 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -222,6 +222,7 @@
 #define OP_31_XOP_STFSX663
 #define OP_31_XOP_STFSUX695
 #define OP_31_XOP_STFDX 727
+#define OP_31_XOP_HASHCHK   754
 #define OP_31_XOP_STFDUX759
 #define OP_31_XOP_LHBRX 790
 #define OP_31_XOP_LFIWAX855
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9bdd79aa51cf..e59ec6d32d37 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1516,6 +1516,22 @@ static void do_program_check(struct pt_regs *regs)
return;
}
}
+
+   if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE) && user_mode(regs)) {
+   ppc_inst_t insn;
+
+   if (get_user_instr(insn, (void __user *)regs->nip)) {
+   _exception(SIGSEGV, regs, SEGV_MAPERR, 
regs->nip);
+   return;
+   }
+
+   if (ppc_inst_primary_opcode(insn) == 31 &&
+   get_xop(ppc_inst_val(insn)) == OP_31_XOP_HASHCHK) {
+   _exception(SIGILL, regs, ILL_ILLOPN, regs->nip);
+   return;
+   }
+   }
+
_exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip);
return;
}
-- 
2.40.1

[PATCH v3 03/11] powerpc/dexcr: Add initial Dynamic Execution Control Register (DEXCR) support

2023-06-15 Thread Benjamin Gray

ISA 3.1B introduces the Dynamic Execution Control Register (DEXCR). It
is a per-cpu register that allows control over various CPU behaviours
including branch hint usage, indirect branch speculation, and
hashst/hashchk support.

Add some definitions and basic support for the DEXCR in the kernel.
Right now it just

  * Initialises the DEXCR and HASHKEYR to a fixed value when a CPU
onlines.
  * Clears them in reset_sprs().
  * Detects when the NPHIE aspect is supported (the others don't get
looked at in this series, so there's no need to waste a CPU_FTR
on them).

We initialise the HASHKEYR to ensure that all cores have the same key,
so an HV enforced NPHIE + swapping cores doesn't randomly crash a
process using hash instructions. The stores to HASHKEYR are
unconditional because the ISA makes no mention of the SPR being missing
if support for doing the hashes isn't present. So all that would happen
is the HASHKEYR value gets ignored. This helps slightly if NPHIE
detection fails; e.g., we currently only detect it on pseries.

Signed-off-by: Benjamin Gray 

---
v3: * Initialise DEXCR with NPHIE enabled
v2: * More definitions for ptrace
v1: * Only make a CPU feature for NPHIE. We only need to know if the
  hashst/hashchk functionality is supported for a static DEXCR.
* Initialise the DEXCR to 0 when each CPU comes online. Remove
  the dexcr_init() and get_thread_dexcr() functions.
* No longer track the DEXCR in a per-thread field.
* Remove the made-up Opal features
---
 arch/powerpc/include/asm/book3s/64/kexec.h |  5 +
 arch/powerpc/include/asm/cputable.h|  4 +++-
 arch/powerpc/include/asm/reg.h | 11 +++
 arch/powerpc/kernel/cpu_setup_power.c  |  8 
 arch/powerpc/kernel/prom.c |  1 +
 5 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kexec.h 
b/arch/powerpc/include/asm/book3s/64/kexec.h
index d4b9d476ecba..df37a76c1e9f 100644
--- a/arch/powerpc/include/asm/book3s/64/kexec.h
+++ b/arch/powerpc/include/asm/book3s/64/kexec.h
@@ -21,6 +21,11 @@ static inline void reset_sprs(void)
plpar_set_ciabr(0);
}
 
+   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+   mtspr(SPRN_DEXCR, 0);
+   mtspr(SPRN_HASHKEYR, 0);
+   }
+
/*  Do we need isync()? We are going via a kexec reset */
isync();
 }
diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 757dbded11dc..443a9d482b15 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -192,6 +192,7 @@ static inline void cpu_feature_keys_init(void) { }
 #define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002)
 #define CPU_FTR_ARCH_31
LONG_ASM_CONST(0x0004)
 #define CPU_FTR_DAWR1  LONG_ASM_CONST(0x0008)
+#define CPU_FTR_DEXCR_NPHIELONG_ASM_CONST(0x0010)
 
 #ifndef __ASSEMBLY__
 
@@ -451,7 +452,8 @@ static inline void cpu_feature_keys_init(void) { }
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_ARCH_207S | \
CPU_FTR_ARCH_300 | CPU_FTR_ARCH_31 | \
-   CPU_FTR_DAWR | CPU_FTR_DAWR1)
+   CPU_FTR_DAWR | CPU_FTR_DAWR1 | \
+   CPU_FTR_DEXCR_NPHIE)
 #define CPU_FTRS_CELL  (CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 6372e5f55ef0..ed6acb0f17e7 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -382,7 +382,18 @@
 #define SPRN_HIOR  0x137   /* 970 Hypervisor interrupt offset */
 #define SPRN_RMOR  0x138   /* Real mode offset register */
 #define SPRN_HRMOR 0x139   /* Real mode offset register */
+#define SPRN_HDEXCR_RO 0x1C7   /* Hypervisor DEXCR (non-privileged, readonly) 
*/
+#define SPRN_HASHKEYR  0x1D4   /* Non-privileged hashst/hashchk key register */
+#define SPRN_HDEXCR0x1D7   /* Hypervisor dynamic execution control 
register */
+#define SPRN_DEXCR_RO  0x32C   /* DEXCR (non-privileged, readonly) */
 #define SPRN_ASDR  0x330   /* Access segment descriptor register */
+#define SPRN_DEXCR 0x33C   /* Dynamic execution control register */
+#define   DEXCR_PR_BIT(aspect) PPC_BIT(32 + (aspect))
+#define   DEXCR_PR_SBHEDEXCR_PR_BIT(0) /* Speculative Branch 
Hint Enable */
+#define   DEXCR_PR_IBRTPD  DEXCR_PR_BIT(3) /* Indirect Branch Recurrent 
Target Prediction Disable */
+#define   DEXCR_PR_SRAPD   DEXCR_PR_BIT(4) /* Subroutine Return Address 
Prediction Disable */
+#define   DEXCR_PR_NPHIE   DEXCR_PR_BIT(5) /* Non-Privileged Hash 
Instruction Enable */
+#define   DEXCR_INIT   DEXCR_PR_NPHIE  /* Fixed

[PATCH v3 02/11] powerpc/ptrace: Add missing include

2023-06-15 Thread Benjamin Gray

ptrace-decl.h uses user_regset_get2_fn (among other things) from
regset.h. While all current users of ptrace-decl.h include regset.h
before it anyway, it adds an implicit ordering dependency and breaks
source tooling that tries to inspect ptrace-decl.h by itself.

Signed-off-by: Benjamin Gray 
Reviewed-by: Russell Currey 

---

v3: * Add ruscur reviewed-by
v2: * New in v2
---
 arch/powerpc/kernel/ptrace/ptrace-decl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index eafe5f0f6289..463a63eb8cc7 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
+#include 
+
 /*
  * Set of msr bits that gdb can change on behalf of a process.
  */
-- 
2.40.1

[PATCH v3 01/11] powerpc/book3s: Add missing include

2023-06-15 Thread Benjamin Gray

The functions here use struct task_struct fields, so need to import
the full definition from . The  header
that defines current only forward declares struct task_struct.

Failing to include this  header leads to a compilation
error when a translation unit does not also include 
indirectly.

Signed-off-by: Benjamin Gray 
Reviewed-by: Nicholas Piggin 
Reviewed-by: Russell Currey 

---

v3: * Add ruscur reviewed-by
* thread_struct -> task_struct
v1: * Add npiggin reviewed-by
---
 arch/powerpc/include/asm/book3s/64/kup.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/book3s/64/kup.h 
b/arch/powerpc/include/asm/book3s/64/kup.h
index 54cf46808157..84c09e546115 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -194,6 +194,7 @@
 #else /* !__ASSEMBLY__ */
 
 #include 
+#include 
 
 DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
 
-- 
2.40.1

[PATCH v3 00/11] Add static DEXCR support

2023-06-15 Thread Benjamin Gray

v3: * Expose (H)DEXCR in ptrace as 64 bits
* Remove build config for DEXCR, always enable NPHIE
* Fix up documentation to reflect this
* Some commit message fixes

Previous versions:
v2: https://lore.kernel.org/all/20230330055040.434133-1-bg...@linux.ibm.com/
v1: https://lore.kernel.org/all/20230322054612.1340573-1-bg...@linux.ibm.com/
RFC: https://lore.kernel.org/all/20221128024458.46121-1-bg...@linux.ibm.com/

Benjamin Gray (11):
  powerpc/book3s: Add missing  include
  powerpc/ptrace: Add missing  include
  powerpc/dexcr: Add initial Dynamic Execution Control Register (DEXCR)
support
  powerpc/dexcr: Handle hashchk exception
  powerpc/dexcr: Support userspace ROP protection
  powerpc/ptrace: Expose DEXCR and HDEXCR registers to ptrace
  powerpc/ptrace: Expose HASHKEYR register to ptrace
  Documentation: Document PowerPC kernel DEXCR interface
  selftests/powerpc: Add more utility macros
  selftests/powerpc/dexcr: Add hashst/hashchk test
  selftests/powerpc/dexcr: Add DEXCR status utility lsdexcr

 Documentation/powerpc/dexcr.rst   |  58 +
 Documentation/powerpc/index.rst   |   1 +
 arch/powerpc/include/asm/book3s/64/kexec.h|   5 +
 arch/powerpc/include/asm/book3s/64/kup.h  |   1 +
 arch/powerpc/include/asm/cputable.h   |   4 +-
 arch/powerpc/include/asm/ppc-opcode.h |   1 +
 arch/powerpc/include/asm/processor.h  |   1 +
 arch/powerpc/include/asm/reg.h|  11 +
 arch/powerpc/include/uapi/asm/elf.h   |   2 +
 arch/powerpc/kernel/cpu_setup_power.c |   8 +
 arch/powerpc/kernel/process.c |  17 ++
 arch/powerpc/kernel/prom.c|   1 +
 arch/powerpc/kernel/ptrace/ptrace-decl.h  |   6 +
 arch/powerpc/kernel/ptrace/ptrace-view.c  |  72 +-
 arch/powerpc/kernel/traps.c   |  16 ++
 include/uapi/linux/elf.h  |   2 +
 tools/testing/selftests/powerpc/Makefile  |   1 +
 .../selftests/powerpc/dexcr/.gitignore|   2 +
 .../testing/selftests/powerpc/dexcr/Makefile  |   9 +
 tools/testing/selftests/powerpc/dexcr/dexcr.c | 132 ++
 tools/testing/selftests/powerpc/dexcr/dexcr.h |  49 
 .../selftests/powerpc/dexcr/hashchk_test.c| 227 ++
 .../testing/selftests/powerpc/dexcr/lsdexcr.c | 141 +++
 tools/testing/selftests/powerpc/include/reg.h |   4 +
 .../testing/selftests/powerpc/include/utils.h |  31 ++-
 .../powerpc/pmu/sampling_tests/misc.h |   2 -
 tools/testing/selftests/powerpc/utils.c   |  24 ++
 27 files changed, 823 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/powerpc/dexcr.rst
 create mode 100644 tools/testing/selftests/powerpc/dexcr/.gitignore
 create mode 100644 tools/testing/selftests/powerpc/dexcr/Makefile
 create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.c
 create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr.h
 create mode 100644 tools/testing/selftests/powerpc/dexcr/hashchk_test.c
 create mode 100644 tools/testing/selftests/powerpc/dexcr/lsdexcr.c

--
2.40.1

[powerpc:next] BUILD SUCCESS 4adc95a3edc152ffc3b0c88117767a31c9c01a1e

2023-06-15 Thread kernel test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
next
branch HEAD: 4adc95a3edc152ffc3b0c88117767a31c9c01a1e  powerpc: update 
ppc_save_regs to save current r1 in pt_regs

elapsed time: 726m

configs tested: 179
configs skipped: 13

The following configs have been built successfully.
More configs may be tested in the coming days.

tested configs:
alphaallyesconfig   gcc  
alphabuildonly-randconfig-r004-20230615   gcc  
alpha   defconfig   gcc  
alpharandconfig-r002-20230615   gcc  
alpharandconfig-r011-20230614   gcc  
alpharandconfig-r013-20230614   gcc  
alpharandconfig-r025-20230615   gcc  
alpharandconfig-r026-20230615   gcc  
arc  allyesconfig   gcc  
arc defconfig   gcc  
arc  randconfig-r003-20230615   gcc  
arc  randconfig-r034-20230615   gcc  
arc  randconfig-r043-20230614   gcc  
arc  randconfig-r043-20230615   gcc  
arm  allmodconfig   gcc  
arm  allyesconfig   gcc  
arm axm55xx_defconfig   gcc  
arm defconfig   gcc  
arm  footbridge_defconfig   gcc  
arm  randconfig-r046-20230614   clang
arm  randconfig-r046-20230615   gcc  
arm64allyesconfig   gcc  
arm64   defconfig   gcc  
arm64randconfig-r033-20230615   gcc  
arm64randconfig-r036-20230615   gcc  
csky buildonly-randconfig-r003-20230614   gcc  
cskydefconfig   gcc  
csky randconfig-r003-20230615   gcc  
csky randconfig-r034-20230615   gcc  
hexagon  randconfig-r015-20230614   clang
hexagon  randconfig-r025-20230615   clang
hexagon  randconfig-r035-20230615   clang
hexagon  randconfig-r041-20230614   clang
hexagon  randconfig-r041-20230615   clang
hexagon  randconfig-r045-20230614   clang
hexagon  randconfig-r045-20230615   clang
i386 allyesconfig   gcc  
i386  debian-10.3   gcc  
i386defconfig   gcc  
i386 randconfig-i001-20230614   clang
i386 randconfig-i001-20230615   gcc  
i386 randconfig-i002-20230614   clang
i386 randconfig-i002-20230615   gcc  
i386 randconfig-i003-20230614   clang
i386 randconfig-i003-20230615   gcc  
i386 randconfig-i004-20230614   clang
i386 randconfig-i004-20230615   gcc  
i386 randconfig-i005-20230614   clang
i386 randconfig-i005-20230615   gcc  
i386 randconfig-i006-20230614   clang
i386 randconfig-i006-20230615   gcc  
i386 randconfig-i011-20230614   gcc  
i386 randconfig-i011-20230615   clang
i386 randconfig-i012-20230614   gcc  
i386 randconfig-i012-20230615   clang
i386 randconfig-i013-20230614   gcc  
i386 randconfig-i013-20230615   clang
i386 randconfig-i014-20230614   gcc  
i386 randconfig-i014-20230615   clang
i386 randconfig-i015-20230614   gcc  
i386 randconfig-i015-20230615   clang
i386 randconfig-i016-20230614   gcc  
i386 randconfig-i016-20230615   clang
i386 randconfig-r014-20230614   gcc  
i386 randconfig-r021-20230615   clang
i386 randconfig-r032-20230615   gcc  
loongarchallmodconfig   gcc  
loongarch allnoconfig   gcc  
loongarchbuildonly-randconfig-r005-20230614   gcc  
loongarch   defconfig   gcc  
loongarchrandconfig-r004-20230615   gcc  
loongarchrandconfig-r024-20230615   gcc  
m68k allmodconfig   gcc  
m68k allyesconfig   gcc  
m68kdefconfig   gcc  
m68k randconfig-r002-20230615   gcc  
m68k randconfig-r013-20230614   gcc  
m68k randconfig-r023-20230615   gcc  
m68k randconfig-r033-20230615   gcc  
microblaze   buildonly-randconfig-r002-20230614   gcc  
microblaze   randconfig-r005-20230615   gcc  
microblaze   randconfig-r023-20230615   gcc  
microblaze   randconfig-r025-20230615   gcc  
microblaze   randconfig-r033-20230615   gcc  
mips allmodconfig   gcc  
mips

[PATCH v2 07/23 replacement] mips: add pte_unmap() to balance pte_offset_map()

2023-06-15 Thread Hugh Dickins

To keep balance in future, __update_tlb() remember to pte_unmap() after
pte_offset_map().  This is an odd case, since the caller has already done
pte_offset_map_lock(), then mips forgets the address and recalculates it;
but my two naive attempts to clean that up did more harm than good.

Tested-by: Nathan Chancellor 
Signed-off-by: Hugh Dickins 
---
Andrew, please replace my mips patch, and its build warning fix patch,
in mm-unstable by this less ambitious but working replacement - thanks.

 arch/mips/mm/tlb-r4k.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c
index 1b939abbe4ca..93c2d695588a 100644
--- a/arch/mips/mm/tlb-r4k.c
+++ b/arch/mips/mm/tlb-r4k.c
@@ -297,7 +297,7 @@ void __update_tlb(struct vm_area_struct * vma, unsigned 
long address, pte_t pte)
p4d_t *p4dp;
pud_t *pudp;
pmd_t *pmdp;
-   pte_t *ptep;
+   pte_t *ptep, *ptemap = NULL;
int idx, pid;
 
/*
@@ -344,7 +344,12 @@ void __update_tlb(struct vm_area_struct * vma, unsigned 
long address, pte_t pte)
} else
 #endif
{
-   ptep = pte_offset_map(pmdp, address);
+   ptemap = ptep = pte_offset_map(pmdp, address);
+   /*
+* update_mmu_cache() is called between pte_offset_map_lock()
+* and pte_unmap_unlock(), so we can assume that ptep is not
+* NULL here: and what should be done below if it were NULL?
+*/
 
 #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
 #ifdef CONFIG_XPA
@@ -373,6 +378,9 @@ void __update_tlb(struct vm_area_struct * vma, unsigned 
long address, pte_t pte)
tlbw_use_hazard();
htw_start();
flush_micro_tlb_vm(vma);
+
+   if (ptemap)
+   pte_unmap(ptemap);
local_irq_restore(flags);
 }
 
-- 
2.35.3

Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

2023-06-15 Thread Christophe Leroy



Le 15/06/2023 à 18:34, Randy Dunlap a écrit :
> 
> 
> On 6/15/23 09:13, Randy Dunlap wrote:
>>
>>
>> On 6/15/23 09:05, Timothy Pearson wrote:
>>>
>>>
>>> - Original Message -
 From: "Randy Dunlap" 
 To: "Timothy Pearson" , "Michael Ellerman" 
 
 Cc: "Stephen Rothwell" , "Linux Next Mailing List" 
 , "linux-kernel"
 , "linuxppc-dev" 
 , "Alexey Kardashevskiy" 
 Sent: Thursday, June 15, 2023 11:00:08 AM
 Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
>>>
 Hi Timothy,

 On 6/3/23 20:57, Timothy Pearson wrote:
>
>
> - Original Message -
>> From: "Michael Ellerman" 
>> To: "Randy Dunlap" , "Stephen Rothwell"
>> , "Linux Next Mailing List"
>> 
>> Cc: "linux-kernel" , "linuxppc-dev"
>> , "Alexey
>> Kardashevskiy" , "Timothy Pearson"
>> 
>> Sent: Saturday, June 3, 2023 7:22:51 PM
>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
>
>> Randy Dunlap  writes:
>>> On 6/1/23 21:01, Stephen Rothwell wrote:
 Hi all,

 Changes since 20230601:

>>>
>>> On powerpc64, a randconfig failed with:
>>>
>>> In file included from ../include/linux/list.h:5,
>>>   from ../include/linux/preempt.h:11,
>>>   from ../include/linux/spinlock.h:56,
>>>   from ../include/linux/mmzone.h:8,
>>>   from ../include/linux/gfp.h:7,
>>>   from ../include/linux/slab.h:15,
>>>   from ../arch/powerpc/kernel/iommu.c:15:
>>> ../arch/powerpc/kernel/iommu.c: In function
>>> 'spapr_tce_setup_phb_iommus_initcall':
>>> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared 
>>> (first use
>>> in this function); did you mean 'zonelist'?
>>>   1391 | list_for_each_entry(hose, _list, list_node) {
>>>|^
>> ...
>>
>> hose_list is in pci-common.c which is built when PCI=y.
>>
>> PSERIES and POWERNV force PCI=y.
>>
>> But this config has neither:
>>
>> # CONFIG_PPC_POWERNV is not set
>> # CONFIG_PPC_PSERIES is not set
>> CONFIG_HAVE_PCI=y
>> # CONFIG_PCI is not set
>> # CONFIG_COMMON_CLK_RS9_PCIE is not set
>>
>>
>> Probably the spapr_tce code should be wrapped in an #ifdef that is only
>> enabled when POWERNV || PSERIES is enabled.
>>
>> cheers
>
> Sounds reasonable, I was going to look into this further over the 
> weekend.  I
> can put together a patch for Monday if that works?

 Did you prepare a patch for this? I am still seeing this build error.

 thanks.
 --
 ~Randy
>>>
>>> Yes, it was sent in to the linuxppc-dev list some weeks ago.  Did it not 
>>> arrive?
>>
>> I don't know - I'm not subscribed to that list.
>>
>> It's probably still in the patchworks review cycle
>> so it hasn't been applied anywhere that gets into linux-next.
> 
> OK, it's here, mark with Success:
> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/2015925968.3546872.1685990936823.javamail.zim...@raptorengineeringinc.com/
> 
> I don't know what happens to it next or when.

Is that supposed to fix anything ? I doesn't have any Fixes: tag

Christophe

Re: [PATCH v2 07/23] mips: update_mmu_cache() can replace __update_tlb()

2023-06-15 Thread Yu Zhao

On Wed, Jun 14, 2023 at 04:17:58PM -0700, Nathan Chancellor wrote:
> Hi Hugh,
> 
> On Thu, Jun 08, 2023 at 12:17:24PM -0700, Hugh Dickins wrote:
> > Don't make update_mmu_cache() a wrapper around __update_tlb(): call it
> > directly, and use the ptep (or pmdp) provided by the caller, instead of
> > re-calling pte_offset_map() - which would raise a question of whether a
> > pte_unmap() is needed to balance it.
> > 
> > Check whether the "ptep" provided by the caller is actually the pmdp,
> > instead of testing pmd_huge(): or test pmd_huge() too and warn if it
> > disagrees?  This is "hazardous" territory: needs review and testing.
> > 
> > Signed-off-by: Hugh Dickins 
> > ---
> >  arch/mips/include/asm/pgtable.h | 15 +++
> >  arch/mips/mm/tlb-r3k.c  |  5 +++--
> >  arch/mips/mm/tlb-r4k.c  |  9 +++--
> >  3 files changed, 9 insertions(+), 20 deletions(-)
> > 
> 
> I just bisected a crash while powering down a MIPS machine in QEMU to
> this change as commit 8044511d3893 ("mips: update_mmu_cache() can
> replace __update_tlb()") in linux-next. Unfortunately, I can still
> reproduce it with the existing fix you have for this change on the
> mailing list, which is present in next-20230614.
> 
> I can reproduce it with the GCC 13.1.0 on kernel.org [1].
> 
>   $ make -skj"$(nproc)" ARCH=mips CROSS_COMPILE=mips-linux- mrproper 
> malta_defconfig vmlinux
> 
>   $ qemu-system-mipsel \
>   -display none \
>   -nodefaults \
>   -cpu 24Kf \
>   -machine malta \
>   -kernel vmlinux \
>   -initrd rootfs.cpio \
>   -m 512m \
>   -serial mon:stdio
>   ...
>   Linux version 6.4.0-rc6-next-20230614 (nathan@dev-arch.thelio-3990X) 
> (mips-linux-gcc (GCC) 13.1.0, GNU ld (GNU Binutils) 2.40) #1 SMP Wed Jun 14 
> 16:13:02 MST 2023
>   ...
>   Run /init as init process
>   process '/bin/busybox' started with executable stack
>   do_page_fault(): sending SIGSEGV to init for invalid read access from 
> 003c
>   epc = 77b893dc in ld-uClibc-1.0.39.so[77b84000+8000]
>   ra  = 77b8930c in ld-uClibc-1.0.39.so[77b84000+8000]
>   Kernel panic - not syncing: Attempted to kill init! exitcode=0x000b
>   ---[ end Kernel panic - not syncing: Attempted to kill init! 
> exitcode=0x000b ]---
> 
> The rootfs is available at [2] if it is needed. I am more than happy to
> provide additional information or test patches if necessary.
> 
> [1]: https://mirrors.edge.kernel.org/pub/tools/crosstool/
> [2]: 
> https://github.com/ClangBuiltLinux/boot-utils/releases/download/20230609-194440/mipsel-rootfs.cpio.zst

Seeing this on real h/w as well (just to confirm).

Linux version 6.4.0-rc4-00437-g4bab5c42a698 (r...@yuzhao.bld.corp.google.com) 
(mips64el-linux-gnuabi64-gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils 
for Debian) 2.40) #3 SMP PREEMPT Thu Jun 15 01:05:20 MDT 2023
Skipping L2 locking due to reduced L2 cache size
CVMSEG size: 2 cache lines (256 bytes)
printk: bootconsole [early0] enabled
CPU0 revision is: 000d9602 (Cavium Octeon III)
FPU revision is: 00739600
Kernel sections are not in the memory maps
Wasting 243712 bytes for tracking 4352 unused pages
Initrd not found or empty - disabling initrd
Using passed Device Tree.
software IO TLB: SWIOTLB bounce buffer size adjusted to 0MB
software IO TLB: area num 1.
software IO TLB: mapped [mem 0x0370d000-0x0374d000] (0MB)
Primary instruction cache 78kB, virtually tagged, 39 way, 16 sets, linesize 128 
bytes.
Primary data cache 32kB, 32-way, 8 sets, linesize 128 bytes.
Zone ranges:
  DMA32[mem 0x0110-0xefff]
  Normal   empty
Movable zone start for each node
Early memory node ranges
  node   0: [mem 0x0110-0x03646fff]
  node   0: [mem 0x0370-0x0faf]
  node   0: [mem 0x2000-0x4ebf]
Initmem setup node 0 [mem 0x0110-0x4ebf]
On node 0, zone DMA32: 4352 pages in unavailable ranges
On node 0, zone DMA32: 185 pages in unavailable ranges
On node 0, zone DMA32: 1280 pages in unavailable ranges
On node 0, zone DMA32: 5120 pages in unavailable ranges
percpu: Embedded 15 pages/cpu s24368 r8192 d28880 u61440
pcpu-alloc: s24368 r8192 d28880 u61440 alloc=15*4096
pcpu-alloc: [0] 0 [0] 1 [0] 2 [0] 3 
Kernel command line:  loglevel=8 console=ttyS0,115200
printk: log_buf_len individual max cpu contribution: 4096 bytes
printk: log_buf_len total cpu_extra contributions: 12288 bytes
printk: log_buf_len min size: 16384 bytes
printk: log_buf_len: 32768 bytes
printk: early log buf free: 14184(86%)
Dentry cache hash table entries: 131072 (order: 8, 1048576 bytes, linear)
Inode-cache hash table entries: 65536 (order: 7, 524288 bytes, linear)
Built 1 zonelists, mobility grouping on.  Total pages: 247772
mem auto-init: stack:all(zero), heap alloc:off, heap free:off
Memory: 950032K/1004828K available (8058K kernel code, 575K rwdata, 1880K 
rodata, 27488K init, 158K bss, 54796K reserved, 0K cma-reserved)
rcu: Preemptible

Re: [PATCH v2 07/23] mips: update_mmu_cache() can replace __update_tlb()

2023-06-15 Thread Hugh Dickins

On Thu, 15 Jun 2023, Nathan Chancellor wrote:
> On Wed, Jun 14, 2023 at 10:43:30PM -0700, Hugh Dickins wrote:
> > 
> > I do hope that you find the first fixes the breakage; but if not, then
> 
> I hate to be the bearer of bad news but the first patch did not fix the
> breakage, I see the same issue.

Boo!

> 
> > I even more fervently hope that the second will, despite my hating it.
> > Touch wood for the first, fingers crossed for the second, thanks,
> 
> Thankfully, the second one does. Thanks for the quick and thoughtful
> responses!

Hurrah!

Thanks a lot, Nathan.  I'll set aside my disappointment and curiosity,
clearly I'm not going to have much of a future as a MIPS programmer.

I must take a break, then rush Andrew the second patch, well, not
exactly that second patch, since most of that is revert: I'll just
send the few lines of replacement patch (with a new Subject line, as
update_mmu_cache() goes back to being separate from __update_tlb()).

Unless you object, I'll include a Tested-by: you.  I realize that
your testing is limited to seeing it running; but that's true of
most of the testing at this stage - it gets to be more interesting
when the patch that adds the rcu_read_lock() and rcu_read_unlock()
is added on top later.

Thanks again,
Hugh

Re: [PATCH v9 00/14] pci: Work around ASMedia ASM2824 PCIe link training failures

2023-06-15 Thread Bjorn Helgaas

On Thu, Jun 15, 2023 at 01:41:10AM +0100, Maciej W. Rozycki wrote:
> On Wed, 14 Jun 2023, Bjorn Helgaas wrote:
> 
> > >  This is v9 of the change to work around a PCIe link training phenomenon 
> > > where a pair of devices both capable of operating at a link speed above 
> > > 2.5GT/s seems unable to negotiate the link speed and continues training 
> > > indefinitely with the Link Training bit switching on and off repeatedly 
> > > and the data link layer never reaching the active state.
> > > 
> > >  With several requests addressed and a few extra issues spotted this
> > > version has now grown to 14 patches.  It has been verified for device 
> > > enumeration with and without PCI_QUIRKS enabled, using the same piece of 
> > > RISC-V hardware as previously.  Hot plug or reset events have not been 
> > > verified, as this is difficult if at all feasible with hardware in 
> > > question.

> >  static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
> >  {
> > -   bool retrain = true;
> > int delay = 1;
> > +   bool retrain = false;
> > +   struct pci_dev *bridge;
> > +
> > +   if (pci_is_pcie(dev)) {
> > +   retrain = true;
> > +   bridge = pci_upstream_bridge(dev);
> > +   }
> 
>  If doing it this way, which I actually like, I think it would be a little 
> bit better performance- and style-wise if this was written as:
> 
>   if (pci_is_pcie(dev)) {
>   bridge = pci_upstream_bridge(dev);
>   retrain = !!bridge;
>   }
> 
> (or "retrain = bridge != NULL" if you prefer this style), and then we 
> don't have to repeatedly check two variables iff (pcie && !bridge) in the 
> loop below:

Done, thanks, I do like that better.  I did:

  bridge = pci_upstream_bridge(dev);
  if (bridge)
retrain = true;

because it seems like it flows more naturally when reading.

Bjorn

Re: [PATCH v1 10/21] powerpc/kexec: refactor for kernel/Kconfig.kexec

2023-06-15 Thread Segher Boessenkool

On Thu, Jun 15, 2023 at 01:34:25PM +1000, Michael Ellerman wrote:
> Eric DeVolder  writes:
> > -config KEXEC_FILE
> > -   bool "kexec file based system call"
> > -   select KEXEC_CORE
> > -   select HAVE_IMA_KEXEC if IMA
> > -   select KEXEC_ELF
> > -   depends on PPC64
> > -   depends on CRYPTO=y
> > -   depends on CRYPTO_SHA256=y
> ...
> > +
> > +config ARCH_HAS_KEXEC_FILE
> > +   def_bool PPC64 && CRYPTO && CRYPTO_SHA256
> 
> The =y's got lost here.
> 
> I think they were both meaningful, because both options are tristate. So
> this previously required them to be built-in (=y), whereas after your
> patch it will allow them to be modules.
> 
> I don't know for sure that those options need to be built-in, but that's
> what the code does now, so this patch shouldn't change it, at least
> without an explanation.

This patch shouldn't change it at all, period.  If you want to change it
(and that sounds like a good idea, if it is possible anyway), that
should be a separate patch.


Segher

Re: [PATCH v1 00/21] refactor Kconfig to consolidate KEXEC and CRASH options

2023-06-15 Thread Eric DeVolder





On 6/14/23 22:26, Michael Ellerman wrote:

Eric DeVolder  writes:

On 6/13/23 15:21, Kees Cook wrote:

On Mon, Jun 12, 2023 at 01:27:52PM -0400, Eric DeVolder wrote:

The Kconfig is refactored to consolidate KEXEC and CRASH options from
various arch//Kconfig files into new file kernel/Kconfig.kexec.


This looks very nice!


Thank you Kees!


[...]
- The boolean ARCH_HAS_ in effect allows the arch to determine
when the feature is allowed.  Archs which don't have the feature
simply do not provide the corresponding ARCH_HAS_.
For each arch, where there previously were KEXEC and/or CRASH
options, these have been replaced with the corresponding boolean
ARCH_HAS_, and an appropriate def_bool statement.

For example, if the arch supports KEXEC_FILE, then the
ARCH_HAS_KEXEC_FILE simply has a 'def_bool y'. This permits the
KEXEC_FILE option to be available.

If the arch has a 'depends on' statement in its original coding
of the option, then that expression becomes part of the def_bool
expression. For example, arm64 had:

config KEXEC
  depends on PM_SLEEP_SMP

and in this solution, this converts to:

config ARCH_HAS_KEXEC
  def_bool PM_SLEEP_SMP


- In order to account for the differences in the config coding for
the three common options, the ARCH_SUPPORTS_ is used.
This options has a 'depends on ' statement to couple it
to the main option, and from there can insert the differences
from the common option and the arch original coding of that option.

For example, a few archs enable CRYPTO and CRYTPO_SHA256 for
KEXEC_FILE. These require a ARCH_SUPPORTS_KEXEC_FILE and
'select CRYPTO' and 'select CRYPTO_SHA256' statements.


Naming nit: "HAS" and "SUPPORTS" feel very similar, and looking at
existing configs, "ARCH_SUPPORTS_..." is already used for doing this
kind of bare "bool" management. e.g. see ARCH_SUPPORTS_INT128

It looks like you need to split "depends" and "select" so the options
can be chosen separately from the "selectable" configs.

How about naming this ARCH_SELECTS_, since that's what it's
there for?


I'm OK with this. Let's see if others agree?


Yeah please rename one or both of them. At a glance the difference
between HAS and SUPPORTS is very non-obvious.

I like Kees' suggestion to use ARCH_SUPPORTS and ARCH_SELECTS.

cheers

Michael, ok thanks!
eric

Re: [PATCH v1 10/21] powerpc/kexec: refactor for kernel/Kconfig.kexec

2023-06-15 Thread Eric DeVolder





On 6/14/23 22:34, Michael Ellerman wrote:

Eric DeVolder  writes:


The kexec and crash kernel options are provided in the common
kernel/Kconfig.kexec. Utilize the common options and provide
the ARCH_HAS_ and ARCH_SUPPORTS_ entries to recreate the
equivalent set of KEXEC and CRASH options.

Signed-off-by: Eric DeVolder 
Reviewed-by: Sourabh Jain 
---
  arch/powerpc/Kconfig | 55 ++--
  1 file changed, 17 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bff5820b7cda..36f2fe0cc8a5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -588,41 +588,21 @@ config PPC64_SUPPORTS_MEMORY_FAILURE
default "y" if PPC_POWERNV
select ARCH_SUPPORTS_MEMORY_FAILURE
  
-config KEXEC

-   bool "kexec system call"
-   depends on PPC_BOOK3S || PPC_E500 || (44x && !SMP)
-   select KEXEC_CORE
-   help
- kexec is a system call that implements the ability to shutdown your
- current kernel, and to start another kernel.  It is like a reboot
- but it is independent of the system firmware.   And like a reboot
- you can start any kernel with it, not just Linux.
-
- The name comes from the similarity to the exec system call.
-
- It is an ongoing process to be certain the hardware in a machine
- is properly shutdown, so do not be surprised if this code does not
- initially work for you.  As of this writing the exact hardware
- interface is strongly in flux, so no good recommendation can be
- made.
-
-config KEXEC_FILE
-   bool "kexec file based system call"
-   select KEXEC_CORE
-   select HAVE_IMA_KEXEC if IMA
-   select KEXEC_ELF
-   depends on PPC64
-   depends on CRYPTO=y
-   depends on CRYPTO_SHA256=y

...

+
+config ARCH_HAS_KEXEC_FILE
+   def_bool PPC64 && CRYPTO && CRYPTO_SHA256


The =y's got lost here.

I think they were both meaningful, because both options are tristate. So
this previously required them to be built-in (=y), whereas after your
patch it will allow them to be modules.

I don't know for sure that those options need to be built-in, but that's
what the code does now, so this patch shouldn't change it, at least
without an explanation.

cheers

Thanks Michael, I've applied =y's. Good catch!
eric

Re: [PATCH mm-unstable v2 08/10] kvm/x86: move tdp_mmu_enabled and shadow_accessed_mask

2023-06-15 Thread Sean Christopherson

On Fri, May 26, 2023, Yu Zhao wrote:
> tdp_mmu_enabled and shadow_accessed_mask are needed to implement
> kvm_arch_has_test_clear_young().
> 
> Signed-off-by: Yu Zhao 
> ---
>  arch/x86/include/asm/kvm_host.h | 6 ++
>  arch/x86/kvm/mmu.h  | 6 --
>  arch/x86/kvm/mmu/spte.h | 1 -
>  3 files changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fb9d1f2d6136..753c67072c47 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1772,6 +1772,7 @@ struct kvm_arch_async_pf {
>  
>  extern u32 __read_mostly kvm_nr_uret_msrs;
>  extern u64 __read_mostly host_efer;
> +extern u64 __read_mostly shadow_accessed_mask;
>  extern bool __read_mostly allow_smaller_maxphyaddr;
>  extern bool __read_mostly enable_apicv;
>  extern struct kvm_x86_ops kvm_x86_ops;
> @@ -1855,6 +1856,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned 
> irqchip, unsigned pin,
>bool mask);
>  
>  extern bool tdp_enabled;
> +#ifdef CONFIG_X86_64
> +extern bool tdp_mmu_enabled;
> +#else
> +#define tdp_mmu_enabled false
> +#endif

I would much prefer that these be kept in kvm/mmu.h.  And looking at all the 
arch
code, there's no reason to make kvm_arch_has_test_clear_young() a runtime 
callback,
all of the logic is constant relative to when KVM is loaded.

So rather than have generic KVM pull from arch code, what if we have arch code
push info to generic KVM?  We could even avoid #ifdefs if arch code passed in 
its
handler.  That might result in an extra indirect branch though, so it might be
better to just use a flag?  E.g. the x86 conversion would be something like 
this.

---
 arch/x86/kvm/mmu/mmu.c |  5 +
 arch/x86/kvm/mmu/tdp_mmu.c |  2 +-
 arch/x86/kvm/mmu/tdp_mmu.h |  1 +
 include/linux/kvm_host.h   | 24 
 virt/kvm/kvm_main.c| 14 ++
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c8ebe542c565..84a4a83540f0 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5809,6 +5809,11 @@ void kvm_configure_mmu(bool enable_tdp, int 
tdp_forced_root_level,
max_huge_page_level = PG_LEVEL_1G;
else
max_huge_page_level = PG_LEVEL_2M;
+
+   if (tdp_mmu_enabled && kvm_ad_enabled())
+   kvm_init_test_clear_young(kvm_tdp_mmu_test_clear_young);
+   else
+   kvm_init_test_clear_young(NULL);
 }
 EXPORT_SYMBOL_GPL(kvm_configure_mmu);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f463d54228f8..e878c88f0e02 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1308,7 +1308,7 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct 
kvm_gfn_range *range)
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
-bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_tdp_mmu_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
 {
struct kvm_mmu_page *root;
int offset = ffs(shadow_accessed_mask) - 1;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 0a63b1afabd3..aaa0b75b3896 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -34,6 +34,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct 
kvm_gfn_range *range,
 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
+bool kvm_tdp_mmu_test_clear_young(struct kvm *kvm, struct kvm_gfn_range 
*range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
 const struct kvm_memory_slot *slot, int min_level);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1714f82a0c47..7a0922cbc36f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -264,31 +264,15 @@ struct kvm_gfn_range {
pte_t pte;
bool may_block;
 };
+
+typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
+
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_should_clear_young(struct kvm_gfn_range *range, gfn_t gfn);
-bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range);
-#endif
-
-/*
- * Architectures that implement kvm_arch_test_clear_young() should override
- * kvm_arch_has_test_clear_young().
- *
- * kvm_arch_has_test_clear_young() is allowed to return false positive, i.e., 
it
- * can return true if kvm_arch_test_clear_young() is supported but disabled due
- * to some

[PATCH 2/2] PCI: layerscape: Add the workaround for lost link capablities during reset

2023-06-15 Thread Frank Li

From: Xiaowei Bao 

A workaround for the issue where the PCI Express Endpoint (EP) controller
loses the values of the Maximum Link Width and Supported Link Speed from
the Link Capabilities Register, which initially configured by the Reset
Configuration Word (RCW) during a link-down or hot reset event.

Signed-off-by: Xiaowei Bao 
Signed-off-by: Hou Zhiqiang 
Signed-off-by: Frank Li 
---
 drivers/pci/controller/dwc/pci-layerscape-ep.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index 4e4fdd1dfea7..2ef02d827eeb 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -45,6 +45,7 @@ struct ls_pcie_ep {
struct pci_epc_features *ls_epc;
const struct ls_pcie_ep_drvdata *drvdata;
int irq;
+   u32 lnkcap;
boolbig_endian;
 };
 
@@ -73,6 +74,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
*dev_id)
struct ls_pcie_ep *pcie = dev_id;
struct dw_pcie *pci = pcie->pci;
u32 val, cfg;
+   u8 offset;
 
val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
@@ -81,6 +83,13 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
*dev_id)
return IRQ_NONE;
 
if (val & PEX_PF0_PME_MES_DR_LUD) {
+
+   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+
+   dw_pcie_dbi_ro_wr_en(pci);
+   dw_pcie_writew_dbi(pci, offset + PCI_EXP_LNKCAP, pcie->lnkcap);
+   dw_pcie_dbi_ro_wr_dis(pci);
+
cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
cfg |= PEX_PF0_CFG_READY;
ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
@@ -216,6 +225,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
*pdev)
struct ls_pcie_ep *pcie;
struct pci_epc_features *ls_epc;
struct resource *dbi_base;
+   u8 offset;
int ret;
 
pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
@@ -252,6 +262,9 @@ static int __init ls_pcie_ep_probe(struct platform_device 
*pdev)
 
platform_set_drvdata(pdev, pcie);
 
+   offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+   pcie->lnkcap = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP);
+
ret = dw_pcie_ep_init(>ep);
if (ret)
return ret;
-- 
2.34.1

Re: [PATCH mm-unstable v2 01/10] mm/kvm: add mmu_notifier_ops->test_clear_young()

2023-06-15 Thread Sean Christopherson

On Fri, May 26, 2023, Yu Zhao wrote:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 0e571e973bc2..374262545f96 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -258,6 +258,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
>  #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
>  struct kvm_gfn_range {
>   struct kvm_memory_slot *slot;
> + void *args;

There's no reason to make this "void *", just declare "struct 
test_clear_young_args"
in the header.  Arch code won't be able to use it regardless.  And I vote for
something more like "test_clear_young_metadata", as there's far more information
in there than just function arguments.

And to stave off the argument that "void *" would allow reuse, take this 
opportunity
to unionize the test_clear_young field with the change_pte field, e.g.

/* comment about these fields being callback specific. */
union {
struct test_clear_young_metadata *metadata;
pte_t pte;
unsigned long callback_arg; /* needs a better name */
};

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 51e4882d0873..31ee58754b19 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -541,6 +541,7 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned 
> long start,
>  typedef void (*on_unlock_fn_t)(struct kvm *kvm);
>  
>  struct kvm_hva_range {
> + void *args;

Same feedback as kvm_gfn_range.

>   unsigned long start;
>   unsigned long end;
>   pte_t pte;
> @@ -549,6 +550,7 @@ struct kvm_hva_range {
>   on_unlock_fn_t on_unlock;
>   bool flush_on_ret;
>   bool may_block;
> + bool lockless;
>  };
>  
>  /*
> @@ -602,6 +604,8 @@ static __always_inline int __kvm_handle_hva_range(struct 
> kvm *kvm,
>   hva_end = min(range->end, slot->userspace_addr +
> (slot->npages << PAGE_SHIFT));
>  
> + gfn_range.args = range->args;

And this goes away because the generic callback_arg is what gets propagated.

> +
>   /*
>* To optimize for the likely case where the address
>* range is covered by zero or one memslots, don't
> @@ -619,7 +623,7 @@ static __always_inline int __kvm_handle_hva_range(struct 
> kvm *kvm,
>   gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE 
> - 1, slot);
>   gfn_range.slot = slot;
>  
> - if (!locked) {
> + if (!range->lockless && !locked) {
>   locked = true;
>   KVM_MMU_LOCK(kvm);
>   if (!IS_KVM_NULL_FN(range->on_lock))
> @@ -628,6 +632,9 @@ static __always_inline int __kvm_handle_hva_range(struct 
> kvm *kvm,
>   break;
>   }
>   ret |= range->handler(kvm, _range);
> +
> + if (range->lockless && ret)

I don't like overloading "lockless" to also mean "stop on ret".  Just add 
another
flag, there's literally no cost for most callbacks as everything is constant at
compile time and gets optimized away.

> + range.args = 
> + range.lockless = true;

The lockless and stop_on_ret behavior needs comments.

> + range.handler = kvm_arch_test_clear_young;
> +
> + if (!__kvm_handle_hva_range(kvm, ))
> + return args.young ? MMU_NOTIFIER_RANGE_LOCKLESS : 0;
> + }
> +
> + if (bitmap)
> + return 0;
> +
> + range.args = NULL;
> + range.lockless = false;

No need to manually clear these, they'll be zeroed by the initialization code.

E.g. all in all, something like so

---
 include/linux/kvm_host.h |  9 +++--
 virt/kvm/kvm_main.c  | 29 +
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7a0922cbc36f..e04605061f5e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,12 +256,17 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
 #ifdef KVM_ARCH_WANT_MMU_NOTIFIER
+struct test_clear_young_metadata;
+
 struct kvm_gfn_range {
struct kvm_memory_slot *slot;
-   void *args;
gfn_t start;
gfn_t end;
-   pte_t pte;
+   union {
+   struct test_clear_young_metadata *metadata;
+   pte_t pte;
+   unsigned long callback_arg;
+   };
bool may_block;
 };
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ac83cfb30771..8cf4fee9cd8b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -536,16 +536,20 @@ typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned 
long start,
 typedef void (*on_unlock_fn_t)(struct kvm *kvm);
 
 struct kvm_hva_range {
-   void *args;
unsigned

Re: [PATCH mm-unstable v2 09/10] kvm/x86: add kvm_arch_test_clear_young()

2023-06-15 Thread Sean Christopherson

On Fri, May 26, 2023, Yu Zhao wrote:
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 08340219c35a..6875a819e007 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1232,6 +1232,40 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct 
> kvm_gfn_range *range)
>   return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
>  }
>  
> +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> + struct kvm_mmu_page *root;
> + int offset = ffs(shadow_accessed_mask) - 1;
> +
> + if (kvm_shadow_root_allocated(kvm))

This needs a comment.

> + return true;
> +
> + rcu_read_lock();
> +
> + list_for_each_entry_rcu(root, >arch.tdp_mmu_roots, link) {

As requested in v1[1], please add a macro for a lockless walk.

[1] https://lkml.kernel.org/r/Y%2Fed0XYAPx%2B7pukA%40google.com

> + struct tdp_iter iter;
> +
> + if (kvm_mmu_page_as_id(root) != range->slot->as_id)
> + continue;
> +
> + tdp_root_for_each_leaf_pte(iter, root, range->start, 
> range->end) {
> + u64 *sptep = rcu_dereference(iter.sptep);
> +
> + VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));

Hrm, I don't like adding this in KVM.  The primary MMU might guarantee that this
callback is invoked if and only if the SPTE is backed by struct page memory, but
there's no reason to assume that's true in KVM.  If we want the sanity check, 
then
this needs to use kvm_pfn_to_refcounted_page().

And it should use KVM's MMU_WARN_ON(), which is a mess and effectively dead 
code,
but I'm working on changing that[*], i.e. by the time this gets to Linus' tree,
the sanity check should have a much cleaner implementation.

[2] https://lore.kernel.org/all/20230511235917.639770-8-sea...@google.com

> +
> + if (!(iter.old_spte & shadow_accessed_mask))
> + continue;
> +
> + if (kvm_should_clear_young(range, iter.gfn))
> + clear_bit(offset, (unsigned long *)sptep);

If/when you rebase on https://github.com/kvm-x86/linux/tree/next, can you pull
out the atomic bits of tdp_mmu_clear_spte_bits() and use that new helper? E.g.

diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index fae559559a80..914c34518829 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -58,15 +58,18 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, 
u64 old_spte,
return old_spte;
 }
 
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+   atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+   return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
 static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
  u64 mask, int level)
 {
-   atomic64_t *sptep_atomic;
-
-   if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
-   sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
-   return (u64)atomic64_fetch_and(~mask, sptep_atomic);
-   }
+   if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
+   return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
 
__kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
return old_spte;

[PATCH 1/2] PCI: layerscape: Add support for Link down notification

2023-06-15 Thread Frank Li

Add support to pass Link down notification to Endpoint function driver
so that the LINK_DOWN event can be processed by the function.

Signed-off-by: Frank Li 
---
 drivers/pci/controller/dwc/pci-layerscape-ep.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index de4c1758a6c3..4e4fdd1dfea7 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -88,6 +88,7 @@ static irqreturn_t ls_pcie_ep_event_handler(int irq, void 
*dev_id)
 
dev_dbg(pci->dev, "Link up\n");
} else if (val & PEX_PF0_PME_MES_DR_LDD) {
+   pci_epc_linkdown(pci->ep.epc);
dev_dbg(pci->dev, "Link down\n");
} else if (val & PEX_PF0_PME_MES_DR_HRD) {
dev_dbg(pci->dev, "Hot reset\n");
-- 
2.34.1

Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

2023-06-15 Thread Randy Dunlap




On 6/15/23 09:13, Randy Dunlap wrote:
> 
> 
> On 6/15/23 09:05, Timothy Pearson wrote:
>>
>>
>> - Original Message -
>>> From: "Randy Dunlap" 
>>> To: "Timothy Pearson" , "Michael Ellerman" 
>>> 
>>> Cc: "Stephen Rothwell" , "Linux Next Mailing List" 
>>> , "linux-kernel"
>>> , "linuxppc-dev" 
>>> , "Alexey Kardashevskiy" 
>>> Sent: Thursday, June 15, 2023 11:00:08 AM
>>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
>>
>>> Hi Timothy,
>>>
>>> On 6/3/23 20:57, Timothy Pearson wrote:


 - Original Message -
> From: "Michael Ellerman" 
> To: "Randy Dunlap" , "Stephen Rothwell"
> , "Linux Next Mailing List"
> 
> Cc: "linux-kernel" , "linuxppc-dev"
> , "Alexey
> Kardashevskiy" , "Timothy Pearson"
> 
> Sent: Saturday, June 3, 2023 7:22:51 PM
> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

> Randy Dunlap  writes:
>> On 6/1/23 21:01, Stephen Rothwell wrote:
>>> Hi all,
>>>
>>> Changes since 20230601:
>>>
>>
>> On powerpc64, a randconfig failed with:
>>
>> In file included from ../include/linux/list.h:5,
>>  from ../include/linux/preempt.h:11,
>>  from ../include/linux/spinlock.h:56,
>>  from ../include/linux/mmzone.h:8,
>>  from ../include/linux/gfp.h:7,
>>  from ../include/linux/slab.h:15,
>>  from ../arch/powerpc/kernel/iommu.c:15:
>> ../arch/powerpc/kernel/iommu.c: In function
>> 'spapr_tce_setup_phb_iommus_initcall':
>> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared 
>> (first use
>> in this function); did you mean 'zonelist'?
>>  1391 | list_for_each_entry(hose, _list, list_node) {
>>   |^
> ...
>
> hose_list is in pci-common.c which is built when PCI=y.
>
> PSERIES and POWERNV force PCI=y.
>
> But this config has neither:
>
> # CONFIG_PPC_POWERNV is not set
> # CONFIG_PPC_PSERIES is not set
> CONFIG_HAVE_PCI=y
> # CONFIG_PCI is not set
> # CONFIG_COMMON_CLK_RS9_PCIE is not set
>
>
> Probably the spapr_tce code should be wrapped in an #ifdef that is only
> enabled when POWERNV || PSERIES is enabled.
>
> cheers

 Sounds reasonable, I was going to look into this further over the weekend. 
  I
 can put together a patch for Monday if that works?
>>>
>>> Did you prepare a patch for this? I am still seeing this build error.
>>>
>>> thanks.
>>> --
>>> ~Randy
>>
>> Yes, it was sent in to the linuxppc-dev list some weeks ago.  Did it not 
>> arrive?
> 
> I don't know - I'm not subscribed to that list.
> 
> It's probably still in the patchworks review cycle
> so it hasn't been applied anywhere that gets into linux-next.

OK, it's here, mark with Success:
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/2015925968.3546872.1685990936823.javamail.zim...@raptorengineeringinc.com/

I don't know what happens to it next or when.

thanks.

-- 
~Randy

Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

2023-06-15 Thread Randy Dunlap




On 6/15/23 09:05, Timothy Pearson wrote:
> 
> 
> - Original Message -
>> From: "Randy Dunlap" 
>> To: "Timothy Pearson" , "Michael Ellerman" 
>> 
>> Cc: "Stephen Rothwell" , "Linux Next Mailing List" 
>> , "linux-kernel"
>> , "linuxppc-dev" 
>> , "Alexey Kardashevskiy" 
>> Sent: Thursday, June 15, 2023 11:00:08 AM
>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
> 
>> Hi Timothy,
>>
>> On 6/3/23 20:57, Timothy Pearson wrote:
>>>
>>>
>>> - Original Message -
 From: "Michael Ellerman" 
 To: "Randy Dunlap" , "Stephen Rothwell"
 , "Linux Next Mailing List"
 
 Cc: "linux-kernel" , "linuxppc-dev"
 , "Alexey
 Kardashevskiy" , "Timothy Pearson"
 
 Sent: Saturday, June 3, 2023 7:22:51 PM
 Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
>>>
 Randy Dunlap  writes:
> On 6/1/23 21:01, Stephen Rothwell wrote:
>> Hi all,
>>
>> Changes since 20230601:
>>
>
> On powerpc64, a randconfig failed with:
>
> In file included from ../include/linux/list.h:5,
>  from ../include/linux/preempt.h:11,
>  from ../include/linux/spinlock.h:56,
>  from ../include/linux/mmzone.h:8,
>  from ../include/linux/gfp.h:7,
>  from ../include/linux/slab.h:15,
>  from ../arch/powerpc/kernel/iommu.c:15:
> ../arch/powerpc/kernel/iommu.c: In function
> 'spapr_tce_setup_phb_iommus_initcall':
> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared 
> (first use
> in this function); did you mean 'zonelist'?
>  1391 | list_for_each_entry(hose, _list, list_node) {
>   |^
 ...

 hose_list is in pci-common.c which is built when PCI=y.

 PSERIES and POWERNV force PCI=y.

 But this config has neither:

 # CONFIG_PPC_POWERNV is not set
 # CONFIG_PPC_PSERIES is not set
 CONFIG_HAVE_PCI=y
 # CONFIG_PCI is not set
 # CONFIG_COMMON_CLK_RS9_PCIE is not set


 Probably the spapr_tce code should be wrapped in an #ifdef that is only
 enabled when POWERNV || PSERIES is enabled.

 cheers
>>>
>>> Sounds reasonable, I was going to look into this further over the weekend.  
>>> I
>>> can put together a patch for Monday if that works?
>>
>> Did you prepare a patch for this? I am still seeing this build error.
>>
>> thanks.
>> --
>> ~Randy
> 
> Yes, it was sent in to the linuxppc-dev list some weeks ago.  Did it not 
> arrive?

I don't know - I'm not subscribed to that list.

It's probably still in the patchworks review cycle
so it hasn't been applied anywhere that gets into linux-next.

thanks.
-- 
~Randy

Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

2023-06-15 Thread Timothy Pearson




- Original Message -
> From: "Randy Dunlap" 
> To: "Timothy Pearson" , "Michael Ellerman" 
> 
> Cc: "Stephen Rothwell" , "Linux Next Mailing List" 
> , "linux-kernel"
> , "linuxppc-dev" 
> , "Alexey Kardashevskiy" 
> Sent: Thursday, June 15, 2023 11:00:08 AM
> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

> Hi Timothy,
> 
> On 6/3/23 20:57, Timothy Pearson wrote:
>> 
>> 
>> - Original Message -
>>> From: "Michael Ellerman" 
>>> To: "Randy Dunlap" , "Stephen Rothwell"
>>> , "Linux Next Mailing List"
>>> 
>>> Cc: "linux-kernel" , "linuxppc-dev"
>>> , "Alexey
>>> Kardashevskiy" , "Timothy Pearson"
>>> 
>>> Sent: Saturday, June 3, 2023 7:22:51 PM
>>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
>> 
>>> Randy Dunlap  writes:
 On 6/1/23 21:01, Stephen Rothwell wrote:
> Hi all,
>
> Changes since 20230601:
>

 On powerpc64, a randconfig failed with:

 In file included from ../include/linux/list.h:5,
  from ../include/linux/preempt.h:11,
  from ../include/linux/spinlock.h:56,
  from ../include/linux/mmzone.h:8,
  from ../include/linux/gfp.h:7,
  from ../include/linux/slab.h:15,
  from ../arch/powerpc/kernel/iommu.c:15:
 ../arch/powerpc/kernel/iommu.c: In function
 'spapr_tce_setup_phb_iommus_initcall':
 ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared 
 (first use
 in this function); did you mean 'zonelist'?
  1391 | list_for_each_entry(hose, _list, list_node) {
   |^
>>> ...
>>>
>>> hose_list is in pci-common.c which is built when PCI=y.
>>>
>>> PSERIES and POWERNV force PCI=y.
>>>
>>> But this config has neither:
>>>
>>> # CONFIG_PPC_POWERNV is not set
>>> # CONFIG_PPC_PSERIES is not set
>>> CONFIG_HAVE_PCI=y
>>> # CONFIG_PCI is not set
>>> # CONFIG_COMMON_CLK_RS9_PCIE is not set
>>>
>>>
>>> Probably the spapr_tce code should be wrapped in an #ifdef that is only
>>> enabled when POWERNV || PSERIES is enabled.
>>>
>>> cheers
>> 
>> Sounds reasonable, I was going to look into this further over the weekend.  I
>> can put together a patch for Monday if that works?
> 
> Did you prepare a patch for this? I am still seeing this build error.
> 
> thanks.
> --
> ~Randy

Yes, it was sent in to the linuxppc-dev list some weeks ago.  Did it not arrive?

file removal issue in tools/testing/selftests/powerpc/mm/tlbie_test.c

2023-06-15 Thread Colin King (gmail)


Hi,

Static analysis with cppcheck has found an issue in the following commit:

commit 047e6575aec71d75b765c22111820c4776cd1c43
Author: Aneesh Kumar K.V 
Date:   Tue Sep 24 09:22:53 2019 +0530

powerpc/mm: Fixup tlbie vs mtpidr/mtlpidr ordering issue on POWER9


The issue in tools/testing/selftests/powerpc/mm/tlbie_test.c in 
end_verification_log() is as follows:


static inline void end_verification_log(unsigned int tid, unsigned 
nr_anamolies)

{
FILE *f = fp[tid];
char logfile[30];
char path[LOGDIR_NAME_SIZE + 30];
char separator[] = "/";

fclose(f);

if (nr_anamolies == 0) {
remove(path);
return;
}
 etc


in the case where nr_anamolies is zero the remove(path) call is using an 
uninitialized path, this potentially could contain uninitialized garbage 
on the stack (and if one is unlucky enough it may be a valid filename 
that one does not want to be removed).


Not sure what the original intention was, but this code looks incorrect 
to me.


Colin

Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)

2023-06-15 Thread Randy Dunlap

Hi Timothy,

On 6/3/23 20:57, Timothy Pearson wrote:
> 
> 
> - Original Message -
>> From: "Michael Ellerman" 
>> To: "Randy Dunlap" , "Stephen Rothwell" 
>> , "Linux Next Mailing List"
>> 
>> Cc: "linux-kernel" , "linuxppc-dev" 
>> , "Alexey
>> Kardashevskiy" , "Timothy Pearson" 
>> 
>> Sent: Saturday, June 3, 2023 7:22:51 PM
>> Subject: Re: linux-next: Tree for Jun 2 (arch/powerpc/kernel/iommu.c)
> 
>> Randy Dunlap  writes:
>>> On 6/1/23 21:01, Stephen Rothwell wrote:
 Hi all,

 Changes since 20230601:

>>>
>>> On powerpc64, a randconfig failed with:
>>>
>>> In file included from ../include/linux/list.h:5,
>>>  from ../include/linux/preempt.h:11,
>>>  from ../include/linux/spinlock.h:56,
>>>  from ../include/linux/mmzone.h:8,
>>>  from ../include/linux/gfp.h:7,
>>>  from ../include/linux/slab.h:15,
>>>  from ../arch/powerpc/kernel/iommu.c:15:
>>> ../arch/powerpc/kernel/iommu.c: In function
>>> 'spapr_tce_setup_phb_iommus_initcall':
>>> ../arch/powerpc/kernel/iommu.c:1391:36: error: 'hose_list' undeclared 
>>> (first use
>>> in this function); did you mean 'zonelist'?
>>>  1391 | list_for_each_entry(hose, _list, list_node) {
>>>   |^
>> ...
>>
>> hose_list is in pci-common.c which is built when PCI=y.
>>
>> PSERIES and POWERNV force PCI=y.
>>
>> But this config has neither:
>>
>> # CONFIG_PPC_POWERNV is not set
>> # CONFIG_PPC_PSERIES is not set
>> CONFIG_HAVE_PCI=y
>> # CONFIG_PCI is not set
>> # CONFIG_COMMON_CLK_RS9_PCIE is not set
>>
>>
>> Probably the spapr_tce code should be wrapped in an #ifdef that is only
>> enabled when POWERNV || PSERIES is enabled.
>>
>> cheers
> 
> Sounds reasonable, I was going to look into this further over the weekend.  I 
> can put together a patch for Monday if that works?

Did you prepare a patch for this? I am still seeing this build error.

thanks.
-- 
~Randy

Re: [PATCH v2 07/23] mips: update_mmu_cache() can replace __update_tlb()

2023-06-15 Thread Nathan Chancellor

On Wed, Jun 14, 2023 at 10:43:30PM -0700, Hugh Dickins wrote:
> On Wed, 14 Jun 2023, Hugh Dickins wrote:
> > On Wed, 14 Jun 2023, Nathan Chancellor wrote:
> > > 
> > > I just bisected a crash while powering down a MIPS machine in QEMU to
> > > this change as commit 8044511d3893 ("mips: update_mmu_cache() can
> > > replace __update_tlb()") in linux-next.
> > 
> > Thank you, Nathan, that's very helpful indeed.  This patch certainly knew
> > that it wanted testing, and I'm glad to hear that it is now seeing some.
> > 
> > While powering down?  The messages below look like it was just coming up,
> > but no doubt that's because you were bisecting (or because I'm unfamiliar
> > with what messages to expect there).  It's probably irrelevant information,
> > but I wonder whether the (V)machine worked well enough for a while before
> > you first powered down and spotted the problem, or whether it's never got
> > much further than trying to run init (busybox)?  I'm trying to get a feel
> > for whether the problem occurs under common or uncommon conditions.

Ugh sorry, I have been looking into too many bugs lately and got my
wires crossed :) this is indeed a problem when running init (which is
busybox, this is a simple Buildroot file system).

> > > Unfortunately, I can still
> > > reproduce it with the existing fix you have for this change on the
> > > mailing list, which is present in next-20230614.
> > 
> > Right, that later fix was only for a build warning, nothing functional
> > (or at least I hoped that it wasn't making any functional difference).
> > 
> > Thanks a lot for the detailed instructions below: unfortunately, those
> > would draw me into a realm of testing I've never needed to enter before,
> > so a lot of time spent on setup and learning.  Usually, I just stare at
> > the source.
> > 
> > What this probably says is that I should revert most my cleanup there,
> > and keep as close to the existing code as possible.  But some change is
> > needed, and I may need to understand (or have a good guess at) what was
> > going wrong, to decide what kind of retreat will be successful.
> > 
> > Back to the source for a while: I hope I'll find examples in nearby MIPS
> > kernel source (and git history), which will hint at the right way forward.
> > Then send you a patch against next-20230614 to try, when I'm reasonably
> > confident that it's enough to satisfy my purpose, but likely not to waste
> > your time.
> 
> I'm going to take advantage of your good nature by attaching
> two alternative patches, either to go on top of next-20230614.
> 
> mips1.patch,
>  arch/mips/mm/tlb-r4k.c |   12 +---
>  1 file changed, 1 insertion(+), 11 deletions(-)
> 
> is by far my favourite.  I couldn't see anything wrong with what's
> already there for mips, but it seems possible that (though I didn't
> find it) somewhere calls update_mmu_cache_pmd() on a page table.  So
> mips1.patch restores the pmd_huge() check, and cleans up further by
> removing the silly pgdp, p4dp, pudp, pmdp stuff: the pointer has now
> been passed in by the caller, why walk the tree again?  I should have
> done it this way before.
> 
> But if that doesn't work, then I'm afraid it will have to be
> mips2.patch,
>  arch/mips/include/asm/pgtable.h |   15 ---
>  arch/mips/mm/tlb-r3k.c  |5 ++---
>  arch/mips/mm/tlb-r4k.c  |   27 ++-
>  3 files changed, 32 insertions(+), 15 deletions(-)
> 
> which reverts all of the original patch and its build warning fix,
> and does a pte_unmap() to balance the silly pte_offset_map() there;
> with an apologetic comment for this being about the only place in
> the tree where I have no idea what to do if ptep were NULL.
> 
> I do hope that you find the first fixes the breakage; but if not, then

I hate to be the bearer of bad news but the first patch did not fix the
breakage, I see the same issue.

> I even more fervently hope that the second will, despite my hating it.
> Touch wood for the first, fingers crossed for the second, thanks,

Thankfully, the second one does. Thanks for the quick and thoughtful
responses!

Cheers,
Nathan

[PATCH 04/10] cpu/SMT: Remove topology_smt_supported()

2023-06-15 Thread Laurent Dufour

Since the maximum number of threads is now passed to
cpu_smt_set_num_threads(), checking that value is enough to know if SMT is
supported.

Cc: Michael Ellerman 
Suggested-by: Thomas Gleixner 
Signed-off-by: Laurent Dufour 
---
 arch/x86/include/asm/topology.h | 2 --
 arch/x86/kernel/smpboot.c   | 8 
 kernel/cpu.c| 2 +-
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 66927a59e822..87358a8fe843 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -143,7 +143,6 @@ int topology_update_die_map(unsigned int dieid, unsigned 
int cpu);
 int topology_phys_to_logical_pkg(unsigned int pkg);
 int topology_phys_to_logical_die(unsigned int die, unsigned int cpu);
 bool topology_is_primary_thread(unsigned int cpu);
-bool topology_smt_supported(void);
 #else
 #define topology_max_packages()(1)
 static inline int
@@ -156,7 +155,6 @@ static inline int topology_phys_to_logical_die(unsigned int 
die,
 static inline int topology_max_die_per_package(void) { return 1; }
 static inline int topology_max_smt_threads(void) { return 1; }
 static inline bool topology_is_primary_thread(unsigned int cpu) { return true; 
}
-static inline bool topology_smt_supported(void) { return false; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 352f0ce1ece4..3052c171668d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -278,14 +278,6 @@ bool topology_is_primary_thread(unsigned int cpu)
return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
 }
 
-/**
- * topology_smt_supported - Check whether SMT is supported by the CPUs
- */
-bool topology_smt_supported(void)
-{
-   return smp_num_siblings > 1;
-}
-
 /**
  * topology_phys_to_logical_pkg - Map a physical package id to a logical
  *
diff --git a/kernel/cpu.c b/kernel/cpu.c
index edca8b7bd400..e354af92b2b8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -442,7 +442,7 @@ void __init cpu_smt_set_num_threads(unsigned int 
num_threads,
 {
WARN_ON(!num_threads || (num_threads > max_threads));
 
-   if (!topology_smt_supported())
+   if (max_threads == 1)
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
 
cpu_smt_max_threads = max_threads;
-- 
2.41.0

[PATCH 08/10] powerpc/pseries: Initialise CPU hotplug callbacks earlier

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

As part of the generic HOTPLUG_SMT code, there is support for disabling
secondary SMT threads at boot time, by passing "nosmt" on the kernel
command line.

The way that is implemented is the secondary threads are brought partly
online, and then taken back offline again. That is done to support x86
CPUs needing certain initialisation done on all threads. However powerpc
has similar needs, see commit d70a54e2d085 ("powerpc/powernv: Ignore
smt-enabled on Power8 and later").

For that to work the powerpc CPU hotplug callbacks need to be registered
before secondary CPUs are brought online, otherwise __cpu_disable()
fails due to smp_ops->cpu_disable being NULL.

So split the basic initialisation into pseries_cpu_hotplug_init() which
can be called early from setup_arch(). The DLPAR related initialisation
can still be done later, because it needs to do allocations.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 22 
 arch/powerpc/platforms/pseries/pseries.h |  2 ++
 arch/powerpc/platforms/pseries/setup.c   |  2 ++
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 1a3cb313976a..61fb7cb00880 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -845,15 +845,9 @@ static struct notifier_block pseries_smp_nb = {
.notifier_call = pseries_smp_notifier,
 };
 
-static int __init pseries_cpu_hotplug_init(void)
+void __init pseries_cpu_hotplug_init(void)
 {
int qcss_tok;
-   unsigned int node;
-
-#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-   ppc_md.cpu_probe = dlpar_cpu_probe;
-   ppc_md.cpu_release = dlpar_cpu_release;
-#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
rtas_stop_self_token = rtas_function_token(RTAS_FN_STOP_SELF);
qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE);
@@ -862,12 +856,22 @@ static int __init pseries_cpu_hotplug_init(void)
qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk(KERN_INFO "CPU Hotplug not supported by firmware "
"- disabling.\n");
-   return 0;
+   return;
}
 
smp_ops->cpu_offline_self = pseries_cpu_offline_self;
smp_ops->cpu_disable = pseries_cpu_disable;
smp_ops->cpu_die = pseries_cpu_die;
+}
+
+static int __init pseries_dlpar_init(void)
+{
+   unsigned int node;
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+   ppc_md.cpu_probe = dlpar_cpu_probe;
+   ppc_md.cpu_release = dlpar_cpu_release;
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
/* Processors can be added/removed only on LPAR */
if (firmware_has_feature(FW_FEATURE_LPAR)) {
@@ -886,4 +890,4 @@ static int __init pseries_cpu_hotplug_init(void)
 
return 0;
 }
-machine_arch_initcall(pseries, pseries_cpu_hotplug_init);
+machine_arch_initcall(pseries, pseries_dlpar_init);
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index f8bce40ebd0c..f8893ba46e83 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -75,11 +75,13 @@ static inline int dlpar_hp_pmem(struct pseries_hp_errorlog 
*hp_elog)
 
 #ifdef CONFIG_HOTPLUG_CPU
 int dlpar_cpu(struct pseries_hp_errorlog *hp_elog);
+void pseries_cpu_hotplug_init(void);
 #else
 static inline int dlpar_cpu(struct pseries_hp_errorlog *hp_elog)
 {
return -EOPNOTSUPP;
 }
+static inline void pseries_cpu_hotplug_init(void) { }
 #endif
 
 /* PCI root bridge prepare function override for pseries */
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index e2a57cfa6c83..41451b76c6e5 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -816,6 +816,8 @@ static void __init pSeries_setup_arch(void)
/* Discover PIC type and setup ppc_md accordingly */
smp_init_pseries();
 
+   // Setup CPU hotplug callbacks
+   pseries_cpu_hotplug_init();
 
if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE))
if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
-- 
2.41.0

[PATCH 01/10] cpu/SMT: Move SMT prototypes into cpu_smt.h

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

A subsequent patch would like to use the cpuhp_smt_control enum as part
of the interface between generic and arch code.

Currently that leads to circular header dependencies. So split the enum
and related declarations into a separate header.

Signed-off-by: Michael Ellerman 
---
 arch/x86/include/asm/topology.h |  2 ++
 include/linux/cpu.h | 25 +
 include/linux/cpu_smt.h | 29 +
 kernel/cpu.c|  1 +
 4 files changed, 33 insertions(+), 24 deletions(-)
 create mode 100644 include/linux/cpu_smt.h

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 458c891a8273..66927a59e822 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -136,6 +136,8 @@ static inline int topology_max_smt_threads(void)
return __max_smt_threads;
 }
 
+#include 
+
 int topology_update_package_map(unsigned int apicid, unsigned int cpu);
 int topology_update_die_map(unsigned int dieid, unsigned int cpu);
 int topology_phys_to_logical_pkg(unsigned int pkg);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 8582a7142623..40548f3c201c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct device;
 struct device_node;
@@ -202,30 +203,6 @@ void cpuhp_report_idle_dead(void);
 static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
-enum cpuhp_smt_control {
-   CPU_SMT_ENABLED,
-   CPU_SMT_DISABLED,
-   CPU_SMT_FORCE_DISABLED,
-   CPU_SMT_NOT_SUPPORTED,
-   CPU_SMT_NOT_IMPLEMENTED,
-};
-
-#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
-extern enum cpuhp_smt_control cpu_smt_control;
-extern void cpu_smt_disable(bool force);
-extern void cpu_smt_check_topology(void);
-extern bool cpu_smt_possible(void);
-extern int cpuhp_smt_enable(void);
-extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
-#else
-# define cpu_smt_control   (CPU_SMT_NOT_IMPLEMENTED)
-static inline void cpu_smt_disable(bool force) { }
-static inline void cpu_smt_check_topology(void) { }
-static inline bool cpu_smt_possible(void) { return false; }
-static inline int cpuhp_smt_enable(void) { return 0; }
-static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 
0; }
-#endif
-
 extern bool cpu_mitigations_off(void);
 extern bool cpu_mitigations_auto_nosmt(void);
 
diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h
new file mode 100644
index ..722c2e306fef
--- /dev/null
+++ b/include/linux/cpu_smt.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CPU_SMT_H_
+#define _LINUX_CPU_SMT_H_
+
+enum cpuhp_smt_control {
+   CPU_SMT_ENABLED,
+   CPU_SMT_DISABLED,
+   CPU_SMT_FORCE_DISABLED,
+   CPU_SMT_NOT_SUPPORTED,
+   CPU_SMT_NOT_IMPLEMENTED,
+};
+
+#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
+extern enum cpuhp_smt_control cpu_smt_control;
+extern void cpu_smt_disable(bool force);
+extern void cpu_smt_check_topology(void);
+extern bool cpu_smt_possible(void);
+extern int cpuhp_smt_enable(void);
+extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
+#else
+# define cpu_smt_control   (CPU_SMT_NOT_IMPLEMENTED)
+static inline void cpu_smt_disable(bool force) { }
+static inline void cpu_smt_check_topology(void) { }
+static inline bool cpu_smt_possible(void) { return false; }
+static inline int cpuhp_smt_enable(void) { return 0; }
+static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 
0; }
+#endif
+
+#endif /* _LINUX_CPU_SMT_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f4a2c5845bcb..237394e0574a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -413,6 +413,7 @@ static void lockdep_release_cpus_lock(void)
 void __weak arch_smt_update(void) { }
 
 #ifdef CONFIG_HOTPLUG_SMT
+
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
 
 void __init cpu_smt_disable(bool force)
-- 
2.41.0

[PATCH 03/10] cpu/SMT: Store the current/max number of threads

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

Some architectures (ppc64) allows partial SMT states at boot time, ie. when
not all SMT threads are brought online.

To support that the SMT code needs to know the maximum number of SMT
threads, and also the currently configured number.

The architecture code knows the max number of threads, so have the
architecture code pass that value to cpu_smt_set_num_threads(). Note that
although topology_max_smt_threads() exists, it is not configured early
enough to be used here. As architecture, like PowerPC, allows the threads
number to be set through the kernel command line, also pass that value.

Signed-off-by: Michael Ellerman 
[ldufour: slightly reword the commit message]
[ldufour: rename cpu_smt_check_topology and add a num_threads argument]
Signed-off-by: Laurent Dufour 
---
 arch/x86/kernel/cpu/bugs.c |  3 ++-
 include/linux/cpu_smt.h|  8 ++--
 kernel/cpu.c   | 21 -
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 182af64387d0..ed71ad385ea7 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -133,7 +134,7 @@ void __init check_bugs(void)
 * identify_boot_cpu() initialized SMT support information, let the
 * core code know.
 */
-   cpu_smt_check_topology();
+   cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings);
 
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
diff --git a/include/linux/cpu_smt.h b/include/linux/cpu_smt.h
index 722c2e306fef..0c1664294b57 100644
--- a/include/linux/cpu_smt.h
+++ b/include/linux/cpu_smt.h
@@ -12,15 +12,19 @@ enum cpuhp_smt_control {
 
 #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
 extern enum cpuhp_smt_control cpu_smt_control;
+extern unsigned int cpu_smt_num_threads;
 extern void cpu_smt_disable(bool force);
-extern void cpu_smt_check_topology(void);
+extern void cpu_smt_set_num_threads(unsigned int num_threads,
+   unsigned int max_threads);
 extern bool cpu_smt_possible(void);
 extern int cpuhp_smt_enable(void);
 extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval);
 #else
 # define cpu_smt_control   (CPU_SMT_NOT_IMPLEMENTED)
+# define cpu_smt_num_threads 1
 static inline void cpu_smt_disable(bool force) { }
-static inline void cpu_smt_check_topology(void) { }
+static inline void cpu_smt_set_num_threads(unsigned int num_threads,
+  unsigned int max_threads) { }
 static inline bool cpu_smt_possible(void) { return false; }
 static inline int cpuhp_smt_enable(void) { return 0; }
 static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 
0; }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c67049bb3fc8..edca8b7bd400 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -415,6 +415,8 @@ void __weak arch_smt_update(void) { }
 #ifdef CONFIG_HOTPLUG_SMT
 
 enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
+static unsigned int cpu_smt_max_threads __ro_after_init;
+unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;
 
 void __init cpu_smt_disable(bool force)
 {
@@ -428,16 +430,33 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
+   cpu_smt_num_threads = 1;
 }
 
 /*
  * The decision whether SMT is supported can only be done after the full
  * CPU identification. Called from architecture code.
  */
-void __init cpu_smt_check_topology(void)
+void __init cpu_smt_set_num_threads(unsigned int num_threads,
+   unsigned int max_threads)
 {
+   WARN_ON(!num_threads || (num_threads > max_threads));
+
if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
+
+   cpu_smt_max_threads = max_threads;
+
+   /*
+* If SMT has been disabled via the kernel command line or SMT is
+* not supported, set cpu_smt_num_threads to 1 for consistency.
+* If enabled, take the architecture requested number of threads
+* to bring up into account.
+*/
+   if (cpu_smt_control != CPU_SMT_ENABLED)
+   cpu_smt_num_threads = 1;
+   else if (num_threads < cpu_smt_num_threads)
+   cpu_smt_num_threads = num_threads;
 }
 
 static int __init smt_cmdline_disable(char *str)
-- 
2.41.0

[PATCH 09/10] powerpc: Add HOTPLUG_SMT support

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

Add support for HOTPLUG_SMT, which enables the generic sysfs SMT support
files in /sys/devices/system/cpu/smt, as well as the "nosmt" boot
parameter.

Implement the recently added hooks to allow partial SMT states, allow
any number of threads per core.

Tie the config symbol to HOTPLUG_CPU, which enables it on the major
platforms that support SMT. If there are other platforms that want the
SMT support that can be tweaked in future.

Signed-off-by: Michael Ellerman 
[ldufour: pass current SMT level to cpu_smt_set_num_threads]
[ldufour: remove topology_smt_supported]
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/Kconfig|  1 +
 arch/powerpc/include/asm/topology.h | 20 
 arch/powerpc/kernel/smp.c   |  8 +++-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 539d1f03ff42..5cf87ca10a9c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -273,6 +273,7 @@ config PPC
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_VIRT_CPU_ACCOUNTING_GEN
+   select HOTPLUG_SMT  if HOTPLUG_CPU
select HUGETLB_PAGE_SIZE_VARIABLE   if PPC_BOOK3S_64 && HUGETLB_PAGE
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index 8a4d4f4d9749..7602f17d688a 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -143,5 +143,25 @@ static inline int cpu_to_coregroup_id(int cpu)
 #endif
 #endif
 
+#ifdef CONFIG_HOTPLUG_SMT
+#include 
+#include 
+
+static inline bool topology_smt_threads_supported(unsigned int num_threads)
+{
+   return num_threads <= threads_per_core;
+}
+
+static inline bool topology_is_primary_thread(unsigned int cpu)
+{
+   return cpu == cpu_first_thread_sibling(cpu);
+}
+
+static inline bool topology_smt_thread_allowed(unsigned int cpu)
+{
+   return cpu_thread_in_core(cpu) < cpu_smt_num_threads;
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_TOPOLOGY_H */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 265801a3e94c..cdb77d36cdd0 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1087,7 +1087,7 @@ static int __init init_big_cores(void)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-   unsigned int cpu;
+   unsigned int cpu, num_threads;
 
DBG("smp_prepare_cpus\n");
 
@@ -1154,6 +1154,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
if (smp_ops && smp_ops->probe)
smp_ops->probe();
+
+   // Initalise the generic SMT topology support
+   num_threads = 1;
+   if (smt_enabled_at_boot)
+   num_threads = smt_enabled_at_boot;
+   cpu_smt_set_num_threads(num_threads, threads_per_core);
 }
 
 void smp_prepare_boot_cpu(void)
-- 
2.41.0

[PATCH 07/10] cpu/SMT: Allow enabling partial SMT states via sysfs

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

Add support to the /sys/devices/system/cpu/smt/control interface for
enabling a specified number of SMT threads per core, including partial
SMT states where not all threads are brought online.

The current interface accepts "on" and "off", to enable either 1 or all
SMT threads per core.

This commit allows writing an integer, between 1 and the number of SMT
threads supported by the machine. Writing 1 is a synonym for "off", 2 or
more enables SMT with the specified number of threads.

When reading the file, if all threads are online "on" is returned, to
avoid changing behaviour for existing users. If some other number of
threads is online then the integer value is returned.

There is a hook which allows arch code to control how many threads per
core are supported. To retain the existing behaviour, the x86 hook only
supports 1 thread or all threads.

Signed-off-by: Michael Ellerman 
---
 .../ABI/testing/sysfs-devices-system-cpu  |  1 +
 kernel/cpu.c  | 39 ---
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index f54867cadb0f..3c4cfb59d495 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -555,6 +555,7 @@ Description:Control Symmetric Multi Threading (SMT)
  
=
 "on" SMT is enabled
 "off"SMT is disabled
+""SMT is enabled with N threads per 
core.
 "forceoff"   SMT is force disabled. Cannot be 
changed.
 "notsupported"   SMT is not supported by the CPU
 "notimplemented" SMT runtime toggling is not
diff --git a/kernel/cpu.c b/kernel/cpu.c
index ae2fa26a5b63..248f0734098a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2507,7 +2507,7 @@ static ssize_t
 __store_smt_control(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
 {
-   int ctrlval, ret;
+   int ctrlval, ret, num_threads, orig_threads;
 
if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
return -EPERM;
@@ -2515,20 +2515,38 @@ __store_smt_control(struct device *dev, struct 
device_attribute *attr,
if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
return -ENODEV;
 
-   if (sysfs_streq(buf, "on"))
+   if (sysfs_streq(buf, "on")) {
ctrlval = CPU_SMT_ENABLED;
-   else if (sysfs_streq(buf, "off"))
+   num_threads = cpu_smt_max_threads;
+   } else if (sysfs_streq(buf, "off")) {
ctrlval = CPU_SMT_DISABLED;
-   else if (sysfs_streq(buf, "forceoff"))
+   num_threads = 1;
+   } else if (sysfs_streq(buf, "forceoff")) {
ctrlval = CPU_SMT_FORCE_DISABLED;
-   else
+   num_threads = 1;
+   } else if (kstrtoint(buf, 10, _threads) == 0) {
+   if (num_threads == 1)
+   ctrlval = CPU_SMT_DISABLED;
+   else if (num_threads > 1 && 
topology_smt_threads_supported(num_threads))
+   ctrlval = CPU_SMT_ENABLED;
+   else
+   return -EINVAL;
+   } else {
return -EINVAL;
+   }
 
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
 
-   if (ctrlval != cpu_smt_control) {
+   orig_threads = cpu_smt_num_threads;
+   cpu_smt_num_threads = num_threads;
+
+   if (num_threads > orig_threads) {
+   ret = cpuhp_smt_enable();
+   } else if (num_threads < orig_threads) {
+   ret = cpuhp_smt_disable(ctrlval);
+   } else if (ctrlval != cpu_smt_control) {
switch (ctrlval) {
case CPU_SMT_ENABLED:
ret = cpuhp_smt_enable();
@@ -2566,6 +2584,15 @@ static ssize_t control_show(struct device *dev,
 {
const char *state = smt_states[cpu_smt_control];
 
+   /*
+* If SMT is enabled but not all threads are enabled then show the
+* number of threads. If all threads are enabled show "on". Otherwise
+* show the state name.
+*/
+   if (cpu_smt_control == CPU_SMT_ENABLED &&
+   cpu_smt_num_threads != cpu_smt_max_threads)
+   return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
+
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
 }
 
-- 
2.41.0

[PATCH 02/10] cpu/SMT: Move smt/control simple exit cases earlier

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

Move the simple exit cases, ie. which don't depend on the value written,
earlier in the function. That makes it clearer that regardless of the
input those states can not be transitioned out of.

That does have a user-visible effect, in that the error returned will
now always be EPERM/ENODEV for those states, regardless of the value
written. Previously writing an invalid value would return EINVAL even
when in those states.

Signed-off-by: Michael Ellerman 
---
 kernel/cpu.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 237394e0574a..c67049bb3fc8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2482,6 +2482,12 @@ __store_smt_control(struct device *dev, struct 
device_attribute *attr,
 {
int ctrlval, ret;
 
+   if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
+   return -EPERM;
+
+   if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+   return -ENODEV;
+
if (sysfs_streq(buf, "on"))
ctrlval = CPU_SMT_ENABLED;
else if (sysfs_streq(buf, "off"))
@@ -2491,12 +2497,6 @@ __store_smt_control(struct device *dev, struct 
device_attribute *attr,
else
return -EINVAL;
 
-   if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
-   return -EPERM;
-
-   if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
-   return -ENODEV;
-
ret = lock_device_hotplug_sysfs();
if (ret)
return ret;
-- 
2.41.0

[PATCH 00/10] Introduce SMT level and add PowerPC support

2023-06-15 Thread Laurent Dufour

I'm taking over the series Michael sent previously [1] which is smartly
reviewing the initial series I sent [2].  This series is addressing the
comments sent by Thomas and me on the Michael's one.

Here is a short introduction to the issue this series is addressing:

When a new CPU is added, the kernel is activating all its threads. This
leads to weird, but functional, result when adding CPU on a SMT 4 system
for instance.

Here the newly added CPU 1 has 8 threads while the other one has 4 threads
active (system has been booted with the 'smt-enabled=4' kernel option):

ltcden3-lp12:~ # ppc64_cpu --info
Core   0:0*1*2*3*4 5 6 7
Core   1:8*9*   10*   11*   12*   13*   14*   15*

This mixed SMT level may confused end users and/or some applications.

There is no SMT level recorded in the kernel (common code), neither in user
space, as far as I know. Such a level is helpful when adding new CPU or
when optimizing the energy efficiency (when reactivating CPUs).

When SMP and HOTPLUG_SMT are defined, this series is adding a new SMT level
(cpu_smt_num_threads) and few callbacks allowing the architecture code to
fine control this value, setting a max and a "at boot" level, and
controling whether a thread should be onlined or not.

[1] 
https://lore.kernel.org/linuxppc-dev/20230524155630.794584-1-...@ellerman.id.au/
[2] 
https://lore.kernel.org/linuxppc-dev/20230331153905.31698-1-lduf...@linux.ibm.com/

Laurent Dufour (1):
  cpu/SMT: Remove topology_smt_supported()

Michael Ellerman (9):
  cpu/SMT: Move SMT prototypes into cpu_smt.h
  cpu/SMT: Move smt/control simple exit cases earlier
  cpu/SMT: Store the current/max number of threads
  cpu/SMT: Create topology_smt_threads_supported()
  cpu/SMT: Create topology_smt_thread_allowed()
  cpu/SMT: Allow enabling partial SMT states via sysfs
  powerpc/pseries: Initialise CPU hotplug callbacks earlier
  powerpc: Add HOTPLUG_SMT support
  powerpc/pseries: Honour current SMT state when DLPAR onlining CPUs

 .../ABI/testing/sysfs-devices-system-cpu  |  1 +
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/topology.h   | 20 +
 arch/powerpc/kernel/smp.c |  8 +-
 arch/powerpc/platforms/pseries/hotplug-cpu.c  | 30 +--
 arch/powerpc/platforms/pseries/pseries.h  |  2 +
 arch/powerpc/platforms/pseries/setup.c|  2 +
 arch/x86/include/asm/topology.h   |  8 +-
 arch/x86/kernel/cpu/bugs.c|  3 +-
 arch/x86/kernel/smpboot.c | 25 +-
 include/linux/cpu.h   | 25 +-
 include/linux/cpu_smt.h   | 33 
 kernel/cpu.c  | 83 +++
 13 files changed, 187 insertions(+), 54 deletions(-)
 create mode 100644 include/linux/cpu_smt.h

-- 
2.41.0

[PATCH 06/10] cpu/SMT: Create topology_smt_thread_allowed()

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

A subsequent patch will enable partial SMT states, ie. when not all SMT
threads are brought online.

To support that, add an arch helper which checks whether a given CPU is
allowed to be brought online depending on how many SMT threads are
currently enabled.

Call the helper from cpu_smt_enable(), and cpu_smt_allowed() when SMT is
enabled, to check if the particular thread should be onlined. Notably,
also call it from cpu_smt_disable() if CPU_SMT_ENABLED, to allow
offlining some threads to move from a higher to lower number of threads
online.

Signed-off-by: Michael Ellerman 
---
 arch/x86/include/asm/topology.h |  2 ++
 arch/x86/kernel/smpboot.c   | 15 +++
 kernel/cpu.c| 10 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 232df5ffab34..4696d4566cb5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -144,6 +144,7 @@ int topology_phys_to_logical_pkg(unsigned int pkg);
 int topology_phys_to_logical_die(unsigned int die, unsigned int cpu);
 bool topology_is_primary_thread(unsigned int cpu);
 bool topology_smt_threads_supported(unsigned int threads);
+bool topology_smt_thread_allowed(unsigned int cpu);
 #else
 #define topology_max_packages()(1)
 static inline int
@@ -157,6 +158,7 @@ static inline int topology_max_die_per_package(void) { 
return 1; }
 static inline int topology_max_smt_threads(void) { return 1; }
 static inline bool topology_is_primary_thread(unsigned int cpu) { return true; 
}
 static inline bool topology_smt_threads_supported(unsigned int threads) { 
return false; }
+static inline bool topology_smt_thread_allowed(unsigned int cpu) { return 
false; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d163ef55577b..cfae55c2d1b0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -290,6 +290,21 @@ bool topology_smt_threads_supported(unsigned int threads)
return threads == 1 || threads == smp_num_siblings;
 }
 
+/**
+ * topology_smt_thread_allowed - When enabling SMT check whether this 
particular
+ *  CPU thread is allowed to be brought online.
+ * @cpu:   CPU to check
+ */
+bool topology_smt_thread_allowed(unsigned int cpu)
+{
+   /*
+* No extra logic s required here to support different thread values
+* because threads will always == 1 or smp_num_siblings because of
+* topology_smt_threads_supported().
+*/
+   return true;
+}
+
 /**
  * topology_phys_to_logical_pkg - Map a physical package id to a logical
  *
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e354af92b2b8..ae2fa26a5b63 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -468,7 +468,7 @@ early_param("nosmt", smt_cmdline_disable);
 
 static inline bool cpu_smt_allowed(unsigned int cpu)
 {
-   if (cpu_smt_control == CPU_SMT_ENABLED)
+   if (cpu_smt_control == CPU_SMT_ENABLED && 
topology_smt_thread_allowed(cpu))
return true;
 
if (topology_is_primary_thread(cpu))
@@ -2283,6 +2283,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
for_each_online_cpu(cpu) {
if (topology_is_primary_thread(cpu))
continue;
+   /*
+* Disable can be called with CPU_SMT_ENABLED when changing
+* from a higher to lower number of SMT threads per core.
+*/
+   if (ctrlval == CPU_SMT_ENABLED && 
topology_smt_thread_allowed(cpu))
+   continue;
ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
if (ret)
break;
@@ -2317,6 +2323,8 @@ int cpuhp_smt_enable(void)
/* Skip online CPUs and CPUs on offline nodes */
if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
continue;
+   if (!topology_smt_thread_allowed(cpu))
+   continue;
ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
if (ret)
break;
-- 
2.41.0

[PATCH 10/10] powerpc/pseries: Honour current SMT state when DLPAR onlining CPUs

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

Integrate with the generic SMT support, so that when a CPU is DLPAR
onlined it is brought up with the correct SMT mode.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 61fb7cb00880..e62835a12d73 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -398,6 +398,14 @@ static int dlpar_online_cpu(struct device_node *dn)
for_each_present_cpu(cpu) {
if (get_hard_smp_processor_id(cpu) != thread)
continue;
+
+   if (!topology_is_primary_thread(cpu)) {
+   if (cpu_smt_control != CPU_SMT_ENABLED)
+   break;
+   if (!topology_smt_thread_allowed(cpu))
+   break;
+   }
+
cpu_maps_update_done();
find_and_update_cpu_nid(cpu);
rc = device_online(get_cpu_device(cpu));
-- 
2.41.0

[PATCH 05/10] cpu/SMT: Create topology_smt_threads_supported()

2023-06-15 Thread Laurent Dufour

From: Michael Ellerman 

A subsequent patch will enable partial SMT states, ie. when not all SMT
threads are brought online.

To support that, add an arch helper to check how many SMT threads are
supported.

To retain existing behaviour, the x86 implementation only allows a
single thread or all threads to be online.

Signed-off-by: Michael Ellerman 
---
 arch/x86/include/asm/topology.h |  2 ++
 arch/x86/kernel/smpboot.c   | 12 
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 87358a8fe843..232df5ffab34 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -143,6 +143,7 @@ int topology_update_die_map(unsigned int dieid, unsigned 
int cpu);
 int topology_phys_to_logical_pkg(unsigned int pkg);
 int topology_phys_to_logical_die(unsigned int die, unsigned int cpu);
 bool topology_is_primary_thread(unsigned int cpu);
+bool topology_smt_threads_supported(unsigned int threads);
 #else
 #define topology_max_packages()(1)
 static inline int
@@ -155,6 +156,7 @@ static inline int topology_phys_to_logical_die(unsigned int 
die,
 static inline int topology_max_die_per_package(void) { return 1; }
 static inline int topology_max_smt_threads(void) { return 1; }
 static inline bool topology_is_primary_thread(unsigned int cpu) { return true; 
}
+static inline bool topology_smt_threads_supported(unsigned int threads) { 
return false; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3052c171668d..d163ef55577b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -278,6 +278,18 @@ bool topology_is_primary_thread(unsigned int cpu)
return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
 }
 
+/**
+ * topology_smt_threads_supported - Check if the given number of SMT threads
+ * is supported.
+ *
+ * @threads:   The number of SMT threads.
+ */
+bool topology_smt_threads_supported(unsigned int threads)
+{
+   // Only support a single thread or all threads.
+   return threads == 1 || threads == smp_num_siblings;
+}
+
 /**
  * topology_phys_to_logical_pkg - Map a physical package id to a logical
  *
-- 
2.41.0

[RFC PATCH] powerpc/ftrace: Create a dummy stackframe to fix stack unwind

2023-06-15 Thread Naveen N Rao

With ppc64 -mprofile-kernel and ppc32 -pg, profiling instructions to
call into ftrace are emitted right at function entry. The instruction
sequence used is minimal to reduce overhead. Crucially, a stackframe is
not created for the function being traced. This breaks stack unwinding
since the function being traced does not have a stackframe for itself.
As such, it never shows up in the backtrace:

/sys/kernel/debug/tracing # echo 1 > /proc/sys/kernel/stack_tracer_enabled
/sys/kernel/debug/tracing # cat stack_trace
DepthSize   Location(17 entries)
-   
  0) 4144  32   ftrace_call+0x4/0x44
  1) 4112 432   get_page_from_freelist+0x26c/0x1ad0
  2) 3680 496   __alloc_pages+0x290/0x1280
  3) 3184 336   __folio_alloc+0x34/0x90
  4) 2848 176   vma_alloc_folio+0xd8/0x540
  5) 2672 272   __handle_mm_fault+0x700/0x1cc0
  6) 2400 208   handle_mm_fault+0xf0/0x3f0
  7) 2192  80   ___do_page_fault+0x3e4/0xbe0
  8) 2112 160   do_page_fault+0x30/0xc0
  9) 1952 256   data_access_common_virt+0x210/0x220
 10) 1696 400   0xcf16b100
 11) 1296 384   load_elf_binary+0x804/0x1b80
 12)  912 208   bprm_execve+0x2d8/0x7e0
 13)  704  64   do_execveat_common+0x1d0/0x2f0
 14)  640 160   sys_execve+0x54/0x70
 15)  480  64   system_call_exception+0x138/0x350
 16)  416 416   system_call_common+0x160/0x2c4

Fix this by having ftrace create a dummy stackframe for the function
being traced. Since this is only relevant when ftrace is active, we nop
out the instruction to store LR in the LR save area in the profiling
instruction sequence on ppc32 (and in ppc64 with older gcc versions).
Instead, in ftrace, we store LR in the LR save area of the previous
stackframe, and create a minimal stackframe to represent the function
being traced. With this, backtraces now capture the function being
traced:

/sys/kernel/debug/tracing # cat stack_trace
DepthSize   Location(17 entries)
-   
  0) 3888  32   _raw_spin_trylock+0x8/0x70
  1) 3856 576   get_page_from_freelist+0x26c/0x1ad0
  2) 3280  64   __alloc_pages+0x290/0x1280
  3) 3216 336   __folio_alloc+0x34/0x90
  4) 2880 176   vma_alloc_folio+0xd8/0x540
  5) 2704 416   __handle_mm_fault+0x700/0x1cc0
  6) 2288  96   handle_mm_fault+0xf0/0x3f0
  7) 2192  48   ___do_page_fault+0x3e4/0xbe0
  8) 2144 192   do_page_fault+0x30/0xc0
  9) 1952 608   data_access_common_virt+0x210/0x220
 10) 1344  16   0xc000334bbb50
 11) 1328 416   load_elf_binary+0x804/0x1b80
 12)  912  64   bprm_execve+0x2d8/0x7e0
 13)  848 176   do_execveat_common+0x1d0/0x2f0
 14)  672 192   sys_execve+0x54/0x70
 15)  480  64   system_call_exception+0x138/0x350
 16)  416 416   system_call_common+0x160/0x2c4

This results in two additional stores in the ftrace entry code, but
produces reliable backtraces. Note that this change now aligns with
other architectures (arm64, s390, x86).

Signed-off-by: Naveen N Rao 
---
This applies atop the below RFC patchset:
http://lore.kernel.org/cover.1686151854.git.nav...@kernel.org

- Naveen

 arch/powerpc/kernel/trace/ftrace.c   |  8 +---
 arch/powerpc/kernel/trace/ftrace_entry.S | 11 ---
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index a03aa4fd7a08ba..3f090f037d8ef5 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -229,13 +229,15 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace 
*rec)
/* Expected sequence: 'mflr r0', 'stw r0,4(r1)', 'bl _mcount' */
ret = ftrace_validate_inst(ip - 8, ppc_inst(PPC_RAW_MFLR(_R0)));
if (!ret)
-   ret = ftrace_validate_inst(ip - 4, 
ppc_inst(PPC_RAW_STW(_R0, _R1, 4)));
+   ret = ftrace_modify_code(ip - 4, 
ppc_inst(PPC_RAW_STW(_R0, _R1, 4)),
+ppc_inst(PPC_RAW_NOP()));
} else if (IS_ENABLED(CONFIG_MPROFILE_KERNEL)) {
/* Expected sequence: 'mflr r0', ['std r0,16(r1)'], 'bl 
_mcount' */
ret = ftrace_validate_inst(ip - 4, ppc_inst(PPC_RAW_MFLR(_R0)));
if (ret) {
-   ret = ftrace_validate_inst(ip - 4, 
ppc_inst(PPC_RAW_STD(_R0, _R1, 16)));
-   ret |= ftrace_validate_inst(ip - 8, 
ppc_inst(PPC_RAW_MFLR(_R0)));
+   ret = ftrace_validate_inst(ip - 8, 
ppc_inst(PPC_RAW_MFLR(_R0)));
+   ret |= ftrace_modify_code(ip - 4, 
ppc_inst(PPC_RAW_STD(_R0, _R1, 16)),
+ ppc_inst(PPC_RAW_NOP()));
}
} else {
return -EINVAL;

[PATCH] powerpc/ftrace: Fix dropping weak symbols with older toolchains

2023-06-15 Thread Naveen N Rao

The minimum level of gcc supported for building the kernel is v5.1.
v5.x releases of gcc emitted a three instruction sequence for
-mprofile-kernel:
mflrr0
std r0, 16(r1)
bl  _mcount

It is only with the v6.x releases that gcc started emitting the two
instruction sequence for -mprofile-kernel, omitting the second store
instruction.

With the older three instruction sequence, the actual ftrace location
can be the 5th instruction into a function. Update the allowed offset
for ftrace location from 12 to 16 to accommodate the same.

Cc: sta...@vger.kernel.org
Fixes: 7af82ff90a2b06 ("powerpc/ftrace: Ignore weak functions")
Signed-off-by: Naveen N Rao 
---
 arch/powerpc/include/asm/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/ftrace.h 
b/arch/powerpc/include/asm/ftrace.h
index 91c049d51d0e10..2edc6269b1a357 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -12,7 +12,7 @@
 
 /* Ignore unused weak functions which will have larger offsets */
 #ifdef CONFIG_MPROFILE_KERNEL
-#define FTRACE_MCOUNT_MAX_OFFSET   12
+#define FTRACE_MCOUNT_MAX_OFFSET   16
 #elif defined(CONFIG_PPC32)
 #define FTRACE_MCOUNT_MAX_OFFSET   8
 #endif

base-commit: bd517a8442b6c6646a136421cd4c1b95bf4ce32b
-- 
2.40.1

[PATCH v2 2/2] powerpc/tpm: Reserve SML log when kexec'ing with kexec_file_load()

2023-06-15 Thread Michael Ellerman

The TPM code in prom_init.c creates a small buffer of memory to store
the TPM's SML (Stored Measurement Log). It's communicated to Linux via
the linux,sml-base/size device tree properties of the TPM node.

When kexec'ing that buffer can be overwritten, or when kdump'ing it may
not be mapped by the second kernel. The latter can lead to a crash when
booting the second kernel such as:

  tpm_ibmvtpm 7103: CRQ initialization completed
  BUG: Unable to handle kernel data access on read at 0xc0002ffb
  Faulting instruction address: 0xc000200a70e0
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc2-00134-g9307ce092f5d #314
  Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1200 0xf05 
of:SLOF,git-5b4c5a pSeries
  NIP:  c000200a70e0 LR: c000203dd5dc CTR: 0800
  REGS: c00024543280 TRAP: 0300   Not tainted  
(6.2.0-rc2-00134-g9307ce092f5d)
  MSR:  82009033   CR: 24002280  XER: 0006
  CFAR: c000200a70c8 DAR: c0002ffb DSISR: 4000 IRQMASK: 0
  ...
  NIP memcpy_power7+0x400/0x7d0
  LR  kmemdup+0x5c/0x80
  Call Trace:
memcpy_power7+0x274/0x7d0 (unreliable)
kmemdup+0x5c/0x80
tpm_read_log_of+0xe8/0x1b0
tpm_bios_log_setup+0x60/0x210
tpm_chip_register+0x134/0x320
tpm_ibmvtpm_probe+0x520/0x7d0
vio_bus_probe+0x9c/0x460
really_probe+0x104/0x420
__driver_probe_device+0xb0/0x170
driver_probe_device+0x58/0x180
__driver_attach+0xd8/0x250
bus_for_each_dev+0xb4/0x140
driver_attach+0x34/0x50
bus_add_driver+0x1e8/0x2d0
driver_register+0xb4/0x1c0
__vio_register_driver+0x74/0x9c
ibmvtpm_module_init+0x34/0x48
do_one_initcall+0x80/0x320
kernel_init_freeable+0x304/0x3ac
kernel_init+0x30/0x1a0
ret_from_kernel_thread+0x5c/0x64

To fix the crash, add the SML region to the usable memory areas for the
kdump kernel, so that the second kernel will map the region. To avoid
corruption of the region, add the region to the reserved memory areas,
so that the second kernel does not use the memory for something else.

Note that when loading a kdump kernel with the regular kexec_load()
syscall the SML may be overwritten by the kdump kernel, depending on
where the SML is in memory in relation to the crashkernel region. That
is a separate problem that is not solved by this patch.

Fixes: a0458284f062 ("powerpc: Add support code for kexec_file_load()")
Reported-by: Stefan Berger 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/kexec_ranges.h |  1 +
 arch/powerpc/kexec/file_load_64.c   | 12 
 arch/powerpc/kexec/ranges.c | 20 
 3 files changed, 33 insertions(+)

v2: Add fixes tag as suggested by Jarkko.
Make change log clearer that this only fixes kexec_file_load().

diff --git a/arch/powerpc/include/asm/kexec_ranges.h 
b/arch/powerpc/include/asm/kexec_ranges.h
index f83866a19e87..cf6a97f9113d 100644
--- a/arch/powerpc/include/asm/kexec_ranges.h
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -21,5 +21,6 @@ int add_kernel_mem_range(struct crash_mem **mem_ranges);
 int add_rtas_mem_range(struct crash_mem **mem_ranges);
 int add_opal_mem_range(struct crash_mem **mem_ranges);
 int add_reserved_mem_ranges(struct crash_mem **mem_ranges);
+int add_sml_mem_range(struct crash_mem **mem_ranges);
 
 #endif /* _ASM_POWERPC_KEXEC_RANGES_H */
diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index 110d28bede2a..90c10a89fcbc 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -79,6 +79,10 @@ static int get_exclude_memory_ranges(struct crash_mem 
**mem_ranges)
if (ret)
goto out;
 
+   ret = add_sml_mem_range(mem_ranges);
+   if (ret)
+   goto out;
+
ret = add_opal_mem_range(mem_ranges);
if (ret)
goto out;
@@ -122,6 +126,10 @@ static int get_usable_memory_ranges(struct crash_mem 
**mem_ranges)
if (ret)
goto out;
 
+   ret = add_sml_mem_range(mem_ranges);
+   if (ret)
+   goto out;
+
ret = add_opal_mem_range(mem_ranges);
if (ret)
goto out;
@@ -225,6 +233,10 @@ static int get_reserved_memory_ranges(struct crash_mem 
**mem_ranges)
if (ret)
goto out;
 
+   ret = add_sml_mem_range(mem_ranges);
+   if (ret)
+   goto out;
+
ret = add_tce_mem_ranges(mem_ranges);
if (ret)
goto out;
diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
index 5fc53a5fcfdf..8b01655ceb5e 100644
--- a/arch/powerpc/kexec/ranges.c
+++ b/arch/powerpc/kexec/ranges.c
@@ -350,6 +350,26 @@ int add_rtas_mem_range(struct crash_mem **mem_ranges)
return ret;
 }
 
+int add_sml_mem_range(struct crash_mem **mem_ranges)
+{
+   struct

[PATCH v2 1/2] powerpc/tpm: Create linux,sml-base/size as big endian

2023-06-15 Thread Michael Ellerman

There's code in prom_instantiate_sml() to do a "SML handover" (Stored
Measurement Log) from OF to Linux, before Linux shuts down Open
Firmware.

This involves creating a buffer to hold the SML, and creating two device
tree properties to record its base address and size. The kernel then
later reads those properties from the device tree to find the SML.

When the code was initially added in commit 4a727429abec ("PPC64: Add
support for instantiating SML from Open Firmware") the powerpc kernel
was always built big endian, so the properties were created big endian
by default.

However since then little endian support was added to powerpc, and now
the code lacks conversions to big endian when creating the properties.

This means on little endian kernels the device tree properties are
little endian, which is contrary to the device tree spec, and in
contrast to all other device tree properties.

To cope with that a workaround was added in tpm_read_log_of() to skip
the endian conversion if the properties were created via the SML
handover.

A better solution is to encode the properties as big endian as they
should be, and remove the workaround.

Typically changing the encoding of a property like this would present
problems for kexec. However the SML is not propagated across kexec, so
changing the encoding of the properties is a non-issue.

Fixes: e46e22f12b19 ("tpm: enhance read_log_of() to support Physical TPM event 
log")
Signed-off-by: Michael Ellerman 
Reviewed-by: Stefan Berger 
---
 arch/powerpc/kernel/prom_init.c |  8 ++--
 drivers/char/tpm/eventlog/of.c  | 23 ---
 2 files changed, 10 insertions(+), 21 deletions(-)

v2: Add Stefan's reviewed-by.

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index d464ba412084..72fe306b6820 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1900,6 +1900,7 @@ static void __init prom_instantiate_sml(void)
u32 entry = 0, size = 0, succ = 0;
u64 base;
__be32 val;
+   __be64 val64;
 
prom_debug("prom_instantiate_sml: start...\n");
 
@@ -1956,10 +1957,13 @@ static void __init prom_instantiate_sml(void)
 
reserve_mem(base, size);
 
+   val64 = cpu_to_be64(base);
prom_setprop(ibmvtpm_node, "/vdevice/vtpm", "linux,sml-base",
-, sizeof(base));
+, sizeof(val64));
+
+   val = cpu_to_be32(size);
prom_setprop(ibmvtpm_node, "/vdevice/vtpm", "linux,sml-size",
-, sizeof(size));
+, sizeof(val));
 
prom_debug("sml base = 0x%llx\n", base);
prom_debug("sml size = 0x%x\n", size);
diff --git a/drivers/char/tpm/eventlog/of.c b/drivers/char/tpm/eventlog/of.c
index 930fe43d5daf..0bc0cb6333c6 100644
--- a/drivers/char/tpm/eventlog/of.c
+++ b/drivers/char/tpm/eventlog/of.c
@@ -51,8 +51,8 @@ static int tpm_read_log_memory_region(struct tpm_chip *chip)
 int tpm_read_log_of(struct tpm_chip *chip)
 {
struct device_node *np;
-   const u32 *sizep;
-   const u64 *basep;
+   const __be32 *sizep;
+   const __be64 *basep;
struct tpm_bios_log *log;
u32 size;
u64 base;
@@ -73,23 +73,8 @@ int tpm_read_log_of(struct tpm_chip *chip)
if (sizep == NULL || basep == NULL)
return -EIO;
 
-   /*
-* For both vtpm/tpm, firmware has log addr and log size in big
-* endian format. But in case of vtpm, there is a method called
-* sml-handover which is run during kernel init even before
-* device tree is setup. This sml-handover function takes care
-* of endianness and writes to sml-base and sml-size in little
-* endian format. For this reason, vtpm doesn't need conversion
-* but physical tpm needs the conversion.
-*/
-   if (of_property_match_string(np, "compatible", "IBM,vtpm") < 0 &&
-   of_property_match_string(np, "compatible", "IBM,vtpm20") < 0) {
-   size = be32_to_cpup((__force __be32 *)sizep);
-   base = be64_to_cpup((__force __be64 *)basep);
-   } else {
-   size = *sizep;
-   base = *basep;
-   }
+   size = be32_to_cpup(sizep);
+   base = be64_to_cpup(basep);
 
if (size == 0) {
dev_warn(>dev, "%s: Event log area empty\n", __func__);
-- 
2.40.1

Re: [6.4-rc6] Crash during a kexec operation (tpm_amd_is_rng_defective)

2023-06-15 Thread Linux regression tracking #adding (Thorsten Leemhuis)

[CCing the regression list, as it should be in the loop for regressions:
https://docs.kernel.org/admin-guide/reporting-regressions.html]

[TLDR: I'm adding this report to the list of tracked Linux kernel
regressions; the text you find below is based on a few templates
paragraphs you might have encountered already in similar form.
See link in footer if these mails annoy you.]

On 14.06.23 17:12, Sachin Sant wrote:
> Following crash is observed during a kexec operation on 
> IBM Power10 server:
> 
> [ 34.381548] Kernel attempted to read user page (50) - exploit attempt? (uid: 
> 0)
> [ 34.381562] BUG: Kernel NULL pointer dereference on read at 0x0050
> [ 34.381565] Faulting instruction address: 0xc09db1e4
> [ 34.381569] Oops: Kernel access of bad area, sig: 11 [#1]
> [ 34.381572] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> [ 34.381576] Modules linked in: dm_mod(E) nft_fib_inet(E) nft_fib_ipv4(E) 
> nft_fib_ipv6(E) nft_fib(E) nft_reject_inet(E) nf_reject_ipv4(E) 
> nf_reject_ipv6(E) nft_reject(E) nft_ct(E) nft_chain_nat(E) nf_nat(E) 
> nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) bonding(E) tls(E) 
> rfkill(E) ip_set(E) sunrpc(E) nf_tables(E) nfnetlink(E) pseries_rng(E) 
> aes_gcm_p10_crypto(E) drm(E) drm_panel_orientation_quirks(E) xfs(E) 
> libcrc32c(E) sd_mod(E) sr_mod(E) t10_pi(E) crc64_rocksoft_generic(E) cdrom(E) 
> crc64_rocksoft(E) crc64(E) sg(E) ibmvscsi(E) scsi_transport_srp(E) ibmveth(E) 
> vmx_crypto(E) fuse(E)
> [ 34.381613] CPU: 18 PID: 5918 Comm: kexec Kdump: loaded Tainted: G E 
> 6.4.0-rc6-00037-gb6dad5178cea #3
> [ 34.381618] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf06 
> of:IBM,FW1030.20 (NH1030_058) hv:phyp pSeries
> [ 34.381621] NIP: c09db1e4 LR: c09db928 CTR: c09eab60
> [ 34.381625] REGS: c0009742f780 TRAP: 0300 Tainted: G E 
> (6.4.0-rc6-00037-gb6dad5178cea)
> [ 34.381628] MSR: 8280b033  CR: 
> 4444 XER: 0001
> [ 34.381638] CFAR: c09db19c DAR: 0050 DSISR: 4000 
> IRQMASK: 0 
> [ 34.381638] GPR00: c09db928 c0009742fa20 c14a1500 
> c81d 
> [ 34.381638] GPR04: cd842c50 cd842c50 0025 
> fffe 
> [ 34.381638] GPR08:   0009 
> c00800785280 
> [ 34.381638] GPR12: c09eab60 c0135fab7f00  
>  
> [ 34.381638] GPR16:    
>  
> [ 34.381638] GPR20:    
>  
> [ 34.381638] GPR24:    
> c2e21e08 
> [ 34.381638] GPR28: cd842c48 c2a02208 c321c0c0 
> c81d 
> [ 34.381674] NIP [c09db1e4] tpm_amd_is_rng_defective+0x74/0x240
> [ 34.381681] LR [c09db928] tpm_chip_unregister+0x138/0x160
> [ 34.381685] Call Trace:
> [ 34.381686] [c0009742faa0] [c09db928] 
> tpm_chip_unregister+0x138/0x160
> [ 34.381690] [c0009742fae0] [c09eab94] 
> tpm_ibmvtpm_remove+0x34/0x130
> [ 34.381695] [c0009742fb50] [c0115738] vio_bus_remove+0x58/0xd0
> [ 34.381701] [c0009742fb90] [c0a01ecc] device_shutdown+0x21c/0x39c
> [ 34.381705] [c0009742fc20] [c01a2684] 
> kernel_restart_prepare+0x54/0x70
> [ 34.381710] [c0009742fc40] [c0292c48] kernel_kexec+0xa8/0x100
> [ 34.381714] [c0009742fcb0] [c01a2cd4] __do_sys_reboot+0x214/0x2c0
> [ 34.381718] [c0009742fe10] [c0034adc] 
> system_call_exception+0x13c/0x340
> [ 34.381723] [c0009742fe50] [c000d05c] 
> system_call_vectored_common+0x15c/0x2ec
> [ 34.381729] --- interrupt: 3000 at 0x7fff9c5459f0
> [ 34.381732] NIP: 7fff9c5459f0 LR:  CTR: 
> [ 34.381735] REGS: c0009742fe80 TRAP: 3000 Tainted: G E 
> (6.4.0-rc6-00037-gb6dad5178cea)
> [ 34.381738] MSR: 8280f033  CR: 
> 42422884 XER: 
> [ 34.381747] IRQMASK: 0 
> [ 34.381747] GPR00: 0058 7ad83d70 00012fc47f00 
> fee1dead 
> [ 34.381747] GPR04: 28121969 45584543  
> 0003 
> [ 34.381747] GPR08: 0010   
>  
> [ 34.381747] GPR12:  7fff9c7bb2c0 00012fc3f598 
>  
> [ 34.381747] GPR16:   00012fc1fcc0 
>  
> [ 34.381747] GPR20: 8913 8914 00014b891020 
> 0003 
> [ 34.381747] GPR24:  0001 0003 
> 7ad83ef0 
> [ 34.381747] GPR28: 00012fc19f10 7fff9c6419c0 00014b891080 
> 00014b891040 
> [ 34.381781] NIP [7fff9c5459f0] 0x7fff9c5459f0
> [ 34.381784] LR [] 0x0
> [ 34.381786] --- interrupt: 3000
> [ 34.381788] Code: 5463063e 408201c8 38210080 4e800020

Re: [RFC PATCH v2 1/1] powerpc: update ppc_save_regs to save current r1 in pt_regs

2023-06-15 Thread Nicholas Piggin

On Thu Jun 15, 2023 at 7:10 PM AEST, Aditya Gupta wrote:
> ppc_save_regs() skips one stack frame while saving the CPU register states.
> Instead of saving current R1, it pulls the previous stack frame pointer.
>
> When vmcores caused by direct panic call (such as `echo c >
> /proc/sysrq-trigger`), are debugged with gdb, gdb fails to show the
> backtrace correctly. On further analysis, it was found that it was because
> of mismatch between r1 and NIP.
>
> GDB uses NIP to get current function symbol and uses corresponding debug
> info of that function to unwind previous frames, but due to the
> mismatching r1 and NIP, the unwinding does not work, and it fails to
> unwind to the 2nd frame and hence does not show the backtrace.
>
> GDB backtrace with vmcore of kernel without this patch:
>
> -
> (gdb) bt
>  #0  0xc02a53e8 in crash_setup_regs (oldregs=,
> newregs=0xc4f8f8d8) at ./arch/powerpc/include/asm/kexec.h:69
>  #1  __crash_kexec (regs=) at kernel/kexec_core.c:974
>  #2  0x0063 in ?? ()
>  #3  0xc3579320 in ?? ()
> -
>
> Further analysis revealed that the mismatch occurred because
> "ppc_save_regs" was saving the previous stack's SP instead of the current
> r1. This patch fixes this by storing current r1 in the saved pt_regs.
>
> GDB backtrace with vmcore of patched kernel:
>
> 
> (gdb) bt
>  #0  0xc02a53e8 in crash_setup_regs (oldregs=0x0, 
> newregs=0xc670b8d8)
> at ./arch/powerpc/include/asm/kexec.h:69
>  #1  __crash_kexec (regs=regs@entry=0x0) at kernel/kexec_core.c:974
>  #2  0xc0168918 in panic (fmt=fmt@entry=0xc1654a60 "sysrq 
> triggered crash\n")
> at kernel/panic.c:358
>  #3  0xc0b735f8 in sysrq_handle_crash (key=) at 
> drivers/tty/sysrq.c:155
>  #4  0xc0b742cc in __handle_sysrq (key=key@entry=99, 
> check_mask=check_mask@entry=false)
> at drivers/tty/sysrq.c:602
>  #5  0xc0b7506c in write_sysrq_trigger (file=, 
> buf=,
> count=2, ppos=) at drivers/tty/sysrq.c:1163
>  #6  0xc069a7bc in pde_write (ppos=, count= out>,
> buf=, file=, pde=0xc362cb40) at 
> fs/proc/inode.c:340
>  #7  proc_reg_write (file=, buf=, 
> count=,
> ppos=) at fs/proc/inode.c:352
>  #8  0xc05b3bbc in vfs_write (file=file@entry=0xc6aa6b00,
> buf=buf@entry=0x61f498b4f60  0x61f498b4f60>,
> count=count@entry=2, pos=pos@entry=0xc670bda0) at 
> fs/read_write.c:582
>  #9  0xc05b4264 in ksys_write (fd=,
> buf=0x61f498b4f60 , 
> count=2)
> at fs/read_write.c:637
>  #10 0xc002ea2c in system_call_exception (regs=0xc670be80, 
> r0=)
> at arch/powerpc/kernel/syscall.c:171
>  #11 0xc000c270 in system_call_vectored_common ()
> at arch/powerpc/kernel/interrupt_64.S:192
> 
>
> Signed-off-by: Aditya Gupta 

So this now saves regs as though it was an interrupt taken in the
caller, at the instruction after the call to ppc_save_regs, whereas
previously the NIP was there, but R1 came from the caller's caller
and that mismatch is what causes gdb's dwarf unwinder to go haywire.

Nice catch, and I think I follow the fix and I think I agree with it.
Before the bug was introduced, NIP also came from the grandparent.
Which is what xmon presumably wanted, but since then I think maybe it
makes more sense to just have the parent caller.

Reivewed-by: Nicholas Piggin 
Fixes: d16a58f8854b1 ("powerpc: Improve ppc_save_regs()")

Thanks,
Nick

[RFC PATCH v2 1/1] powerpc: update ppc_save_regs to save current r1 in pt_regs

2023-06-15 Thread Aditya Gupta

ppc_save_regs() skips one stack frame while saving the CPU register states.
Instead of saving current R1, it pulls the previous stack frame pointer.

When vmcores caused by direct panic call (such as `echo c >
/proc/sysrq-trigger`), are debugged with gdb, gdb fails to show the
backtrace correctly. On further analysis, it was found that it was because
of mismatch between r1 and NIP.

GDB uses NIP to get current function symbol and uses corresponding debug
info of that function to unwind previous frames, but due to the
mismatching r1 and NIP, the unwinding does not work, and it fails to
unwind to the 2nd frame and hence does not show the backtrace.

GDB backtrace with vmcore of kernel without this patch:

-
(gdb) bt
 #0  0xc02a53e8 in crash_setup_regs (oldregs=,
newregs=0xc4f8f8d8) at ./arch/powerpc/include/asm/kexec.h:69
 #1  __crash_kexec (regs=) at kernel/kexec_core.c:974
 #2  0x0063 in ?? ()
 #3  0xc3579320 in ?? ()
-

Further analysis revealed that the mismatch occurred because
"ppc_save_regs" was saving the previous stack's SP instead of the current
r1. This patch fixes this by storing current r1 in the saved pt_regs.

GDB backtrace with vmcore of patched kernel:


(gdb) bt
 #0  0xc02a53e8 in crash_setup_regs (oldregs=0x0, 
newregs=0xc670b8d8)
at ./arch/powerpc/include/asm/kexec.h:69
 #1  __crash_kexec (regs=regs@entry=0x0) at kernel/kexec_core.c:974
 #2  0xc0168918 in panic (fmt=fmt@entry=0xc1654a60 "sysrq 
triggered crash\n")
at kernel/panic.c:358
 #3  0xc0b735f8 in sysrq_handle_crash (key=) at 
drivers/tty/sysrq.c:155
 #4  0xc0b742cc in __handle_sysrq (key=key@entry=99, 
check_mask=check_mask@entry=false)
at drivers/tty/sysrq.c:602
 #5  0xc0b7506c in write_sysrq_trigger (file=, 
buf=,
count=2, ppos=) at drivers/tty/sysrq.c:1163
 #6  0xc069a7bc in pde_write (ppos=, count=,
buf=, file=, pde=0xc362cb40) at 
fs/proc/inode.c:340
 #7  proc_reg_write (file=, buf=, 
count=,
ppos=) at fs/proc/inode.c:352
 #8  0xc05b3bbc in vfs_write (file=file@entry=0xc6aa6b00,
buf=buf@entry=0x61f498b4f60 ,
count=count@entry=2, pos=pos@entry=0xc670bda0) at 
fs/read_write.c:582
 #9  0xc05b4264 in ksys_write (fd=,
buf=0x61f498b4f60 , 
count=2)
at fs/read_write.c:637
 #10 0xc002ea2c in system_call_exception (regs=0xc670be80, 
r0=)
at arch/powerpc/kernel/syscall.c:171
 #11 0xc000c270 in system_call_vectored_common ()
at arch/powerpc/kernel/interrupt_64.S:192


Signed-off-by: Aditya Gupta 
---

More information:

This problem with gdb backtrace was discovered while working on a crash
tool enhancement to improve crash analysis using gdb passthrough to be
able print function arguments and local variables inside crash tool. gdb
passthrough simply asks gdb to handle the backtrace printing, where it
was noticed that it could not print correct backtrace in some vmcores.

The changes introduced here has an implication on xmon, that it might show
one extra `xmon` frame in backtrace. By looking at older commits it seems
that originally the ppc_save_regs function was introduced as
xmon_save_regs(). But now the same function has been renamed to
ppc_save_regs() and been used in few other places as well.

Tested this patch with multiple ways of crashing:
1. direct panic call (`echo c > /proc/sysrq-trigger`)
2. null dereference/oops path (the earlier implementation of 
`sysrq_handle_crash`)
3. sys reset
4. sys reset inside qemu (to test for any regressions, that were fixed by
   commit d16a58f8854b19)

Changelog
- V2:
  - fixed bogus LR by storing caller's LR area as pointed out by Naveen and
  Nick

---
 arch/powerpc/kernel/ppc_save_regs.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/ppc_save_regs.S 
b/arch/powerpc/kernel/ppc_save_regs.S
index 49813f982468..a9b9c32d0c1f 100644
--- a/arch/powerpc/kernel/ppc_save_regs.S
+++ b/arch/powerpc/kernel/ppc_save_regs.S
@@ -31,10 +31,10 @@ _GLOBAL(ppc_save_regs)
lbz r0,PACAIRQSOFTMASK(r13)
PPC_STL r0,SOFTE(r3)
 #endif
-   /* go up one stack frame for SP */
-   PPC_LL  r4,0(r1)
-   PPC_STL r4,GPR1(r3)
+   /* store current SP */
+   PPC_STL r1,GPR1(r3)
/* get caller's LR */
+   PPC_LL  r4,0(r1)
PPC_LL  r0,LRSAVE(r4)
PPC_STL r0,_LINK(r3)
mflrr0
-- 
2.40.1

Re: [PATCH v4 1/1] PCI: layerscape: Add the endpoint linkup notifier support

2023-06-15 Thread Lorenzo Pieralisi

On Mon, 15 May 2023 11:10:49 -0400, Frank Li wrote:
> Layerscape has PME interrupt, which can be used as linkup notifier.
> Set CFG_READY bit of PEX_PF0_CONFIG to enable accesses from root complex
> when linkup detected.
> 
> 

Applied to controller/endpoint, thanks!

[1/1] PCI: layerscape: Add the endpoint linkup notifier support
  https://git.kernel.org/pci/pci/c/71d9ca28015c

Thanks,
Lorenzo

Re: [PATCH v4 1/1] PCI: layerscape: Add the endpoint linkup notifier support

2023-06-15 Thread Lorenzo Pieralisi

On Mon, May 15, 2023 at 11:10:49AM -0400, Frank Li wrote:
> Layerscape has PME interrupt, which can be used as linkup notifier.
> Set CFG_READY bit of PEX_PF0_CONFIG to enable accesses from root complex
> when linkup detected.
> 
> Acked-by: Manivannan Sadhasivam 
> Signed-off-by: Xiaowei Bao 
> Signed-off-by: Frank Li 
> ---
> Change from v3 to v4
>  - swap irq and big_endian
> Change from v2 to v3
>  - align 80 column
>  - clear irq firstly
>  - dev_info to dev_dbg
>  - remove double space
>  - update commit message
> 
> Change from v1 to v2
> - pme -> PME
> - irq -> IRQ
> - update dev_info message according to Bjorn's suggestion
> 
>  .../pci/controller/dwc/pci-layerscape-ep.c| 102 +-
>  1 file changed, 101 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c 
> b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> index c640db60edc6..5398641b6b7e 100644
> --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> @@ -18,6 +18,20 @@
>  
>  #include "pcie-designware.h"
>  
> +#define PEX_PF0_CONFIG   0xC0014
> +#define PEX_PF0_CFG_READYBIT(0)
> +
> +/* PEX PFa PCIE PME and message interrupt registers*/
> +#define PEX_PF0_PME_MES_DR   0xC0020
> +#define PEX_PF0_PME_MES_DR_LUD   BIT(7)
> +#define PEX_PF0_PME_MES_DR_LDD   BIT(9)
> +#define PEX_PF0_PME_MES_DR_HRD   BIT(10)
> +
> +#define PEX_PF0_PME_MES_IER  0xC0028
> +#define PEX_PF0_PME_MES_IER_LUDIEBIT(7)
> +#define PEX_PF0_PME_MES_IER_LDDIEBIT(9)
> +#define PEX_PF0_PME_MES_IER_HRDIEBIT(10)
> +
>  #define to_ls_pcie_ep(x) dev_get_drvdata((x)->dev)
>  
>  struct ls_pcie_ep_drvdata {
> @@ -30,8 +44,86 @@ struct ls_pcie_ep {
>   struct dw_pcie  *pci;
>   struct pci_epc_features *ls_epc;
>   const struct ls_pcie_ep_drvdata *drvdata;
> + int irq;
> + boolbig_endian;
>  };
>  
> +static u32 ls_lut_readl(struct ls_pcie_ep *pcie, u32 offset)
> +{
> + struct dw_pcie *pci = pcie->pci;
> +
> + if (pcie->big_endian)
> + return ioread32be(pci->dbi_base + offset);
> + else
> + return ioread32(pci->dbi_base + offset);
> +}
> +
> +static void ls_lut_writel(struct ls_pcie_ep *pcie, u32 offset, u32 value)
> +{
> + struct dw_pcie *pci = pcie->pci;
> +
> + if (pcie->big_endian)
> + iowrite32be(value, pci->dbi_base + offset);
> + else
> + iowrite32(value, pci->dbi_base + offset);
> +}
> +
> +static irqreturn_t ls_pcie_ep_event_handler(int irq, void *dev_id)
> +{
> + struct ls_pcie_ep *pcie = dev_id;
> + struct dw_pcie *pci = pcie->pci;
> + u32 val, cfg;
> +
> + val = ls_lut_readl(pcie, PEX_PF0_PME_MES_DR);
> + ls_lut_writel(pcie, PEX_PF0_PME_MES_DR, val);
> +
> + if (!val)
> + return IRQ_NONE;
> +
> + if (val & PEX_PF0_PME_MES_DR_LUD) {
> + cfg = ls_lut_readl(pcie, PEX_PF0_CONFIG);
> + cfg |= PEX_PF0_CFG_READY;
> + ls_lut_writel(pcie, PEX_PF0_CONFIG, cfg);
> + dw_pcie_ep_linkup(>ep);
> +
> + dev_dbg(pci->dev, "Link up\n");
> + } else if (val & PEX_PF0_PME_MES_DR_LDD) {
> + dev_dbg(pci->dev, "Link down\n");
> + } else if (val & PEX_PF0_PME_MES_DR_HRD) {
> + dev_dbg(pci->dev, "Hot reset\n");
> + }
> +
> + return IRQ_HANDLED;
> +}
> +
> +static int ls_pcie_ep_interrupt_init(struct ls_pcie_ep *pcie,
> +  struct platform_device *pdev)
> +{
> + u32 val;
> + int ret;
> +
> + pcie->irq = platform_get_irq_byname(pdev, "pme");
> + if (pcie->irq < 0) {
> + dev_err(>dev, "Can't get 'pme' IRQ\n");

I will remove this log, platform_get_irq_byname() already logs an
error.

Lorenzo

> + return pcie->irq;
> + }
> +
> + ret = devm_request_irq(>dev, pcie->irq, ls_pcie_ep_event_handler,
> +IRQF_SHARED, pdev->name, pcie);
> + if (ret) {
> + dev_err(>dev, "Can't register PCIe IRQ\n");
> + return ret;
> + }
> +
> + /* Enable interrupts */
> + val = ls_lut_readl(pcie, PEX_PF0_PME_MES_IER);
> + val |=  PEX_PF0_PME_MES_IER_LDDIE | PEX_PF0_PME_MES_IER_HRDIE |
> + PEX_PF0_PME_MES_IER_LUDIE;
> + ls_lut_writel(pcie, PEX_PF0_PME_MES_IER, val);
> +
> + return 0;
> +}
> +
>  static const struct pci_epc_features*
>  ls_pcie_ep_get_features(struct dw_pcie_ep *ep)
>  {
> @@ -125,6 +217,7 @@ static int __init ls_pcie_ep_probe(struct platform_device 
> *pdev)
>   struct ls_pcie_ep *pcie;
>   struct pci_epc_features *ls_epc;
>   struct resource *dbi_base;
> + int ret;
>  
>   pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
>   if (!pcie)
> @@ -144,6 +237,7 @@ static int __init ls_pcie_ep_probe(struct

Re: [kvm-unit-tests v4 00/12] powerpc: updates, P10, PNV support

2023-06-15 Thread Joel Stanley

On Thu, 15 Jun 2023 at 03:02, Nicholas Piggin  wrote:
>
> On Wed Jun 14, 2023 at 11:09 AM AEST, Joel Stanley wrote:
> > On Thu, 8 Jun 2023 at 07:58, Nicholas Piggin  wrote:
> > >
> > > Posting again, a couple of patches were merged and accounted for review
> > > comments from last time.
> >
> > I saw some failures in the spr tests running on a power9 powernv system:
> >
> > $ TESTNAME=sprs TIMEOUT=90s ACCEL= ./powerpc/run powerpc/sprs.elf -smp
> > 1 |grep FAIL
> > FAIL: WORT  ( 895):0xc0deba80 <==> 0x
>
> This is just TCG machine? I'm not sure why WORT fails, AFAIKS it's the
> same on POWER8 and doesn't do anything just a simple register. I think
> on real hardware WORT may not have any bits implemented on POWER9
> though.

Yeah, just the TCG machine. Now that you point it out all of the
failures I reported are for ACCEL=, so they are running in tcg
mode.

run_migration timeout -k 1s --foreground 90s
/usr/bin/qemu-system-ppc64 -nodefaults -machine pseries,accel=tcg
-bios powerpc/boot_rom.bin -display none -serial stdio -kernel
powerpc/sprs.elf -smp 1 -append -w # -initrd /tmp/tmp.61XhbvCGcb


>
> > $ MIGRATION=yes TESTNAME=sprs-migration TIMEOUT=90s ACCEL=
> > ./powerpc/run powerpc/sprs.elf -smp 1 -append '-w' | grep FAIL
> > FAIL: SRR0  (  26):0xcafefacec0debabc <==> 0x00402244
> > FAIL: SRR1  (  27):0xc006409ebab6 <==> 0x80001001
> > FAIL: CTRL  ( 136):0x <==> 0x8001
> > FAIL: WORT  ( 895):0xc0deba80 <==> 0x
> > FAIL: PIR   (1023):0x0010 <==> 0x0049
> >
> > Linux 6.2.0-20-generic
> > QEMU emulator version 7.2.0 (Debian 1:7.2+dfsg-5ubuntu2)
> >
> > On a power8 powernv:
> >
> > MIGRATION=yes TESTNAME=sprs-migration TIMEOUT=90s ACCEL= ./powerpc/run
> > powerpc/sprs.elf -smp 1 -append '-w' |grep FAIL
> > FAIL: SRR0  (  26):0xcafefacec0debabc <==> 0x00402234
> > FAIL: SRR1  (  27):0xc006409ebab6 <==> 0x80001000
> > FAIL: CTRL  ( 136):0x <==> 0x8001
> > FAIL: PIR   (1023):0x0060 <==> 0x0030
>
> Hmm, seems we take some interrupt over migration test that is not
> accounted for (could check the address in SRR0 to see where it is).
> Either need to prevent that interrupt or avoid failing on SRR0/1 on
> this test.
>
> Interesting about CTRL, I wonder if that not migrating correctly.
> PIR looks like a migration issue as well, it can't be changed so
> destination CPU has got a different PIR. I would be inclined to
> leave those as failing to remind us to look into them.
>
> I'll take a look at the others though.
>
> Thanks,
> Nick

Re: [PATCH v4 04/34] pgtable: Create struct ptdesc

2023-06-15 Thread Hugh Dickins

On Mon, 12 Jun 2023, Vishal Moola (Oracle) wrote:

> Currently, page table information is stored within struct page. As part
> of simplifying struct page, create struct ptdesc for page table
> information.
> 
> Signed-off-by: Vishal Moola (Oracle) 

Vishal, as I think you have already guessed, your ptdesc series and
my pte_free_defer() "mm: free retracted page table by RCU" series are
on a collision course.

Probably just trivial collisions in most architectures, which either
of us can easily adjust to the other; powerpc likely to be more awkward,
but fairly easily resolved; s390 quite a problem.

I've so far been unable to post a v2 of my series (and powerpc and s390
were stupidly wrong in the v1), because a good s390 patch is not yet
decided - Gerald Schaefer and I are currently working on that, on the
s390 list (I took off most Ccs until we are settled and I can post v2).

As you have no doubt found yourself, s390 has sophisticated handling of
free half-pages already, and I need to add rcu_head usage in there too:
it's tricky to squeeze it all in, and ptdesc does not appear to help us
in any way (though mostly it's just changing some field names, okay).

If ptdesc were actually allowing a flexible structure which architectures
could add into, that would (in some future) be nice; but of course at
present it's still fitting it all into one struct page, and mandating
new restrictions which just make an architecture's job harder.

Some notes on problematic fields below FYI.

> ---
>  include/linux/pgtable.h | 51 +
>  1 file changed, 51 insertions(+)
> 
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index c5a51481bbb9..330de96ebfd6 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct 
> vm_area_struct *vma,
>  #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
>  #endif /* CONFIG_MMU */
>  
> +
> +/**
> + * struct ptdesc - Memory descriptor for page tables.
> + * @__page_flags: Same as page flags. Unused for page tables.
> + * @pt_list: List of used page tables. Used for s390 and x86.
> + * @_pt_pad_1: Padding that aliases with page's compound head.
> + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs.
> + * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only.
> + * @pt_mm: Used for x86 pgds.
> + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 
> only.
> + * @ptl: Lock for the page table.
> + *
> + * This struct overlays struct page for now. Do not modify without a good
> + * understanding of the issues.
> + */
> +struct ptdesc {
> + unsigned long __page_flags;
> +
> + union {
> + struct list_head pt_list;

I shall be needing struct rcu_head rcu_head (or pt_rcu_head or whatever,
if you prefer) in this union too.  Sharing the lru or pt_list with rcu_head
is what's difficult to get right and efficient on s390 - and if ptdesc gave
us an independent rcu_head for each page table, that would be a blessing!
but sadly not, it still has to squeeze into a struct page.

> + struct {
> + unsigned long _pt_pad_1;
> + pgtable_t pmd_huge_pte;
> + };
> + };
> + unsigned long _pt_s390_gaddr;
> +
> + union {
> + struct mm_struct *pt_mm;
> + atomic_t pt_frag_refcount;

Whether s390 will want pt_mm is not yet decided: I want to use it,
Gerald prefers to go without it; but if we do end up using it,
then pt_frag_refcount is a luxury we would have to give up.

s390 does very well already with its _refcount tricks, and I'd expect
powerpc's simpler but more wasteful implementation to work as well
with _refcount too - I know that a few years back, powerpc did misuse
_refcount (it did not allow for speculative accesses, thought it had
sole ownership of that field); but s390 copes well with that, and I
expect powerpc can do so too, without the luxury of pt_frag_refcount.

But I've no desire to undo powerpc's use of pt_frag_refcount:
just warning that we may want to undo any use of it in s390.

I thought I had more issues to mention, probably Gerald will
remind me of a whole new unexplored dimension! gmap perhaps.

Hugh

> + };
> +
> +#if ALLOC_SPLIT_PTLOCKS
> + spinlock_t *ptl;
> +#else
> + spinlock_t ptl;
> +#endif
> +};
> +
> +#define TABLE_MATCH(pg, pt)  \
> + static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
> +TABLE_MATCH(flags, __page_flags);
> +TABLE_MATCH(compound_head, pt_list);
> +TABLE_MATCH(compound_head, _pt_pad_1);
> +TABLE_MATCH(pmd_huge_pte, pmd_huge_pte);
> +TABLE_MATCH(mapping, _pt_s390_gaddr);
> +TABLE_MATCH(pt_mm, pt_mm);
> +TABLE_MATCH(ptl, ptl);
> +#undef TABLE_MATCH
> +static_assert(sizeof(struct ptdesc) <= sizeof(struct page));
> +
>  /*
>   * No-op macros that just return the current protection value. Defined here

[PATCH 2/2] tools/perf/tests: perf all metricgroups test fails when perf_event access is restricted

2023-06-15 Thread Athira Rajeev

Perf all metricgroups test fails as below when perf_event access
is restricted.

./perf test -v "perf all metricgroups test"
Testing Memory_BW
Error:
Access to performance monitoring and observability operations is limited.
Enforced MAC policy settings (SELinux) can limit access to performance
—
access to performance monitoring and observability operations for processes
without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
—
test child finished with -1
 end 
perf all metricgroups test: FAILED!

Fix the testcase to skip those metric events which needs perf_event access
explicitly. The exit code of the testcase is based on return code of
the perf stat command ( enabled by set -e option ). Hence save the
exit status in a variable and use that to decide success or fail for the
testcase.

Signed-off-by: Athira Rajeev 
---
 tools/perf/tests/shell/stat_all_metricgroups.sh | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tools/perf/tests/shell/stat_all_metricgroups.sh 
b/tools/perf/tests/shell/stat_all_metricgroups.sh
index cb35e488809a..fa86b562676a 100755
--- a/tools/perf/tests/shell/stat_all_metricgroups.sh
+++ b/tools/perf/tests/shell/stat_all_metricgroups.sh
@@ -2,11 +2,19 @@
 # perf all metricgroups test
 # SPDX-License-Identifier: GPL-2.0
 
-set -e
-
 for m in $(perf list --raw-dump metricgroups); do
   echo "Testing $m"
-  perf stat -M "$m" -a true
+  result=$(perf stat -M "$m" -a true 2>&1)
+  rc=$?
+  # Skip if there is no access to perf_events monitoring
+  # Otherwise exit based on the return code of perf comamnd.
+  if [[ "$result" =~ "Access to performance monitoring and observability 
operations is limited" ]]
+  then
+  continue
+  else
+  [ $rc -ne 0 ] && exit $rc
+  fi
+
 done
 
 exit 0
-- 
2.31.1

[PATCH 1/2] tools/perf/tests: perf all metrics test fails when perf_event access is restricted

2023-06-15 Thread Athira Rajeev

Perf all metrics test fails as below when perf_event access
is restricted.

./perf test -v "perf all metrics test"
Metric 'Memory_RD_BW_Chip' not printed in:
Error:
Access to performance monitoring and observability operations is limited.
Enforced MAC policy settings (SELinux) can limit access to performance
—
access to performance monitoring and observability operations for processes
without CAP_PERFMON, CAP_SYS_PTRACE or CAP_SYS_ADMIN Linux capability.
—
test child finished with -1
 end 
perf all metrics test: FAILED!

The perf all metrics test picks the input events from
"perf list --raw-dump metrics" and runs "perf stat -M "$m""
for each of the metrics in the list. It fails here for some
of the metrics which needs access, since it collects system
wide resource details/statistics. Fix the testcase to skip
those metric events.

Signed-off-by: Athira Rajeev 
---
 tools/perf/tests/shell/stat_all_metrics.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/shell/stat_all_metrics.sh 
b/tools/perf/tests/shell/stat_all_metrics.sh
index 54774525e18a..14b96484a359 100755
--- a/tools/perf/tests/shell/stat_all_metrics.sh
+++ b/tools/perf/tests/shell/stat_all_metrics.sh
@@ -6,7 +6,9 @@ err=0
 for m in $(perf list --raw-dump metrics); do
   echo "Testing $m"
   result=$(perf stat -M "$m" true 2>&1)
-  if [[ "$result" =~ ${m:0:50} ]] || [[ "$result" =~ "" ]]
+  # Skip if there is no access to perf_events monitoring
+  # and observability operations
+  if [[ "$result" =~ ${m:0:50} ]] || [[ "$result" =~ "" ]] || 
[[ "$result" =~ "Access to performance monitoring and observability operations 
is limited" ]]
   then
 continue
   fi
-- 
2.31.1

56 matches

Mail list logo