[PATCH] powerpc: Drop zalloc_maybe_bootmem()

2023-08-22 Thread Michael Ellerman
The only callers of zalloc_maybe_bootmem() are PCI setup routines. These
used to be called early during boot before slab setup, and also during
runtime due to hotplug.

But commit 5537fcb319d0 ("powerpc/pci: Add ppc_md.discover_phbs()")
moved the boot-time calls later, after slab setup, meaning there's no
longer any need for zalloc_maybe_bootmem(), kzalloc() can be used in all
cases.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/setup.h |  1 -
 arch/powerpc/kernel/pci-common.c |  2 +-
 arch/powerpc/lib/Makefile|  2 +-
 arch/powerpc/lib/alloc.c | 23 ---
 arch/powerpc/sysdev/fsl_pci.c|  2 +-
 5 files changed, 3 insertions(+), 27 deletions(-)
 delete mode 100644 arch/powerpc/lib/alloc.c

diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index e29e83f8a89c..eed74c1fb832 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -8,7 +8,6 @@
 extern void ppc_printk_progress(char *s, unsigned short hex);
 
 extern unsigned long long memory_limit;
-extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
 
 struct device_node;
 
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index e88d7c9feeec..040255ddb569 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -125,7 +125,7 @@ struct pci_controller *pcibios_alloc_controller(struct 
device_node *dev)
 {
struct pci_controller *phb;
 
-   phb = zalloc_maybe_bootmem(sizeof(struct pci_controller), GFP_KERNEL);
+   phb = kzalloc(sizeof(struct pci_controller), GFP_KERNEL);
if (phb == NULL)
return NULL;
 
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 9aa8286c9687..51ad0397c17a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -27,7 +27,7 @@ endif
 CFLAGS_code-patching.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 CFLAGS_feature-fixups.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 
-obj-y += alloc.o code-patching.o feature-fixups.o pmem.o
+obj-y += code-patching.o feature-fixups.o pmem.o
 
 obj-$(CONFIG_CODE_PATCHING_SELFTEST) += test-code-patching.o
 
diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c
deleted file mode 100644
index ce180870bd52..
--- a/arch/powerpc/lib/alloc.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-
-void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask)
-{
-   void *p;
-
-   if (slab_is_available())
-   p = kzalloc(size, mask);
-   else {
-   p = memblock_alloc(size, SMP_CACHE_BYTES);
-   if (!p)
-   panic("%s: Failed to allocate %zu bytes\n", __func__,
- size);
-   }
-   return p;
-}
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index 5f7219df35ef..3868483fbe29 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -767,7 +767,7 @@ static int __init mpc83xx_pcie_setup(struct pci_controller 
*hose,
u32 cfg_bar;
int ret = -ENOMEM;
 
-   pcie = zalloc_maybe_bootmem(sizeof(*pcie), GFP_KERNEL);
+   pcie = kzalloc(sizeof(*pcie), GFP_KERNEL);
if (!pcie)
return ret;
 
-- 
2.41.0



[PATCH 1/4] powerpc/pseries: Move VPHN constants into vphn.h

2023-08-22 Thread Michael Ellerman
These don't have any particularly good reason to belong in lppaca.h,
move them into their own header.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/lppaca.h | 22 ---
 arch/powerpc/include/asm/vphn.h   | 22 +++
 arch/powerpc/mm/numa.c|  1 +
 arch/powerpc/platforms/pseries/lpar.c |  1 +
 arch/powerpc/platforms/pseries/vas.c  |  1 +
 arch/powerpc/platforms/pseries/vphn.c |  2 +-
 .../selftests/powerpc/vphn/asm/lppaca.h   |  1 -
 .../testing/selftests/powerpc/vphn/asm/vphn.h |  1 +
 8 files changed, 27 insertions(+), 24 deletions(-)
 create mode 100644 arch/powerpc/include/asm/vphn.h
 delete mode 12 tools/testing/selftests/powerpc/vphn/asm/lppaca.h
 create mode 12 tools/testing/selftests/powerpc/vphn/asm/vphn.h

diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index 34d44cb17c87..12159e5b6888 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -6,28 +6,6 @@
 #ifndef _ASM_POWERPC_LPPACA_H
 #define _ASM_POWERPC_LPPACA_H
 
-/*
- * The below VPHN macros are outside the __KERNEL__ check since these are
- * used for compiling the vphn selftest in userspace
- */
-
-/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */
-#define VPHN_REGISTER_COUNT 6
-
-/*
- * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
- * form the complete property we have to add the length in the first cell.
- */
-#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
-
-/*
- * The H_HOME_NODE_ASSOCIATIVITY hcall takes two values for flags:
- * 1 for retrieving associativity information for a guest cpu
- * 2 for retrieving associativity information for a host/hypervisor cpu
- */
-#define VPHN_FLAG_VCPU 1
-#define VPHN_FLAG_PCPU 2
-
 #ifdef __KERNEL__
 
 /*
diff --git a/arch/powerpc/include/asm/vphn.h b/arch/powerpc/include/asm/vphn.h
new file mode 100644
index ..e0970603fce2
--- /dev/null
+++ b/arch/powerpc/include/asm/vphn.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ASM_POWERPC_VPHN_H
+#define _ASM_POWERPC_VPHN_H
+
+/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. */
+#define VPHN_REGISTER_COUNT 6
+
+/*
+ * 6 64-bit registers unpacked into up to 24 be32 associativity values. To
+ * form the complete property we have to add the length in the first cell.
+ */
+#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1)
+
+/*
+ * The H_HOME_NODE_ASSOCIATIVITY hcall takes two values for flags:
+ * 1 for retrieving associativity information for a guest cpu
+ * 2 for retrieving associativity information for a host/hypervisor cpu
+ */
+#define VPHN_FLAG_VCPU 1
+#define VPHN_FLAG_PCPU 2
+
+#endif // _ASM_POWERPC_VPHN_H
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9f73d089eac1..f6c4ace3b221 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static int numa_enabled = 1;
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 2eab323f6970..27fb656bd6ba 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "pseries.h"
 
diff --git a/arch/powerpc/platforms/pseries/vas.c 
b/arch/powerpc/platforms/pseries/vas.c
index 3fbc2a6aa319..e25ac52acf50 100644
--- a/arch/powerpc/platforms/pseries/vas.c
+++ b/arch/powerpc/platforms/pseries/vas.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "vas.h"
 
diff --git a/arch/powerpc/platforms/pseries/vphn.c 
b/arch/powerpc/platforms/pseries/vphn.c
index cca474a2c396..3f85ece3c872 100644
--- a/arch/powerpc/platforms/pseries/vphn.c
+++ b/arch/powerpc/platforms/pseries/vphn.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
-#include 
+#include 
 
 /*
  * The associativity domain numbers are returned from the hypervisor as a
diff --git a/tools/testing/selftests/powerpc/vphn/asm/lppaca.h 
b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h
deleted file mode 12
index 942b1d00999c..
--- a/tools/testing/selftests/powerpc/vphn/asm/lppaca.h
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../arch/powerpc/include/asm/lppaca.h
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/vphn/asm/vphn.h 
b/tools/testing/selftests/powerpc/vphn/asm/vphn.h
new file mode 12
index ..3a0b2a00171c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/asm/vphn.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/asm/vphn.h
\ No newline at end of file
-- 
2.41.0



[PATCH 3/4] powerpc: Don't include lppaca.h in paca.h

2023-08-22 Thread Michael Ellerman
By adding a forward declaration for struct lppaca we can untangle paca.h
and lppaca.h. Also move get_lppaca() into lppaca.h for consistency.

Add includes of lppaca.h to some files that need it.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/lppaca.h | 4 
 arch/powerpc/include/asm/paca.h   | 6 +-
 arch/powerpc/include/asm/paravirt.h   | 1 +
 arch/powerpc/include/asm/plpar_wrappers.h | 1 +
 arch/powerpc/kvm/book3s_hv_ras.c  | 1 +
 arch/powerpc/mm/book3s64/slb.c| 1 +
 arch/powerpc/xmon/xmon.c  | 1 +
 7 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index 27f0421188ec..b6a63fa0965f 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -112,6 +112,10 @@ static inline bool lppaca_shared_proc(struct lppaca *l)
return !!(l->__old_status & LPPACA_OLD_SHARED_PROC);
 }
 
+#ifdef CONFIG_PPC_PSERIES
+#define get_lppaca()   (get_paca()->lppaca_ptr)
+#endif
+
 /*
  * SLB shadow buffer structure as defined in the PAPR.  The save_area
  * contains adjacent ESID and VSID pairs for each shadowed SLB.  The
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index cb325938766a..e667d455ecb4 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -15,7 +15,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #ifdef CONFIG_PPC_BOOK3E_64
@@ -47,14 +46,11 @@ extern unsigned int debug_smp_processor_id(void); /* from 
linux/smp.h */
 #define get_paca() local_paca
 #endif
 
-#ifdef CONFIG_PPC_PSERIES
-#define get_lppaca()   (get_paca()->lppaca_ptr)
-#endif
-
 #define get_slb_shadow()   (get_paca()->slb_shadow_ptr)
 
 struct task_struct;
 struct rtas_args;
+struct lppaca;
 
 /*
  * Defines the layout of the paca.
diff --git a/arch/powerpc/include/asm/paravirt.h 
b/arch/powerpc/include/asm/paravirt.h
index f5ba1a3c41f8..e08513d73119 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -6,6 +6,7 @@
 #include 
 #ifdef CONFIG_PPC64
 #include 
+#include 
 #include 
 #endif
 
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index 8239c0af5eb2..fe3d0ea0058a 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 static inline long poll_pending(void)
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index ccfd96965630..82be6d87514b 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 6956f637a38c..f2708c8629a5 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 78453b9b1ba0..6c6f90f1da94 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -58,6 +58,7 @@
 #ifdef CONFIG_PPC64
 #include 
 #include 
+#include 
 #endif
 
 #include "nonstdio.h"
-- 
2.41.0



[PATCH v2 4/4] powerpc/pseries: Rework lppaca_shared_proc() to avoid DEBUG_PREEMPT

2023-08-22 Thread Michael Ellerman
From: Russell Currey 

lppaca_shared_proc() takes a pointer to the lppaca which is typically
accessed through get_lppaca().  With DEBUG_PREEMPT enabled, this leads
to checking if preemption is enabled, for example:

  BUG: using smp_processor_id() in preemptible [] code: grep/10693
  caller is lparcfg_data+0x408/0x19a0
  CPU: 4 PID: 10693 Comm: grep Not tainted 6.5.0-rc3 #2
  Call Trace:
dump_stack_lvl+0x154/0x200 (unreliable)
check_preemption_disabled+0x214/0x220
lparcfg_data+0x408/0x19a0
...

This isn't actually a problem however, as it does not matter which
lppaca is accessed, the shared proc state will be the same.
vcpudispatch_stats_procfs_init() already works around this by disabling
preemption, but the lparcfg code does not, erroring any time
/proc/powerpc/lparcfg is accessed with DEBUG_PREEMPT enabled.

Instead of disabling preemption on the caller side, rework
lppaca_shared_proc() to not take a pointer and instead directly access
the lppaca, bypassing any potential preemption checks.

Fixes: f13c13a00512 ("powerpc: Stop using non-architected shared_proc field in 
lppaca")
Signed-off-by: Russell Currey 
[mpe: Rework to avoid needing a definition in paca.h and lppaca.h]
Signed-off-by: Michael Ellerman 
Link: https://msgid.link/20230808005317.20374-1-rus...@russell.cc
---
 arch/powerpc/include/asm/lppaca.h| 11 +--
 arch/powerpc/platforms/pseries/lpar.c| 10 +-
 arch/powerpc/platforms/pseries/lparcfg.c |  4 ++--
 arch/powerpc/platforms/pseries/setup.c   |  2 +-
 drivers/cpuidle/cpuidle-pseries.c|  8 +---
 5 files changed, 14 insertions(+), 21 deletions(-)

v2: mpe, rework based on the preceeding header untangling patches.

diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index b6a63fa0965f..61ec2447dabf 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * The lppaca is the "virtual processor area" registered with the hypervisor,
@@ -105,14 +106,20 @@ struct lppaca {
  */
 #define LPPACA_OLD_SHARED_PROC 2
 
-static inline bool lppaca_shared_proc(struct lppaca *l)
+#ifdef CONFIG_PPC_PSERIES
+/*
+ * All CPUs should have the same shared proc value, so directly access the PACA
+ * to avoid false positives from DEBUG_PREEMPT.
+ */
+static inline bool lppaca_shared_proc(void)
 {
+   struct lppaca *l = local_paca->lppaca_ptr;
+
if (!firmware_has_feature(FW_FEATURE_SPLPAR))
return false;
return !!(l->__old_status & LPPACA_OLD_SHARED_PROC);
 }
 
-#ifdef CONFIG_PPC_PSERIES
 #define get_lppaca()   (get_paca()->lppaca_ptr)
 #endif
 
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 27fb656bd6ba..f2cb62148f36 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -640,16 +640,8 @@ static const struct proc_ops 
vcpudispatch_stats_freq_proc_ops = {
 
 static int __init vcpudispatch_stats_procfs_init(void)
 {
-   /*
-* Avoid smp_processor_id while preemptible. All CPUs should have
-* the same value for lppaca_shared_proc.
-*/
-   preempt_disable();
-   if (!lppaca_shared_proc(get_lppaca())) {
-   preempt_enable();
+   if (!lppaca_shared_proc())
return 0;
-   }
-   preempt_enable();
 
if (!proc_create("powerpc/vcpudispatch_stats", 0600, NULL,
_stats_proc_ops))
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
b/arch/powerpc/platforms/pseries/lparcfg.c
index 8acc70509520..1c151d77e74b 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -206,7 +206,7 @@ static void parse_ppp_data(struct seq_file *m)
   ppp_data.active_system_procs);
 
/* pool related entries are appropriate for shared configs */
-   if (lppaca_shared_proc(get_lppaca())) {
+   if (lppaca_shared_proc()) {
unsigned long pool_idle_time, pool_procs;
 
seq_printf(m, "pool=%d\n", ppp_data.pool_num);
@@ -560,7 +560,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
   partition_potential_processors);
 
seq_printf(m, "shared_processor_mode=%d\n",
-  lppaca_shared_proc(get_lppaca()));
+  lppaca_shared_proc());
 
 #ifdef CONFIG_PPC_64S_HASH_MMU
if (!radix_enabled())
diff --git a/arch/powerpc/platforms/pseries/setup.c 
b/arch/powerpc/platforms/pseries/setup.c
index bb0a9aeb50f9..ecea85c74c43 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -849,7 +849,7 @@ static void __init pSeries_setup_arch(void)
if (firmware_has_feature(FW_FEATURE_LPAR)) {
vpa_init(boot_cpuid);
 
-   if (lppaca_shared_proc(get_lppaca())) {
+   if 

[PATCH 2/4] powerpc/pseries: Move hcall_vphn() prototype into vphn.h

2023-08-22 Thread Michael Ellerman
Consolidate the two prototypes for hcall_vphn() into vphn.h.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/dtl.h| 1 -
 arch/powerpc/include/asm/lppaca.h | 2 --
 arch/powerpc/include/asm/vphn.h   | 2 ++
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/dtl.h b/arch/powerpc/include/asm/dtl.h
index 4bcb9f9ac764..d6f43d149f8d 100644
--- a/arch/powerpc/include/asm/dtl.h
+++ b/arch/powerpc/include/asm/dtl.h
@@ -39,6 +39,5 @@ extern rwlock_t dtl_access_lock;
 
 extern void register_dtl_buffer(int cpu);
 extern void alloc_dtl_buffers(unsigned long *time_limit);
-extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity);
 
 #endif /* _ASM_POWERPC_DTL_H */
diff --git a/arch/powerpc/include/asm/lppaca.h 
b/arch/powerpc/include/asm/lppaca.h
index 12159e5b6888..27f0421188ec 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -127,8 +127,6 @@ struct slb_shadow {
} save_area[SLB_NUM_BOLTED];
 } cacheline_aligned;
 
-extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity);
-
 #endif /* CONFIG_PPC_BOOK3S */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_LPPACA_H */
diff --git a/arch/powerpc/include/asm/vphn.h b/arch/powerpc/include/asm/vphn.h
index e0970603fce2..8c2f795eea68 100644
--- a/arch/powerpc/include/asm/vphn.h
+++ b/arch/powerpc/include/asm/vphn.h
@@ -19,4 +19,6 @@
 #define VPHN_FLAG_VCPU 1
 #define VPHN_FLAG_PCPU 2
 
+long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity);
+
 #endif // _ASM_POWERPC_VPHN_H
-- 
2.41.0



[PATCH] ibmveth: Use dcbf rather than dcbfl

2023-08-22 Thread Michael Ellerman
When building for power4, newer binutils don't recognise the "dcbfl"
extended mnemonic.

dcbfl RA, RB is equivalent to dcbf RA, RB, 1.

Switch to "dcbf" to avoid the build error.

Signed-off-by: Michael Ellerman 
---
 drivers/net/ethernet/ibm/ibmveth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
b/drivers/net/ethernet/ibm/ibmveth.c
index 113fcb3e353e..832a2ae01950 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -203,7 +203,7 @@ static inline void ibmveth_flush_buffer(void *addr, 
unsigned long length)
unsigned long offset;
 
for (offset = 0; offset < length; offset += SMP_CACHE_BYTES)
-   asm("dcbfl %0,%1" :: "b" (addr), "r" (offset));
+   asm("dcbf %0,%1,1" :: "b" (addr), "r" (offset));
 }
 
 /* replenish the buffers for a pool.  note that we don't need to
-- 
2.40.1



Re: [PATCH] cxl: Drop unused detach_spa()

2023-08-22 Thread Andrew Donnellan
On Wed, 2023-08-23 at 14:48 +1000, Michael Ellerman wrote:
> Clang warns:
>   drivers/misc/cxl/native.c:272:20: error: unused function
> 'detach_spa' [-Werror,-Wunused-function]
> 
> It was created as part of some refactoring in commit 05155772f642
> ("cxl:
> Allocate and release the SPA with the AFU"), but has never been
> called
> in its current form. Drop it.
> 
> Signed-off-by: Michael Ellerman 

Indeed it looks unused.

Acked-by: Andrew Donnellan 

-- 
Andrew DonnellanOzLabs, ADL Canberra
a...@linux.ibm.com   IBM Australia Limited


[PATCH] cxl: Drop unused detach_spa()

2023-08-22 Thread Michael Ellerman
Clang warns:
  drivers/misc/cxl/native.c:272:20: error: unused function 'detach_spa' 
[-Werror,-Wunused-function]

It was created as part of some refactoring in commit 05155772f642 ("cxl:
Allocate and release the SPA with the AFU"), but has never been called
in its current form. Drop it.

Signed-off-by: Michael Ellerman 
---
 drivers/misc/cxl/native.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 50b0c44bb8d7..fbe16a6ab7ad 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -269,11 +269,6 @@ static void attach_spa(struct cxl_afu *afu)
cxl_p1n_write(afu, CXL_PSL_SPAP_An, spap);
 }
 
-static inline void detach_spa(struct cxl_afu *afu)
-{
-   cxl_p1n_write(afu, CXL_PSL_SPAP_An, 0);
-}
-
 void cxl_release_spa(struct cxl_afu *afu)
 {
if (afu->native->spa) {
-- 
2.41.0



Re: [PATCH 1/2] powerpc/powernv: Fix fortify source warnings in opal-prd.c

2023-08-22 Thread Mahesh J Salgaonkar
On 2023-08-22 00:28:19 Tue, Michael Ellerman wrote:
> As reported by Mahesh & Aneesh, opal_prd_msg_notifier() triggers a
> FORTIFY_SOURCE warning:
> 
>   memcpy: detected field-spanning write (size 32) of single field 
> ">msg" at arch/powerpc/platforms/powernv/opal-prd.c:355 (size 4)
>   WARNING: CPU: 9 PID: 660 at arch/powerpc/platforms/powernv/opal-prd.c:355 
> opal_prd_msg_notifier+0x174/0x188 [opal_prd]
>   NIP opal_prd_msg_notifier+0x174/0x188 [opal_prd]
>   LR  opal_prd_msg_notifier+0x170/0x188 [opal_prd]
>   Call Trace:
> opal_prd_msg_notifier+0x170/0x188 [opal_prd] (unreliable)
> notifier_call_chain+0xc0/0x1b0
> atomic_notifier_call_chain+0x2c/0x40
> opal_message_notify+0xf4/0x2c0
> 
> This happens because the copy is targetting item->msg, which is only 4
> bytes in size, even though the enclosing item was allocated with extra
> space following the msg.
> 
> To fix the warning define struct opal_prd_msg with a union of the header
> and a flex array, and have the memcpy target the flex array.
> 
> Reported-by: Aneesh Kumar K.V 
> Reported-by: Mahesh Salgaonkar 
> Signed-off-by: Michael Ellerman 

Tested-by: Mahesh Salgaonkar 
Reviewed-by: Mahesh Salgaonkar 

Thanks,
-Mahesh

> ---
>  arch/powerpc/platforms/powernv/opal-prd.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/opal-prd.c 
> b/arch/powerpc/platforms/powernv/opal-prd.c
> index 113bdb151f68..40e26e9f318f 100644
> --- a/arch/powerpc/platforms/powernv/opal-prd.c
> +++ b/arch/powerpc/platforms/powernv/opal-prd.c
> @@ -24,13 +24,20 @@
>  #include 
>  
>  
> +struct opal_prd_msg {
> + union {
> + struct opal_prd_msg_header header;
> + DECLARE_FLEX_ARRAY(u8, data);
> + };
> +};
> +
>  /*
>   * The msg member must be at the end of the struct, as it's followed by the
>   * message data.
>   */
>  struct opal_prd_msg_queue_item {
> - struct list_headlist;
> - struct opal_prd_msg_header  msg;
> + struct list_headlist;
> + struct opal_prd_msg msg;
>  };
>  
>  static struct device_node *prd_node;
> @@ -156,7 +163,7 @@ static ssize_t opal_prd_read(struct file *file, char 
> __user *buf,
>   int rc;
>  
>   /* we need at least a header's worth of data */
> - if (count < sizeof(item->msg))
> + if (count < sizeof(item->msg.header))
>   return -EINVAL;
>  
>   if (*ppos)
> @@ -186,7 +193,7 @@ static ssize_t opal_prd_read(struct file *file, char 
> __user *buf,
>   return -EINTR;
>   }
>  
> - size = be16_to_cpu(item->msg.size);
> + size = be16_to_cpu(item->msg.header.size);
>   if (size > count) {
>   err = -EINVAL;
>   goto err_requeue;
> @@ -352,7 +359,7 @@ static int opal_prd_msg_notifier(struct notifier_block 
> *nb,
>   if (!item)
>   return -ENOMEM;
>  
> - memcpy(>msg, msg->params, msg_size);
> + memcpy(>msg.data, msg->params, msg_size);
>  
>   spin_lock_irqsave(_prd_msg_queue_lock, flags);
>   list_add_tail(>list, _prd_msg_queue);
> -- 
> 2.41.0
> 

-- 
Mahesh J Salgaonkar


Re: [PATCH v6 2/3] PCI/AER: Disable AER interrupt on suspend

2023-08-22 Thread Kai-Heng Feng
On Fri, Aug 11, 2023 at 4:00 PM Kai-Heng Feng
 wrote:
>
> On Thu, Aug 10, 2023 at 6:51 PM Bjorn Helgaas  wrote:
> >
> > On Thu, Aug 10, 2023 at 04:17:21PM +0800, Kai-Heng Feng wrote:
> > > On Thu, Aug 10, 2023 at 2:52 AM Bjorn Helgaas  wrote:
> > > > On Fri, Jul 21, 2023 at 11:58:24AM +0800, Kai-Heng Feng wrote:
> > > > > On Tue, Jul 18, 2023 at 7:17 PM Bjorn Helgaas  
> > > > > wrote:
> > > > > > On Fri, May 12, 2023 at 08:00:13AM +0800, Kai-Heng Feng wrote:
> > > > > > > PCIe services that share an IRQ with PME, such as AER or DPC,
> > > > > > > may cause a spurious wakeup on system suspend. To prevent this,
> > > > > > > disable the AER interrupt notification during the system suspend
> > > > > > > process.
> > > > > >
> > > > > > I see that in this particular BZ dmesg log, PME, AER, and DPC do 
> > > > > > share
> > > > > > the same IRQ, but I don't think this is true in general.
> > > > > >
> > > > > > Root Ports usually use MSI or MSI-X.  PME and hotplug events use the
> > > > > > Interrupt Message Number in the PCIe Capability, but AER uses the 
> > > > > > one
> > > > > > in the AER Root Error Status register, and DPC uses the one in the 
> > > > > > DPC
> > > > > > Capability register.  Those potentially correspond to three distinct
> > > > > > MSI/MSI-X vectors.
> > > > > >
> > > > > > I think this probably has nothing to do with the IRQ being *shared*,
> > > > > > but just that putting the downstream component into D3cold, where 
> > > > > > the
> > > > > > link state is L3, may cause the upstream component to log and 
> > > > > > signal a
> > > > > > link-related error as the link goes completely down.
> > > > >
> > > > > That's quite likely a better explanation than my wording.
> > > > > Assuming AER IRQ and PME IRQ are not shared, does system get woken up
> > > > > by AER IRQ?
> > > >
> > > > Rafael could answer this better than I can, but
> > > > Documentation/power/suspend-and-interrupts.rst says device interrupts
> > > > are generally disabled during suspend after the "late" phase of
> > > > suspending devices, i.e.,
> > > >
> > > >   dpm_suspend_noirq
> > > > suspend_device_irqs   <-- disable non-wakeup IRQs
> > > > dpm_noirq_suspend_devices
> > > >   ...
> > > > pci_pm_suspend_noirq  # (I assume)
> > > >   pci_prepare_to_sleep
> > > >
> > > > I think the downstream component would be put in D3cold by
> > > > pci_prepare_to_sleep(), so non-wakeup interrupts should be disabled by
> > > > then.
> > > >
> > > > I assume PME would generally *not* be disabled since it's needed for
> > > > wakeup, so I think any interrupt that shares the PME IRQ and occurs
> > > > during suspend may cause a spurious wakeup.
> > >
> > > Yes, that's the case here.
> > >
> > > > If so, it's exactly as you said at the beginning: AER/DPC/etc sharing
> > > > the PME IRQ may cause spurious wakeups, and we would have to disable
> > > > those other interrupts at the source, e.g., by clearing
> > > > PCI_ERR_ROOT_CMD_FATAL_EN etc (exactly as your series does).
> > >
> > > So is the series good to be merged now?
> >
> > If we merge as-is, won't we disable AER & DPC interrupts unnecessarily
> > in the case where the link goes to D3hot?  In that case, there's no
> > reason to expect interrupts related to the link going down, but things
> > like PTM messages still work, and they may cause errors that we should
> > know about.
>
> Because the issue can be observed on D3hot as well [0].
> The root port device [0] is power managed by ACPI, so I wonder if it's
> reasonable to disable AER & DPC for devices that power managed by
> firmware?

OK, I think the D3hot case is different to this one, so I'll work on
next revision that only disable AER/DPC when power is really off.

In additional to disabling interrupt, is it reasonable to disable AER
and DPC service completely, so unwanted electric noise wont trigger a
DPC reset?

Kai-Heng

> [0] https://bugzilla.kernel.org/show_bug.cgi?id=216295#c3
>
> Kai-Heng
>
> >
> > > > > > I don't think D0-D3hot should be relevant here because in all those
> > > > > > states, the link should be active because the downstream config 
> > > > > > space
> > > > > > remains accessible.  So I'm not sure if it's possible, but I wonder 
> > > > > > if
> > > > > > there's a more targeted place we could do this, e.g., in the path 
> > > > > > that
> > > > > > puts downstream devices in D3cold.
> > > > >
> > > > > Let me try to work on this.
> > > > >
> > > > > Kai-Heng
> > > > >
> > > > > >
> > > > > > > As Per PCIe Base Spec 5.0, section 5.2, titled "Link State Power 
> > > > > > > Management",
> > > > > > > TLP and DLLP transmission are disabled for a Link in L2/L3 Ready 
> > > > > > > (D3hot), L2
> > > > > > > (D3cold with aux power) and L3 (D3cold) states. So disabling the 
> > > > > > > AER
> > > > > > > notification during suspend and re-enabling them during the 
> > > > > > > resume process
> > > > > > > should not affect the basic functionality.
> > > > > > >

[PATCH RFC 1/2] powerpc/pseries: papr-vpd char driver for VPD retrieval

2023-08-22 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

PowerVM LPARs may retrieve Vital Product Data (VPD) for system
components using the ibm,get-vpd RTAS function.

We can expose this to user space with a /dev/papr-vpd character
device, where the programming model is:

  struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
  int devfd = open("/dev/papr-vpd", O_WRONLY);
  int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, );
  size_t size = lseek(vpdfd, 0, SEEK_END);
  char *buf = malloc(size);
  pread(devfd, buf, size, 0);

When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
the file contains the result of a complete ibm,get-vpd sequence. The
file contents are immutable from the POV of user space. To get a new
view of VPD, clients must create a new handle.

This design choice insulates user space from most of the complexities
that ibm,get-vpd brings:

* ibm,get-vpd must be called more than once to obtain complete
  results.
* Only one ibm,get-vpd call sequence should be in progress at a time;
  concurrent sequences will disrupt each other. Callers must have a
  protocol for serializing their use of the function.
* A call sequence in progress may receive a "VPD changed, try again"
  status, requiring the client to start over. (The driver does not yet
  handle this, but it should be easy to add.)

The memory required for the VPD buffers seems acceptable, around 20KB
for all VPD on one of my systems. And the value of the
/rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
consistently 300KB across various systems I've checked.

I've implemented support for this new ABI in the rtas_get_vpd()
function in librtas, which the vpdupdate command currently uses to
populate its VPD database. I've verified that an unmodified vpdupdate
binary generates an identical database when using a librtas.so that
prefers the new ABI.

Likely remaining work:

* Handle RTAS call status -4 (VPD changed) during ibm,get-vpd call
  sequence.
* Prevent ibm,get-vpd calls via rtas(2) from disrupting ibm,get-vpd
  call sequences in this driver.
* (Maybe) implement a poll method for delivering notifications of
  potential changes to VPD, e.g. after a partition migration.

Questions, points for discussion:

* Am I allocating the ioctl numbers correctly?
* The only way to discover the size of a VPD buffer is
  lseek(SEEK_END). fstat() doesn't work for anonymous fds like
  this. Is this OK, or should the buffer length be discoverable some
  other way?

Signed-off-by: Nathan Lynch 
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 arch/powerpc/include/uapi/asm/papr-vpd.h   |  29 ++
 arch/powerpc/platforms/pseries/Makefile|   1 +
 arch/powerpc/platforms/pseries/papr-vpd.c  | 353 +
 4 files changed, 385 insertions(+)

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 4ea5b837399a..a950545bf7cd 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -349,6 +349,8 @@ Code  Seq#Include File  
 Comments
  

 0xB1  00-1F  PPPoX
  

+0xB2  00 arch/powerpc/include/uapi/asm/papr-vpd.h
powerpc/pseries VPD API
+ 

 0xB3  00 linux/mmc/ioctl.h
 0xB4  00-0F  linux/gpio.h

 0xB5  00-0F  uapi/linux/rpmsg.h  

diff --git a/arch/powerpc/include/uapi/asm/papr-vpd.h 
b/arch/powerpc/include/uapi/asm/papr-vpd.h
new file mode 100644
index ..aa33217ad5de
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-vpd.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#ifndef _UAPI_PAPR_VPD_H_
+#define _UAPI_PAPR_VPD_H_
+
+#include 
+#include 
+
+struct papr_location_code {
+   /*
+* PAPR+ 12.3.2.4 Converged Location Code Rules - Length
+* Restrictions. 79 characters plus nul.
+*/
+   char str[80];
+};
+
+#define PAPR_VPD_IOCTL_BASE 0xb2
+
+#define PAPR_VPD_IO(nr) _IO(PAPR_VPD_IOCTL_BASE, nr)
+#define PAPR_VPD_IOR(nr, type)  _IOR(PAPR_VPD_IOCTL_BASE, nr, type)
+#define PAPR_VPD_IOW(nr, type)  _IOW(PAPR_VPD_IOCTL_BASE, nr, type)
+#define PAPR_VPD_IOWR(nr, type) _IOWR(PAPR_VPD_IOCTL_BASE, nr, type)
+
+/*
+ * ioctl for /dev/papr-vpd. Returns a VPD handle fd corresponding to
+ * the location code.
+ */
+#define PAPR_VPD_CREATE_HANDLE PAPR_VPD_IOW(0, struct papr_location_code)
+
+#endif /* _UAPI_PAPR_VPD_H_ */
diff --git 

[PATCH RFC 2/2] powerpc/selftests: add test for papr-vpd

2023-08-22 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Add selftests for /dev/papr-vpd, exercising the common expected use
cases:

* Retrieve all VPD by passing an empty location code.
* Retrieve the "system VPD" by passing a location code derived from DT
  root node properties, as done by the vpdupdate command.

The tests also verify that certain intended properties of the driver
hold:

* Passing an unterminated location code to PAPR_VPD_CREATE_HANDLE gets
  EINVAL.
* Passing a NULL location code pointer to PAPR_VPD_CREATE_HANDLE gets
  EFAULT.
* Closing the device node without first issuing a
  PAPR_VPD_CREATE_HANDLE command to it succeeds.
* Releasing a handle without first consuming any data from it
  succeeds.
* Re-reading the contents of a handle returns the same data as the
  first time.

Some minimal validation of the returned data is performed.

The tests are skipped on systems where the papr-vpd driver does not
initialize, making this useful only on PowerVM LPARs at this point.

Signed-off-by: Nathan Lynch 
---
 tools/testing/selftests/powerpc/Makefile   |   1 +
 .../testing/selftests/powerpc/papr_vpd/.gitignore  |   1 +
 tools/testing/selftests/powerpc/papr_vpd/Makefile  |  12 +
 .../testing/selftests/powerpc/papr_vpd/papr_vpd.c  | 351 +
 4 files changed, 365 insertions(+)

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 49f2ad1793fd..7de972612786 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -32,6 +32,7 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   papr_attributes  \
+  papr_vpd \
   ptrace   \
   security \
   mce
diff --git a/tools/testing/selftests/powerpc/papr_vpd/.gitignore 
b/tools/testing/selftests/powerpc/papr_vpd/.gitignore
new file mode 100644
index ..49285031a656
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/.gitignore
@@ -0,0 +1 @@
+/papr_vpd
diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile 
b/tools/testing/selftests/powerpc/papr_vpd/Makefile
new file mode 100644
index ..06b719703bfd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+   $(MAKE) -C ../
+
+TEST_GEN_PROGS := papr_vpd
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
+
+$(OUTPUT)/papr_vpd: CFLAGS += $(KHDR_INCLUDES)
diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c 
b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
new file mode 100644
index ..4102e06601db
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
@@ -0,0 +1,351 @@
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "utils.h"
+
+#define DEVPATH "/dev/papr-vpd"
+
+static int dev_papr_vpd_open_close(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+   FAIL_IF(close(devfd) != 0);
+
+   return 0;
+}
+
+static int dev_papr_vpd_get_handle_all(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+   struct papr_location_code lc = { .str = "", };
+   off_t size;
+   int fd;
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   errno = 0;
+   fd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, );
+   FAIL_IF(errno != 0);
+   FAIL_IF(fd < 0);
+
+   FAIL_IF(close(devfd) != 0);
+
+   size = lseek(fd, 0, SEEK_END);
+   FAIL_IF(size <= 0);
+
+   void *buf = malloc((size_t)size);
+   FAIL_IF(!buf);
+
+   ssize_t consumed = pread(fd, buf, size, 0);
+   FAIL_IF(consumed != size);
+
+   /* Ensure EOF */
+   FAIL_IF(read(fd, buf, size) != 0);
+   FAIL_IF(close(fd));
+
+   /* Verify that the buffer looks like VPD */
+   const char needle[] = "System VPD";
+   FAIL_IF(!memmem(buf, size, needle, strlen(needle)));
+
+   return 0;
+}
+
+static int dev_papr_vpd_get_handle_byte_at_a_time(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+   struct papr_location_code lc = { .str = "", };
+   int fd;
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   errno = 0;
+   fd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, );
+   FAIL_IF(errno != 0);
+   FAIL_IF(fd < 0);
+
+   FAIL_IF(close(devfd) != 0);
+
+   size_t consumed = 0;
+   while (1) {
+   ssize_t res;
+   char c;
+
+   errno = 0;
+   res = read(fd, , sizeof(c));;
+   FAIL_IF(res > sizeof(c));
+   FAIL_IF(res < 0);
+   

[PATCH RFC 0/2] powerpc/pseries: new character devices for RTAS functions

2023-08-22 Thread Nathan Lynch via B4 Relay
This is a proposal for adding chardev-based access to a select subset
of RTAS functions on the pseries platform.

The problem: important platform features are enabled on Linux VMs
through the powerpc-specific rtas() syscall in combination with
writeable mappings of /dev/mem. In typical usage, this is encapsulated
behind APIs provided by the librtas library. This paradigm is
incompatible with lockdown, which prohibits /dev/mem access.

The solution I'm working on is to add a small pseries-specific
"driver" for each functional area, exposing the relevant features to
user space in ways that are compatible with lockdown. In most of these
areas, I believe it's possible to change librtas to prefer the new
chardev interfaces without disrupting existing users.

I've broken down the affected functions into the following areas and
priorities:

High priority:
* VPD retrieval.
* System parameters: retrieval and update.

Medium priority:
* Platform dump retrieval.
* Light path diagnostics (get/set-dynamic-indicator,
  get-dynamic-sensor-state, get-indices).

Low priority (may never happen):
* Error injection: would have to be carefully restricted.
* Physical attestation: no known users.
* LPAR perftools: no known users.

Out of scope:
* DLPAR (configure-connector et al): involves device tree updates
  which must be handled entirely in-kernel for lockdown. This is the
  object of a separate effort.

See https://github.com/ibm-power-utilities/librtas/issues/29 for more
details.

In this RFC, I've included a single driver for VPD retrieval. Clients
use ioctl() to obtain a file descriptor-based handle for the VPD they
want. I think this could be a good model for the other areas too, but
I'd like to get opinions on it.

In the next iteration I expect to add a separate driver for system
parameters.

For reference, I floated a different approach for system parameters
here:

https://lore.kernel.org/linuxppc-dev/2022073458.130938-1-nath...@linux.ibm.com/

---
Nathan Lynch (2):
  powerpc/pseries: papr-vpd char driver for VPD retrieval
  powerpc/selftests: add test for papr-vpd

 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 arch/powerpc/include/uapi/asm/papr-vpd.h   |  29 ++
 arch/powerpc/platforms/pseries/Makefile|   1 +
 arch/powerpc/platforms/pseries/papr-vpd.c  | 353 +
 tools/testing/selftests/powerpc/Makefile   |   1 +
 .../testing/selftests/powerpc/papr_vpd/.gitignore  |   1 +
 tools/testing/selftests/powerpc/papr_vpd/Makefile  |  12 +
 .../testing/selftests/powerpc/papr_vpd/papr_vpd.c  | 351 
 8 files changed, 750 insertions(+)
---
base-commit: d77497508a229529830850ba07e1e52596463d21
change-id: 20230817-papr-sys_rtas-vs-lockdown-5c54505db792

Best regards,
-- 
Nathan Lynch 



Re: linux-next: build failure after merge of the mm tree

2023-08-22 Thread Matthew Wilcox
On Mon, Aug 21, 2023 at 09:00:43PM -0700, Darrick J. Wong wrote:
> Please leave this ^^^ comment, because the need for TRACE_DEFINE_ENUM to
> make enums work in tracepoints is not at all obvious.
> 
> "order %u" to match the (non dev_t) style of the rest of the xfs
> tracepoints.

ACK, thanks.

Andrew, please add this -fix patch for "mm: Remove enum
page_entry_size".

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 1904eaf7a2e9..fd789e00dfd6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -802,9 +802,6 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating);
  * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
  * code.
  */
-TRACE_DEFINE_ENUM(PMD_ORDER);
-TRACE_DEFINE_ENUM(PUD_ORDER);
-
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
 
@@ -823,13 +820,10 @@ TRACE_EVENT(xfs_filemap_fault,
__entry->order = order;
__entry->write_fault = write_fault;
),
-   TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
+   TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d",
  MAJOR(__entry->dev), MINOR(__entry->dev),
  __entry->ino,
- __print_symbolic(__entry->order,
-   { 0,"PTE" },
-   { PMD_ORDER,"PMD" },
-   { PUD_ORDER,"PUD" }),
+ __entry->order,
  __entry->write_fault)
 )
 


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Hugh Dickins
On Tue, 22 Aug 2023, Matthew Wilcox wrote:
> On Tue, Aug 22, 2023 at 11:34:19AM -0700, Hugh Dickins wrote:
> > (Yes, the locking is a bit confusing: but mainly for the unrelated reason,
> > that with the split locking configs, we never quite know whether this lock
> > is the same as that lock or not, and so have to be rather careful.)
> 
> Is it time to remove the PTE split locking config option?  I believe all
> supported architectures have at least two levels of page tables, so if we
> have split ptlocks, ptl and pml are always different from each other (it's
> just that on two level machines, pmd == pud == p4d == pgd).  With huge
> thread counts now being the norm, it's hard to see why anybody would want
> to support SMP and !SPLIT_PTE_PTLOCKS.  To quote the documentation ...
> 
>   Split page table lock for PTE tables is enabled compile-time if
>   CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
>   If split lock is disabled, all tables are guarded by mm->page_table_lock.
> 
> You can barely buy a wrist-watch without eight CPUs these days.

Whilst I'm still happy with my 0-CPU wrist-watch, I do think you're right:
that SPLIT_PTLOCK_CPUS business was really just a safety-valve for when
introducing split ptlock in the first place, 4 pulled out of a hat, and
the unsplit ptlock path quite under-tested.

But I'll leave it to someone else do the job of removing it whenever.

Hugh


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Jann Horn
On Tue, Aug 22, 2023 at 8:54 PM Hugh Dickins  wrote:
> But rather than reworking it, please let's just go with v1 for now.

Sounds good to me.


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Hugh Dickins
On Tue, 22 Aug 2023, Jann Horn wrote:
> On Tue, Aug 22, 2023 at 4:51 AM Hugh Dickins  wrote:
> > On Mon, 21 Aug 2023, Jann Horn wrote:
> > > On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins  wrote:
> > > > Just for this case, take the pmd_lock() two steps earlier: not because
> > > > it gives any protection against this case itself, but because ptlock
> > > > nests inside it, and it's the dropping of ptlock which let the bug in.
> > > > In other cases, continue to minimize the pmd_lock() hold time.
> > >
> > > Special-casing userfaultfd like this makes me a bit uncomfortable; but
> > > I also can't find anything other than userfaultfd that would insert
> > > pages into regions that are khugepaged-compatible, so I guess this
> > > works?
> >
> > I'm as sure as I can be that it's solely because userfaultfd breaks
> > the usual rules here (and in fairness, IIRC Andrea did ask my permission
> > before making it behave that way on shmem, COWing without a source page).
> >
> > Perhaps something else will want that same behaviour in future (it's
> > tempting, but difficult to guarantee correctness); for now, it is just
> > userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
> > expecting uffd to add more such exceptional modes in future).
> 
> Hm, yeah, sounds okay. (I guess we'd also run into this if we ever
> wanted to make it possible to reliably install PTE markers with
> madvise() or something like that, which might be nice for allowing
> userspace to create guard pages without unnecessary extra VMAs...)

I see the mailthread has taken inspiration from your comment there,
and veered off in that direction: but I'll ignore those futures.

> 
> > > I guess an alternative would be to use a spin_trylock() instead of the
> > > current pmd_lock(), and if that fails, temporarily drop the page table
> > > lock and then restart from step 2 with both locks held - and at that
> > > point the page table scan should be fast since we expect it to usually
> > > be empty.
> >
> > That's certainly a good idea, if collapse on userfaultfd_armed private
> > is anything of a common case (I doubt, but I don't know).  It may be a
> > better idea anyway (saving a drop and retake of ptlock).
> 
> I was thinking it also has the advantage that it would still perform
> okay if we got rid of the userfaultfd_armed() condition at some point
> - though I realize that designing too much for hypothetical future
> features is an antipattern.
> 
> > I gave it a try, expecting to end up with something that would lead
> > me to say "I tried it, but it didn't work out well"; but actually it
> > looks okay to me.  I wouldn't say I prefer it, but it seems reasonable,
> > and no more complicated (as Peter rightly observes) than the original.
> >
> > It's up to you and Peter, and whoever has strong feelings about it,
> > to choose between them: I don't mind (but I shall be sad if someone
> > demands that I indent that comment deeper - I'm not a fan of long
> > multi-line comments near column 80).
> 
> I prefer this version because it would make it easier to remove the
> "userfaultfd_armed()" check in the future if we have to, but I guess
> we could also always change it later if that becomes necessary, so I
> don't really have strong feelings on it at this point.

Thanks for considering them both, Jann.  I do think your trylock way,
as in v2, is in principle superior, and we may well have good reason
to switch over to it in future; but I find it slightly more confusing,
so will follow your and Peter's "no strong feelings" for now, and ask
Andrew please to take the original (implicit v1).

Overriding reason: I realized overnight that v2 is not quite correct:
I was clever enough to realize that nr_ptes needed to be reset to 0
to get the accounting right with a recheck pass, but not clever enough
to realize that resetting it to 0 there would likely skip the abort
path's flush_tlb_mm(mm), when we actually had cleared entries on the
first pass.  It needs a separate bool to decide the flush_tlb_mm(mm),
or it needs that (ridiculously minor!) step 3 to be moved down.

But rather than reworking it, please let's just go with v1 for now.

Thanks,
Hugh

Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Matthew Wilcox
On Tue, Aug 22, 2023 at 11:34:19AM -0700, Hugh Dickins wrote:
> (Yes, the locking is a bit confusing: but mainly for the unrelated reason,
> that with the split locking configs, we never quite know whether this lock
> is the same as that lock or not, and so have to be rather careful.)

Is it time to remove the PTE split locking config option?  I believe all
supported architectures have at least two levels of page tables, so if we
have split ptlocks, ptl and pml are always different from each other (it's
just that on two level machines, pmd == pud == p4d == pgd).  With huge
thread counts now being the norm, it's hard to see why anybody would want
to support SMP and !SPLIT_PTE_PTLOCKS.  To quote the documentation ...

  Split page table lock for PTE tables is enabled compile-time if
  CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS.
  If split lock is disabled, all tables are guarded by mm->page_table_lock.

You can barely buy a wrist-watch without eight CPUs these days.


Re: [PATCH v5 0/3 RESEND] sed-opal: keyrings, discovery, revert, key store

2023-08-22 Thread Jens Axboe


On Fri, 21 Jul 2023 16:15:31 -0500, gjo...@linux.vnet.ibm.com wrote:
> This patchset has gone through numerous rounds of review and
> all comments/suggetions have been addressed. The reviews have
> covered all relevant areas including reviews by block and keyring
> developers as well as the SED Opal maintainer. The last
> patchset submission has not solicited any responses in the
> six weeks since it was last distributed. The changes are
> generally useful and ready for inclusion.
> 
> [...]

Applied, thanks!

[1/3] block: sed-opal: Implement IOC_OPAL_DISCOVERY
  commit: 9fb10726ecc5145550180aec4fd0adf0a7b1d634
[2/3] block: sed-opal: Implement IOC_OPAL_REVERT_LSP
  commit: 5c82efc1aee8eb0919aa67a0d2559de5a326bd7c
[3/3] block: sed-opal: keyring support for SED keys
  commit: 3bfeb61256643281ac4be5b8a57e9d9da3db4335

Best regards,
-- 
Jens Axboe





Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread David Hildenbrand

On 22.08.23 17:30, Jann Horn wrote:

On Tue, Aug 22, 2023 at 5:23 PM Matthew Wilcox  wrote:

On Tue, Aug 22, 2023 at 04:39:43PM +0200, Jann Horn wrote:

Perhaps something else will want that same behaviour in future (it's
tempting, but difficult to guarantee correctness); for now, it is just
userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
expecting uffd to add more such exceptional modes in future).


Hm, yeah, sounds okay. (I guess we'd also run into this if we ever
wanted to make it possible to reliably install PTE markers with
madvise() or something like that, which might be nice for allowing
userspace to create guard pages without unnecessary extra VMAs...)


I don't know what a userspace API for this would look like, but I have
a dream of creating guard VMAs which only live in the maple tree and
don't require the allocation of a struct VMA.  Use some magic reserved
pointer value like XA_ZERO_ENTRY to represent them ... seems more
robust than putting a PTE marker in the page tables?


Chrome currently uses a lot of VMAs for its heap, which I think are
basically alternating PROT_NONE and PROT_READ|PROT_WRITE anonymous
VMAs. Like this:

[...]
3a10002cf000-3a10002d ---p  00:00 0
3a10002d-3a10002e6000 rw-p  00:00 0
3a10002e6000-3a10002e8000 ---p  00:00 0
3a10002e8000-3a10002f2000 rw-p  00:00 0
3a10002f2000-3a10002f4000 ---p  00:00 0
3a10002f4000-3a10002fb000 rw-p  00:00 0
3a10002fb000-3a10002fc000 ---p  00:00 0
3a10002fc000-3a1000303000 rw-p  00:00 0
3a1000303000-3a1000304000 ---p  00:00 0
3a1000304000-3a100031b000 rw-p  00:00 0
3a100031b000-3a100031c000 ---p  00:00 0
3a100031c000-3a1000326000 rw-p  00:00 0
3a1000326000-3a1000328000 ---p  00:00 0
3a1000328000-3a100033a000 rw-p  00:00 0
3a100033a000-3a100033c000 ---p  00:00 0
3a100033c000-3a100038b000 rw-p  00:00 0
3a100038b000-3a100038c000 ---p  00:00 0
3a100038c000-3a100039b000 rw-p  00:00 0
3a100039b000-3a100039c000 ---p  00:00 0
3a100039c000-3a10003af000 rw-p  00:00 0
3a10003af000-3a10003b ---p  00:00 0
3a10003b-3a10003e8000 rw-p  00:00 0
3a10003e8000-3a1000401000 ---p  00:00 0
3a1000401000-3a1000402000 rw-p  00:00 0
3a1000402000-3a100040c000 ---p  00:00 0
3a100040c000-3a100046f000 rw-p  00:00 0
3a100046f000-3a100047 ---p  00:00 0
3a100047-3a100047a000 rw-p  00:00 0
3a100047a000-3a100047c000 ---p  00:00 0
3a100047c000-3a1000492000 rw-p  00:00 0
3a1000492000-3a1000494000 ---p  00:00 0
3a1000494000-3a10004a2000 rw-p  00:00 0
3a10004a2000-3a10004a4000 ---p  00:00 0
3a10004a4000-3a10004b6000 rw-p  00:00 0
3a10004b6000-3a10004b8000 ---p  00:00 0
3a10004b8000-3a10004ea000 rw-p  00:00 0
3a10004ea000-3a10004ec000 ---p  00:00 0
3a10004ec000-3a10005f4000 rw-p  00:00 0
3a10005f4000-3a1000601000 ---p  00:00 0
3a1000601000-3a1000602000 rw-p  00:00 0
3a1000602000-3a1000604000 ---p  00:00 0
3a1000604000-3a100062b000 rw-p  00:00 0
3a100062b000-3a1000801000 ---p  00:00 0
[...]

I was thinking if you used PTE markers as guards, you could maybe turn
all that into more or less a single VMA?


I proposed the topic "A proper API for sparse memory mappings" for the 
bi-weekly MM meeting on September 20, that would also cover exactly that 
use case. :)


--
Cheers,

David / dhildenb



Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Jann Horn
On Tue, Aug 22, 2023 at 5:23 PM Matthew Wilcox  wrote:
> On Tue, Aug 22, 2023 at 04:39:43PM +0200, Jann Horn wrote:
> > > Perhaps something else will want that same behaviour in future (it's
> > > tempting, but difficult to guarantee correctness); for now, it is just
> > > userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
> > > expecting uffd to add more such exceptional modes in future).
> >
> > Hm, yeah, sounds okay. (I guess we'd also run into this if we ever
> > wanted to make it possible to reliably install PTE markers with
> > madvise() or something like that, which might be nice for allowing
> > userspace to create guard pages without unnecessary extra VMAs...)
>
> I don't know what a userspace API for this would look like, but I have
> a dream of creating guard VMAs which only live in the maple tree and
> don't require the allocation of a struct VMA.  Use some magic reserved
> pointer value like XA_ZERO_ENTRY to represent them ... seems more
> robust than putting a PTE marker in the page tables?

Chrome currently uses a lot of VMAs for its heap, which I think are
basically alternating PROT_NONE and PROT_READ|PROT_WRITE anonymous
VMAs. Like this:

[...]
3a10002cf000-3a10002d ---p  00:00 0
3a10002d-3a10002e6000 rw-p  00:00 0
3a10002e6000-3a10002e8000 ---p  00:00 0
3a10002e8000-3a10002f2000 rw-p  00:00 0
3a10002f2000-3a10002f4000 ---p  00:00 0
3a10002f4000-3a10002fb000 rw-p  00:00 0
3a10002fb000-3a10002fc000 ---p  00:00 0
3a10002fc000-3a1000303000 rw-p  00:00 0
3a1000303000-3a1000304000 ---p  00:00 0
3a1000304000-3a100031b000 rw-p  00:00 0
3a100031b000-3a100031c000 ---p  00:00 0
3a100031c000-3a1000326000 rw-p  00:00 0
3a1000326000-3a1000328000 ---p  00:00 0
3a1000328000-3a100033a000 rw-p  00:00 0
3a100033a000-3a100033c000 ---p  00:00 0
3a100033c000-3a100038b000 rw-p  00:00 0
3a100038b000-3a100038c000 ---p  00:00 0
3a100038c000-3a100039b000 rw-p  00:00 0
3a100039b000-3a100039c000 ---p  00:00 0
3a100039c000-3a10003af000 rw-p  00:00 0
3a10003af000-3a10003b ---p  00:00 0
3a10003b-3a10003e8000 rw-p  00:00 0
3a10003e8000-3a1000401000 ---p  00:00 0
3a1000401000-3a1000402000 rw-p  00:00 0
3a1000402000-3a100040c000 ---p  00:00 0
3a100040c000-3a100046f000 rw-p  00:00 0
3a100046f000-3a100047 ---p  00:00 0
3a100047-3a100047a000 rw-p  00:00 0
3a100047a000-3a100047c000 ---p  00:00 0
3a100047c000-3a1000492000 rw-p  00:00 0
3a1000492000-3a1000494000 ---p  00:00 0
3a1000494000-3a10004a2000 rw-p  00:00 0
3a10004a2000-3a10004a4000 ---p  00:00 0
3a10004a4000-3a10004b6000 rw-p  00:00 0
3a10004b6000-3a10004b8000 ---p  00:00 0
3a10004b8000-3a10004ea000 rw-p  00:00 0
3a10004ea000-3a10004ec000 ---p  00:00 0
3a10004ec000-3a10005f4000 rw-p  00:00 0
3a10005f4000-3a1000601000 ---p  00:00 0
3a1000601000-3a1000602000 rw-p  00:00 0
3a1000602000-3a1000604000 ---p  00:00 0
3a1000604000-3a100062b000 rw-p  00:00 0
3a100062b000-3a1000801000 ---p  00:00 0
[...]

I was thinking if you used PTE markers as guards, you could maybe turn
all that into more or less a single VMA?


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread David Hildenbrand

On 22.08.23 16:39, Jann Horn wrote:

On Tue, Aug 22, 2023 at 4:51 AM Hugh Dickins  wrote:

On Mon, 21 Aug 2023, Jann Horn wrote:

On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins  wrote:

Just for this case, take the pmd_lock() two steps earlier: not because
it gives any protection against this case itself, but because ptlock
nests inside it, and it's the dropping of ptlock which let the bug in.
In other cases, continue to minimize the pmd_lock() hold time.


Special-casing userfaultfd like this makes me a bit uncomfortable; but
I also can't find anything other than userfaultfd that would insert
pages into regions that are khugepaged-compatible, so I guess this
works?


I'm as sure as I can be that it's solely because userfaultfd breaks
the usual rules here (and in fairness, IIRC Andrea did ask my permission
before making it behave that way on shmem, COWing without a source page).

Perhaps something else will want that same behaviour in future (it's
tempting, but difficult to guarantee correctness); for now, it is just
userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
expecting uffd to add more such exceptional modes in future).


Hm, yeah, sounds okay. (I guess we'd also run into this if we ever
wanted to make it possible to reliably install PTE markers with
madvise() or something like that, which might be nice for allowing
userspace to create guard pages without unnecessary extra VMAs...)


I'm working on something similar that goes a bit further than just guard 
pages. It also installs PTE markers into page tables, inside existing 
large VMAs.


Initially, I'll only tackle anon VMAs, though.

--
Cheers,

David / dhildenb



Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Matthew Wilcox
On Tue, Aug 22, 2023 at 04:39:43PM +0200, Jann Horn wrote:
> > Perhaps something else will want that same behaviour in future (it's
> > tempting, but difficult to guarantee correctness); for now, it is just
> > userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
> > expecting uffd to add more such exceptional modes in future).
> 
> Hm, yeah, sounds okay. (I guess we'd also run into this if we ever
> wanted to make it possible to reliably install PTE markers with
> madvise() or something like that, which might be nice for allowing
> userspace to create guard pages without unnecessary extra VMAs...)

I don't know what a userspace API for this would look like, but I have
a dream of creating guard VMAs which only live in the maple tree and
don't require the allocation of a struct VMA.  Use some magic reserved
pointer value like XA_ZERO_ENTRY to represent them ... seems more
robust than putting a PTE marker in the page tables?



Re: [PATCH v14 00/15] phy: Add support for Lynx 10G SerDes

2023-08-22 Thread Ioana Ciornei
On Mon, Aug 21, 2023 at 02:46:53PM -0400, Sean Anderson wrote:
> On 8/21/23 14:13, Ioana Ciornei wrote:
> > On Mon, Aug 21, 2023 at 01:45:44PM -0400, Sean Anderson wrote:
> >> Well, we have two pieces of information we need
> >> 
> >> - What values do we need to program in the PCCRs to select a particular
> >>   mode? This includes whether to e.g. set the KX bits.
> >> - Implied by the above, what protocols are supported on which lanes?
> >>   This is not strictly necessary, but will certainly solve a lot of
> >>   headscratching.
> >> 
> >> This information varies between different socs, and different serdes on
> >> the same socs. We can't really look at the RCW or the clocks and figure
> >> out what we need to program. So what are our options?
> >> 
> >> - We can have a separate compatible for each serdes on each SoC (e.g.
> >>   "fsl,lynx-10g-a"). This was rejected by the devicetree maintainers.

I previously took this statement at face value and didn't further
investigate. After a bit of digging through the first versions of this
patch set it's evident that you left out a big piece of information.

The devicetree maintainers have indeed rejected compatible strings of
the "fsl,-serdes-" form but they also suggested to
move the numbering to a property instead:

https://lore.kernel.org/all/db9d9455-37af-1616-8f7f-3d752e793...@linaro.org/

But instead of doing that, you chose to move all the different details
that vary between SerDes blocks/SoCs from the driver to the DTS. I don't
see that this was done in response to explicit feedback.

> >> - We can have one compatible for each SoC, and determine the serdes
> >>   based on the address. I would like to avoid this...
> > 
> > To me this really seems like a straightforward approach.
> 
> Indeed it would be straightforward, but what's the point of having a
> devicetree in the first place then? We could just go back to being a
> (non-dt) platform device.
> 

I am confused why you are now so adamant to have these details into the
DTS. Your first approach was to put them into the driver, not the DTS:

https://lore.kernel.org/netdev/20220628221404.1444200-5-sean.ander...@seco.com/

And this approach could still work now and get accepted by the device
tree maintainers. The only change that would be needed is to add a
property like "fsl,serdes-block-id = <1>".

> >> - We can stick all the details which vary between serdes/socs into the
> >>   device tree. This is very flexible, since supporting new SoCs is
> >>   mostly a matter of adding a new compatible and writing a new
> >>   devicetree. On the other hand, if you have a bug in your devicetree,
> >>   it's not easy to fix it in the kernel.
> >> - Just don't support protocol switching. The 28G driver does this, which
> >>   is why it only has one compatible. However, supporting protocol
> >>   switching is a core goal of this driver, so dropping support is not an
> >>   option.
> >> 

Ioana


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Jann Horn
On Tue, Aug 22, 2023 at 4:51 AM Hugh Dickins  wrote:
> On Mon, 21 Aug 2023, Jann Horn wrote:
> > On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins  wrote:
> > > Just for this case, take the pmd_lock() two steps earlier: not because
> > > it gives any protection against this case itself, but because ptlock
> > > nests inside it, and it's the dropping of ptlock which let the bug in.
> > > In other cases, continue to minimize the pmd_lock() hold time.
> >
> > Special-casing userfaultfd like this makes me a bit uncomfortable; but
> > I also can't find anything other than userfaultfd that would insert
> > pages into regions that are khugepaged-compatible, so I guess this
> > works?
>
> I'm as sure as I can be that it's solely because userfaultfd breaks
> the usual rules here (and in fairness, IIRC Andrea did ask my permission
> before making it behave that way on shmem, COWing without a source page).
>
> Perhaps something else will want that same behaviour in future (it's
> tempting, but difficult to guarantee correctness); for now, it is just
> userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
> expecting uffd to add more such exceptional modes in future).

Hm, yeah, sounds okay. (I guess we'd also run into this if we ever
wanted to make it possible to reliably install PTE markers with
madvise() or something like that, which might be nice for allowing
userspace to create guard pages without unnecessary extra VMAs...)

> > I guess an alternative would be to use a spin_trylock() instead of the
> > current pmd_lock(), and if that fails, temporarily drop the page table
> > lock and then restart from step 2 with both locks held - and at that
> > point the page table scan should be fast since we expect it to usually
> > be empty.
>
> That's certainly a good idea, if collapse on userfaultfd_armed private
> is anything of a common case (I doubt, but I don't know).  It may be a
> better idea anyway (saving a drop and retake of ptlock).

I was thinking it also has the advantage that it would still perform
okay if we got rid of the userfaultfd_armed() condition at some point
- though I realize that designing too much for hypothetical future
features is an antipattern.

> I gave it a try, expecting to end up with something that would lead
> me to say "I tried it, but it didn't work out well"; but actually it
> looks okay to me.  I wouldn't say I prefer it, but it seems reasonable,
> and no more complicated (as Peter rightly observes) than the original.
>
> It's up to you and Peter, and whoever has strong feelings about it,
> to choose between them: I don't mind (but I shall be sad if someone
> demands that I indent that comment deeper - I'm not a fan of long
> multi-line comments near column 80).

I prefer this version because it would make it easier to remove the
"userfaultfd_armed()" check in the future if we have to, but I guess
we could also always change it later if that becomes necessary, so I
don't really have strong feelings on it at this point.


Re: [PATCH mm-unstable] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd

2023-08-22 Thread Peter Xu
Hi, Hugh, Jann,

On Mon, Aug 21, 2023 at 07:51:38PM -0700, Hugh Dickins wrote:
> On Mon, 21 Aug 2023, Jann Horn wrote:
> > On Mon, Aug 21, 2023 at 9:51 PM Hugh Dickins  wrote:
> > > Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
> > > shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
> > > thought it had emptied: page lock on the huge page is enough to protect
> > > against WP faults (which find the PTE has been cleared), but not enough
> > > to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.
> > >
> > > retract_page_tables() protects against this by checking !vma->anon_vma;
> > > but we know that MADV_COLLAPSE needs to be able to work on private shmem
> > > mappings, even those with an anon_vma prepared for another part of the
> > > mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
> > > mappings which are userfaultfd_armed().  Whether it needs to work on
> > > private shmem mappings which are userfaultfd_armed(), I'm not so sure:
> > > but assume that it does.
> > 
> > I think we couldn't rely on anon_vma here anyway, since holding the
> > mmap_lock in read mode doesn't prevent concurrent creation of an
> > anon_vma?
> 
> We would have had to do the same as in retract_page_tables() (which
> doesn't even have mmap_lock for read): recheck !vma->anon_vma after
> finally acquiring ptlock.  But the !anon_vma limitation is certainly
> not acceptable here anyway.
> 
> > 
> > > Just for this case, take the pmd_lock() two steps earlier: not because
> > > it gives any protection against this case itself, but because ptlock
> > > nests inside it, and it's the dropping of ptlock which let the bug in.
> > > In other cases, continue to minimize the pmd_lock() hold time.
> > 
> > Special-casing userfaultfd like this makes me a bit uncomfortable; but
> > I also can't find anything other than userfaultfd that would insert
> > pages into regions that are khugepaged-compatible, so I guess this
> > works?
> 
> I'm as sure as I can be that it's solely because userfaultfd breaks
> the usual rules here (and in fairness, IIRC Andrea did ask my permission
> before making it behave that way on shmem, COWing without a source page).
> 
> Perhaps something else will want that same behaviour in future (it's
> tempting, but difficult to guarantee correctness); for now, it is just
> userfaultfd (but by saying "_armed" rather than "_missing", I'm half-
> expecting uffd to add more such exceptional modes in future).
> 
> > 
> > I guess an alternative would be to use a spin_trylock() instead of the
> > current pmd_lock(), and if that fails, temporarily drop the page table
> > lock and then restart from step 2 with both locks held - and at that
> > point the page table scan should be fast since we expect it to usually
> > be empty.
> 
> That's certainly a good idea, if collapse on userfaultfd_armed private
> is anything of a common case (I doubt, but I don't know).  It may be a
> better idea anyway (saving a drop and retake of ptlock).
> 
> I gave it a try, expecting to end up with something that would lead
> me to say "I tried it, but it didn't work out well"; but actually it
> looks okay to me.  I wouldn't say I prefer it, but it seems reasonable,
> and no more complicated (as Peter rightly observes) than the original.
> 
> It's up to you and Peter, and whoever has strong feelings about it,
> to choose between them: I don't mind (but I shall be sad if someone
> demands that I indent that comment deeper - I'm not a fan of long
> multi-line comments near column 80).

No strong opinion here, either.  Just one trivial comment/question below on
the new patch (if that will be preferred)..

> 
> 
> [PATCH mm-unstable v2] mm/khugepaged: fix collapse_pte_mapped_thp() versus 
> uffd
> 
> Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private
> shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp()
> thought it had emptied: page lock on the huge page is enough to protect
> against WP faults (which find the PTE has been cleared), but not enough
> to protect against userfaultfd.  "BUG: Bad rss-counter state" followed.
> 
> retract_page_tables() protects against this by checking !vma->anon_vma;
> but we know that MADV_COLLAPSE needs to be able to work on private shmem
> mappings, even those with an anon_vma prepared for another part of the
> mapping; and we know that MADV_COLLAPSE needs to work on shared shmem
> mappings which are userfaultfd_armed().  Whether it needs to work on
> private shmem mappings which are userfaultfd_armed(), I'm not so sure:
> but assume that it does.
> 
> Now trylock pmd lock without dropping ptlock (suggested by jannh): if
> that fails, drop and retake ptlock around taking pmd lock, and just in
> the uffd private case, go back to recheck and empty the page table.
> 
> Reported-by: Jann Horn 
> Closes: 
> 

Re: [PATCH v5 0/3 RESEND] sed-opal: keyrings, discovery, revert, key store

2023-08-22 Thread Jarkko Sakkinen
On Mon Aug 21, 2023 at 6:26 PM EEST, Greg Joyce wrote:
> On Wed, 2023-08-16 at 23:41 +0300, Jarkko Sakkinen wrote:
> > On Wed Aug 16, 2023 at 10:45 PM EEST, Greg Joyce wrote:
> > > It's been almost 4 weeks since the last resend and there haven't
> > > been
> > > any comments. Is there anything that needs to be changed for
> > > acceptance?
> > > 
> > > Thanks for your input.
> > > 
> > > Greg
> > > 
> > > On Fri, 2023-07-21 at 16:15 -0500, gjo...@linux.vnet.ibm.com wrote:
> > > > From: Greg Joyce 
> > > > 
> > > > This patchset has gone through numerous rounds of review and
> > > > all comments/suggetions have been addressed. The reviews have
> > > > covered all relevant areas including reviews by block and keyring
> > > > developers as well as the SED Opal maintainer. The last
> > > > patchset submission has not solicited any responses in the
> > > > six weeks since it was last distributed. The changes are
> > > > generally useful and ready for inclusion.
> > > > 
> > > > TCG SED Opal is a specification from The Trusted Computing Group
> > > > that allows self encrypting storage devices (SED) to be locked at
> > > > power on and require an authentication key to unlock the drive.
> > > > 
> > > > The current SED Opal implementation in the block driver
> > > > requires that authentication keys be provided in an ioctl
> > > > so that they can be presented to the underlying SED
> > > > capable drive. Currently, the key is typically entered by
> > > > a user with an application like sedutil or sedcli. While
> > > > this process works, it does not lend itself to automation
> > > > like unlock by a udev rule.
> > > > 
> > > > The SED block driver has been extended so it can alternatively
> > > > obtain a key from a sed-opal kernel keyring. The SED ioctls
> > > > will indicate the source of the key, either directly in the
> > > > ioctl data or from the keyring.
> > > > 
> > > > Two new SED ioctls have also been added. These are:
> > > >   1) IOC_OPAL_REVERT_LSP to revert LSP state
> > > >   2) IOC_OPAL_DISCOVERY to discover drive capabilities/state
> > > > 
> > > > change log v5:
> > > > - rebase to for-6.5/block
> > > > 
> > > > change log v4:
> > > > - rebase to 6.3-rc7
> > > > - replaced "255" magic number with U8_MAX
> > > > 
> > > > change log:
> > > > - rebase to 6.x
> > > > - added latest reviews
> > > > - removed platform functions for persistent key storage
> > > > - replaced key update logic with key_create_or_update()
> > > > - minor bracing and padding changes
> > > > - add error returns
> > > > - opal_key structure is application provided but kernel
> > > >   verified
> > > > - added brief description of TCG SED Opal
> > > > 
> > > > 
> > > > Greg Joyce (3):
> > > >   block: sed-opal: Implement IOC_OPAL_DISCOVERY
> > > >   block: sed-opal: Implement IOC_OPAL_REVERT_LSP
> > > >   block: sed-opal: keyring support for SED keys
> > > > 
> > > >  block/Kconfig |   2 +
> > > >  block/opal_proto.h|   4 +
> > > >  block/sed-opal.c  | 252
> > > > +-
> > > >  include/linux/sed-opal.h  |   5 +
> > > >  include/uapi/linux/sed-opal.h |  25 +++-
> > > >  5 files changed, 282 insertions(+), 6 deletions(-)
> > > > 
> > > > 
> > > > base-commit: 1341c7d2ccf42ed91aea80b8579d35bc1ea381e2
> > 
> > I can give because it looks good to me to all patches:
> > 
> > Acked-by: Jarkko Sakkinen 
> > 
> > ... but should not probably go to my tree.
> > 
> > BR, Jarkko
>
> Thanks for the ack Jarkko. Any thoughts on which tree it should go to?

I get from "scripts/get_maintainer.pl block/sed-opal.c | wl-copy"

Jonathan Derrick  (supporter:SECURE ENCRYPTING 
DEVICE (SED) OPAL DRIVER)
Jens Axboe  (maintainer:BLOCK LAYER)
linux-bl...@vger.kernel.org (open list:SECURE ENCRYPTING DEVICE (SED) OPAL 
DRIVER)
linux-ker...@vger.kernel.org (open list)

You should probably add the corresponding maintainers and linux-block to
the loop. I suggest to send a new version of the patch set with my ack's
added to the patches.

BR, Jarkko


Re: [PATCH rfc v2 05/10] powerpc: mm: use try_vma_locked_page_fault()

2023-08-22 Thread Kefeng Wang




On 2023/8/22 17:38, Christophe Leroy wrote:



Le 21/08/2023 à 14:30, Kefeng Wang a écrit :

Use new try_vma_locked_page_fault() helper to simplify code.
No functional change intended.


Does it really simplifies code ? It's 32 insertions versus 34 deletions
so only removing 2 lines.


Yes,it is unfriendly for powerpc as the arch's vma access check is much
complex than other arch,


I don't like the struct vm_fault you are adding because when it was four
independant variables it was handled through local registers. Now that
it is a struct it has to go via the stack, leading to unnecessary memory
read and writes. And going back and forth between architecture code and
generic code may also be counter-performant.


Because different arch has different var to check vma access, so the
easy way to add them into vmf, I don' find a better way.


Did you make any performance analysis ? Page faults are really a hot
path when dealling with minor faults.


no, this is only built and rfc to see the feedback about the conversion.

Thanks.



Thanks
Christophe



Signed-off-by: Kefeng Wang 
---
   arch/powerpc/mm/fault.c | 66 -
   1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b1723094d464..52f9546e020e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -391,6 +391,22 @@ static int page_fault_is_bad(unsigned long err)
   #define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S)
   #endif
   
+#ifdef CONFIG_PER_VMA_LOCK

+bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+   int is_exec = TRAP(vmf->regs) == INTERRUPT_INST_STORAGE;
+   int is_write = page_fault_is_write(vmf->fault_code);
+
+   if (unlikely(access_pkey_error(is_write, is_exec,
+   (vmf->fault_code & DSISR_KEYFAULT), vma)))
+   return true;
+
+   if (unlikely(access_error(is_write, is_exec, vma)))
+   return true;
+   return false;
+}
+#endif
+
   /*
* For 600- and 800-family processors, the error_code parameter is DSISR
* for a data fault, SRR1 for an instruction fault.
@@ -407,12 +423,18 @@ static int ___do_page_fault(struct pt_regs *regs, 
unsigned long address,
   {
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
-   unsigned int flags = FAULT_FLAG_DEFAULT;
int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
vm_fault_t fault, major = 0;
bool kprobe_fault = kprobe_page_fault(regs, 11);
+   struct vm_fault vmf = {
+   .real_address = address,
+   .fault_code = error_code,
+   .regs = regs,
+   .flags = FAULT_FLAG_DEFAULT,
+   };
+
   
   	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))

return 0;
@@ -463,45 +485,21 @@ static int ___do_page_fault(struct pt_regs *regs, 
unsigned long address,
 * mmap_lock held
 */
if (is_user)
-   flags |= FAULT_FLAG_USER;
+   vmf.flags |= FAULT_FLAG_USER;
if (is_write)
-   flags |= FAULT_FLAG_WRITE;
+   vmf.flags |= FAULT_FLAG_WRITE;
if (is_exec)
-   flags |= FAULT_FLAG_INSTRUCTION;
+   vmf.flags |= FAULT_FLAG_INSTRUCTION;
   
-	if (!(flags & FAULT_FLAG_USER))

-   goto lock_mmap;
-
-   vma = lock_vma_under_rcu(mm, address);
-   if (!vma)
-   goto lock_mmap;
-
-   if (unlikely(access_pkey_error(is_write, is_exec,
-  (error_code & DSISR_KEYFAULT), vma))) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-
-   if (unlikely(access_error(is_write, is_exec, vma))) {
-   vma_end_read(vma);
-   goto lock_mmap;
-   }
-
-   fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, 
regs);
-   if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
-   vma_end_read(vma);
-
-   if (!(fault & VM_FAULT_RETRY)) {
-   count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+   fault = try_vma_locked_page_fault();
+   if (fault == VM_FAULT_NONE)
+   goto retry;
+   if (!(fault & VM_FAULT_RETRY))
goto done;
-   }
-   count_vm_vma_lock_event(VMA_LOCK_RETRY);
   
   	if (fault_signal_pending(fault, regs))

return user_mode(regs) ? 0 : SIGBUS;
   
-lock_mmap:

-
/* When running in the kernel we expect faults to occur only to
 * addresses in user space.  All other faults represent errors in the
 * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -528,7 +526,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * make sure we exit gracefully 

Re: linux-next: manual merge of the tty tree with the powerpc tree

2023-08-22 Thread Greg KH
On Fri, Aug 18, 2023 at 02:58:26PM +1000, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the tty tree got a conflict in:
> 
>   arch/powerpc/include/asm/fs_pd.h
> 
> between commits:
> 
>   e6e077cb2aa4 ("powerpc/include: Declare mpc8xx_immr in 8xx_immap.h")
>   fecc436a97af ("powerpc/include: Remove mpc8260.h and m82xx_pci.h")
>   fbbf4280dae4 ("powerpc/8xx: Remove immr_map() and immr_unmap()")
>   7768716d2f19 ("powerpc/cpm2: Remove cpm2_map() and cpm2_unmap()")
> 
> from the powerpc tree and commit:
> 
>   c2d6c1b4f034 ("serial: cpm_uart: Use get_baudrate() instead of 
> uart_baudrate()")
> 
> from the tty tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> Note that after all the above are applied, it looks like this file can
> be removed completely as nothing in the tree includes it any more.

Thanks for the notice, I'll let the ppc developers remove it as it's in
their tree.

thanks,

greg k-h


[PATCH AUTOSEL 6.1 9/9] powerpc/powermac: Use early_* IO variants in via_calibrate_decr()

2023-08-22 Thread Sasha Levin
From: Benjamin Gray 

[ Upstream commit 86582e6189dd8f9f52c25d46c70fe5d111da6345 ]

On a powermac platform, via the call path:

  start_kernel()
time_init()
  ppc_md.calibrate_decr() (pmac_calibrate_decr)
via_calibrate_decr()

ioremap() and iounmap() are called. The unmap can enable interrupts
unexpectedly (cond_resched() in vunmap_pmd_range()), which causes a
warning later in the boot sequence in start_kernel().

Use the early_* variants of these IO functions to prevent this.

The issue is pre-existing, but is surfaced by commit 721255b9826b
("genirq: Use a maple tree for interrupt descriptor management").

Signed-off-by: Benjamin Gray 
Reviewed-by: Christophe Leroy 
Signed-off-by: Michael Ellerman 
Link: https://msgid.link/20230706010816.72682-1-bg...@linux.ibm.com
Signed-off-by: Sasha Levin 
---
 arch/powerpc/platforms/powermac/time.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/time.c 
b/arch/powerpc/platforms/powermac/time.c
index 4c5790aff1b54..8633891b7aa58 100644
--- a/arch/powerpc/platforms/powermac/time.c
+++ b/arch/powerpc/platforms/powermac/time.c
@@ -26,8 +26,8 @@
 #include 
 #include 
 
+#include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -182,7 +182,7 @@ static int __init via_calibrate_decr(void)
return 0;
}
of_node_put(vias);
-   via = ioremap(rsrc.start, resource_size());
+   via = early_ioremap(rsrc.start, resource_size());
if (via == NULL) {
printk(KERN_ERR "Failed to map VIA for timer calibration !\n");
return 0;
@@ -207,7 +207,7 @@ static int __init via_calibrate_decr(void)
 
ppc_tb_freq = (dstart - dend) * 100 / 6;
 
-   iounmap(via);
+   early_iounmap((void *)via, resource_size());
 
return 1;
 }
-- 
2.40.1



[PATCH AUTOSEL 6.4 09/10] powerpc/powermac: Use early_* IO variants in via_calibrate_decr()

2023-08-22 Thread Sasha Levin
From: Benjamin Gray 

[ Upstream commit 86582e6189dd8f9f52c25d46c70fe5d111da6345 ]

On a powermac platform, via the call path:

  start_kernel()
time_init()
  ppc_md.calibrate_decr() (pmac_calibrate_decr)
via_calibrate_decr()

ioremap() and iounmap() are called. The unmap can enable interrupts
unexpectedly (cond_resched() in vunmap_pmd_range()), which causes a
warning later in the boot sequence in start_kernel().

Use the early_* variants of these IO functions to prevent this.

The issue is pre-existing, but is surfaced by commit 721255b9826b
("genirq: Use a maple tree for interrupt descriptor management").

Signed-off-by: Benjamin Gray 
Reviewed-by: Christophe Leroy 
Signed-off-by: Michael Ellerman 
Link: https://msgid.link/20230706010816.72682-1-bg...@linux.ibm.com
Signed-off-by: Sasha Levin 
---
 arch/powerpc/platforms/powermac/time.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/time.c 
b/arch/powerpc/platforms/powermac/time.c
index 4c5790aff1b54..8633891b7aa58 100644
--- a/arch/powerpc/platforms/powermac/time.c
+++ b/arch/powerpc/platforms/powermac/time.c
@@ -26,8 +26,8 @@
 #include 
 #include 
 
+#include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -182,7 +182,7 @@ static int __init via_calibrate_decr(void)
return 0;
}
of_node_put(vias);
-   via = ioremap(rsrc.start, resource_size());
+   via = early_ioremap(rsrc.start, resource_size());
if (via == NULL) {
printk(KERN_ERR "Failed to map VIA for timer calibration !\n");
return 0;
@@ -207,7 +207,7 @@ static int __init via_calibrate_decr(void)
 
ppc_tb_freq = (dstart - dend) * 100 / 6;
 
-   iounmap(via);
+   early_iounmap((void *)via, resource_size());
 
return 1;
 }
-- 
2.40.1



Re: [PATCH rfc v2 05/10] powerpc: mm: use try_vma_locked_page_fault()

2023-08-22 Thread Christophe Leroy


Le 21/08/2023 à 14:30, Kefeng Wang a écrit :
> Use new try_vma_locked_page_fault() helper to simplify code.
> No functional change intended.

Does it really simplifies code ? It's 32 insertions versus 34 deletions 
so only removing 2 lines.

I don't like the struct vm_fault you are adding because when it was four 
independant variables it was handled through local registers. Now that 
it is a struct it has to go via the stack, leading to unnecessary memory 
read and writes. And going back and forth between architecture code and 
generic code may also be counter-performant.

Did you make any performance analysis ? Page faults are really a hot 
path when dealling with minor faults.

Thanks
Christophe

> 
> Signed-off-by: Kefeng Wang 
> ---
>   arch/powerpc/mm/fault.c | 66 -
>   1 file changed, 32 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index b1723094d464..52f9546e020e 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -391,6 +391,22 @@ static int page_fault_is_bad(unsigned long err)
>   #define page_fault_is_bad(__err)((__err) & DSISR_BAD_FAULT_32S)
>   #endif
>   
> +#ifdef CONFIG_PER_VMA_LOCK
> +bool arch_vma_access_error(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> + int is_exec = TRAP(vmf->regs) == INTERRUPT_INST_STORAGE;
> + int is_write = page_fault_is_write(vmf->fault_code);
> +
> + if (unlikely(access_pkey_error(is_write, is_exec,
> + (vmf->fault_code & DSISR_KEYFAULT), vma)))
> + return true;
> +
> + if (unlikely(access_error(is_write, is_exec, vma)))
> + return true;
> + return false;
> +}
> +#endif
> +
>   /*
>* For 600- and 800-family processors, the error_code parameter is DSISR
>* for a data fault, SRR1 for an instruction fault.
> @@ -407,12 +423,18 @@ static int ___do_page_fault(struct pt_regs *regs, 
> unsigned long address,
>   {
>   struct vm_area_struct * vma;
>   struct mm_struct *mm = current->mm;
> - unsigned int flags = FAULT_FLAG_DEFAULT;
>   int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
>   int is_user = user_mode(regs);
>   int is_write = page_fault_is_write(error_code);
>   vm_fault_t fault, major = 0;
>   bool kprobe_fault = kprobe_page_fault(regs, 11);
> + struct vm_fault vmf = {
> + .real_address = address,
> + .fault_code = error_code,
> + .regs = regs,
> + .flags = FAULT_FLAG_DEFAULT,
> + };
> +
>   
>   if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
>   return 0;
> @@ -463,45 +485,21 @@ static int ___do_page_fault(struct pt_regs *regs, 
> unsigned long address,
>* mmap_lock held
>*/
>   if (is_user)
> - flags |= FAULT_FLAG_USER;
> + vmf.flags |= FAULT_FLAG_USER;
>   if (is_write)
> - flags |= FAULT_FLAG_WRITE;
> + vmf.flags |= FAULT_FLAG_WRITE;
>   if (is_exec)
> - flags |= FAULT_FLAG_INSTRUCTION;
> + vmf.flags |= FAULT_FLAG_INSTRUCTION;
>   
> - if (!(flags & FAULT_FLAG_USER))
> - goto lock_mmap;
> -
> - vma = lock_vma_under_rcu(mm, address);
> - if (!vma)
> - goto lock_mmap;
> -
> - if (unlikely(access_pkey_error(is_write, is_exec,
> -(error_code & DSISR_KEYFAULT), vma))) {
> - vma_end_read(vma);
> - goto lock_mmap;
> - }
> -
> - if (unlikely(access_error(is_write, is_exec, vma))) {
> - vma_end_read(vma);
> - goto lock_mmap;
> - }
> -
> - fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, 
> regs);
> - if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
> - vma_end_read(vma);
> -
> - if (!(fault & VM_FAULT_RETRY)) {
> - count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
> + fault = try_vma_locked_page_fault();
> + if (fault == VM_FAULT_NONE)
> + goto retry;
> + if (!(fault & VM_FAULT_RETRY))
>   goto done;
> - }
> - count_vm_vma_lock_event(VMA_LOCK_RETRY);
>   
>   if (fault_signal_pending(fault, regs))
>   return user_mode(regs) ? 0 : SIGBUS;
>   
> -lock_mmap:
> -
>   /* When running in the kernel we expect faults to occur only to
>* addresses in user space.  All other faults represent errors in the
>* kernel and should generate an OOPS.  Unfortunately, in the case of an
> @@ -528,7 +526,7 @@ static int ___do_page_fault(struct pt_regs *regs, 
> unsigned long address,
>* make sure we exit gracefully rather than endlessly redo
>* the fault.
>*/
> - fault = handle_mm_fault(vma, address, flags, regs);
> + fault = handle_mm_fault(vma, address, vmf.flags, regs);
>   
>   major |= fault & VM_FAULT_MAJOR;
>   
> @@ -544,7 +542,7 @@ static int 

Re: KASAN debug kernel fails to boot at early stage when CONFIG_SMP=y is set (kernel 6.5-rc5, PowerMac G4 3,6)

2023-08-22 Thread Christophe Leroy


Le 18/08/2023 à 18:23, Erhard Furtner a écrit :
> On Fri, 18 Aug 2023 15:47:38 +
> Christophe Leroy  wrote:
> 
>> I'm wondering if the problem is just linked to the kernel being built
>> with CONFIG_SMP or if it is the actual startup of a secondary CPU that
>> cause the freeze.
>>
>> Please leave the btext_unmap() in place because I think it is important
>> to keep it, and start the kernel with the following parameter:
>>
>> nr_cpus=1
> 
> With btext_unmap() back and place and nr_cpus=1 set the freeze still happens 
> after the 1st btext_unmap:129 on cold boots:
> 
> [0.00] printk: bootconsole [udbg0] enabled
> [0.00] Total memory = 2048MB; using 4096kB for hash table
> [0.00] mapin_ram:125
> [0.00] mmu_mapin_ram:169 0 3000 140 200
> [0.00] __mmu_mapin_ram:146 0 140
> [0.00] __mmu_mapin_ram:155 140
> [0.00] __mmu_mapin_ram:146 140 3000
> [0.00] __mmu_mapin_ram:155 2000
> [0.00] __mapin_ram_chunk:107 2000 3000
> [0.00] __mapin_ram_chunk:117
> [0.00] mapin_ram:134
> [0.00] kasan_mmu_init:129
> [0.00] kasan_mmu_init:132 0
> [0.00] kasan_mmu_init:137
> [0.00] btext_unmap:129
> 


Thanks,

Can you replace the call to btext_unmap() by a call to btext_map() at 
the end of MMU_init() ?

If that gives no interesting result, can you leave the call to 
btext_unmap() and add a call to btext_map() at the very begining of 
function start_kernel() in init/main.c (You may have to add a include of 
asm/btext.h)

With that I hope we can see more stuff.

Christophe


Re: linux-next: build failure after merge of the mm tree

2023-08-22 Thread Michael Ellerman
Matthew Wilcox  writes:
> On Tue, Aug 22, 2023 at 09:55:37AM +1000, Stephen Rothwell wrote:
>> In file included from include/trace/trace_events.h:27,
>>  from include/trace/define_trace.h:102,
>>  from fs/xfs/xfs_trace.h:4428,
>>  from fs/xfs/xfs_trace.c:45:
>> include/linux/pgtable.h:8:25: error: initializer element is not constant
>> 8 | #define PMD_ORDER   (PMD_SHIFT - PAGE_SHIFT)
>
> Ummm.  PowerPC doesn't have a compile-time constant PMD size?

Yeah. The joys of supporting two MMUs with different supported page
sizes in a single kernel binary.

> arch/powerpc/include/asm/book3s/64/pgtable.h:#define PMD_SHIFT  (PAGE_SHIFT + 
> PTE_INDEX_SIZE)
> arch/powerpc/include/asm/book3s/64/pgtable.h:#define PTE_INDEX_SIZE  
> __pte_index_size
>
> That's really annoying.  I'll try to work around it.

Sorry, thanks.

cheers


Re: [PATCH v6 02/14] x86/kexec: refactor for kernel/Kconfig.kexec

2023-08-22 Thread Michal Suchánek
Hello,

On Thu, Jul 13, 2023 at 07:13:57PM +0800, Leizhen (ThunderTown) wrote:
> 
> 
> On 2023/7/13 0:15, Eric DeVolder wrote:
> > The kexec and crash kernel options are provided in the common
> > kernel/Kconfig.kexec. Utilize the common options and provide
> > the ARCH_SUPPORTS_ and ARCH_SELECTS_ entries to recreate the
> > equivalent set of KEXEC and CRASH options.
> > 
> > Signed-off-by: Eric DeVolder 
> > ---
> >  arch/x86/Kconfig | 92 ++--
> >  1 file changed, 19 insertions(+), 73 deletions(-)
> > 
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 7422db409770..9767a343f7c2 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -2040,88 +2040,34 @@ config EFI_RUNTIME_MAP
> >  
> >  source "kernel/Kconfig.hz"
> >  
> > -config KEXEC
> > -   bool "kexec system call"
> > -   select KEXEC_CORE
> > -   help
> > - kexec is a system call that implements the ability to shutdown your
> > - current kernel, and to start another kernel.  It is like a reboot
> > - but it is independent of the system firmware.   And like a reboot
> > - you can start any kernel with it, not just Linux.
> > -
> > - The name comes from the similarity to the exec system call.
> > -
> > - It is an ongoing process to be certain the hardware in a machine
> > - is properly shutdown, so do not be surprised if this code does not
> > - initially work for you.  As of this writing the exact hardware
> > - interface is strongly in flux, so no good recommendation can be
> > - made.
> > -
> > -config KEXEC_FILE
> > -   bool "kexec file based system call"
> > -   select KEXEC_CORE
> > -   select HAVE_IMA_KEXEC if IMA
> > -   depends on X86_64
> > -   depends on CRYPTO=y
> > -   depends on CRYPTO_SHA256=y
> > -   help
> > - This is new version of kexec system call. This system call is
> > - file based and takes file descriptors as system call argument
> > - for kernel and initramfs as opposed to list of segments as
> > - accepted by previous system call.
> > +config ARCH_SUPPORTS_KEXEC
> > +   def_bool y
> 
> In v5, Joel Fernandes seems to suggest you change it to the following form:

It's unfortunate that the suggestion did not make it to the mailinglist.

> In arch/Kconfig:
> +config ARCH_SUPPORTS_KEXEC
> + bool
> 
> In arch/x86/Kconfig:
> config X86
>   ... ...
> + select ARCH_SUPPORTS_KEXEC
> 
> In arch/arm64/Kconfig:
> config ARM64
>   ... ...
> + select ARCH_SUPPORTS_KEXEC if PM_SLEEP_SMP

Which might work for this case

> 
> etc..
> 
> You can refer to ARCH_HAS_DEBUG_VIRTUAL.
> 
> >  
> > -config ARCH_HAS_KEXEC_PURGATORY
> > -   def_bool KEXEC_FILE
> > +config ARCH_SUPPORTS_KEXEC_FILE
> > +   def_bool X86_64 && CRYPTO && CRYPTO_SHA256
> >  
> > -config KEXEC_SIG
> > -   bool "Verify kernel signature during kexec_file_load() syscall"
> > +config ARCH_SELECTS_KEXEC_FILE
> > +   def_bool y
> > depends on KEXEC_FILE
> > -   help
> > +   select HAVE_IMA_KEXEC if IMA

but not this case, at least not this trivially.

Than for consistency it looks better to keep as is.

Thanks

Michal

> >  
> > - This option makes the kexec_file_load() syscall check for a valid
> > - signature of the kernel image.  The image can still be loaded without
> > - a valid signature unless you also enable KEXEC_SIG_FORCE, though if
> > - there's a signature that we can check, then it must be valid.
> > +config ARCH_HAS_KEXEC_PURGATORY
> > +   def_bool KEXEC_FILE
> >  
> > - In addition to this option, you need to enable signature
> > - verification for the corresponding kernel image type being
> > - loaded in order for this to work.
> > +config ARCH_SUPPORTS_KEXEC_SIG
> > +   def_bool y
> >  
> > -config KEXEC_SIG_FORCE
> > -   bool "Require a valid signature in kexec_file_load() syscall"
> > -   depends on KEXEC_SIG
> > -   help
> > - This option makes kernel signature verification mandatory for
> > - the kexec_file_load() syscall.
> > +config ARCH_SUPPORTS_KEXEC_SIG_FORCE
> > +   def_bool y
> >  
> > -config KEXEC_BZIMAGE_VERIFY_SIG
> > -   bool "Enable bzImage signature verification support"
> > -   depends on KEXEC_SIG
> > -   depends on SIGNED_PE_FILE_VERIFICATION
> > -   select SYSTEM_TRUSTED_KEYRING
> > -   help
> > - Enable bzImage signature verification support.
> > +config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
> > +   def_bool y
> >  
> > -config CRASH_DUMP
> > -   bool "kernel crash dumps"
> > -   depends on X86_64 || (X86_32 && HIGHMEM)
> > -   help
> > - Generate crash dump after being started by kexec.
> > - This should be normally only set in special crash dump kernels
> > - which are loaded in the main kernel with kexec-tools into
> > - a specially reserved region and then later executed after
> > - a crash by kdump/kexec. The crash dump kernel must be compiled
> > - to a memory address not used by the main kernel or BIOS using
> > - 

[PATCH] powerpc/85xx: Mark some functions static and add missing includes to fix no previous prototype error

2023-08-22 Thread Christophe Leroy
corenet{32/64}_smp_defconfig leads to:

  CC  arch/powerpc/sysdev/ehv_pic.o
arch/powerpc/sysdev/ehv_pic.c:45:6: error: no previous prototype for 
'ehv_pic_unmask_irq' [-Werror=missing-prototypes]
   45 | void ehv_pic_unmask_irq(struct irq_data *d)
  |  ^~
arch/powerpc/sysdev/ehv_pic.c:52:6: error: no previous prototype for 
'ehv_pic_mask_irq' [-Werror=missing-prototypes]
   52 | void ehv_pic_mask_irq(struct irq_data *d)
  |  ^~~~
arch/powerpc/sysdev/ehv_pic.c:59:6: error: no previous prototype for 
'ehv_pic_end_irq' [-Werror=missing-prototypes]
   59 | void ehv_pic_end_irq(struct irq_data *d)
  |  ^~~
arch/powerpc/sysdev/ehv_pic.c:66:6: error: no previous prototype for 
'ehv_pic_direct_end_irq' [-Werror=missing-prototypes]
   66 | void ehv_pic_direct_end_irq(struct irq_data *d)
  |  ^~
arch/powerpc/sysdev/ehv_pic.c:71:5: error: no previous prototype for 
'ehv_pic_set_affinity' [-Werror=missing-prototypes]
   71 | int ehv_pic_set_affinity(struct irq_data *d, const struct cpumask *dest,
  | ^~~~
arch/powerpc/sysdev/ehv_pic.c:112:5: error: no previous prototype for 
'ehv_pic_set_irq_type' [-Werror=missing-prototypes]
  112 | int ehv_pic_set_irq_type(struct irq_data *d, unsigned int flow_type)
  | ^~~~
  CC  arch/powerpc/sysdev/fsl_rio.o
arch/powerpc/sysdev/fsl_rio.c:102:5: error: no previous prototype for 
'fsl_rio_mcheck_exception' [-Werror=missing-prototypes]
  102 | int fsl_rio_mcheck_exception(struct pt_regs *regs)
  | ^~~~
arch/powerpc/sysdev/fsl_rio.c:306:5: error: no previous prototype for 
'fsl_map_inb_mem' [-Werror=missing-prototypes]
  306 | int fsl_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart,
  | ^~~
arch/powerpc/sysdev/fsl_rio.c:357:6: error: no previous prototype for 
'fsl_unmap_inb_mem' [-Werror=missing-prototypes]
  357 | void fsl_unmap_inb_mem(struct rio_mport *mport, dma_addr_t lstart)
  |  ^
arch/powerpc/sysdev/fsl_rio.c:445:5: error: no previous prototype for 
'fsl_rio_setup' [-Werror=missing-prototypes]
  445 | int fsl_rio_setup(struct platform_device *dev)
  | ^
  CC  arch/powerpc/sysdev/fsl_rmu.o
arch/powerpc/sysdev/fsl_rmu.c:362:6: error: no previous prototype for 
'msg_unit_error_handler' [-Werror=missing-prototypes]
  362 | void msg_unit_error_handler(void)
  |  ^~
  CC  arch/powerpc/platforms/85xx/corenet_generic.o
arch/powerpc/platforms/85xx/corenet_generic.c:33:13: error: no previous 
prototype for 'corenet_gen_pic_init' [-Werror=missing-prototypes]
   33 | void __init corenet_gen_pic_init(void)
  | ^~~~
arch/powerpc/platforms/85xx/corenet_generic.c:51:13: error: no previous 
prototype for 'corenet_gen_setup_arch' [-Werror=missing-prototypes]
   51 | void __init corenet_gen_setup_arch(void)
  | ^~
arch/powerpc/platforms/85xx/corenet_generic.c:104:12: error: no previous 
prototype for 'corenet_gen_publish_devices' [-Werror=missing-prototypes]
  104 | int __init corenet_gen_publish_devices(void)
  |^~~
  CC  arch/powerpc/platforms/85xx/qemu_e500.o
arch/powerpc/platforms/85xx/qemu_e500.c:28:13: error: no previous prototype for 
'qemu_e500_pic_init' [-Werror=missing-prototypes]
   28 | void __init qemu_e500_pic_init(void)
  | ^~
  CC  arch/powerpc/kernel/pmc.o
arch/powerpc/kernel/pmc.c:78:6: error: no previous prototype for 
'power4_enable_pmcs' [-Werror=missing-prototypes]
   78 | void power4_enable_pmcs(void)
  |  ^~

Signed-off-by: Christophe Leroy 
Cc: Arnd Bergmann 
---
 arch/powerpc/kernel/pmc.c |  2 +-
 arch/powerpc/platforms/85xx/corenet_generic.c |  6 +++---
 arch/powerpc/platforms/85xx/qemu_e500.c   |  2 +-
 arch/powerpc/sysdev/ehv_pic.c | 12 ++--
 arch/powerpc/sysdev/fsl_rio.c |  9 +
 arch/powerpc/sysdev/fsl_rmu.c |  2 +-
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c
index 15414c8a2837..9fabb4d9235e 100644
--- a/arch/powerpc/kernel/pmc.c
+++ b/arch/powerpc/kernel/pmc.c
@@ -74,7 +74,7 @@ void release_pmc_hardware(void)
 }
 EXPORT_SYMBOL_GPL(release_pmc_hardware);
 
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_BOOK3S_64
 void power4_enable_pmcs(void)
 {
unsigned long hid0;
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c 
b/arch/powerpc/platforms/85xx/corenet_generic.c
index bfde391c42f4..645fcca77cde 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -30,7 +30,7 @@
 #include "smp.h"
 #include "mpc85xx.h"
 
-void __init corenet_gen_pic_init(void)
+static void __init corenet_gen_pic_init(void)

[PATCH] powerpc/64e: Fix circular dependency with CONFIG_SMP disabled

2023-08-22 Thread Christophe Leroy
asm/percpu.h includes asm/paca.h which needs struct tlb_core_data
which is defined in mmu-e500.h

asm/percpu.h is included from asm/mmu.h in a #ifdef CONFIG_E500
before the inclusion of mmu-e500.h

To fix that, move the inclusion of asm/percpu.h into mmu-e500.h
after the definition of struct tlb_core_data

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202308220708.nrf5auae-...@intel.com/
Closes: 
https://lore.kernel.org/oe-kbuild-all/202308220857.ufq2oaxm-...@intel.com/
Closes: 
https://lore.kernel.org/oe-kbuild-all/202308221055.lw3uzjil-...@intel.com/
Fixes: 3a24ea0df83e ("powerpc/kuap: Use ASM feature fixups instead of static 
branches")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu.h | 5 -
 arch/powerpc/include/asm/nohash/mmu-e500.h | 3 +++
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 82af2e2c5eca..52cc25864a1b 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -144,11 +144,6 @@
 
 typedef pte_t *pgtable_t;
 
-#ifdef CONFIG_PPC_E500
-#include 
-DECLARE_PER_CPU(int, next_tlbcam_idx);
-#endif
-
 enum {
MMU_FTRS_POSSIBLE =
 #if defined(CONFIG_PPC_BOOK3S_604)
diff --git a/arch/powerpc/include/asm/nohash/mmu-e500.h 
b/arch/powerpc/include/asm/nohash/mmu-e500.h
index e43a418d3ccd..6ddced0415cb 100644
--- a/arch/powerpc/include/asm/nohash/mmu-e500.h
+++ b/arch/powerpc/include/asm/nohash/mmu-e500.h
@@ -319,6 +319,9 @@ extern int book3e_htw_mode;
 
 #endif
 
+#include 
+DECLARE_PER_CPU(int, next_tlbcam_idx);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */
-- 
2.41.0