[RFC PATCH 12/12] selftests/fpu: Allow building on other architectures

2023-12-07 Thread Samuel Holland
Now that ARCH_HAS_KERNEL_FPU_SUPPORT provides a common way to compile
and run floating-point code, this test is no longer x86-specific.

Signed-off-by: Samuel Holland 
---

 lib/Kconfig.debug   |  2 +-
 lib/Makefile| 25 ++---
 lib/test_fpu_glue.c |  5 -
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cc7d53d9dc01..bbab0b054e09 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2933,7 +2933,7 @@ config TEST_FREE_PAGES
 
 config TEST_FPU
tristate "Test floating point operations in kernel space"
-   depends on X86 && !KCOV_INSTRUMENT_ALL
+   depends on ARCH_HAS_KERNEL_FPU_SUPPORT && !KCOV_INSTRUMENT_ALL
help
  Enable this option to add /sys/kernel/debug/selftest_helpers/test_fpu
  which will trigger a sequence of floating point operations. This is 
used
diff --git a/lib/Makefile b/lib/Makefile
index e7cbd54944a2..b9f28558c9bd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -109,31 +109,10 @@ CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE)
 obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o
 obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
 
-#
-# CFLAGS for compiling floating point code inside the kernel. x86/Makefile 
turns
-# off the generation of FPU/SSE* instructions for kernel proper but FPU_FLAGS
-# get appended last to CFLAGS and thus override those previous compiler 
options.
-#
-FPU_CFLAGS := -msse -msse2
-ifdef CONFIG_CC_IS_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
-#
-# The "-msse" in the first argument is there so that the
-# -mpreferred-stack-boundary=3 build error:
-#
-#  -mpreferred-stack-boundary=3 is not between 4 and 12
-#
-# can be triggered. Otherwise gcc doesn't complain.
-FPU_CFLAGS += -mhard-float
-FPU_CFLAGS += $(call cc-option,-msse 
-mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
-endif
-
 obj-$(CONFIG_TEST_FPU) += test_fpu.o
 test_fpu-y := test_fpu_glue.o test_fpu_impl.o
-CFLAGS_test_fpu_impl.o += $(FPU_CFLAGS)
+CFLAGS_test_fpu_impl.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_test_fpu_impl.o += $(CC_FLAGS_NO_FPU)
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_fpu_glue.c b/lib/test_fpu_glue.c
index 2761b51117b0..2e0b4027a5e3 100644
--- a/lib/test_fpu_glue.c
+++ b/lib/test_fpu_glue.c
@@ -17,7 +17,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 int test_fpu(void);
 
@@ -38,6 +38,9 @@ static struct dentry *selftest_dir;
 
 static int __init test_fpu_init(void)
 {
+   if (!kernel_fpu_available())
+   return -EINVAL;
+
selftest_dir = debugfs_create_dir("selftest_helpers", NULL);
if (!selftest_dir)
return -ENOMEM;
-- 
2.42.0



[RFC PATCH 10/12] drm/amd/display: Use ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
Now that all previously-supported architectures select
ARCH_HAS_KERNEL_FPU_SUPPORT, this code can depend on that symbol instead
of the existing list of architectures. It can also take advantage of the
common kernel-mode FPU API and method of adjusting CFLAGS.

Signed-off-by: Samuel Holland 
---

 drivers/gpu/drm/amd/display/Kconfig   |  2 +-
 .../gpu/drm/amd/display/amdgpu_dm/dc_fpu.c| 33 +
 drivers/gpu/drm/amd/display/dc/dml/Makefile   | 36 ++-
 drivers/gpu/drm/amd/display/dc/dml2/Makefile  | 36 ++-
 4 files changed, 6 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/Kconfig 
b/drivers/gpu/drm/amd/display/Kconfig
index 901d1961b739..5fcd4f778dc3 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -8,7 +8,7 @@ config DRM_AMD_DC
depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64
select SND_HDA_COMPONENT if SND_HDA_CORE
# !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752
-   select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || 
(ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG))
+   select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || 
!CC_IS_CLANG)
help
  Choose this option if you want to use the new display engine
  support for AMDGPU. This adds required support for Vega and
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
index 4ae4720535a5..b64f917174ca 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
@@ -26,16 +26,7 @@
 
 #include "dc_trace.h"
 
-#if defined(CONFIG_X86)
-#include 
-#elif defined(CONFIG_PPC64)
-#include 
-#include 
-#elif defined(CONFIG_ARM64)
-#include 
-#elif defined(CONFIG_LOONGARCH)
 #include 
-#endif
 
 /**
  * DOC: DC FPU manipulation overview
@@ -87,20 +78,9 @@ void dc_fpu_begin(const char *function_name, const int line)
WARN_ON_ONCE(!in_task());
preempt_disable();
depth = __this_cpu_inc_return(fpu_recursion_depth);
-
if (depth == 1) {
-#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
+   BUG_ON(!kernel_fpu_available());
kernel_fpu_begin();
-#elif defined(CONFIG_PPC64)
-   if (cpu_has_feature(CPU_FTR_VSX_COMP))
-   enable_kernel_vsx();
-   else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
-   enable_kernel_altivec();
-   else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
-   enable_kernel_fp();
-#elif defined(CONFIG_ARM64)
-   kernel_neon_begin();
-#endif
}
 
TRACE_DCN_FPU(true, function_name, line, depth);
@@ -122,18 +102,7 @@ void dc_fpu_end(const char *function_name, const int line)
 
depth = __this_cpu_dec_return(fpu_recursion_depth);
if (depth == 0) {
-#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
kernel_fpu_end();
-#elif defined(CONFIG_PPC64)
-   if (cpu_has_feature(CPU_FTR_VSX_COMP))
-   disable_kernel_vsx();
-   else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
-   disable_kernel_altivec();
-   else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
-   disable_kernel_fp();
-#elif defined(CONFIG_ARM64)
-   kernel_neon_end();
-#endif
} else {
WARN_ON_ONCE(depth < 0);
}
diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile 
b/drivers/gpu/drm/amd/display/dc/dml/Makefile
index ea7d60f9a9b4..5aad0f572ba3 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
+++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
@@ -25,40 +25,8 @@
 # It provides the general basic services required by other DAL
 # subcomponents.
 
-ifdef CONFIG_X86
-dml_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float
-dml_ccflags := $(dml_ccflags-y) -msse
-endif
-
-ifdef CONFIG_PPC64
-dml_ccflags := -mhard-float -maltivec
-endif
-
-ifdef CONFIG_ARM64
-dml_rcflags := -mgeneral-regs-only
-endif
-
-ifdef CONFIG_LOONGARCH
-dml_ccflags := -mfpu=64
-dml_rcflags := -msoft-float
-endif
-
-ifdef CONFIG_CC_IS_GCC
-ifneq ($(call gcc-min-version, 70100),y)
-IS_OLD_GCC = 1
-endif
-endif
-
-ifdef CONFIG_X86
-ifdef IS_OLD_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-dml_ccflags += -mpreferred-stack-boundary=4
-else
-dml_ccflags += -msse2
-endif
-endif
+dml_ccflags := $(CC_FLAGS_FPU)
+dml_rcflags := $(CC_FLAGS_NO_FPU)
 
 ifneq ($(CONFIG_FRAME_WARN),0)
 frame_warn_flag := -Wframe-larger-than=2048
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/Makefile 
b/drivers/gpu/drm/amd/display/dc/dml2/Makefile
index acff3449b8d7..4f6c804a26ad 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/Makefile
+++ 

[RFC PATCH 11/12] selftests/fpu: Move FP code to a separate translation unit

2023-12-07 Thread Samuel Holland
This ensures no compiler-generated floating-point code can appear
outside kernel_fpu_{begin,end}() sections, and some architectures
enforce this separation.

Signed-off-by: Samuel Holland 
---

 lib/Makefile|  3 ++-
 lib/{test_fpu.c => test_fpu_glue.c} | 32 +-
 lib/test_fpu_impl.c | 35 +
 3 files changed, 38 insertions(+), 32 deletions(-)
 rename lib/{test_fpu.c => test_fpu_glue.c} (71%)
 create mode 100644 lib/test_fpu_impl.c

diff --git a/lib/Makefile b/lib/Makefile
index 6b09731d8e61..e7cbd54944a2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -132,7 +132,8 @@ FPU_CFLAGS += $(call cc-option,-msse 
-mpreferred-stack-boundary=3,-mpreferred-st
 endif
 
 obj-$(CONFIG_TEST_FPU) += test_fpu.o
-CFLAGS_test_fpu.o += $(FPU_CFLAGS)
+test_fpu-y := test_fpu_glue.o test_fpu_impl.o
+CFLAGS_test_fpu_impl.o += $(FPU_CFLAGS)
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_fpu.c b/lib/test_fpu_glue.c
similarity index 71%
rename from lib/test_fpu.c
rename to lib/test_fpu_glue.c
index e82db19fed84..2761b51117b0 100644
--- a/lib/test_fpu.c
+++ b/lib/test_fpu_glue.c
@@ -19,37 +19,7 @@
 #include 
 #include 
 
-static int test_fpu(void)
-{
-   /*
-* This sequence of operations tests that rounding mode is
-* to nearest and that denormal numbers are supported.
-* Volatile variables are used to avoid compiler optimizing
-* the calculations away.
-*/
-   volatile double a, b, c, d, e, f, g;
-
-   a = 4.0;
-   b = 1e-15;
-   c = 1e-310;
-
-   /* Sets precision flag */
-   d = a + b;
-
-   /* Result depends on rounding mode */
-   e = a + b / 2;
-
-   /* Denormal and very large values */
-   f = b / c;
-
-   /* Depends on denormal support */
-   g = a + c * f;
-
-   if (d > a && e > a && g > a)
-   return 0;
-   else
-   return -EINVAL;
-}
+int test_fpu(void);
 
 static int test_fpu_get(void *data, u64 *val)
 {
diff --git a/lib/test_fpu_impl.c b/lib/test_fpu_impl.c
new file mode 100644
index ..2ff01980bc22
--- /dev/null
+++ b/lib/test_fpu_impl.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include 
+
+int test_fpu(void)
+{
+   /*
+* This sequence of operations tests that rounding mode is
+* to nearest and that denormal numbers are supported.
+* Volatile variables are used to avoid compiler optimizing
+* the calculations away.
+*/
+   volatile double a, b, c, d, e, f, g;
+
+   a = 4.0;
+   b = 1e-15;
+   c = 1e-310;
+
+   /* Sets precision flag */
+   d = a + b;
+
+   /* Result depends on rounding mode */
+   e = a + b / 2;
+
+   /* Denormal and very large values */
+   f = b / c;
+
+   /* Depends on denormal support */
+   g = a + c * f;
+
+   if (d > a && e > a && g > a)
+   return 0;
+   else
+   return -EINVAL;
+}
-- 
2.42.0



[RFC PATCH 09/12] riscv: Add support for kernel-mode FPU

2023-12-07 Thread Samuel Holland
This is motivated by the amdgpu DRM driver, which needs floating-point
code to support recent hardware. That code is not performance-critical,
so only provide a minimal non-preemptible implementation for now.

Use a similar trick as ARM to force placing floating-point code in a
separate translation unit, so it is not possible for compiler-generated
floating-point code to appear outside kernel_fpu_{begin,end}().

Signed-off-by: Samuel Holland 
---

 arch/riscv/Kconfig  |  1 +
 arch/riscv/Makefile |  3 +++
 arch/riscv/include/asm/fpu.h| 26 ++
 arch/riscv/kernel/Makefile  |  1 +
 arch/riscv/kernel/kernel_mode_fpu.c | 28 
 5 files changed, 59 insertions(+)
 create mode 100644 arch/riscv/include/asm/fpu.h
 create mode 100644 arch/riscv/kernel/kernel_mode_fpu.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 95a2a06acc6a..cf0967928e6d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -27,6 +27,7 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if FPU
select ARCH_HAS_MMIOWB
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index a74be78678eb..2e719c369210 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -81,6 +81,9 @@ KBUILD_CFLAGS += -march=$(shell echo $(riscv-march-y) | sed 
-E 's/(rv32ima|rv64i
 
 KBUILD_AFLAGS += -march=$(riscv-march-y)
 
+# For C code built with floating-point support, exclude V but keep F and D.
+CC_FLAGS_FPU  := -march=$(shell echo $(riscv-march-y) | sed -E 
's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/')
+
 KBUILD_CFLAGS += -mno-save-restore
 KBUILD_CFLAGS += -DCONFIG_PAGE_OFFSET=$(CONFIG_PAGE_OFFSET)
 
diff --git a/arch/riscv/include/asm/fpu.h b/arch/riscv/include/asm/fpu.h
new file mode 100644
index ..8cd027acc015
--- /dev/null
+++ b/arch/riscv/include/asm/fpu.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_RISCV_FPU_H
+#define _ASM_RISCV_FPU_H
+
+#include 
+
+#define kernel_fpu_available() has_fpu()
+
+#ifdef __riscv_f
+
+#define kernel_fpu_begin() \
+   static_assert(false, "floating-point code must use a separate 
translation unit")
+#define kernel_fpu_end() kernel_fpu_begin()
+
+#else
+
+void kernel_fpu_begin(void);
+void kernel_fpu_end(void);
+
+#endif
+
+#endif /* ! _ASM_RISCV_FPU_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index fee22a3d1b53..662c483e338d 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_MMU) += vdso.o vdso/
 
 obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
 obj-$(CONFIG_FPU)  += fpu.o
+obj-$(CONFIG_FPU)  += kernel_mode_fpu.o
 obj-$(CONFIG_RISCV_ISA_V)  += vector.o
 obj-$(CONFIG_SMP)  += smpboot.o
 obj-$(CONFIG_SMP)  += smp.o
diff --git a/arch/riscv/kernel/kernel_mode_fpu.c 
b/arch/riscv/kernel/kernel_mode_fpu.c
new file mode 100644
index ..9b2024cc056b
--- /dev/null
+++ b/arch/riscv/kernel/kernel_mode_fpu.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+void kernel_fpu_begin(void)
+{
+   preempt_disable();
+   fstate_save(current, task_pt_regs(current));
+   csr_set(CSR_SSTATUS, SR_FS);
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+   csr_clear(CSR_SSTATUS, SR_FS);
+   fstate_restore(current, task_pt_regs(current));
+   preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
-- 
2.42.0



[RFC PATCH 08/12] x86: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
x86 already provides kernel_fpu_begin() and kernel_fpu_end(), but in a
different header. Add a wrapper header, and export the CFLAGS
adjustments as found in lib/Makefile.

Signed-off-by: Samuel Holland 
---

 arch/x86/Kconfig   |  1 +
 arch/x86/Makefile  | 20 
 arch/x86/include/asm/fpu.h | 13 +
 3 files changed, 34 insertions(+)
 create mode 100644 arch/x86/include/asm/fpu.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3762f41bb092..1fe7f2d8d017 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -81,6 +81,7 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOVif X86_64
+   select ARCH_HAS_KERNEL_FPU_SUPPORT
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1a068de12a56..71576c8dbe79 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,6 +70,26 @@ export BITS
 KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
 KBUILD_RUSTFLAGS += 
-Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
+#
+# CFLAGS for compiling floating point code inside the kernel.
+#
+CC_FLAGS_FPU := -msse -msse2
+ifdef CONFIG_CC_IS_GCC
+# Stack alignment mismatch, proceed with caution.
+# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
+# (8B stack alignment).
+# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+# The "-msse" in the first argument is there so that the
+# -mpreferred-stack-boundary=3 build error:
+#
+#  -mpreferred-stack-boundary=3 is not between 4 and 12
+#
+# can be triggered. Otherwise gcc doesn't complain.
+CC_FLAGS_FPU += -mhard-float
+CC_FLAGS_FPU += $(call cc-option,-msse 
-mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
+endif
+
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
 #
 # Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h
new file mode 100644
index ..b2743fe19339
--- /dev/null
+++ b/arch/x86/include/asm/fpu.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+#include 
+
+#define kernel_fpu_available() true
+
+#endif /* ! _ASM_X86_FPU_H */
-- 
2.42.0



[RFC PATCH 07/12] powerpc: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
PowerPC provides an equivalent to the common kernel-mode FPU API, but in
a different header and using different function names. The PowerPC API
also requires a non-preemptible context. Add a wrapper header, and
export the CFLAGS adjustments.

Signed-off-by: Samuel Holland 
---

 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/Makefile  |  5 -
 arch/powerpc/include/asm/fpu.h | 28 
 3 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/fpu.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f105ee4f3cf..e96cb5b7c571 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT  if PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEMREMAP_COMPAT_ALIGN   if PPC_64S_HASH_MMU
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index f19dbaa1d541..2d5f21baf6ff 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -142,6 +142,9 @@ CFLAGS-$(CONFIG_PPC32)  += $(call cc-option, 
$(MULTIPLEWORD))
 
 CFLAGS-$(CONFIG_PPC32) += $(call cc-option,-mno-readonly-in-sdata)
 
+CC_FLAGS_FPU   := $(call cc-option,-mhard-float)
+CC_FLAGS_NO_FPU+= $(call cc-option,-msoft-float)
+
 ifdef CONFIG_FUNCTION_TRACER
 ifdef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY
 KBUILD_CPPFLAGS+= -DCC_USING_PATCHABLE_FUNCTION_ENTRY
@@ -163,7 +166,7 @@ asinstr := $(call as-instr,lis 
9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
 
 KBUILD_CPPFLAGS+= -I $(srctree)/arch/$(ARCH) $(asinstr)
 KBUILD_AFLAGS  += $(AFLAGS-y)
-KBUILD_CFLAGS  += $(call cc-option,-msoft-float)
+KBUILD_CFLAGS  += $(CC_FLAGS_NO_FPU)
 KBUILD_CFLAGS  += $(CFLAGS-y)
 CPP= $(CC) -E $(KBUILD_CFLAGS)
 
diff --git a/arch/powerpc/include/asm/fpu.h b/arch/powerpc/include/asm/fpu.h
new file mode 100644
index ..ca584e4bc40f
--- /dev/null
+++ b/arch/powerpc/include/asm/fpu.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_POWERPC_FPU_H
+#define _ASM_POWERPC_FPU_H
+
+#include 
+
+#include 
+#include 
+
+#define kernel_fpu_available() (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
+
+static inline void kernel_fpu_begin(void)
+{
+   preempt_disable();
+   enable_kernel_fp();
+}
+
+static inline void kernel_fpu_end(void)
+{
+   disable_kernel_fp();
+   preempt_enable();
+}
+
+#endif /* ! _ASM_POWERPC_FPU_H */
-- 
2.42.0



[RFC PATCH 05/12] lib/raid6: Use CC_FLAGS_FPU for NEON CFLAGS

2023-12-07 Thread Samuel Holland
Now that CC_FLAGS_FPU is exported and can be used anywhere in the source
tree, use it instead of duplicating the flags here.

Signed-off-by: Samuel Holland 
---

 lib/raid6/Makefile | 31 ---
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 1c5420ff254e..309fea97efc6 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -33,25 +33,6 @@ CFLAGS_REMOVE_vpermxor8.o += -msoft-float
 endif
 endif
 
-# The GCC option -ffreestanding is required in order to compile code containing
-# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
-ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-NEON_FLAGS := -ffreestanding
-# Enable 
-NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
-ifeq ($(ARCH),arm)
-NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
-endif
-CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
-ifeq ($(ARCH),arm64)
-CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
-endif
-endif
-
 quiet_cmd_unroll = UNROLL  $@
   cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@
 
@@ -75,10 +56,14 @@ targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c
 $(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
 
-CFLAGS_neon1.o += $(NEON_FLAGS)
-CFLAGS_neon2.o += $(NEON_FLAGS)
-CFLAGS_neon4.o += $(NEON_FLAGS)
-CFLAGS_neon8.o += $(NEON_FLAGS)
+CFLAGS_neon1.o += $(CC_FLAGS_FPU)
+CFLAGS_neon2.o += $(CC_FLAGS_FPU)
+CFLAGS_neon4.o += $(CC_FLAGS_FPU)
+CFLAGS_neon8.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_neon1.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU)
 targets += neon1.c neon2.c neon4.c neon8.c
 $(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
-- 
2.42.0



[RFC PATCH 06/12] LoongArch: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
LoongArch already provides kernel_fpu_begin() and kernel_fpu_end() in
asm/fpu.h, so it only needs to add kernel_fpu_available() and export
the CFLAGS adjustments.

Signed-off-by: Samuel Holland 
---

 arch/loongarch/Kconfig   | 1 +
 arch/loongarch/Makefile  | 5 -
 arch/loongarch/include/asm/fpu.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ee123820a476..65d4475565b8 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -15,6 +15,7 @@ config LOONGARCH
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if CPU_HAS_FPU
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 204b94b2e6aa..f5c4f7e921db 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -25,6 +25,9 @@ endif
 32bit-emul = elf32loongarch
 64bit-emul = elf64loongarch
 
+CC_FLAGS_FPU   := -mfpu=64
+CC_FLAGS_NO_FPU:= -msoft-float
+
 ifdef CONFIG_DYNAMIC_FTRACE
 KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
 CC_FLAGS_FTRACE := -fpatchable-function-entry=2
@@ -46,7 +49,7 @@ ld-emul   = $(64bit-emul)
 cflags-y   += -mabi=lp64s
 endif
 
-cflags-y   += -pipe -msoft-float
+cflags-y   += -pipe $(CC_FLAGS_NO_FPU)
 LDFLAGS_vmlinux+= -static -n -nostdlib
 
 # When the assembler supports explicit relocation hint, we must use it.
diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h
index c2d8962fda00..3177674228f8 100644
--- a/arch/loongarch/include/asm/fpu.h
+++ b/arch/loongarch/include/asm/fpu.h
@@ -21,6 +21,7 @@
 
 struct sigcontext;
 
+#define kernel_fpu_available() cpu_has_fpu
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 
-- 
2.42.0



[RFC PATCH 04/12] arm64: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
arm64 provides an equivalent to the common kernel-mode FPU API, but in a
different header and using different function names. Add a wrapper
header, and export CFLAGS adjustments as found in lib/raid6/Makefile.

Signed-off-by: Samuel Holland 
---

 arch/arm64/Kconfig   |  1 +
 arch/arm64/Makefile  |  9 -
 arch/arm64/include/asm/fpu.h | 17 +
 3 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/fpu.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b071a00425d..485ac389ac11 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -30,6 +30,7 @@ config ARM64
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 9a2d3723cd0f..4a65f24c7998 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -36,7 +36,14 @@ ifeq ($(CONFIG_BROKEN_GAS_INST),y)
 $(warning Detected assembler with broken .inst; disassembly will be unreliable)
 endif
 
-KBUILD_CFLAGS  += -mgeneral-regs-only  \
+# The GCC option -ffreestanding is required in order to compile code containing
+# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
+CC_FLAGS_FPU   := -ffreestanding
+# Enable 
+CC_FLAGS_FPU   += -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_NO_FPU:= -mgeneral-regs-only
+
+KBUILD_CFLAGS  += $(CC_FLAGS_NO_FPU) \
   $(compat_vdso) $(cc_has_k_constraint)
 KBUILD_CFLAGS  += $(call cc-disable-warning, psabi)
 KBUILD_AFLAGS  += $(compat_vdso)
diff --git a/arch/arm64/include/asm/fpu.h b/arch/arm64/include/asm/fpu.h
new file mode 100644
index ..664c0a192ab1
--- /dev/null
+++ b/arch/arm64/include/asm/fpu.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * linux/arch/arm64/include/asm/fpu.h
+ *
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef __ASM_FPU_H
+#define __ASM_FPU_H
+
+#include 
+
+#define kernel_fpu_available() cpu_has_neon()
+#define kernel_fpu_begin() kernel_neon_begin()
+#define kernel_fpu_end()   kernel_neon_end()
+
+#endif /* ! __ASM_FPU_H */
-- 
2.42.0



[RFC PATCH 03/12] ARM: crypto: Use CC_FLAGS_FPU for NEON CFLAGS

2023-12-07 Thread Samuel Holland
Now that CC_FLAGS_FPU is exported and can be used anywhere in the source
tree, use it instead of duplicating the flags here.

Signed-off-by: Samuel Holland 
---

 arch/arm/lib/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 650404be6768..0ca5aae1bcc3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -40,8 +40,7 @@ $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:  $(obj)/csumpartialcopygeneric.S
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-  NEON_FLAGS   := -march=armv7-a -mfloat-abi=softfp -mfpu=neon
-  CFLAGS_xor-neon.o+= $(NEON_FLAGS)
+  CFLAGS_xor-neon.o+= $(CC_FLAGS_FPU)
   obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
 endif
 
-- 
2.42.0



[RFC PATCH 02/12] ARM: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
ARM provides an equivalent to the common kernel-mode FPU API, but in a
different header and using different function names. Add a wrapper
header, and export CFLAGS adjustments as found in lib/raid6/Makefile.

Signed-off-by: Samuel Holland 
---

 arch/arm/Kconfig   |  1 +
 arch/arm/Makefile  |  7 +++
 arch/arm/include/asm/fpu.h | 17 +
 3 files changed, 25 insertions(+)
 create mode 100644 arch/arm/include/asm/fpu.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f8567e95f98b..92e21a4a2903 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -14,6 +14,7 @@ config ARM
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 5ba42f69f8ce..1dd860dba5f5 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -130,6 +130,13 @@ endif
 # Accept old syntax despite ".syntax unified"
 AFLAGS_NOWARN  :=$(call 
as-option,-Wa$(comma)-mno-warn-deprecated,-Wa$(comma)-W)
 
+# The GCC option -ffreestanding is required in order to compile code containing
+# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
+CC_FLAGS_FPU   := -ffreestanding
+# Enable 
+CC_FLAGS_FPU   += -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_FPU   += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+
 ifeq ($(CONFIG_THUMB2_KERNEL),y)
 CFLAGS_ISA :=-Wa,-mimplicit-it=always $(AFLAGS_NOWARN)
 AFLAGS_ISA :=$(CFLAGS_ISA) -Wa$(comma)-mthumb
diff --git a/arch/arm/include/asm/fpu.h b/arch/arm/include/asm/fpu.h
new file mode 100644
index ..d01ca06e700a
--- /dev/null
+++ b/arch/arm/include/asm/fpu.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * linux/arch/arm/include/asm/fpu.h
+ *
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef __ASM_FPU_H
+#define __ASM_FPU_H
+
+#include 
+
+#define kernel_fpu_available() cpu_has_neon()
+#define kernel_fpu_begin() kernel_neon_begin()
+#define kernel_fpu_end()   kernel_neon_end()
+
+#endif /* ! __ASM_FPU_H */
-- 
2.42.0



[RFC PATCH 00/12] Unified cross-architecture kernel-mode FPU API

2023-12-07 Thread Samuel Holland
This series supersedes my earier RISC-V specific series[1].

This series unifies the kernel-mode FPU API across several architectures
by wrapping the existing functions (where needed) in consistently-named
functions placed in a consistent header location, with mostly the same
semantics: they can be called from preemptible or non-preemptible task
context, and are not assumed to be reentrant. Architectures are also
expected to provide CFLAGS adjustments for compiling FPU-dependent code.
For the moment, SIMD/vector units are out of scope for this common API.

This allows us to remove the ifdeffery and duplicated Makefile logic at
each FPU user. It then implements the common API on RISC-V, and converts
a couple of users to the new API: the AMDGPU DRM driver, and the FPU
self test.

The underlying goal of this series is to allow using newer AMD GPUs
(e.g. Navi) on RISC-V boards such as SiFive's HiFive Unmatched. Those
GPUs need CONFIG_DRM_AMD_DC_FP to initialize, which requires kernel-mode
FPU support.

[1]: 
https://lore.kernel.org/linux-riscv/20231122030621.3759313-1-samuel.holl...@sifive.com/


Samuel Holland (12):
  arch: Add ARCH_HAS_KERNEL_FPU_SUPPORT
  ARM: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  ARM: crypto: Use CC_FLAGS_FPU for NEON CFLAGS
  arm64: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  lib/raid6: Use CC_FLAGS_FPU for NEON CFLAGS
  LoongArch: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  powerpc: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  x86: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  riscv: Add support for kernel-mode FPU
  drm/amd/display: Use ARCH_HAS_KERNEL_FPU_SUPPORT
  selftests/fpu: Move FP code to a separate translation unit
  selftests/fpu: Allow building on other architectures

 Makefile  |  4 ++
 arch/Kconfig  |  9 +
 arch/arm/Kconfig  |  1 +
 arch/arm/Makefile |  7 
 arch/arm/include/asm/fpu.h| 17 +
 arch/arm/lib/Makefile |  3 +-
 arch/arm64/Kconfig|  1 +
 arch/arm64/Makefile   |  9 -
 arch/arm64/include/asm/fpu.h  | 17 +
 arch/loongarch/Kconfig|  1 +
 arch/loongarch/Makefile   |  5 ++-
 arch/loongarch/include/asm/fpu.h  |  1 +
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/Makefile |  5 ++-
 arch/powerpc/include/asm/fpu.h| 28 ++
 arch/riscv/Kconfig|  1 +
 arch/riscv/Makefile   |  3 ++
 arch/riscv/include/asm/fpu.h  | 26 +
 arch/riscv/kernel/Makefile|  1 +
 arch/riscv/kernel/kernel_mode_fpu.c   | 28 ++
 arch/x86/Kconfig  |  1 +
 arch/x86/Makefile | 20 ++
 arch/x86/include/asm/fpu.h| 13 +++
 drivers/gpu/drm/amd/display/Kconfig   |  2 +-
 .../gpu/drm/amd/display/amdgpu_dm/dc_fpu.c| 33 +
 drivers/gpu/drm/amd/display/dc/dml/Makefile   | 36 +-
 drivers/gpu/drm/amd/display/dc/dml2/Makefile  | 36 +-
 lib/Kconfig.debug |  2 +-
 lib/Makefile  | 26 ++---
 lib/raid6/Makefile| 31 
 lib/{test_fpu.c => test_fpu_glue.c}   | 37 +++
 lib/test_fpu_impl.c   | 35 ++
 32 files changed, 255 insertions(+), 185 deletions(-)
 create mode 100644 arch/arm/include/asm/fpu.h
 create mode 100644 arch/arm64/include/asm/fpu.h
 create mode 100644 arch/powerpc/include/asm/fpu.h
 create mode 100644 arch/riscv/include/asm/fpu.h
 create mode 100644 arch/riscv/kernel/kernel_mode_fpu.c
 create mode 100644 arch/x86/include/asm/fpu.h
 rename lib/{test_fpu.c => test_fpu_glue.c} (71%)
 create mode 100644 lib/test_fpu_impl.c

-- 
2.42.0



[RFC PATCH 01/12] arch: Add ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-07 Thread Samuel Holland
Several architectures provide an API to enable the FPU and run
floating-point SIMD code in kernel space. However, the function names,
header locations, and semantics are inconsistent across architectures,
and FPU support may be gated behind other Kconfig options.

Provide a standard way for architectures to declare that kernel space
FPU support is available. Architectures selecting this option must
implement what is currently the most common API (kernel_fpu_begin() and
kernel_fpu_end(), plus a new function kernel_fpu_available()) and
provide the appropriate CFLAGS for compiling floating-point C code.

Suggested-by: Christoph Hellwig 
Signed-off-by: Samuel Holland 
---

 Makefile | 4 
 arch/Kconfig | 9 +
 2 files changed, 13 insertions(+)

diff --git a/Makefile b/Makefile
index 511b5616aa41..e65c186cf2c9 100644
--- a/Makefile
+++ b/Makefile
@@ -969,6 +969,10 @@ KBUILD_CFLAGS  += $(CC_FLAGS_CFI)
 export CC_FLAGS_CFI
 endif
 
+# Architectures can define flags to add/remove for floating-point support
+export CC_FLAGS_FPU
+export CC_FLAGS_NO_FPU
+
 ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0)
 KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT)
 endif
diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..6df834e18e9c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1478,6 +1478,15 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG
  address translations. Page table walkers that clear the accessed bit
  may use this capability to reduce their search space.
 
+config ARCH_HAS_KERNEL_FPU_SUPPORT
+   bool
+   help
+ An architecture should select this option if it supports running
+ floating-point code in kernel space. It must export the functions
+ kernel_fpu_available(), kernel_fpu_begin(), and kernel_fpu_end() from
+ , and define CC_FLAGS_FPU and/or CC_FLAGS_NO_FPU as
+ necessary in its Makefile.
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
-- 
2.42.0



Re: [PATCH 3/3] drm/amd/display: Support DRM_AMD_DC_FP on RISC-V

2023-12-07 Thread Samuel Holland
Hi Christoph,

On 2023-11-22 2:40 AM, Christoph Hellwig wrote:
>> -select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || 
>> (ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG))
>> +select DRM_AMD_DC_FP if ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG
>> +select DRM_AMD_DC_FP if PPC64 && ALTIVEC
>> +select DRM_AMD_DC_FP if RISCV && FPU
>> +select DRM_AMD_DC_FP if LOONGARCH || X86
> 
> This really is a mess.  Can you add a ARCH_HAS_KERNEL_FPU_SUPPORT
> symbol that all architetures that have it select instead, and them
> make DRM_AMD_DC_FP depend on it?

Yes, I have done this for v2, which I will send shortly.

>> -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
>> +#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) || 
>> defined(CONFIG_RISCV)
>>  kernel_fpu_begin();
>>  #elif defined(CONFIG_PPC64)
>>  if (cpu_has_feature(CPU_FTR_VSX_COMP))
>> @@ -122,7 +124,7 @@ void dc_fpu_end(const char *function_name, const int 
>> line)
>>  
>>  depth = __this_cpu_dec_return(fpu_recursion_depth);
>>  if (depth == 0) {
>> -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
>> +#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) || 
>> defined(CONFIG_RISCV)
>>  kernel_fpu_end();
>>  #elif defined(CONFIG_PPC64)
>>  if (cpu_has_feature(CPU_FTR_VSX_COMP))
> 
> And then this mess can go away.  We'll need to decide if we want to
> cover all the in-kernel vector support as part of it, which would
> seem reasonable to me, or have a separate generic kernel_vector_begin
> with it's own option.

I think we may want to keep vector separate for performance on architectures
with separate FP and vector register files. For now, I have limited my changes
to FPU support only, which means I have removed VSX/Altivec from here; the
AMDGPU code doesn't need Altivec anyway.

>> diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile 
>> b/drivers/gpu/drm/amd/display/dc/dml/Makefile
>> index ea7d60f9a9b4..5c8f840ef323 100644
>> --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
>> +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
>> @@ -43,6 +43,12 @@ dml_ccflags := -mfpu=64
>>  dml_rcflags := -msoft-float
>>  endif
>>  
>> +ifdef CONFIG_RISCV
>> +include $(srctree)/arch/riscv/Makefile.isa
>> +# Remove V from the ISA string, like in arch/riscv/Makefile, but keep F and 
>> D.
>> +dml_ccflags := -march=$(shell echo $(riscv-march-y) | sed -E 
>> 's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/')
>> +endif
>> +
>>  ifdef CONFIG_CC_IS_GCC
>>  ifneq ($(call gcc-min-version, 70100),y)
>>  IS_OLD_GCC = 1
> 
> And this is again not really something we should be doing.
> Instead we need a generic way in Kconfig to enable FPU support
> for an object file or set of, that the arch support can hook
> into.

I've included this in v2 as well.

> Btw, I'm also really worried about folks using the FPU instructions
> outside the kernel_fpu_begin/end windows in general (not directly
> related to the RISC-V support).  Can we have objecttool checks
> for that similar to only allowing the unsafe uaccess in the
> uaccess begin/end pairs?

ARM partially enforces this at compile time: it disallows calling
kernel_neon_begin() inside a translation unit that has NEON enabled. That
doesn't prevent the programmer from calling a FPU-enabled function from outside
a begin/end section, but it does prevent the compiler from generating unexpected
FPU usage behind your back. I implemented this same functionality for RISC-V.

Actually tracking all possibly-FPU-tainted functions and their call sites is
probably possible, but a much larger task.

Regards,
Samuel



[PATCH v5 09/13] powerpc/pseries: Add papr-vpd character driver for VPD retrieval

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

PowerVM LPARs may retrieve Vital Product Data (VPD) for system
components using the ibm,get-vpd RTAS function.

We can expose this to user space with a /dev/papr-vpd character
device, where the programming model is:

  struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
  int devfd = open("/dev/papr-vpd", O_RDONLY);
  int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, );
  size_t size = lseek(vpdfd, 0, SEEK_END);
  char *buf = malloc(size);
  pread(devfd, buf, size, 0);

When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
the file contains the result of a complete ibm,get-vpd sequence. The
file contents are immutable from the POV of user space. To get a new
view of the VPD, the client must create a new handle.

This design choice insulates user space from most of the complexities
that ibm,get-vpd brings:

* ibm,get-vpd must be called more than once to obtain complete
  results.

* Only one ibm,get-vpd call sequence should be in progress at a time;
  interleaved sequences will disrupt each other. Callers must have a
  protocol for serializing their use of the function.

* A call sequence in progress may receive a "VPD changed, try again"
  status, requiring the client to abandon the sequence and start
  over.

The memory required for the VPD buffers seems acceptable, around 20KB
for all VPD on one of my systems. And the value of the
/rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
consistently 300KB across various systems I've checked.

I've implemented support for this new ABI in the rtas_get_vpd()
function in librtas, which the vpdupdate command currently uses to
populate its VPD database. I've verified that an unmodified vpdupdate
binary generates an identical database when using a librtas.so that
prefers the new ABI.

Along with the papr-vpd.h header exposed to user space, this
introduces a common papr-miscdev.h uapi header to share a base ioctl
ID with similar drivers to come.

Tested-by: Michal Suchánek 
Signed-off-by: Nathan Lynch 
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 arch/powerpc/include/uapi/asm/papr-miscdev.h   |   9 +
 arch/powerpc/include/uapi/asm/papr-vpd.h   |  22 +
 arch/powerpc/platforms/pseries/Makefile|   1 +
 arch/powerpc/platforms/pseries/papr-vpd.c  | 541 +
 5 files changed, 575 insertions(+)

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 4ea5b837399a..a950545bf7cd 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -349,6 +349,8 @@ Code  Seq#Include File  
 Comments
  

 0xB1  00-1F  PPPoX
  

+0xB2  00 arch/powerpc/include/uapi/asm/papr-vpd.h
powerpc/pseries VPD API
+ 

 0xB3  00 linux/mmc/ioctl.h
 0xB4  00-0F  linux/gpio.h

 0xB5  00-0F  uapi/linux/rpmsg.h  

diff --git a/arch/powerpc/include/uapi/asm/papr-miscdev.h 
b/arch/powerpc/include/uapi/asm/papr-miscdev.h
new file mode 100644
index ..49a2a270b7f3
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-miscdev.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_MISCDEV_H_
+#define _UAPI_PAPR_MISCDEV_H_
+
+enum {
+   PAPR_MISCDEV_IOC_ID = 0xb2,
+};
+
+#endif /* _UAPI_PAPR_MISCDEV_H_ */
diff --git a/arch/powerpc/include/uapi/asm/papr-vpd.h 
b/arch/powerpc/include/uapi/asm/papr-vpd.h
new file mode 100644
index ..1c88e87cb420
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-vpd.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_VPD_H_
+#define _UAPI_PAPR_VPD_H_
+
+#include 
+#include 
+
+struct papr_location_code {
+   /*
+* PAPR+ v2.13 12.3.2.4 Converged Location Code Rules - Length
+* Restrictions. 79 characters plus nul.
+*/
+   char str[80];
+};
+
+/*
+ * ioctl for /dev/papr-vpd. Returns a VPD handle fd corresponding to
+ * the location code.
+ */
+#define PAPR_VPD_IOC_CREATE_HANDLE _IOW(PAPR_MISCDEV_IOC_ID, 0, struct 
papr_location_code)
+
+#endif /* _UAPI_PAPR_VPD_H_ */
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index 1476c5e4433c..f936962a2946 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -4,6 +4,7 @@ 

[PATCH v5 12/13] powerpc/selftests: Add test for papr-vpd

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Add selftests for /dev/papr-vpd, exercising the common expected use
cases:

* Retrieve all VPD by passing an empty location code.
* Retrieve the "system VPD" by passing a location code derived from DT
  root node properties, as done by the vpdupdate command.

The tests also verify that certain intended properties of the driver
hold:

* Passing an unterminated location code to PAPR_VPD_CREATE_HANDLE gets
  EINVAL.
* Passing a NULL location code pointer to PAPR_VPD_CREATE_HANDLE gets
  EFAULT.
* Closing the device node without first issuing a
  PAPR_VPD_CREATE_HANDLE command to it succeeds.
* Releasing a handle without first consuming any data from it
  succeeds.
* Re-reading the contents of a handle returns the same data as the
  first time.

Some minimal validation of the returned data is performed.

The tests are skipped on systems where the papr-vpd driver does not
initialize, making this useful only on PowerVM LPARs at this point.

Signed-off-by: Nathan Lynch 
---
 tools/testing/selftests/powerpc/Makefile   |   1 +
 .../testing/selftests/powerpc/papr_vpd/.gitignore  |   1 +
 tools/testing/selftests/powerpc/papr_vpd/Makefile  |  12 +
 .../testing/selftests/powerpc/papr_vpd/papr_vpd.c  | 352 +
 4 files changed, 366 insertions(+)

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 7ea42fa02eab..05fc68d446c2 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -32,6 +32,7 @@ SUB_DIRS = alignment  \
   vphn \
   math \
   papr_attributes  \
+  papr_vpd \
   ptrace   \
   security \
   mce
diff --git a/tools/testing/selftests/powerpc/papr_vpd/.gitignore 
b/tools/testing/selftests/powerpc/papr_vpd/.gitignore
new file mode 100644
index ..49285031a656
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/.gitignore
@@ -0,0 +1 @@
+/papr_vpd
diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile 
b/tools/testing/selftests/powerpc/papr_vpd/Makefile
new file mode 100644
index ..06b719703bfd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+   $(MAKE) -C ../
+
+TEST_GEN_PROGS := papr_vpd
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
+
+$(OUTPUT)/papr_vpd: CFLAGS += $(KHDR_INCLUDES)
diff --git a/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c 
b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
new file mode 100644
index ..98cbb9109ee6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_vpd/papr_vpd.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "utils.h"
+
+#define DEVPATH "/dev/papr-vpd"
+
+static int dev_papr_vpd_open_close(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+   FAIL_IF(close(devfd) != 0);
+
+   return 0;
+}
+
+static int dev_papr_vpd_get_handle_all(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+   struct papr_location_code lc = { .str = "", };
+   off_t size;
+   int fd;
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   errno = 0;
+   fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, );
+   FAIL_IF(errno != 0);
+   FAIL_IF(fd < 0);
+
+   FAIL_IF(close(devfd) != 0);
+
+   size = lseek(fd, 0, SEEK_END);
+   FAIL_IF(size <= 0);
+
+   void *buf = malloc((size_t)size);
+   FAIL_IF(!buf);
+
+   ssize_t consumed = pread(fd, buf, size, 0);
+   FAIL_IF(consumed != size);
+
+   /* Ensure EOF */
+   FAIL_IF(read(fd, buf, size) != 0);
+   FAIL_IF(close(fd));
+
+   /* Verify that the buffer looks like VPD */
+   static const char needle[] = "System VPD";
+   FAIL_IF(!memmem(buf, size, needle, strlen(needle)));
+
+   return 0;
+}
+
+static int dev_papr_vpd_get_handle_byte_at_a_time(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+   struct papr_location_code lc = { .str = "", };
+   int fd;
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   errno = 0;
+   fd = ioctl(devfd, PAPR_VPD_IOC_CREATE_HANDLE, );
+   FAIL_IF(errno != 0);
+   FAIL_IF(fd < 0);
+
+   FAIL_IF(close(devfd) != 0);
+
+   size_t consumed = 0;
+   while (1) {
+   ssize_t res;
+   char c;
+
+   errno = 0;
+   res = read(fd, , sizeof(c));
+   FAIL_IF(res > sizeof(c));
+ 

[PATCH v5 13/13] powerpc/selftests: Add test for papr-sysparm

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Consistently testing system parameter access is a bit difficult by
nature -- the set of parameters available depends on the model and
system configuration, and updating a parameter should be considered a
destructive operation reserved for the admin.

So we validate some of the error paths and retrieve the SPLPAR
characteristics string, but not much else.

Signed-off-by: Nathan Lynch 
---
 tools/testing/selftests/powerpc/Makefile   |   1 +
 .../selftests/powerpc/papr_sysparm/.gitignore  |   1 +
 .../selftests/powerpc/papr_sysparm/Makefile|  12 ++
 .../selftests/powerpc/papr_sysparm/papr_sysparm.c  | 165 +
 4 files changed, 179 insertions(+)

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 05fc68d446c2..c376151982c4 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -33,6 +33,7 @@ SUB_DIRS = alignment  \
   math \
   papr_attributes  \
   papr_vpd \
+  papr_sysparm \
   ptrace   \
   security \
   mce
diff --git a/tools/testing/selftests/powerpc/papr_sysparm/.gitignore 
b/tools/testing/selftests/powerpc/papr_sysparm/.gitignore
new file mode 100644
index ..f2a69bf59d40
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_sysparm/.gitignore
@@ -0,0 +1 @@
+/papr_sysparm
diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile 
b/tools/testing/selftests/powerpc/papr_sysparm/Makefile
new file mode 100644
index ..7f79e437634a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+   $(MAKE) -C ../
+
+TEST_GEN_PROGS := papr_sysparm
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
+
+$(OUTPUT)/papr_sysparm: CFLAGS += $(KHDR_INCLUDES)
diff --git a/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c 
b/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c
new file mode 100644
index ..9d4850c25aed
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_sysparm/papr_sysparm.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+
+#define DEVPATH "/dev/papr-sysparm"
+
+static int open_close(void)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+   FAIL_IF(close(devfd) != 0);
+
+   return 0;
+}
+
+static int get_splpar(void)
+{
+   struct papr_sysparm_io_block sp = {
+   .parameter = 20, // SPLPAR characteristics
+   };
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+   FAIL_IF(ioctl(devfd, PAPR_SYSPARM_IOC_GET, ) != 0);
+   FAIL_IF(sp.length == 0);
+   FAIL_IF(sp.length > sizeof(sp.data));
+   FAIL_IF(close(devfd) != 0);
+
+   return 0;
+}
+
+static int get_bad_parameter(void)
+{
+   struct papr_sysparm_io_block sp = {
+   .parameter = UINT32_MAX, // there are only ~60 specified 
parameters
+   };
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   // Ensure expected error
+   FAIL_IF(ioctl(devfd, PAPR_SYSPARM_IOC_GET, ) != -1);
+   FAIL_IF(errno != EOPNOTSUPP);
+
+   // Ensure the buffer is unchanged
+   FAIL_IF(sp.length != 0);
+   for (size_t i = 0; i < ARRAY_SIZE(sp.data); ++i)
+   FAIL_IF(sp.data[i] != 0);
+
+   FAIL_IF(close(devfd) != 0);
+
+   return 0;
+}
+
+static int check_efault_common(unsigned long cmd)
+{
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   // Ensure expected error
+   FAIL_IF(ioctl(devfd, cmd, NULL) != -1);
+   FAIL_IF(errno != EFAULT);
+
+   FAIL_IF(close(devfd) != 0);
+
+   return 0;
+}
+
+static int check_efault_get(void)
+{
+   return check_efault_common(PAPR_SYSPARM_IOC_GET);
+}
+
+static int check_efault_set(void)
+{
+   return check_efault_common(PAPR_SYSPARM_IOC_SET);
+}
+
+static int set_hmc0(void)
+{
+   struct papr_sysparm_io_block sp = {
+   .parameter = 0, // HMC0, not a settable parameter
+   };
+   const int devfd = open(DEVPATH, O_RDONLY);
+
+   SKIP_IF_MSG(devfd < 0 && errno == ENOENT,
+   DEVPATH " not present");
+
+   FAIL_IF(devfd < 0);
+
+   // Ensure expected error
+   FAIL_IF(ioctl(devfd, 

[PATCH v5 00/13] powerpc/pseries: New character devices for system parameters and VPD

2023-12-07 Thread Nathan Lynch via B4 Relay
Add character devices that expose PAPR-specific system parameters and
VPD to user space.

The problem: important platform features are enabled on Linux VMs
through the powerpc-specific rtas() syscall in combination with
writeable mappings of /dev/mem. In typical usage, this is encapsulated
behind APIs provided by the librtas library. This paradigm is
incompatible with lockdown, which prohibits /dev/mem access. It also
is too low-level in many cases: a single logical operation may require
multiple sys_rtas() calls in succession to complete. This carries the
risk that a process may exit while leaving an operation unfinished. It
also means that callers must coordinate their use of the syscall for
functions that cannot tolerate multiple concurrent clients, such as
ibm,get-vpd.

These are the general problems, but it's difficult to formulate a
similarly general solution in the form of a single replacement ABI for
sys_rtas(). Instead, each platform facility we expose to user space
needs a specific interface that forms the kernel-user interactions at
a higher level than individual RTAS calls.

I've identified system parameter support as a high priority for this
effort, since the software that communicates with the management
console relies on it and therefore does not work at all with lockdown
enabled on current kernels. VPD retrieval is also important (for
licensing/entitlement things I think?), and serves as a good initial
example of encapsulating sequence-based RTAS calls. So this series
proposes a model for incrementally solving these issues by introducing
a small pseries-specific "driver" for each of these platform
functions. The new drivers expose these facilities to user space in
ways that are compatible with lockdown, and they require no
coordination between their clients.

In preparation, per-function mutexes are added to the core RTAS code
to serialize access to sequence-based RTAS functions. These prevent
kernel-based sequences from interfering with each other, and they
prevent sys_rtas() users from disrupting kernel-based users. The RTAS
core enforces correct lock usage when lockdep is enabled.

Both drivers could potentially support poll() methods to notify
clients of changes to parameters or VPD that happen due to partition
migration and other events. But that should be safe to leave for
later, assuming there's any interest.

I have made changes to librtas to prefer the new interfaces and
verified that existing clients work correctly with the new code. A
draft PR for that work is here:

https://github.com/ibm-power-utilities/librtas/pull/36

The user-space ABI has not changed since v1 of this series.

I expect to propose at least one more small driver in this style for
platform dump retrieval in a separate submission in the future. Other
facilities may follow as needs are identified.

---
Changes in v5:
- Add to the front of the queue another fix for a latent bug where
  sys_rtas() users can trigger a spurious warning backtrace.
- Merge "powerpc/uapi: Export papr-miscdev.h header" into "Add
  papr-vpd character driver for VPD retrieval" so we don't temporarily
  expose the PAPR_MISCDEV_IOC_ID constant via UAPI without also
  updating the ioctl-number documentation. (Michael Ellerman)
- Drop rtas_function_{un}lock() in favor of more explicit use of the new
  per-function mutexes; make rtas_ibm_get_vpd_lock extern for use by
  papr-vpd.
- Assert that rtas_ibm_get_vpd_lock is held in rtas_ibm_get_vpd().
- Add example usage to "powerpc/rtas: Facilitate high-level call
  sequences". (Aneesh Kumar K.V)
- Drop the now-unnecessary 04/13 "powerpc/rtas: Factor out function
  descriptor lookup".
- Include document version when citing the PAPR+ specification
  throughout. (Michael Ellerman)
- Add missing include directives to papr-vpd and papr-sysparm. (Michal
  Suchánek)
- Fix spurious testcase failure in environments without a working
  ibm,set-system-parameter RTAS function. (Michael Ellerman)
- Link to v4: 
https://lore.kernel.org/r/20231117-papr-sys_rtas-vs-lockdown-v4-0-b794d8cb8...@linux.ibm.com

Changes in v4:
- Fix latent issue in rtas_token_to_function() which causes boot-time
  crashes.
- More small preparatory changes: a function table iterator and
  additional symbolic constants for RTAS function return values.
- Use symbolic constants for ibm,get-vpd statuses in papr-vpd.c.
- Add commentary to papr_vpd_ioc_create_handle() explaining choice to
  retrieve all VPD at file handle creation time instead of deferring
  it to the read handler.
- Rebase on current powerpc/next.
- Link to v3: 
https://lore.kernel.org/r/20231025-papr-sys_rtas-vs-lockdown-v3-0-5eb04559e...@linux.ibm.com

Changes in v3:
- Add new rtas_function_lock()/unlock() APIs and convert existing code
  to use them.
- Convert papr-vpd to use rtas_function_lock()/unlock() instead of
  having sys_rtas() obtain a driver-private mutex.
- Rebase on current powerpc/next.
- Link to v2: 

[PATCH v5 03/13] powerpc/rtas: Fall back to linear search on failed token->function lookup

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Enabling any of the powerpc:rtas_* tracepoints at boot is likely to
result in an oops on RTAS platforms. For example, booting a QEMU
pseries model with 'trace_event=powerpc:rtas_input' in the command
line leads to:

  BUG: Kernel NULL pointer dereference on read at 0x0008
  Oops: Kernel access of bad area, sig: 7 [#1]
  NIP [c004231c] do_enter_rtas+0x1bc/0x460
  LR [c004231c] do_enter_rtas+0x1bc/0x460
  Call Trace:
do_enter_rtas+0x1bc/0x460 (unreliable)
rtas_call+0x22c/0x4a0
rtas_get_boot_time+0x80/0x14c
read_persistent_clock64+0x124/0x150
read_persistent_wall_and_boot_offset+0x28/0x58
timekeeping_init+0x70/0x348
start_kernel+0xa0c/0xc1c
start_here_common+0x1c/0x20

(This is preceded by a warning for the failed lookup in
rtas_token_to_function().)

This happens when __do_enter_rtas_trace() attempts a token to function
descriptor lookup before the xarray containing the mappings has been
set up.

Fall back to linear scan of the table if rtas_token_to_function_xarray
is empty.

Fixes: 24098f580e2b ("powerpc/rtas: add tracepoints around RTAS entry")
Reviewed-by: Aneesh Kumar K.V (IBM) 
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/rtas.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ae9b10c954a1..f60a8e7bd5ed 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -572,11 +572,21 @@ static const struct rtas_function 
*rtas_token_to_function(s32 token)
return NULL;
 
func = rtas_token_to_function_untrusted(token);
+   if (func)
+   return func;
+   /*
+* Fall back to linear scan in case the reverse mapping hasn't
+* been initialized yet.
+*/
+   if (xa_empty(_token_to_function_xarray)) {
+   for_each_rtas_function(func) {
+   if (func->token == token)
+   return func;
+   }
+   }
 
-   if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token))
-   return NULL;
-
-   return func;
+   WARN_ONCE(true, "unexpected failed lookup for token %d", token);
+   return NULL;
 }
 
 /* This is here deliberately so it's only used in this file */

-- 
2.41.0



[PATCH v5 05/13] powerpc/rtas: Move token validation from block_rtas_call() to sys_rtas()

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

The rtas system call handler sys_rtas() delegates certain input
validation steps to a helper function: block_rtas_call(). One of these
steps ensures that the user-supplied token value maps to a known RTAS
function. This is done by performing a "reverse" token-to-function
lookup via rtas_token_to_function_untrusted() to obtain an
rtas_function object.

In changes to come, sys_rtas() itself will need the function
descriptor for the token. To prepare:

* Move the lookup and validation up into sys_rtas() and pass the
  resulting rtas_function pointer to block_rtas_call(), which is
  otherwise unconcerned with the token value.

* Change block_rtas_call() to report the RTAS function name instead of
  the token value on validation failures, since it can now rely on
  having a valid function descriptor.

One behavior change is that sys_rtas() now silently errors out when
passed a bad token, before calling block_rtas_call(). So we will no
longer log "RTAS call blocked - exploit attempt?" on invalid
tokens. This is consistent with how sys_rtas() currently handles other
"metadata" (nargs and nret), while block_rtas_call() is primarily
concerned with validating the arguments to be passed to specific RTAS
functions.

Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/rtas.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index f60a8e7bd5ed..ca5bb0b994ac 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1738,24 +1738,18 @@ static bool in_rmo_buf(u32 base, u32 end)
end < (rtas_rmo_buf + RTAS_USER_REGION_SIZE);
 }
 
-static bool block_rtas_call(int token, int nargs,
+static bool block_rtas_call(const struct rtas_function *func, int nargs,
struct rtas_args *args)
 {
-   const struct rtas_function *func;
const struct rtas_filter *f;
-   const bool is_platform_dump = token == 
rtas_function_token(RTAS_FN_IBM_PLATFORM_DUMP);
-   const bool is_config_conn = token == 
rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR);
+   const bool is_platform_dump =
+   func == _function_table[RTAS_FNIDX__IBM_PLATFORM_DUMP];
+   const bool is_config_conn =
+   func == 
_function_table[RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR];
u32 base, size, end;
 
/*
-* If this token doesn't correspond to a function the kernel
-* understands, you're not allowed to call it.
-*/
-   func = rtas_token_to_function_untrusted(token);
-   if (!func)
-   goto err;
-   /*
-* And only functions with filters attached are allowed.
+* Only functions with filters attached are allowed.
 */
f = func->filter;
if (!f)
@@ -1812,14 +1806,15 @@ static bool block_rtas_call(int token, int nargs,
return false;
 err:
pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
-   pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
-  token, nargs, current->comm);
+   pr_err_ratelimited("sys_rtas: %s nargs=%d (called by %s)\n",
+  func->name, nargs, current->comm);
return true;
 }
 
 /* We assume to be passed big endian arguments */
 SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
 {
+   const struct rtas_function *func;
struct pin_cookie cookie;
struct rtas_args args;
unsigned long flags;
@@ -1849,13 +1844,18 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
   nargs * sizeof(rtas_arg_t)) != 0)
return -EFAULT;
 
-   if (token == RTAS_UNKNOWN_SERVICE)
+   /*
+* If this token doesn't correspond to a function the kernel
+* understands, you're not allowed to call it.
+*/
+   func = rtas_token_to_function_untrusted(token);
+   if (!func)
return -EINVAL;
 
args.rets = [nargs];
memset(args.rets, 0, nret * sizeof(rtas_arg_t));
 
-   if (block_rtas_call(token, nargs, ))
+   if (block_rtas_call(func, nargs, ))
return -EINVAL;
 
if (token_is_restricted_errinjct(token)) {

-- 
2.41.0



[PATCH v5 11/13] powerpc/pseries/papr-sysparm: Expose character device to user space

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Until now the papr_sysparm APIs have been kernel-internal. But user
space needs access to PAPR system parameters too. The only method
available to user space today to get or set system parameters is using
sys_rtas() and /dev/mem to pass RTAS-addressable buffers between user
space and firmware. This is incompatible with lockdown and should be
deprecated.

So provide an alternative ABI to user space in the form of a
/dev/papr-sysparm character device with just two ioctl commands (get
and set). The data payloads involved are small enough to fit in the
ioctl argument buffer, making the code relatively simple.

Exposing the system parameters through sysfs has been considered but
it would be too awkward:

* The kernel currently does not have to contain an exhaustive list of
  defined system parameters. This is a convenient property to maintain
  because we don't have to update the kernel whenever a new parameter
  is added to PAPR. Exporting a named attribute in sysfs for each
  parameter would negate this.

* Some system parameters are text-based and some are not.

* Retrieval of at least one system parameter requires input data,
  which a simple read-oriented interface can't support.

Signed-off-by: Nathan Lynch 
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 arch/powerpc/include/asm/papr-sysparm.h|  17 ++-
 arch/powerpc/include/uapi/asm/papr-sysparm.h   |  58 
 arch/powerpc/platforms/pseries/papr-sysparm.c  | 155 -
 4 files changed, 224 insertions(+), 8 deletions(-)

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a950545bf7cd..d8b6cb1a3636 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -351,6 +351,8 @@ Code  Seq#Include File  
 Comments
  

 0xB2  00 arch/powerpc/include/uapi/asm/papr-vpd.h
powerpc/pseries VPD API
  

+0xB2  01-02  arch/powerpc/include/uapi/asm/papr-sysparm.h
powerpc/pseries system parameter API
+ 

 0xB3  00 linux/mmc/ioctl.h
 0xB4  00-0F  linux/gpio.h

 0xB5  00-0F  uapi/linux/rpmsg.h  

diff --git a/arch/powerpc/include/asm/papr-sysparm.h 
b/arch/powerpc/include/asm/papr-sysparm.h
index f5fdbd8ae9db..0dbbff59101d 100644
--- a/arch/powerpc/include/asm/papr-sysparm.h
+++ b/arch/powerpc/include/asm/papr-sysparm.h
@@ -2,8 +2,10 @@
 #ifndef _ASM_POWERPC_PAPR_SYSPARM_H
 #define _ASM_POWERPC_PAPR_SYSPARM_H
 
+#include 
+
 typedef struct {
-   const u32 token;
+   u32 token;
 } papr_sysparm_t;
 
 #define mk_papr_sysparm(x_) ((papr_sysparm_t){ .token = x_, })
@@ -20,11 +22,14 @@ typedef struct {
 #define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRSmk_papr_sysparm(50)
 #define PAPR_SYSPARM_LPAR_NAME mk_papr_sysparm(55)
 
-enum {
-   PAPR_SYSPARM_MAX_INPUT  = 1024,
-   PAPR_SYSPARM_MAX_OUTPUT = 4000,
-};
-
+/**
+ * struct papr_sysparm_buf - RTAS work area layout for system parameter 
functions.
+ *
+ * This is the memory layout of the buffers passed to/from
+ * ibm,get-system-parameter and ibm,set-system-parameter. It is
+ * distinct from the papr_sysparm_io_block structure that is passed
+ * between user space and the kernel.
+ */
 struct papr_sysparm_buf {
__be16 len;
char val[PAPR_SYSPARM_MAX_OUTPUT];
diff --git a/arch/powerpc/include/uapi/asm/papr-sysparm.h 
b/arch/powerpc/include/uapi/asm/papr-sysparm.h
new file mode 100644
index ..9f9a0f267ea5
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-sysparm.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_PAPR_SYSPARM_H_
+#define _UAPI_PAPR_SYSPARM_H_
+
+#include 
+#include 
+#include 
+
+enum {
+   PAPR_SYSPARM_MAX_INPUT  = 1024,
+   PAPR_SYSPARM_MAX_OUTPUT = 4000,
+};
+
+struct papr_sysparm_io_block {
+   __u32 parameter;
+   __u16 length;
+   char data[PAPR_SYSPARM_MAX_OUTPUT];
+};
+
+/**
+ * PAPR_SYSPARM_IOC_GET - Retrieve the value of a PAPR system parameter.
+ *
+ * Uses _IOWR because of one corner case: Retrieving the value of the
+ * "OS Service Entitlement Status" parameter (60) requires the caller
+ * to supply input data (a date string) in the buffer passed to
+ * firmware. So the @length and @data of the incoming
+ * papr_sysparm_io_block are always used to initialize the work area
+ * supplied to ibm,get-system-parameter. No other parameters are known
+ * to 

[PATCH v5 07/13] powerpc/rtas: Serialize firmware activation sequences

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Use rtas_ibm_activate_firmware_lock to prevent interleaving call
sequences of the ibm,activate-firmware RTAS function, which typically
requires multiple calls to complete the update. While the spec does
not specifically prohibit interleaved sequences, there's almost
certainly no advantage to allowing them.

Reviewed-by: Aneesh Kumar K.V (IBM) 
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/rtas.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 4d28983e8b1d..72f6b5a402dd 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1734,10 +1734,14 @@ void rtas_activate_firmware(void)
return;
}
 
+   mutex_lock(_ibm_activate_firmware_lock);
+
do {
fwrc = rtas_call(token, 0, 1, NULL);
} while (rtas_busy_delay(fwrc));
 
+   mutex_unlock(_ibm_activate_firmware_lock);
+
if (fwrc)
pr_err("ibm,activate-firmware failed (%i)\n", fwrc);
 }

-- 
2.41.0



[PATCH v5 01/13] powerpc/rtas: Avoid warning on invalid token argument to sys_rtas()

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

rtas_token_to_function() WARNs when passed an invalid token; it's
meant to catch bugs in kernel-based users of RTAS functions. However,
user space controls the token value passed to rtas_token_to_function()
by block_rtas_call(), so user space with sufficient privilege to use
sys_rtas() can trigger the warnings at will:

  unexpected failed lookup for token 2048
  WARNING: CPU: 20 PID: 2247 at arch/powerpc/kernel/rtas.c:556
rtas_token_to_function+0xfc/0x110
  ...
  NIP rtas_token_to_function+0xfc/0x110
  LR  rtas_token_to_function+0xf8/0x110
  Call Trace:
rtas_token_to_function+0xf8/0x110 (unreliable)
sys_rtas+0x188/0x880
system_call_exception+0x268/0x530
system_call_common+0x160/0x2c4

It's desirable to continue warning on bogus tokens in
rtas_token_to_function(). Currently it is used to look up RTAS
function descriptors when tracing, where we know there has to have
been a successful descriptor lookup by different means already, and it
would be a serious inconsistency for the reverse lookup to fail.

So instead of weakening rtas_token_to_function()'s contract by
removing the warnings, introduce rtas_token_to_function_untrusted(),
which has no opinion on failed lookups. Convert block_rtas_call() and
rtas_token_to_function() to use it.

Fixes: 8252b88294d2 ("powerpc/rtas: improve function information lookups")
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/rtas.c | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index c49f078382a9..ce37dc9860ef 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -544,6 +544,21 @@ static int __init rtas_token_to_function_xarray_init(void)
 }
 arch_initcall(rtas_token_to_function_xarray_init);
 
+/*
+ * For use by sys_rtas(), where the token value is provided by user
+ * space and we don't want to warn on failed lookups.
+ */
+static const struct rtas_function *rtas_token_to_function_untrusted(s32 token)
+{
+   return xa_load(_token_to_function_xarray, token);
+}
+
+/*
+ * Reverse lookup for deriving the function descriptor from a
+ * known-good token value in contexts where the former is not already
+ * available. @token must be valid, e.g. derived from the result of a
+ * prior lookup against the function table.
+ */
 static const struct rtas_function *rtas_token_to_function(s32 token)
 {
const struct rtas_function *func;
@@ -551,7 +566,7 @@ static const struct rtas_function 
*rtas_token_to_function(s32 token)
if (WARN_ONCE(token < 0, "invalid token %d", token))
return NULL;
 
-   func = xa_load(_token_to_function_xarray, token);
+   func = rtas_token_to_function_untrusted(token);
 
if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token))
return NULL;
@@ -1721,7 +1736,7 @@ static bool block_rtas_call(int token, int nargs,
 * If this token doesn't correspond to a function the kernel
 * understands, you're not allowed to call it.
 */
-   func = rtas_token_to_function(token);
+   func = rtas_token_to_function_untrusted(token);
if (!func)
goto err;
/*

-- 
2.41.0



[PATCH v5 02/13] powerpc/rtas: Add for_each_rtas_function() iterator

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Add a convenience macro for iterating over every element of the
internal function table and convert the one site that can use it. An
additional user of the macro is anticipated in changes to follow.

Reviewed-by: Aneesh Kumar K.V (IBM) 
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/rtas.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ce37dc9860ef..ae9b10c954a1 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -454,6 +454,11 @@ static struct rtas_function rtas_function_table[] 
__ro_after_init = {
},
 };
 
+#define for_each_rtas_function(funcp)   \
+   for (funcp = _function_table[0];   \
+funcp < _function_table[ARRAY_SIZE(rtas_function_table)]; \
+++funcp)
+
 /*
  * Nearly all RTAS calls need to be serialized. All uses of the
  * default rtas_args block must hold rtas_lock.
@@ -525,10 +530,10 @@ static DEFINE_XARRAY(rtas_token_to_function_xarray);
 
 static int __init rtas_token_to_function_xarray_init(void)
 {
+   const struct rtas_function *func;
int err = 0;
 
-   for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) {
-   const struct rtas_function *func = _function_table[i];
+   for_each_rtas_function(func) {
const s32 token = func->token;
 
if (token == RTAS_UNKNOWN_SERVICE)

-- 
2.41.0



[PATCH v5 10/13] powerpc/pseries/papr-sysparm: Validate buffer object lengths

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

The ability to get and set system parameters will be exposed to user
space, so let's get a little more strict about malformed
papr_sysparm_buf objects.

* Create accessors for the length field of struct papr_sysparm_buf.
  The length is always stored in MSB order and this is better than
  spreading the necessary conversions all over.

* Reject attempts to submit invalid buffers to RTAS.

* Warn if RTAS returns a buffer with an invalid length, clamping the
  returned length to a safe value that won't overrun the buffer.

These are meant as precautionary measures to mitigate both firmware
and kernel bugs in this area, should they arise, but I am not aware of
any.

Signed-off-by: Nathan Lynch 
---
 arch/powerpc/platforms/pseries/papr-sysparm.c | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/papr-sysparm.c 
b/arch/powerpc/platforms/pseries/papr-sysparm.c
index fedc61599e6c..a1e7aeac7416 100644
--- a/arch/powerpc/platforms/pseries/papr-sysparm.c
+++ b/arch/powerpc/platforms/pseries/papr-sysparm.c
@@ -23,6 +23,46 @@ void papr_sysparm_buf_free(struct papr_sysparm_buf *buf)
kfree(buf);
 }
 
+static size_t papr_sysparm_buf_get_length(const struct papr_sysparm_buf *buf)
+{
+   return be16_to_cpu(buf->len);
+}
+
+static void papr_sysparm_buf_set_length(struct papr_sysparm_buf *buf, size_t 
length)
+{
+   WARN_ONCE(length > sizeof(buf->val),
+ "bogus length %zu, clamping to safe value", length);
+   length = min(sizeof(buf->val), length);
+   buf->len = cpu_to_be16(length);
+}
+
+/*
+ * For use on buffers returned from ibm,get-system-parameter before
+ * returning them to callers. Ensures the encoded length of valid data
+ * cannot overrun buf->val[].
+ */
+static void papr_sysparm_buf_clamp_length(struct papr_sysparm_buf *buf)
+{
+   papr_sysparm_buf_set_length(buf, papr_sysparm_buf_get_length(buf));
+}
+
+/*
+ * Perform some basic diligence on the system parameter buffer before
+ * submitting it to RTAS.
+ */
+static bool papr_sysparm_buf_can_submit(const struct papr_sysparm_buf *buf)
+{
+   /*
+* Firmware ought to reject buffer lengths that exceed the
+* maximum specified in PAPR, but there's no reason for the
+* kernel to allow them either.
+*/
+   if (papr_sysparm_buf_get_length(buf) > sizeof(buf->val))
+   return false;
+
+   return true;
+}
+
 /**
  * papr_sysparm_get() - Retrieve the value of a PAPR system parameter.
  * @param: PAPR system parameter token as described in
@@ -63,6 +103,9 @@ int papr_sysparm_get(papr_sysparm_t param, struct 
papr_sysparm_buf *buf)
if (token == RTAS_UNKNOWN_SERVICE)
return -ENOENT;
 
+   if (!papr_sysparm_buf_can_submit(buf))
+   return -EINVAL;
+
work_area = rtas_work_area_alloc(sizeof(*buf));
 
memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf));
@@ -77,6 +120,7 @@ int papr_sysparm_get(papr_sysparm_t param, struct 
papr_sysparm_buf *buf)
case 0:
ret = 0;
memcpy(buf, rtas_work_area_raw_buf(work_area), sizeof(*buf));
+   papr_sysparm_buf_clamp_length(buf);
break;
case -3: /* parameter not implemented */
ret = -EOPNOTSUPP;
@@ -115,6 +159,9 @@ int papr_sysparm_set(papr_sysparm_t param, const struct 
papr_sysparm_buf *buf)
if (token == RTAS_UNKNOWN_SERVICE)
return -ENOENT;
 
+   if (!papr_sysparm_buf_can_submit(buf))
+   return -EINVAL;
+
work_area = rtas_work_area_alloc(sizeof(*buf));
 
memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf));

-- 
2.41.0



[PATCH v5 08/13] powerpc/rtas: Warn if per-function lock isn't held

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

If the function descriptor has a populated lock member, then callers
are required to hold it across calls. Now that the firmware activation
sequence is appropriately guarded, we can warn when the requirement
isn't satisfied.

__do_enter_rtas_trace() gets reorganized a bit as a result of
performing the function descriptor lookup unconditionally now.

Reviewed-by: Aneesh Kumar K.V (IBM) 
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/rtas.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 72f6b5a402dd..7e793b503e29 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -671,28 +671,25 @@ static void __do_enter_rtas(struct rtas_args *args)
 
 static void __do_enter_rtas_trace(struct rtas_args *args)
 {
-   const char *name = NULL;
+   const struct rtas_function *func = 
rtas_token_to_function(be32_to_cpu(args->token));
 
-   if (args == _args)
-   lockdep_assert_held(_lock);
/*
-* If the tracepoints that consume the function name aren't
-* active, avoid the lookup.
+* If there is a per-function lock, it must be held by the
+* caller.
 */
-   if ((trace_rtas_input_enabled() || trace_rtas_output_enabled())) {
-   const s32 token = be32_to_cpu(args->token);
-   const struct rtas_function *func = 
rtas_token_to_function(token);
+   if (func->lock)
+   lockdep_assert_held(func->lock);
 
-   name = func->name;
-   }
+   if (args == _args)
+   lockdep_assert_held(_lock);
 
-   trace_rtas_input(args, name);
+   trace_rtas_input(args, func->name);
trace_rtas_ll_entry(args);
 
__do_enter_rtas(args);
 
trace_rtas_ll_exit(args);
-   trace_rtas_output(args, name);
+   trace_rtas_output(args, func->name);
 }
 
 static void do_enter_rtas(struct rtas_args *args)

-- 
2.41.0



[PATCH v5 04/13] powerpc/rtas: Add function return status constants

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

Not all of the generic RTAS function statuses specified in PAPR have
symbolic constants and descriptions in rtas.h. Fix this, providing a
little more background, slightly updating the existing wording, and
improving the formatting.

Reviewed-by: Aneesh Kumar K.V (IBM) 
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/include/asm/rtas.h | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index a7110ed52e25..08d19e6904f7 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -201,12 +201,25 @@ typedef struct {
 /* Memory set aside for sys_rtas to use with calls that need a work area. */
 #define RTAS_USER_REGION_SIZE (64 * 1024)
 
-/* RTAS return status codes */
-#define RTAS_HARDWARE_ERROR-1/* Hardware Error */
-#define RTAS_BUSY  -2/* RTAS Busy */
-#define RTAS_INVALID_PARAMETER -3/* Invalid indicator/domain/sensor etc. */
-#define RTAS_EXTENDED_DELAY_MIN9900
-#define RTAS_EXTENDED_DELAY_MAX9905
+/*
+ * Common RTAS function return values, derived from the table "RTAS
+ * Status Word Values" in PAPR+ v2.13 7.2.8: "Return Codes". If a
+ * function can return a value in this table then generally it has the
+ * meaning listed here. More extended commentary in the documentation
+ * for rtas_call().
+ *
+ * RTAS functions may use negative and positive numbers not in this
+ * set for function-specific error and success conditions,
+ * respectively.
+ */
+#define RTAS_SUCCESS 0 /* Success. */
+#define RTAS_HARDWARE_ERROR -1 /* Hardware or other unspecified 
error. */
+#define RTAS_BUSY   -2 /* Retry immediately. */
+#define RTAS_INVALID_PARAMETER  -3 /* Invalid indicator/domain/sensor 
etc. */
+#define RTAS_UNEXPECTED_STATE_CHANGE-7 /* Seems limited to EEH and slot 
reset. */
+#define RTAS_EXTENDED_DELAY_MIN   9900 /* Retry after delaying for ~1ms. */
+#define RTAS_EXTENDED_DELAY_MAX   9905 /* Retry after delaying for ~100s. 
*/
+#define RTAS_ML_ISOLATION_ERROR  -9000 /* Multi-level isolation error. */
 
 /* statuses specific to ibm,suspend-me */
 #define RTAS_SUSPEND_ABORTED 9000 /* Suspension aborted */

-- 
2.41.0



[PATCH v5 06/13] powerpc/rtas: Facilitate high-level call sequences

2023-12-07 Thread Nathan Lynch via B4 Relay
From: Nathan Lynch 

On RTAS platforms there is a general restriction that the OS must not
enter RTAS on more than one CPU at a time. This low-level
serialization requirement is satisfied by holding a spin
lock (rtas_lock) across most RTAS function invocations.

However, some pseries RTAS functions require multiple successive calls
to complete a logical operation. Beginning a new call sequence for such a
function may disrupt any other sequences of that function already in
progress. Safe and reliable use of these functions effectively
requires higher-level serialization beyond what is already done at the
level of RTAS entry and exit.

Where a sequence-based RTAS function is invoked only through
sys_rtas(), with no in-kernel users, there is no issue as far as the
kernel is concerned. User space is responsible for appropriately
serializing its call sequences. (Whether user space code actually
takes measures to prevent sequence interleaving is another matter.)
Examples of such functions currently include ibm,platform-dump and
ibm,get-vpd.

But where a sequence-based RTAS function has both user space and
in-kernel uesrs, there is a hazard. Even if the in-kernel call sites
of such a function serialize their sequences correctly, a user of
sys_rtas() can invoke the same function at any time, potentially
disrupting a sequence in progress.

So in order to prevent disruption of kernel-based RTAS call sequences,
they must serialize not only with themselves but also with sys_rtas()
users, somehow. Preferably without adding more function-specific hacks
to sys_rtas(). This is a prerequisite for adding an in-kernel call
sequence of ibm,get-vpd, which is in a change to follow.

Note that it has never been feasible for the kernel to prevent
sys_rtas()-based sequences from being disrupted because control
returns to user space on every call. sys_rtas()-based users of these
functions have always been, and continue to be, responsible for
coordinating their call sequences with other users, even those which
may invoke the RTAS functions through less direct means than
sys_rtas(). This is an unavoidable consequence of exposing
sequence-based RTAS functions through sys_rtas().

* Add an optional mutex member to struct rtas_function.

* Statically define a mutex for each RTAS function with known call
  sequence serialization requirements, and assign its address to the
  .lock member of the corresponding function table entry, along with
  justifying commentary.

* In sys_rtas(), if the table entry for the RTAS function being
  called has a populated lock member, acquire it before taking
  rtas_lock and entering RTAS.

* Kernel-based RTAS call sequences are expected to access the
  appropriate mutex explicitly by name. For example, a user of the
  ibm,activate-firmware RTAS function would do:

int token = rtas_function_token(RTAS_FN_IBM_ACTIVATE_FIRMWARE);
int fwrc;

mutex_lock(_ibm_activate_firmware_lock);

do {
fwrc = rtas_call(token, 0, 1, NULL);
} while (rtas_busy_delay(fwrc));

mutex_unlock(_ibm_activate_firmware_lock);

There should be no perceivable change introduced here except that
concurrent callers of the same RTAS function via sys_rtas() may block
on a mutex instead of spinning on rtas_lock.

Signed-off-by: Nathan Lynch 
---
 arch/powerpc/include/asm/rtas.h |  3 ++
 arch/powerpc/kernel/rtas.c  | 83 +
 2 files changed, 86 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 08d19e6904f7..9bb2210c8d44 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -3,6 +3,7 @@
 #define _POWERPC_RTAS_H
 #ifdef __KERNEL__
 
+#include 
 #include 
 #include 
 #include 
@@ -512,6 +513,8 @@ extern char rtas_data_buf[RTAS_DATA_BUF_SIZE];
 /* RMO buffer reserved for user-space RTAS use */
 extern unsigned long rtas_rmo_buf;
 
+extern struct mutex rtas_ibm_get_vpd_lock;
+
 #define GLOBAL_INTERRUPT_QUEUE 9005
 
 /**
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ca5bb0b994ac..4d28983e8b1d 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -70,14 +71,33 @@ struct rtas_filter {
  *ppc64le, and we want to keep it that way. It does
  *not make sense for this to be set when @filter
  *is NULL.
+ * @lock: Pointer to an optional dedicated per-function mutex. This
+ *should be set for functions that require multiple calls in
+ *sequence to complete a single operation, and such sequences
+ *will disrupt each other if allowed to interleave. Users of
+ *this function are required to hold the associated lock for
+ *the duration of the call sequence. Add an explanatory
+ *comment to the function 

Re: [PATCH] drm/amdgpu: drop the long-double-128 powerpc check/hack

2023-12-07 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 31/03/2023 à 12:53, Michael Ellerman a écrit :
>> "Daniel Kolesa"  writes:
>>> Commit c653c591789b ("drm/amdgpu: Re-enable DCN for 64-bit powerpc")
>>> introduced this check as a workaround for the driver not building
>>> with toolchains that default to 64-bit long double.
>> ...
>>> In mainline, this work is now fully done, so this check is fully
>>> redundant and does not do anything except preventing AMDGPU DC
>>> from being built on systems such as those using musl libc. The
>>> last piece of work to enable this was commit c92b7fe0d92a
>>> ("drm/amd/display: move remaining FPU code to dml folder")
>>> and this has since been backported to 6.1 stable (in 6.1.7).
>>>
>>> Relevant issue: https://gitlab.freedesktop.org/drm/amd/-/issues/2288
>> 
>> I looked to pick this up for 6.3 but was still seeing build errors with
>> some compilers. I assumed that was due to some fixes coming in
>> linux-next that I didn't have.
>> 
>> But applying the patch on v6.3-rc4 I still see build errors. This is
>> building allyesconfig with the kernel.org GCC 12.2.0 / binutils 2.39
>> toolchain:
>> 
>>powerpc64le-linux-gnu-ld: 
>> drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.o uses hard float, 
>> arch/powerpc/lib/test_emulate_step.o uses soft float
>>powerpc64le-linux-gnu-ld: failed to merge target specific data of file 
>> drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.o
>> 
>> etc.
>> 
>> All the conflicts are between test_emulate_step.o and some file in 
>> drivers/gpu/drm/amd/display/dc/dml.
>> 
>> So even with all the hard-float code isolated in the dml folder, we
>> still hit build errors, because allyesconfig wants to link those
>> hard-float using objects with soft-float objects from elsewhere in the
>> kernel.
>> 
>> It seems like the only workable fix is to force the kernel build to use
>> 128-bit long double. I'll send a patch doing that.
>> 
>
> Commit 78f0929884d4 ("powerpc/64: Always build with 128-bit long 
> double") I guess ?

Yes.

> Let's drop this patch from patchwork then.

Thanks.

cheers


Re: (subset) [PATCH 00/22] -Wmissing-prototype warning fixes

2023-12-07 Thread Michael Ellerman
On Wed, 08 Nov 2023 13:58:21 +0100, Arnd Bergmann wrote:
> I slightly dropped the ball on this since last sending the series in
> August, but a number of warning fixes have made it into the kernel in
> the meantime, both from my earlier submission and from architecture
> maintainers.
> 
> I have none patches that remain from the previous submission, with
> two of them reworked according to comments. The additional patches
> are from more testing across architectures and configurations that
> I had previously missed.
> 
> [...]

Applied to powerpc/next.

[17/22] powerpc: ps3: move udbg_shutdown_ps3gelic prototype

https://git.kernel.org/powerpc/c/04c40eed3f7ac48ddaf20104489510e743a53c47
[18/22] powerpc: pasemi: mark pas_shutdown() static

https://git.kernel.org/powerpc/c/0c9a768de64d24e38e27652b8c273725ccc31916
[19/22] powerpc: powermac: mark smp_psurge_{give,take}_timebase static

https://git.kernel.org/powerpc/c/afb36ac386783d2ef2ed839293c03fd06f470be0

cheers


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Waiman Long



On 12/6/23 16:02, Waiman Long wrote:

On 12/6/23 14:55, Hans de Goede wrote:

Hi,

On 12/6/23 19:58, George Stark wrote:

Hello Hans

Thanks for the review.

On 12/6/23 18:01, Hans de Goede wrote:

Hi George,

On 12/4/23 19:05, George Stark wrote:

Using of devm API leads to certain order of releasing resources.
So all dependent resources which are not devm-wrapped should be 
deleted

with respect to devm-release order. Mutex is one of such objects that
often is bound to other resources and has no own devm wrapping.
Since mutex_destroy() actually does nothing in non-debug builds
frequently calling mutex_destroy() is just ignored which is safe 
for now

but wrong formally and can lead to a problem if mutex_destroy() is
extended so introduce devm_mutex_init().

Signed-off-by: George Stark 
---
   include/linux/devm-helpers.h | 18 ++
   1 file changed, 18 insertions(+)

diff --git a/include/linux/devm-helpers.h 
b/include/linux/devm-helpers.h

index 74891802200d..2f56e476776f 100644
--- a/include/linux/devm-helpers.h
+++ b/include/linux/devm-helpers.h
@@ -76,4 +76,22 @@ static inline int devm_work_autocancel(struct 
device *dev,

   return devm_add_action(dev, devm_work_drop, w);
   }
   +static inline void devm_mutex_release(void *res)
+{
+    mutex_destroy(res);
+}
+
+/**
+ * devm_mutex_init - Resource-managed mutex initialization
+ * @dev:    Device which lifetime work is bound to
+ * @lock:    Pointer to a mutex
+ *
+ * Initialize mutex which is automatically destroyed when driver 
is detached.

+ */
+static inline int devm_mutex_init(struct device *dev, struct 
mutex *lock)

+{
+    mutex_init(lock);
+    return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+
   #endif

mutex_destroy() only actually does anything if CONFIG_DEBUG_MUTEXES
is set, otherwise it is an empty inline-stub.

Adding a devres resource to the device just to call an empty inline
stub which is a no-op seems like a waste of resources. IMHO it
would be better to change this to:

static inline int devm_mutex_init(struct device *dev, struct mutex 
*lock)

{
 mutex_init(lock);
#ifdef CONFIG_DEBUG_MUTEXES
 return devm_add_action_or_reset(dev, devm_mutex_release, lock);
#else
 return 0;
#endif
}

To avoid the unnecessary devres allocation when
CONFIG_DEBUG_MUTEXES is not set.
Honestly saying I don't like unnecessary devres allocation either 
but the proposed approach has its own price:


1) we'll have more than one place with branching if mutex_destroy is 
empty or not using  indirect condition. If suddenly mutex_destroy is 
extended for non-debug code (in upstream branch or e.g. by someone 
for local debug) than there'll be a problem.


2) If mutex_destroy is empty or not depends on CONFIG_PREEMPT_RT 
option too. When CONFIG_PREEMPT_RT is on mutex_destroy is always empty.


As I see it only the mutex interface (mutex.h) has to say definitely 
if mutex_destroy must be called. Probably we could add some define 
to include/linux/mutex.h,like IS_MUTEX_DESTROY_REQUIRED and declare 
it near mutex_destroy definition itself.
That (a  IS_MUTEX_DESTROY_REQUIRED define) is an interesting idea. 
Lets see for v3 if the mutex maintainers will accept that and if not 
then I guess we will just need to live with the unnecessary devres 
allocation.


The purpose of calling mutex_destroy() is to mark a mutex as being 
destroyed so that any subsequent call to mutex_lock/unlock will cause 
a warning to be printed when CONFIG_DEBUG_MUTEXES is defined. I would 
not say that mutex_destroy() is required. Rather it is a nice to have 
for catching programming error.


OTOH, one thing that we can probably do in mutex.h is something like

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..7db7862de3f1 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -83,6 +83,9 @@ struct mutex {

 extern void mutex_destroy(struct mutex *lock);

+/* mutex_destroy() is a real function, not a NOP */
+#define mutex_destroy  mutex_destroy
+
 #else



Now in some devm files, you can use the absense/presence of 
mutex_destroy macro to decide on what to do.


Cheers,
Longman



Re: [PATCH v1 2/4] of: Reimplement of_machine_is_compatible() using of_machine_compatible_match()

2023-12-07 Thread Rob Herring
On Wed, Dec 06, 2023 at 05:13:33PM +0100, Christophe Leroy wrote:
> of_machine_compatible_match() works with a table of strings.
> of_machine_is_compatible() is a simplier version with only one string.
> 
> Re-implement of_machine_is_compatible() by setting a table of strings
> with a single string then using of_machine_compatible_match().
> 
> Suggested-by: Rob Herring 
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/of/base.c  | 22 +-
>  include/linux/of.h | 15 ++-
>  2 files changed, 15 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index 9020be2eb4d5..73c3a754bad1 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -414,27 +414,7 @@ bool of_machine_compatible_match(const char *const 
> *compats)
>  
>   return rc != 0;
>  }
> -
> -/**
> - * of_machine_is_compatible - Test root of device tree for a given 
> compatible value
> - * @compat: compatible string to look for in root node's compatible property.
> - *
> - * Return: A positive integer if the root node has the given value in its
> - * compatible property.
> - */
> -int of_machine_is_compatible(const char *compat)
> -{
> - struct device_node *root;
> - int rc = 0;
> -
> - root = of_find_node_by_path("/");
> - if (root) {
> - rc = of_device_is_compatible(root, compat);
> - of_node_put(root);
> - }
> - return rc;
> -}
> -EXPORT_SYMBOL(of_machine_is_compatible);
> +EXPORT_SYMBOL(of_machine_compatible_match);
>  
>  /**
>   *  __of_device_is_available - check if a device is available for use
> diff --git a/include/linux/of.h b/include/linux/of.h
> index e3418babc203..a0f70be5007f 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -402,9 +402,22 @@ extern void of_alias_scan(void * (*dt_alloc)(u64 size, 
> u64 align));
>  extern int of_alias_get_id(struct device_node *np, const char *stem);
>  extern int of_alias_get_highest_id(const char *stem);
>  
> -extern int of_machine_is_compatible(const char *compat);
>  bool of_machine_compatible_match(const char *const *compats);
>  
> +/**
> + * of_machine_is_compatible - Test root of device tree for a given 
> compatible value
> + * @compat: compatible string to look for in root node's compatible property.
> + *
> + * Return: A positive integer if the root node has the given value in its
> + * compatible property.

There is a subtle change that we really only return true/false instead 
of a score. In a quick scan, I don't see any callers caring. Lots of 
places we could use a list of compatibles instead though. Maybe this 
should just return a bool instead of int? Either way:

Reviewed-by: Rob Herring 


Re: [PATCH v1 1/4] of: Add of_machine_compatible_match()

2023-12-07 Thread Rob Herring


On Wed, 06 Dec 2023 17:13:32 +0100, Christophe Leroy wrote:
> From: Michael Ellerman 
> 
> We have of_machine_is_compatible() to check if a machine is compatible
> with a single compatible string. However some code is able to support
> multiple compatible boards, and so wants to check for one of many
> compatible strings.
> 
> So add of_machine_compatible_match() which takes a NULL terminated
> array of compatible strings to check against the root node's
> compatible property.
> 
> Compared to an open coded match this is slightly more self
> documenting, and also avoids the caller needing to juggle the root
> node either directly or via of_find_node_by_path().
> 
> Signed-off-by: Michael Ellerman 
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/of/base.c  | 21 +
>  include/linux/of.h |  6 ++
>  2 files changed, 27 insertions(+)
> 

Reviewed-by: Rob Herring 



Re: [PATCH v1 4/4] powerpc: Stop using of_root

2023-12-07 Thread Rob Herring
On Wed, Dec 06, 2023 at 05:13:35PM +0100, Christophe Leroy wrote:
> Replace all usages of of_root by of_find_node_by_path("/")
> 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/kernel/secure_boot.c|  8 ++--
>  arch/powerpc/kexec/ranges.c  |  8 +---
>  arch/powerpc/mm/drmem.c  | 10 +-
>  arch/powerpc/mm/numa.c   |  6 --
>  arch/powerpc/platforms/52xx/efika.c  |  4 +++-
>  arch/powerpc/platforms/pasemi/pci.c  |  4 +++-
>  arch/powerpc/platforms/pseries/lparcfg.c |  6 +-
>  arch/powerpc/platforms/pseries/setup.c   | 12 +---
>  8 files changed, 40 insertions(+), 18 deletions(-)

Reviewed-by: Rob Herring 


Re: [PATCH v1 3/4] powerpc/machdep: Define 'compatibles' property in ppc_md and use it

2023-12-07 Thread Rob Herring
On Wed, Dec 06, 2023 at 05:13:34PM +0100, Christophe Leroy wrote:
> Most probe functions that do not use the 'compatible' string do
> nothing else than checking whether the machine is compatible with
> one of the strings in a NULL terminated table of strings.
> 
> Define that table of strings in ppc_md structure and check it directly
> from probe_machine() instead of using ppc_md.probe() for that.
> 
> Keep checking in ppc_md.probe() only for more complex probing.
> 
> All .compatible could be replaced with a single element NULL
> terminated list but that's not worth the churn. Can be do incrementaly
> in follow-up patches.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/machdep.h|  1 +
>  arch/powerpc/kernel/setup-common.c|  2 ++
>  arch/powerpc/platforms/40x/ppc40x_simple.c|  9 +++--
>  arch/powerpc/platforms/512x/mpc512x_generic.c |  4 +---
>  arch/powerpc/platforms/52xx/lite5200.c| 10 +-
>  arch/powerpc/platforms/52xx/mpc5200_simple.c  | 10 +-
>  arch/powerpc/platforms/83xx/mpc830x_rdb.c | 10 +-
>  arch/powerpc/platforms/83xx/mpc831x_rdb.c | 10 +-
>  arch/powerpc/platforms/83xx/mpc837x_rdb.c | 10 +-
>  arch/powerpc/platforms/85xx/corenet_generic.c |  2 +-
>  arch/powerpc/platforms/85xx/tqm85xx.c | 10 +-
>  11 files changed, 14 insertions(+), 64 deletions(-)

> -/*
> - * Called very early, MMU is off, device-tree isn't unflattened
> - */

Certainly an out of date comment as the unflattened API was being 
called.

Reviewed-by: Rob Herring 


Re: [PATCH RFC/RFT 3/4] riscv: Stop emitting preventive sfence.vma for new userspace mappings

2023-12-07 Thread Christophe Leroy
The subject says "riscv:" but it changes core part and several arch. 
Maybe this commit should be split in two commits, one for API changes 
that changes flush_tlb_fix_spurious_fault() to 
flush_tlb_fix_spurious_write_fault() and adds 
flush_tlb_fix_spurious_read_fault() including the change in memory.c, 
then a second patch with the changes to riscv.

Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit :
> The preventive sfence.vma were emitted because new mappings must be made
> visible to the page table walker, either the uarch caches invalid
> entries or not.
> 
> Actually, there is no need to preventively sfence.vma on new mappings for
> userspace, this should be handled only in the page fault path.
> 
> This allows to drastically reduce the number of sfence.vma emitted:
> 
> * Ubuntu boot to login:
> Before: ~630k sfence.vma
> After:  ~200k sfence.vma
> 
> * ltp - mmapstress01
> Before: ~45k
> After:  ~6.3k
> 
> * lmbench - lat_pagefault
> Before: ~665k
> After:   832 (!)
> 
> * lmbench - lat_mmap
> Before: ~546k
> After:   718 (!)
> 
> The only issue with the removal of sfence.vma in update_mmu_cache() is
> that on uarchs that cache invalid entries, those won't be invalidated
> until the process takes a fault: so that's an additional fault in those
> cases.
> 
> Signed-off-by: Alexandre Ghiti 
> ---
>   arch/arm64/include/asm/pgtable.h  |  2 +-
>   arch/mips/include/asm/pgtable.h   |  6 +--
>   arch/powerpc/include/asm/book3s/64/tlbflush.h |  8 ++--
>   arch/riscv/include/asm/pgtable.h  | 43 +++
>   include/linux/pgtable.h   |  8 +++-
>   mm/memory.c   | 12 +-
>   6 files changed, 48 insertions(+), 31 deletions(-)

Did you forget mm/pgtable-generic.c ?

> 
> diff --git a/arch/arm64/include/asm/pgtable.h 
> b/arch/arm64/include/asm/pgtable.h
> index 7f7d9b1df4e5..728f25f529a5 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
>* fault on one CPU which has been handled concurrently by another CPU
>* does not need to perform additional invalidation.
>*/
> -#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
> +#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) do { } while 
> (0)

Why do you need to do that change ? Nothing is explained about that in 
the commit message.

>   
>   /*
>* ZERO_PAGE is a global shared page that is always zero: used
> diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
> index 430b208c0130..84439fe6ed29 100644
> --- a/arch/mips/include/asm/pgtable.h
> +++ b/arch/mips/include/asm/pgtable.h
> @@ -478,9 +478,9 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
>   return __pgprot(prot);
>   }
>   
> -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> - unsigned long address,
> - pte_t *ptep)
> +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct 
> *vma,
> +   unsigned long address,
> +   pte_t *ptep)
>   {
>   }
>   
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
> b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 1950c1b825b4..7166d56f90db 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -128,10 +128,10 @@ static inline void flush_tlb_page(struct vm_area_struct 
> *vma,
>   #define flush_tlb_page(vma, addr)   local_flush_tlb_page(vma, addr)
>   #endif /* CONFIG_SMP */
>   
> -#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
> -static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> - unsigned long address,
> - pte_t *ptep)
> +#define flush_tlb_fix_spurious_write_fault flush_tlb_fix_spurious_write_fault
> +static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct 
> *vma,
> +   unsigned long address,
> +   pte_t *ptep)
>   {
>   /*
>* Book3S 64 does not require spurious fault flushes because the PTE
> diff --git a/arch/riscv/include/asm/pgtable.h 
> b/arch/riscv/include/asm/pgtable.h
> index b2ba3f79cfe9..89aa5650f104 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -472,28 +472,20 @@ static inline void update_mmu_cache_range(struct 
> vm_fault *vmf,
>   struct vm_area_struct *vma, unsigned long address,
>   pte_t *ptep, unsigned int nr)
>   {
> - /*
> -  * The kernel assumes that TLBs don't cache invalid entries, but
> -  * in 

Re: [PATCH v4 10/12] KVM: x86: never write to memory from kvm_vcpu_check_block()

2023-12-07 Thread Sean Christopherson
On Wed, Dec 06, 2023, Jim Mattson wrote:
> kvm_vcpu_check_block() is called while not in TASK_RUNNING, and therefore
> it cannot sleep.  Writing to guest memory is therefore forbidden, but it
> can happen on AMD processors if kvm_check_nested_events() causes a vmexit.
> 
> Fortunately, all events that are caught by kvm_check_nested_events() are
> also recognized by kvm_vcpu_has_events() through vendor callbacks such as
> kvm_x86_interrupt_allowed() or kvm_x86_ops.nested_ops->has_events(), so
> remove the call and postpone the actual processing to vcpu_block().
> 
> Opportunistically honor the return of kvm_check_nested_events().  KVM
> punted on the check in kvm_vcpu_running() because the only error path is
> if vmx_complete_nested_posted_interrupt() fails, in which case KVM exits
> to userspace with "internal error" i.e. the VM is likely dead anyways so
> it wasn't worth overloading the return of kvm_vcpu_running().
> 
> Add the check mostly so that KVM is consistent with itself; the return of
> the call via kvm_apic_accept_events()=>kvm_check_nested_events() that
> immediately follows  _is_ checked.
> 
> Reported-by: Maxim Levitsky 
> Signed-off-by: Paolo Bonzini 
> [sean: check and handle return of kvm_check_nested_events()]
> Signed-off-by: Sean Christopherson 
> ---
>  arch/x86/kvm/x86.c | 14 +++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index dcc675d4e44b..8aeacbc2bff9 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10815,6 +10815,17 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
>   return 1;
>   }
>  
> + /*
> +  * Evaluate nested events before exiting the halted state.  This allows
> +  * the halt state to be recorded properly in the VMCS12's activity
> +  * state field (AMD does not have a similar field and a VM-Exit always
> +  * causes a spurious wakeup from HLT).
> +  */
> + if (is_guest_mode(vcpu)) {
> + if (kvm_check_nested_events(vcpu) < 0)
> + return 0;
> + }
> +
>   if (kvm_apic_accept_events(vcpu) < 0)
>   return 0;
>   switch(vcpu->arch.mp_state) {
> @@ -10837,9 +10848,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
>  
>  static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
>  {
> - if (is_guest_mode(vcpu))
> - kvm_check_nested_events(vcpu);
> -
>   return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
>   !vcpu->arch.apf.halted);
>  }
> 
> This commit breaks delivery of a (virtualized) posted interrupt from
> an L1 vCPU to a halted L2 vCPU.
> 
> Looking back at commit e6c67d8cf117 ("KVM: nVMX: Wake blocked vCPU in
> guest-mode if pending interrupt in virtual APICv"), Liran wrote:
> 
> Note that this also handles the case of nested posted-interrupt by the
> fact RVI is updated in vmx_complete_nested_posted_interrupt() which is
> called from kvm_vcpu_check_block() -> kvm_arch_vcpu_runnable() ->
> kvm_vcpu_running() -> vmx_check_nested_events() ->
> vmx_complete_nested_posted_interrupt().
> 
> Clearly, that is no longer the case.

Doh.  We got the less obvious cases and missed the obvious one.

Ugh, and we also missed a related mess in kvm_guest_apic_has_interrupt().  That
thing should really be folded into vmx_has_nested_events().

Good gravy.  And vmx_interrupt_blocked() does the wrong thing because that
specifically checks if L1 interrupts are blocked.

Compile tested only, and definitely needs to be chunked into multiple patches,
but I think something like this mess?

---
 arch/x86/include/asm/kvm-x86-ops.h |  1 -
 arch/x86/include/asm/kvm_host.h|  1 -
 arch/x86/kvm/lapic.c   | 20 ++---
 arch/x86/kvm/lapic.h   | 12 ++
 arch/x86/kvm/vmx/nested.c  | 36 --
 arch/x86/kvm/vmx/vmx.c | 34 
 arch/x86/kvm/vmx/vmx.h |  1 +
 arch/x86/kvm/x86.c | 10 +
 8 files changed, 59 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h 
b/arch/x86/include/asm/kvm-x86-ops.h
index 378ed944b849..6f81774c1dd0 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -85,7 +85,6 @@ KVM_X86_OP_OPTIONAL(update_cr8_intercept)
 KVM_X86_OP(refresh_apicv_exec_ctrl)
 KVM_X86_OP_OPTIONAL(hwapic_irr_update)
 KVM_X86_OP_OPTIONAL(hwapic_isr_update)
-KVM_X86_OP_OPTIONAL_RET0(guest_apic_has_interrupt)
 KVM_X86_OP_OPTIONAL(load_eoi_exitmap)
 KVM_X86_OP_OPTIONAL(set_virtual_apic_mode)
 KVM_X86_OP_OPTIONAL(set_apic_access_page_addr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8c7e2475a18..fc1466035a8c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1685,7 +1685,6 @@ struct kvm_x86_ops {
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
  

Re: [PATCH RFC/RFT 2/4] riscv: Add a runtime detection of invalid TLB entries caching

2023-12-07 Thread Christophe Leroy


Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit :
> This mechanism allows to completely bypass the sfence.vma introduced by
> the previous commit for uarchs that do not cache invalid TLB entries.
> 
> Signed-off-by: Alexandre Ghiti 
> ---
>   arch/riscv/mm/init.c | 124 +++
>   1 file changed, 124 insertions(+)
> 
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 379403de6c6f..2e854613740c 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -56,6 +56,8 @@ bool pgtable_l5_enabled = IS_ENABLED(CONFIG_64BIT) && 
> !IS_ENABLED(CONFIG_XIP_KER
>   EXPORT_SYMBOL(pgtable_l4_enabled);
>   EXPORT_SYMBOL(pgtable_l5_enabled);
>   
> +bool tlb_caching_invalid_entries;
> +
>   phys_addr_t phys_ram_base __ro_after_init;
>   EXPORT_SYMBOL(phys_ram_base);
>   
> @@ -750,6 +752,18 @@ static void __init disable_pgtable_l4(void)
>   satp_mode = SATP_MODE_39;
>   }
>   
> +static void __init enable_pgtable_l5(void)
> +{
> + pgtable_l5_enabled = true;
> + satp_mode = SATP_MODE_57;
> +}
> +
> +static void __init enable_pgtable_l4(void)
> +{
> + pgtable_l4_enabled = true;
> + satp_mode = SATP_MODE_48;
> +}
> +
>   static int __init print_no4lvl(char *p)
>   {
>   pr_info("Disabled 4-level and 5-level paging");
> @@ -826,6 +840,112 @@ static __init void set_satp_mode(uintptr_t dtb_pa)
>   memset(early_pud, 0, PAGE_SIZE);
>   memset(early_pmd, 0, PAGE_SIZE);
>   }
> +
> +/* Determine at runtime if the uarch caches invalid TLB entries */
> +static __init void set_tlb_caching_invalid_entries(void)
> +{
> +#define NR_RETRIES_CACHING_INVALID_ENTRIES   50

Looks odd to have macros nested in the middle of a function.

> + uintptr_t set_tlb_caching_invalid_entries_pmd = ((unsigned 
> long)set_tlb_caching_invalid_entries) & PMD_MASK;
> + // TODO the test_addr as defined below could go into another pud...
> + uintptr_t test_addr = set_tlb_caching_invalid_entries_pmd + 2 * 
> PMD_SIZE;
> + pmd_t valid_pmd;
> + u64 satp;
> + int i = 0;
> +
> + /* To ease the page table creation */
> + disable_pgtable_l5();
> + disable_pgtable_l4();
> +
> + /* Establish a mapping for set_tlb_caching_invalid_entries() in sv39 */
> + create_pgd_mapping(early_pg_dir,
> +set_tlb_caching_invalid_entries_pmd,
> +(uintptr_t)early_pmd,
> +PGDIR_SIZE, PAGE_TABLE);
> +
> + /* Handle the case where set_tlb_caching_invalid_entries straddles 2 
> PMDs */
> + create_pmd_mapping(early_pmd,
> +set_tlb_caching_invalid_entries_pmd,
> +set_tlb_caching_invalid_entries_pmd,
> +PMD_SIZE, PAGE_KERNEL_EXEC);
> + create_pmd_mapping(early_pmd,
> +set_tlb_caching_invalid_entries_pmd + PMD_SIZE,
> +set_tlb_caching_invalid_entries_pmd + PMD_SIZE,
> +PMD_SIZE, PAGE_KERNEL_EXEC);
> +
> + /* Establish an invalid mapping */
> + create_pmd_mapping(early_pmd, test_addr, 0, PMD_SIZE, __pgprot(0));
> +
> + /* Precompute the valid pmd here because the mapping for pfn_pmd() 
> won't exist */
> + valid_pmd = pfn_pmd(PFN_DOWN(set_tlb_caching_invalid_entries_pmd), 
> PAGE_KERNEL);
> +
> + local_flush_tlb_all();
> + satp = PFN_DOWN((uintptr_t)_pg_dir) | SATP_MODE_39;
> + csr_write(CSR_SATP, satp);
> +
> + /*
> +  * Set stvec to after the trapping access, access this invalid mapping
> +  * and legitimately trap
> +  */
> + // TODO: Should I save the previous stvec?
> +#define ASM_STR(x)   __ASM_STR(x)

Looks odd to have macros nested in the middle of a function.


> + asm volatile(
> + "la a0, 1f  \n"
> + "csrw " ASM_STR(CSR_TVEC) ", a0 \n"
> + "ld a0, 0(%0)   \n"
> + ".align 2   \n"
> + "1: \n"
> + :
> + : "r" (test_addr)
> + : "a0"
> + );
> +
> + /* Now establish a valid mapping to check if the invalid one is cached 
> */
> + early_pmd[pmd_index(test_addr)] = valid_pmd;
> +
> + /*
> +  * Access the valid mapping multiple times: indeed, we can't use
> +  * sfence.vma as a barrier to make sure the cpu did not reorder accesses
> +  * so we may trap even if the uarch does not cache invalid entries. By
> +  * trying a few times, we make sure that those uarchs will see the right
> +  * mapping at some point.
> +  */
> +
> + i = NR_RETRIES_CACHING_INVALID_ENTRIES;
> +
> +#define ASM_STR(x)   __ASM_STR(x)

Deplicate define ?

> + asm_volatile_goto(
> + "la a0, 1f  \n"
> + "csrw " ASM_STR(CSR_TVEC) ", a0 \n"
> + ".align 2 

Re: [PATCH RFC/RFT 1/4] riscv: Stop emitting preventive sfence.vma for new vmalloc mappings

2023-12-07 Thread Christophe Leroy


Le 07/12/2023 à 16:03, Alexandre Ghiti a écrit :
> In 6.5, we removed the vmalloc fault path because that can't work (see
> [1] [2]). Then in order to make sure that new page table entries were
> seen by the page table walker, we had to preventively emit a sfence.vma
> on all harts [3] but this solution is very costly since it relies on IPI.
> 
> And even there, we could end up in a loop of vmalloc faults if a vmalloc
> allocation is done in the IPI path (for example if it is traced, see
> [4]), which could result in a kernel stack overflow.
> 
> Those preventive sfence.vma needed to be emitted because:
> 
> - if the uarch caches invalid entries, the new mapping may not be
>observed by the page table walker and an invalidation may be needed.
> - if the uarch does not cache invalid entries, a reordered access
>could "miss" the new mapping and traps: in that case, we would actually
>only need to retry the access, no sfence.vma is required.
> 
> So this patch removes those preventive sfence.vma and actually handles
> the possible (and unlikely) exceptions. And since the kernel stacks
> mappings lie in the vmalloc area, this handling must be done very early
> when the trap is taken, at the very beginning of handle_exception: this
> also rules out the vmalloc allocations in the fault path.
> 
> Note that for now, we emit a sfence.vma even for uarchs that do not
> cache invalid entries as we have no means to know that: that will be
> fixed in the next patch.
> 
> Link: 
> https://lore.kernel.org/linux-riscv/20230531093817.665799-1-bj...@kernel.org/ 
> [1]
> Link: 
> https://lore.kernel.org/linux-riscv/20230801090927.2018653-1-dy...@andestech.com
>  [2]
> Link: 
> https://lore.kernel.org/linux-riscv/20230725132246.817726-1-alexgh...@rivosinc.com/
>  [3]
> Link: https://lore.kernel.org/lkml/20200508144043.13893-1-j...@8bytes.org/ [4]
> Signed-off-by: Alexandre Ghiti 
> ---
>   arch/riscv/include/asm/cacheflush.h  | 19 +-
>   arch/riscv/include/asm/thread_info.h |  5 ++
>   arch/riscv/kernel/asm-offsets.c  |  5 ++
>   arch/riscv/kernel/entry.S| 94 
>   arch/riscv/mm/init.c |  2 +
>   5 files changed, 124 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/riscv/include/asm/cacheflush.h 
> b/arch/riscv/include/asm/cacheflush.h
> index 3cb53c4df27c..a916cbc69d47 100644
> --- a/arch/riscv/include/asm/cacheflush.h
> +++ b/arch/riscv/include/asm/cacheflush.h
> @@ -37,7 +37,24 @@ static inline void flush_dcache_page(struct page *page)
>   flush_icache_mm(vma->vm_mm, 0)
>   
>   #ifdef CONFIG_64BIT
> -#define flush_cache_vmap(start, end) flush_tlb_kernel_range(start, end)
> +extern u64 new_vmalloc[];

Can you have the table size here ? Would help GCC static analysis for 
boundary checking.

> +extern char _end[];
> +#define flush_cache_vmap flush_cache_vmap
> +static inline void flush_cache_vmap(unsigned long start, unsigned long end)
> +{
> + if ((start < VMALLOC_END && end > VMALLOC_START) ||
> + (start < MODULES_END && end > MODULES_VADDR)) {

Can you use is_vmalloc_or_module_addr() instead ?


> + int i;
> +
> + /*
> +  * We don't care if concurrently a cpu resets this value since
> +  * the only place this can happen is in handle_exception() where
> +  * an sfence.vma is emitted.
> +  */
> + for (i = 0; i < NR_CPUS / sizeof(u64) + 1; ++i)

Use ARRAY_SIZE() ?

> + new_vmalloc[i] = -1ULL;
> + }
> +}
>   #endif
>   
>   #ifndef CONFIG_SMP
> diff --git a/arch/riscv/include/asm/thread_info.h 
> b/arch/riscv/include/asm/thread_info.h
> index 1833beb00489..8fe12fa6c329 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -60,6 +60,11 @@ struct thread_info {
>   longuser_sp;/* User stack pointer */
>   int cpu;
>   unsigned long   syscall_work;   /* SYSCALL_WORK_ flags */
> + /*
> +  * Used in handle_exception() to save a0, a1 and a2 before knowing if we
> +  * can access the kernel stack.
> +  */
> + unsigned long   a0, a1, a2;
>   };
>   
>   /*
> diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> index d6a75aac1d27..340c1c84560d 100644
> --- a/arch/riscv/kernel/asm-offsets.c
> +++ b/arch/riscv/kernel/asm-offsets.c
> @@ -34,10 +34,15 @@ void asm_offsets(void)
>   OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
>   OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
>   OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
> +
> + OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
>   OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
>   OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
>   OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
>   OFFSET(TASK_TI_USER_SP, task_struct, 

[PATCH RFC/RFT 4/4] TEMP: riscv: Add debugfs interface to retrieve #sfence.vma

2023-12-07 Thread Alexandre Ghiti
This is useful for testing/benchmarking.

Signed-off-by: Alexandre Ghiti 
---
 arch/riscv/include/asm/pgtable.h  |  6 --
 arch/riscv/include/asm/tlbflush.h |  4 
 arch/riscv/kernel/sbi.c   | 12 
 arch/riscv/mm/tlbflush.c  | 17 +
 4 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 89aa5650f104..b0855a620cfd 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -550,7 +550,7 @@ static inline int ptep_set_access_flags(struct 
vm_area_struct *vma,
return false;
 }
 
-extern u64 nr_sfence_vma_handle_exception;
+extern u64 nr_sfence_vma_spurious_read;
 extern bool tlb_caching_invalid_entries;
 
 #define flush_tlb_fix_spurious_read_fault flush_tlb_fix_spurious_read_fault
@@ -558,8 +558,10 @@ static inline void 
flush_tlb_fix_spurious_read_fault(struct vm_area_struct *vma,
 unsigned long address,
 pte_t *ptep)
 {
-   if (tlb_caching_invalid_entries)
+   if (tlb_caching_invalid_entries) {
+   __sync_fetch_and_add(_sfence_vma_spurious_read, 1UL);
flush_tlb_page(vma, address);
+   }
 }
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
diff --git a/arch/riscv/include/asm/tlbflush.h 
b/arch/riscv/include/asm/tlbflush.h
index a09196f8de68..f419ec9d2207 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -14,14 +14,18 @@
 #ifdef CONFIG_MMU
 extern unsigned long asid_mask;
 
+extern u64 nr_sfence_vma, nr_sfence_vma_all, nr_sfence_vma_all_asid;
+
 static inline void local_flush_tlb_all(void)
 {
+   __sync_fetch_and_add(_sfence_vma_all, 1UL);
__asm__ __volatile__ ("sfence.vma" : : : "memory");
 }
 
 /* Flush one page from local TLB */
 static inline void local_flush_tlb_page(unsigned long addr)
 {
+   __sync_fetch_and_add(_sfence_vma, 1UL);
ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) 
: "memory"));
 }
 #else /* CONFIG_MMU */
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index c672c8ba9a2a..ac1617759583 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -376,6 +376,8 @@ int sbi_remote_fence_i(const struct cpumask *cpu_mask)
 }
 EXPORT_SYMBOL(sbi_remote_fence_i);
 
+extern u64 nr_sfence_vma, nr_sfence_vma_all, nr_sfence_vma_all_asid;
+
 /**
  * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote
  *  harts for the specified virtual address range.
@@ -389,6 +391,11 @@ int sbi_remote_sfence_vma(const struct cpumask *cpu_mask,
   unsigned long start,
   unsigned long size)
 {
+   if (size == (unsigned long)-1)
+   __sync_fetch_and_add(_sfence_vma_all, 1UL);
+   else
+   __sync_fetch_and_add(_sfence_vma, ALIGN(size, PAGE_SIZE) / 
PAGE_SIZE);
+
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
cpu_mask, start, size, 0, 0);
 }
@@ -410,6 +417,11 @@ int sbi_remote_sfence_vma_asid(const struct cpumask 
*cpu_mask,
unsigned long size,
unsigned long asid)
 {
+   if (size == (unsigned long)-1)
+   __sync_fetch_and_add(_sfence_vma_all_asid, 1UL);
+   else
+   __sync_fetch_and_add(_sfence_vma, ALIGN(size, PAGE_SIZE) / 
PAGE_SIZE);
+
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
cpu_mask, start, size, asid, 0);
 }
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 77be59aadc73..75a3e2dff16a 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -3,11 +3,16 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
+u64 nr_sfence_vma, nr_sfence_vma_all, nr_sfence_vma_all_asid,
+   nr_sfence_vma_handle_exception, nr_sfence_vma_spurious_read;
+
 static inline void local_flush_tlb_all_asid(unsigned long asid)
 {
+   __sync_fetch_and_add(_sfence_vma_all_asid, 1);
__asm__ __volatile__ ("sfence.vma x0, %0"
:
: "r" (asid)
@@ -17,6 +22,7 @@ static inline void local_flush_tlb_all_asid(unsigned long 
asid)
 static inline void local_flush_tlb_page_asid(unsigned long addr,
unsigned long asid)
 {
+   __sync_fetch_and_add(_sfence_vma, 1);
__asm__ __volatile__ ("sfence.vma %0, %1"
:
: "r" (addr), "r" (asid)
@@ -149,3 +155,14 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
__flush_tlb_range(vma->vm_mm, start, end - start, PMD_SIZE);
 }
 #endif
+
+static int debugfs_nr_sfence_vma(void)
+{
+   debugfs_create_u64("nr_sfence_vma", 0444, NULL, _sfence_vma);
+   

[PATCH RFC/RFT 3/4] riscv: Stop emitting preventive sfence.vma for new userspace mappings

2023-12-07 Thread Alexandre Ghiti
The preventive sfence.vma were emitted because new mappings must be made
visible to the page table walker, either the uarch caches invalid
entries or not.

Actually, there is no need to preventively sfence.vma on new mappings for
userspace, this should be handled only in the page fault path.

This allows to drastically reduce the number of sfence.vma emitted:

* Ubuntu boot to login:
Before: ~630k sfence.vma
After:  ~200k sfence.vma

* ltp - mmapstress01
Before: ~45k
After:  ~6.3k

* lmbench - lat_pagefault
Before: ~665k
After:   832 (!)

* lmbench - lat_mmap
Before: ~546k
After:   718 (!)

The only issue with the removal of sfence.vma in update_mmu_cache() is
that on uarchs that cache invalid entries, those won't be invalidated
until the process takes a fault: so that's an additional fault in those
cases.

Signed-off-by: Alexandre Ghiti 
---
 arch/arm64/include/asm/pgtable.h  |  2 +-
 arch/mips/include/asm/pgtable.h   |  6 +--
 arch/powerpc/include/asm/book3s/64/tlbflush.h |  8 ++--
 arch/riscv/include/asm/pgtable.h  | 43 +++
 include/linux/pgtable.h   |  8 +++-
 mm/memory.c   | 12 +-
 6 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7f7d9b1df4e5..728f25f529a5 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -57,7 +57,7 @@ static inline bool arch_thp_swp_supported(void)
  * fault on one CPU which has been handled concurrently by another CPU
  * does not need to perform additional invalidation.
  */
-#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
+#define flush_tlb_fix_spurious_write_fault(vma, address, ptep) do { } while (0)
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 430b208c0130..84439fe6ed29 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -478,9 +478,9 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
return __pgprot(prot);
 }
 
-static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
-   unsigned long address,
-   pte_t *ptep)
+static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct 
*vma,
+ unsigned long address,
+ pte_t *ptep)
 {
 }
 
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 1950c1b825b4..7166d56f90db 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -128,10 +128,10 @@ static inline void flush_tlb_page(struct vm_area_struct 
*vma,
 #define flush_tlb_page(vma, addr)  local_flush_tlb_page(vma, addr)
 #endif /* CONFIG_SMP */
 
-#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
-static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
-   unsigned long address,
-   pte_t *ptep)
+#define flush_tlb_fix_spurious_write_fault flush_tlb_fix_spurious_write_fault
+static inline void flush_tlb_fix_spurious_write_fault(struct vm_area_struct 
*vma,
+ unsigned long address,
+ pte_t *ptep)
 {
/*
 * Book3S 64 does not require spurious fault flushes because the PTE
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index b2ba3f79cfe9..89aa5650f104 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -472,28 +472,20 @@ static inline void update_mmu_cache_range(struct vm_fault 
*vmf,
struct vm_area_struct *vma, unsigned long address,
pte_t *ptep, unsigned int nr)
 {
-   /*
-* The kernel assumes that TLBs don't cache invalid entries, but
-* in RISC-V, SFENCE.VMA specifies an ordering constraint, not a
-* cache flush; it is necessary even after writing invalid entries.
-* Relying on flush_tlb_fix_spurious_fault would suffice, but
-* the extra traps reduce performance.  So, eagerly SFENCE.VMA.
-*/
-   while (nr--)
-   local_flush_tlb_page(address + nr * PAGE_SIZE);
 }
 #define update_mmu_cache(vma, addr, ptep) \
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
 
 #define __HAVE_ARCH_UPDATE_MMU_TLB
-#define update_mmu_tlb update_mmu_cache
+static inline void update_mmu_tlb(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+   flush_tlb_range(vma, address, address + 

[PATCH RFC/RFT 2/4] riscv: Add a runtime detection of invalid TLB entries caching

2023-12-07 Thread Alexandre Ghiti
This mechanism allows to completely bypass the sfence.vma introduced by
the previous commit for uarchs that do not cache invalid TLB entries.

Signed-off-by: Alexandre Ghiti 
---
 arch/riscv/mm/init.c | 124 +++
 1 file changed, 124 insertions(+)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 379403de6c6f..2e854613740c 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -56,6 +56,8 @@ bool pgtable_l5_enabled = IS_ENABLED(CONFIG_64BIT) && 
!IS_ENABLED(CONFIG_XIP_KER
 EXPORT_SYMBOL(pgtable_l4_enabled);
 EXPORT_SYMBOL(pgtable_l5_enabled);
 
+bool tlb_caching_invalid_entries;
+
 phys_addr_t phys_ram_base __ro_after_init;
 EXPORT_SYMBOL(phys_ram_base);
 
@@ -750,6 +752,18 @@ static void __init disable_pgtable_l4(void)
satp_mode = SATP_MODE_39;
 }
 
+static void __init enable_pgtable_l5(void)
+{
+   pgtable_l5_enabled = true;
+   satp_mode = SATP_MODE_57;
+}
+
+static void __init enable_pgtable_l4(void)
+{
+   pgtable_l4_enabled = true;
+   satp_mode = SATP_MODE_48;
+}
+
 static int __init print_no4lvl(char *p)
 {
pr_info("Disabled 4-level and 5-level paging");
@@ -826,6 +840,112 @@ static __init void set_satp_mode(uintptr_t dtb_pa)
memset(early_pud, 0, PAGE_SIZE);
memset(early_pmd, 0, PAGE_SIZE);
 }
+
+/* Determine at runtime if the uarch caches invalid TLB entries */
+static __init void set_tlb_caching_invalid_entries(void)
+{
+#define NR_RETRIES_CACHING_INVALID_ENTRIES 50
+   uintptr_t set_tlb_caching_invalid_entries_pmd = ((unsigned 
long)set_tlb_caching_invalid_entries) & PMD_MASK;
+   // TODO the test_addr as defined below could go into another pud...
+   uintptr_t test_addr = set_tlb_caching_invalid_entries_pmd + 2 * 
PMD_SIZE;
+   pmd_t valid_pmd;
+   u64 satp;
+   int i = 0;
+
+   /* To ease the page table creation */
+   disable_pgtable_l5();
+   disable_pgtable_l4();
+
+   /* Establish a mapping for set_tlb_caching_invalid_entries() in sv39 */
+   create_pgd_mapping(early_pg_dir,
+  set_tlb_caching_invalid_entries_pmd,
+  (uintptr_t)early_pmd,
+  PGDIR_SIZE, PAGE_TABLE);
+
+   /* Handle the case where set_tlb_caching_invalid_entries straddles 2 
PMDs */
+   create_pmd_mapping(early_pmd,
+  set_tlb_caching_invalid_entries_pmd,
+  set_tlb_caching_invalid_entries_pmd,
+  PMD_SIZE, PAGE_KERNEL_EXEC);
+   create_pmd_mapping(early_pmd,
+  set_tlb_caching_invalid_entries_pmd + PMD_SIZE,
+  set_tlb_caching_invalid_entries_pmd + PMD_SIZE,
+  PMD_SIZE, PAGE_KERNEL_EXEC);
+
+   /* Establish an invalid mapping */
+   create_pmd_mapping(early_pmd, test_addr, 0, PMD_SIZE, __pgprot(0));
+
+   /* Precompute the valid pmd here because the mapping for pfn_pmd() 
won't exist */
+   valid_pmd = pfn_pmd(PFN_DOWN(set_tlb_caching_invalid_entries_pmd), 
PAGE_KERNEL);
+
+   local_flush_tlb_all();
+   satp = PFN_DOWN((uintptr_t)_pg_dir) | SATP_MODE_39;
+   csr_write(CSR_SATP, satp);
+
+   /*
+* Set stvec to after the trapping access, access this invalid mapping
+* and legitimately trap
+*/
+   // TODO: Should I save the previous stvec?
+#define ASM_STR(x) __ASM_STR(x)
+   asm volatile(
+   "la a0, 1f  \n"
+   "csrw " ASM_STR(CSR_TVEC) ", a0 \n"
+   "ld a0, 0(%0)   \n"
+   ".align 2   \n"
+   "1: \n"
+   :
+   : "r" (test_addr)
+   : "a0"
+   );
+
+   /* Now establish a valid mapping to check if the invalid one is cached 
*/
+   early_pmd[pmd_index(test_addr)] = valid_pmd;
+
+   /*
+* Access the valid mapping multiple times: indeed, we can't use
+* sfence.vma as a barrier to make sure the cpu did not reorder accesses
+* so we may trap even if the uarch does not cache invalid entries. By
+* trying a few times, we make sure that those uarchs will see the right
+* mapping at some point.
+*/
+
+   i = NR_RETRIES_CACHING_INVALID_ENTRIES;
+
+#define ASM_STR(x) __ASM_STR(x)
+   asm_volatile_goto(
+   "la a0, 1f  \n"
+   "csrw " ASM_STR(CSR_TVEC) ", a0 \n"
+   ".align 2   \n"
+   "1: \n"
+   "addi %0, %0, -1\n"
+   "blt %0, zero, %l[caching_invalid_entries]  \n"
+   "ld a0, 0(%1)   \n"
+   :
+   

[PATCH RFC/RFT 1/4] riscv: Stop emitting preventive sfence.vma for new vmalloc mappings

2023-12-07 Thread Alexandre Ghiti
In 6.5, we removed the vmalloc fault path because that can't work (see
[1] [2]). Then in order to make sure that new page table entries were
seen by the page table walker, we had to preventively emit a sfence.vma
on all harts [3] but this solution is very costly since it relies on IPI.

And even there, we could end up in a loop of vmalloc faults if a vmalloc
allocation is done in the IPI path (for example if it is traced, see
[4]), which could result in a kernel stack overflow.

Those preventive sfence.vma needed to be emitted because:

- if the uarch caches invalid entries, the new mapping may not be
  observed by the page table walker and an invalidation may be needed.
- if the uarch does not cache invalid entries, a reordered access
  could "miss" the new mapping and traps: in that case, we would actually
  only need to retry the access, no sfence.vma is required.

So this patch removes those preventive sfence.vma and actually handles
the possible (and unlikely) exceptions. And since the kernel stacks
mappings lie in the vmalloc area, this handling must be done very early
when the trap is taken, at the very beginning of handle_exception: this
also rules out the vmalloc allocations in the fault path.

Note that for now, we emit a sfence.vma even for uarchs that do not
cache invalid entries as we have no means to know that: that will be
fixed in the next patch.

Link: 
https://lore.kernel.org/linux-riscv/20230531093817.665799-1-bj...@kernel.org/ 
[1]
Link: 
https://lore.kernel.org/linux-riscv/20230801090927.2018653-1-dy...@andestech.com
 [2]
Link: 
https://lore.kernel.org/linux-riscv/20230725132246.817726-1-alexgh...@rivosinc.com/
 [3]
Link: https://lore.kernel.org/lkml/20200508144043.13893-1-j...@8bytes.org/ [4]
Signed-off-by: Alexandre Ghiti 
---
 arch/riscv/include/asm/cacheflush.h  | 19 +-
 arch/riscv/include/asm/thread_info.h |  5 ++
 arch/riscv/kernel/asm-offsets.c  |  5 ++
 arch/riscv/kernel/entry.S| 94 
 arch/riscv/mm/init.c |  2 +
 5 files changed, 124 insertions(+), 1 deletion(-)

diff --git a/arch/riscv/include/asm/cacheflush.h 
b/arch/riscv/include/asm/cacheflush.h
index 3cb53c4df27c..a916cbc69d47 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -37,7 +37,24 @@ static inline void flush_dcache_page(struct page *page)
flush_icache_mm(vma->vm_mm, 0)
 
 #ifdef CONFIG_64BIT
-#define flush_cache_vmap(start, end)   flush_tlb_kernel_range(start, end)
+extern u64 new_vmalloc[];
+extern char _end[];
+#define flush_cache_vmap flush_cache_vmap
+static inline void flush_cache_vmap(unsigned long start, unsigned long end)
+{
+   if ((start < VMALLOC_END && end > VMALLOC_START) ||
+   (start < MODULES_END && end > MODULES_VADDR)) {
+   int i;
+
+   /*
+* We don't care if concurrently a cpu resets this value since
+* the only place this can happen is in handle_exception() where
+* an sfence.vma is emitted.
+*/
+   for (i = 0; i < NR_CPUS / sizeof(u64) + 1; ++i)
+   new_vmalloc[i] = -1ULL;
+   }
+}
 #endif
 
 #ifndef CONFIG_SMP
diff --git a/arch/riscv/include/asm/thread_info.h 
b/arch/riscv/include/asm/thread_info.h
index 1833beb00489..8fe12fa6c329 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -60,6 +60,11 @@ struct thread_info {
longuser_sp;/* User stack pointer */
int cpu;
unsigned long   syscall_work;   /* SYSCALL_WORK_ flags */
+   /*
+* Used in handle_exception() to save a0, a1 and a2 before knowing if we
+* can access the kernel stack.
+*/
+   unsigned long   a0, a1, a2;
 };
 
 /*
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index d6a75aac1d27..340c1c84560d 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -34,10 +34,15 @@ void asm_offsets(void)
OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
+
+   OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
OFFSET(TASK_TI_FLAGS, task_struct, thread_info.flags);
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
+   OFFSET(TASK_TI_A0, task_struct, thread_info.a0);
+   OFFSET(TASK_TI_A1, task_struct, thread_info.a1);
+   OFFSET(TASK_TI_A2, task_struct, thread_info.a2);
 
OFFSET(TASK_THREAD_F0,  task_struct, thread.fstate.f[0]);
OFFSET(TASK_THREAD_F1,  task_struct, thread.fstate.f[1]);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S

[PATCH RFC/RFT 0/4] Remove preventive sfence.vma

2023-12-07 Thread Alexandre Ghiti
In RISC-V, after a new mapping is established, a sfence.vma needs to be
emitted for different reasons:

- if the uarch caches invalid entries, we need to invalidate it otherwise
  we would trap on this invalid entry,
- if the uarch does not cache invalid entries, a reordered access could fail
  to see the new mapping and then trap (sfence.vma acts as a fence).

We can actually avoid emitting those (mostly) useless and costly sfence.vma
by handling the traps instead:

- for new kernel mappings: only vmalloc mappings need to be taken care of,
  other new mapping are rare and already emit the required sfence.vma if
  needed.
  That must be achieved very early in the exception path as explained in
  patch 1, and this also fixes our fragile way of dealing with vmalloc faults.

- for new user mappings: that can be handled in the page fault path as done
  in patch 3.

Patch 2 is certainly a TEMP patch which allows to detect at runtime if a
uarch caches invalid TLB entries.

Patch 4 is a TEMP patch which allows to expose through debugfs the different
sfence.vma that are emitted, which can be used for benchmarking.

On our uarch that does not cache invalid entries and a 6.5 kernel, the
gains are measurable:

* Kernel boot:  6%
* ltp - mmapstress01:   8%
* lmbench - lat_pagefault:  20%
* lmbench - lat_mmap:   5%

On uarchs that cache invalid entries, the results are more mitigated and
need to be explored more thoroughly (if anyone is interested!): that can
be explained by the extra page faults, which depending on "how much" the
uarch caches invalid entries, could kill the benefits of removing the
preventive sfence.vma.

Ved Shanbhogue has prepared a new extension to be used by uarchs that do
not cache invalid entries, which will certainly be used instead of patch 2.

Thanks to Ved and Matt Evans for triggering the discussion that led to
this patchset!

That's an RFC, so please don't mind the checkpatch warnings and dirty
comments. It applies on 6.6.

Any feedback, test or relevant benchmark are welcome :)

Alexandre Ghiti (4):
  riscv: Stop emitting preventive sfence.vma for new vmalloc mappings
  riscv: Add a runtime detection of invalid TLB entries caching
  riscv: Stop emitting preventive sfence.vma for new userspace mappings
  TEMP: riscv: Add debugfs interface to retrieve #sfence.vma

 arch/arm64/include/asm/pgtable.h  |   2 +-
 arch/mips/include/asm/pgtable.h   |   6 +-
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +-
 arch/riscv/include/asm/cacheflush.h   |  19 ++-
 arch/riscv/include/asm/pgtable.h  |  45 ---
 arch/riscv/include/asm/thread_info.h  |   5 +
 arch/riscv/include/asm/tlbflush.h |   4 +
 arch/riscv/kernel/asm-offsets.c   |   5 +
 arch/riscv/kernel/entry.S |  94 +
 arch/riscv/kernel/sbi.c   |  12 ++
 arch/riscv/mm/init.c  | 126 ++
 arch/riscv/mm/tlbflush.c  |  17 +++
 include/linux/pgtable.h   |   8 +-
 mm/memory.c   |  12 +-
 14 files changed, 331 insertions(+), 32 deletions(-)

-- 
2.39.2



Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread George Stark




On 12/7/23 16:01, Christophe Leroy wrote:



Le 07/12/2023 à 13:51, George Stark a écrit :



On 12/7/23 15:28, Christophe Leroy wrote:



Le 07/12/2023 à 13:02, Andy Shevchenko a écrit :

On Thu, Dec 7, 2023 at 1:23 AM George Stark
 wrote:

On 12/7/23 01:37, Christophe Leroy wrote:

Le 06/12/2023 à 23:14, Christophe Leroy a écrit :


...


Looking at it closer, I have the feeling that you want to do
similar to
devm_gpio_request() in linux/gpio.h :

In linux/mutex.h, add a prototype for devm_mutex_init() when
CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
Then define devm_mutex_init() in kernel/locking/mutex-debug.c


Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
we wouldn't have to include whole "linux/device.h" into mutex.h, only
add forward declaration of struct device;


In case you place it into a C-file. Otherwise you need a header for
the API and that is not acceptable for mutex.h.



Right, that's the reason why I'm suggesting to define devm_mutex_init()
in kernel/locking/mutex-debug.c.

In linux/mutex.h, you define a stub for when CONFIG_DEBUG_MUTEXES is not
set, and the prototype of devm_mutex_init() when CONFIG_DEBUG_MUTEXES is
set.


Something like this:

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..4a6041a7fd44 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -21,6 +21,8 @@
   #include 
   #include 

+struct device;
+
   #ifdef CONFIG_DEBUG_LOCK_ALLOC
   # define __DEP_MAP_MUTEX_INITIALIZER(lockname)    \
   , .dep_map = {    \
@@ -127,6 +129,20 @@ extern void __mutex_init(struct mutex *lock, const
char *name,
    */
   extern bool mutex_is_locked(struct mutex *lock);

+#ifdef CONFIG_DEBUG_MUTEXES


There is already a CONFIG_DEBUG_MUTEXES block, can you re-use it ?


those CONFIG_DEBUG_MUTEXES blockd are declared before mutex_init macro :(




+
+extern int devm_mutex_init(struct device *dev, struct mutex *lock);


'extern' is pointless and deprecated for function prototypes.
I know the kernel is full of them, but it is not a good reason to add
new ones.


Ok

Sure I will send this patch in the right way and then we could have 
proper review but firstly I'd like to hear from Andy and mutex.h's 
maintainers is it acceptable at all?





+
+#else
+
+static inline int devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+    mutex_init(lock);
+    return 0;
+}
+
+#endif
+
   #else /* !CONFIG_PREEMPT_RT */
   /*
    * Preempt-RT variant based on rtmutexes.
@@ -169,6 +185,13 @@ do {    \
   \
   __mutex_init((mutex), #mutex, &__key);    \
   } while (0)
+
+static inline int devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+    mutex_init(lock);
+    return 0;
+}
+
   #endif /* CONFIG_PREEMPT_RT */

   /*
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index bc8abb8549d2..d50dfa06e82c 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -19,6 +19,7 @@
   #include 
   #include 
   #include 
+#include 

   #include "mutex.h"

@@ -104,3 +105,25 @@ void mutex_destroy(struct mutex *lock)
   }

   EXPORT_SYMBOL_GPL(mutex_destroy);
+
+static void devm_mutex_release(void *res)
+{
+    mutex_destroy(res);
+}
+
+/**
+ * devm_mutex_init - Resource-managed mutex initialization
+ * @dev:    Device which lifetime mutex is bound to
+ * @lock:    Pointer to a mutex
+ *
+ * Initialize mutex which is automatically destroyed when the driver is
detached.
+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+int devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+    mutex_init(lock);
+    return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+
+EXPORT_SYMBOL_GPL(devm_mutex_init);
\ No newline at end of file




--
Best regards
George


Re: [PATCH] drm/amdgpu: drop the long-double-128 powerpc check/hack

2023-12-07 Thread Christophe Leroy


Le 31/03/2023 à 12:53, Michael Ellerman a écrit :
> "Daniel Kolesa"  writes:
>> Commit c653c591789b ("drm/amdgpu: Re-enable DCN for 64-bit powerpc")
>> introduced this check as a workaround for the driver not building
>> with toolchains that default to 64-bit long double.
> ...
>> In mainline, this work is now fully done, so this check is fully
>> redundant and does not do anything except preventing AMDGPU DC
>> from being built on systems such as those using musl libc. The
>> last piece of work to enable this was commit c92b7fe0d92a
>> ("drm/amd/display: move remaining FPU code to dml folder")
>> and this has since been backported to 6.1 stable (in 6.1.7).
>>
>> Relevant issue: https://gitlab.freedesktop.org/drm/amd/-/issues/2288
> 
> I looked to pick this up for 6.3 but was still seeing build errors with
> some compilers. I assumed that was due to some fixes coming in
> linux-next that I didn't have.
> 
> But applying the patch on v6.3-rc4 I still see build errors. This is
> building allyesconfig with the kernel.org GCC 12.2.0 / binutils 2.39
> toolchain:
> 
>powerpc64le-linux-gnu-ld: 
> drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.o uses hard float, 
> arch/powerpc/lib/test_emulate_step.o uses soft float
>powerpc64le-linux-gnu-ld: failed to merge target specific data of file 
> drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.o
> 
> etc.
> 
> All the conflicts are between test_emulate_step.o and some file in 
> drivers/gpu/drm/amd/display/dc/dml.
> 
> So even with all the hard-float code isolated in the dml folder, we
> still hit build errors, because allyesconfig wants to link those
> hard-float using objects with soft-float objects from elsewhere in the
> kernel.
> 
> It seems like the only workable fix is to force the kernel build to use
> 128-bit long double. I'll send a patch doing that.
> 

Commit 78f0929884d4 ("powerpc/64: Always build with 128-bit long 
double") I guess ?

Let's drop this patch from patchwork then.


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Christophe Leroy


Le 07/12/2023 à 13:51, George Stark a écrit :
> 
> 
> On 12/7/23 15:28, Christophe Leroy wrote:
>>
>>
>> Le 07/12/2023 à 13:02, Andy Shevchenko a écrit :
>>> On Thu, Dec 7, 2023 at 1:23 AM George Stark 
>>>  wrote:
 On 12/7/23 01:37, Christophe Leroy wrote:
> Le 06/12/2023 à 23:14, Christophe Leroy a écrit :
>>>
>>> ...
>>>
> Looking at it closer, I have the feeling that you want to do 
> similar to
> devm_gpio_request() in linux/gpio.h :
>
> In linux/mutex.h, add a prototype for devm_mutex_init() when
> CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
> Then define devm_mutex_init() in kernel/locking/mutex-debug.c

 Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
 we wouldn't have to include whole "linux/device.h" into mutex.h, only
 add forward declaration of struct device;
>>>
>>> In case you place it into a C-file. Otherwise you need a header for
>>> the API and that is not acceptable for mutex.h.
>>>
>>
>> Right, that's the reason why I'm suggesting to define devm_mutex_init()
>> in kernel/locking/mutex-debug.c.
>>
>> In linux/mutex.h, you define a stub for when CONFIG_DEBUG_MUTEXES is not
>> set, and the prototype of devm_mutex_init() when CONFIG_DEBUG_MUTEXES is
>> set.
> 
> Something like this:
> 
> diff --git a/include/linux/mutex.h b/include/linux/mutex.h
> index a33aa9eb9fc3..4a6041a7fd44 100644
> --- a/include/linux/mutex.h
> +++ b/include/linux/mutex.h
> @@ -21,6 +21,8 @@
>   #include 
>   #include 
> 
> +struct device;
> +
>   #ifdef CONFIG_DEBUG_LOCK_ALLOC
>   # define __DEP_MAP_MUTEX_INITIALIZER(lockname)    \
>   , .dep_map = {    \
> @@ -127,6 +129,20 @@ extern void __mutex_init(struct mutex *lock, const 
> char *name,
>    */
>   extern bool mutex_is_locked(struct mutex *lock);
> 
> +#ifdef CONFIG_DEBUG_MUTEXES

There is already a CONFIG_DEBUG_MUTEXES block, can you re-use it ?

> +
> +extern int devm_mutex_init(struct device *dev, struct mutex *lock);

'extern' is pointless and deprecated for function prototypes.
I know the kernel is full of them, but it is not a good reason to add 
new ones.

> +
> +#else
> +
> +static inline int devm_mutex_init(struct device *dev, struct mutex *lock)
> +{
> +    mutex_init(lock);
> +    return 0;
> +}
> +
> +#endif
> +
>   #else /* !CONFIG_PREEMPT_RT */
>   /*
>    * Preempt-RT variant based on rtmutexes.
> @@ -169,6 +185,13 @@ do {    \
>   \
>   __mutex_init((mutex), #mutex, &__key);    \
>   } while (0)
> +
> +static inline int devm_mutex_init(struct device *dev, struct mutex *lock)
> +{
> +    mutex_init(lock);
> +    return 0;
> +}
> +
>   #endif /* CONFIG_PREEMPT_RT */
> 
>   /*
> diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
> index bc8abb8549d2..d50dfa06e82c 100644
> --- a/kernel/locking/mutex-debug.c
> +++ b/kernel/locking/mutex-debug.c
> @@ -19,6 +19,7 @@
>   #include 
>   #include 
>   #include 
> +#include 
> 
>   #include "mutex.h"
> 
> @@ -104,3 +105,25 @@ void mutex_destroy(struct mutex *lock)
>   }
> 
>   EXPORT_SYMBOL_GPL(mutex_destroy);
> +
> +static void devm_mutex_release(void *res)
> +{
> +    mutex_destroy(res);
> +}
> +
> +/**
> + * devm_mutex_init - Resource-managed mutex initialization
> + * @dev:    Device which lifetime mutex is bound to
> + * @lock:    Pointer to a mutex
> + *
> + * Initialize mutex which is automatically destroyed when the driver is 
> detached.
> + *
> + * Returns: 0 on success or a negative error code on failure.
> + */
> +int devm_mutex_init(struct device *dev, struct mutex *lock)
> +{
> +    mutex_init(lock);
> +    return devm_add_action_or_reset(dev, devm_mutex_release, lock);
> +}
> +
> +EXPORT_SYMBOL_GPL(devm_mutex_init);
> \ No newline at end of file
> 
> 


Re: [PATCH] powerpc/ftrace: Fix stack teardown in ftrace_no_trace

2023-12-07 Thread Michael Ellerman
On Thu, 30 Nov 2023 12:29:47 +0530, Naveen N Rao wrote:
> Commit 41a506ef71eb ("powerpc/ftrace: Create a dummy stackframe to fix
> stack unwind") added use of a new stack frame on ftrace entry to fix
> stack unwind. However, the commit missed updating the offset used while
> tearing down the ftrace stack when ftrace is disabled. Fix the same.
> 
> In addition, the commit missed saving the correct stack pointer in
> pt_regs. Update the same.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/ftrace: Fix stack teardown in ftrace_no_trace
  https://git.kernel.org/powerpc/c/4b3338aaa74d7d4ec5b6734dc298f0db94ec83d2

cheers


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread George Stark




On 12/7/23 15:28, Christophe Leroy wrote:



Le 07/12/2023 à 13:02, Andy Shevchenko a écrit :

On Thu, Dec 7, 2023 at 1:23 AM George Stark  wrote:

On 12/7/23 01:37, Christophe Leroy wrote:

Le 06/12/2023 à 23:14, Christophe Leroy a écrit :


...


Looking at it closer, I have the feeling that you want to do similar to
devm_gpio_request() in linux/gpio.h :

In linux/mutex.h, add a prototype for devm_mutex_init() when
CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
Then define devm_mutex_init() in kernel/locking/mutex-debug.c


Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
we wouldn't have to include whole "linux/device.h" into mutex.h, only
add forward declaration of struct device;


In case you place it into a C-file. Otherwise you need a header for
the API and that is not acceptable for mutex.h.



Right, that's the reason why I'm suggesting to define devm_mutex_init()
in kernel/locking/mutex-debug.c.

In linux/mutex.h, you define a stub for when CONFIG_DEBUG_MUTEXES is not
set, and the prototype of devm_mutex_init() when CONFIG_DEBUG_MUTEXES is
set.


Something like this:

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a33aa9eb9fc3..4a6041a7fd44 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -21,6 +21,8 @@
 #include 
 #include 

+struct device;
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
, .dep_map = {  \
@@ -127,6 +129,20 @@ extern void __mutex_init(struct mutex *lock, const 
char *name,

  */
 extern bool mutex_is_locked(struct mutex *lock);

+#ifdef CONFIG_DEBUG_MUTEXES
+
+extern int devm_mutex_init(struct device *dev, struct mutex *lock);
+
+#else
+
+static inline int devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+   mutex_init(lock);
+   return 0;
+}
+
+#endif
+
 #else /* !CONFIG_PREEMPT_RT */
 /*
  * Preempt-RT variant based on rtmutexes.
@@ -169,6 +185,13 @@ do {   
\
\
__mutex_init((mutex), #mutex, &__key);  \
 } while (0)
+
+static inline int devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+   mutex_init(lock);
+   return 0;
+}
+
 #endif /* CONFIG_PREEMPT_RT */

 /*
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index bc8abb8549d2..d50dfa06e82c 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 

 #include "mutex.h"

@@ -104,3 +105,25 @@ void mutex_destroy(struct mutex *lock)
 }

 EXPORT_SYMBOL_GPL(mutex_destroy);
+
+static void devm_mutex_release(void *res)
+{
+   mutex_destroy(res);
+}
+
+/**
+ * devm_mutex_init - Resource-managed mutex initialization
+ * @dev:   Device which lifetime mutex is bound to
+ * @lock:  Pointer to a mutex
+ *
+ * Initialize mutex which is automatically destroyed when the driver is 
detached.

+ *
+ * Returns: 0 on success or a negative error code on failure.
+ */
+int devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+   mutex_init(lock);
+   return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+
+EXPORT_SYMBOL_GPL(devm_mutex_init);
\ No newline at end of file


--
Best regards
George


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Andy Shevchenko
On Thu, Dec 7, 2023 at 2:31 PM Christophe Leroy
 wrote:
> Le 07/12/2023 à 12:59, Andy Shevchenko a écrit :
> > On Thu, Dec 7, 2023 at 1:23 AM George Stark  
> > wrote:
> >> On 12/7/23 01:37, Christophe Leroy wrote:
> >>> Le 06/12/2023 à 23:14, Christophe Leroy a écrit :
>  Le 06/12/2023 à 19:58, George Stark a écrit :
> > On 12/6/23 18:01, Hans de Goede wrote:
> >> On 12/4/23 19:05, George Stark wrote:

...

> >> mutex_destroy() only actually does anything if CONFIG_DEBUG_MUTEXES
> >> is set, otherwise it is an empty inline-stub.
> >>
> >> Adding a devres resource to the device just to call an empty inline
> >> stub which is a no-op seems like a waste of resources. IMHO it
> >> would be better to change this to:
> >>
> >> static inline int devm_mutex_init(struct device *dev, struct mutex
> >> *lock)
> >> {
> >> mutex_init(lock);
> >> #ifdef CONFIG_DEBUG_MUTEXES
> >> return devm_add_action_or_reset(dev, devm_mutex_release, lock);

 (1)

> >> #else
> >> return 0;
> >> #endif
> >> }
> >>
> >> To avoid the unnecessary devres allocation when
> >> CONFIG_DEBUG_MUTEXES is not set.
> >
> > Honestly saying I don't like unnecessary devres allocation either but
> > the proposed approach has its own price:
> >
> > 1) we'll have more than one place with branching if mutex_destroy is
> > empty or not using  indirect condition. If suddenly mutex_destroy is
> > extended for non-debug code (in upstream branch or e.g. by someone for
> > local debug) than there'll be a problem.
> >
> > 2) If mutex_destroy is empty or not depends on CONFIG_PREEMPT_RT option
> > too. When CONFIG_PREEMPT_RT is on mutex_destroy is always empty.
> >
> > As I see it only the mutex interface (mutex.h) has to say definitely if
> > mutex_destroy must be called. Probably we could add some define to
> > include/linux/mutex.h,like IS_MUTEX_DESTROY_REQUIRED and declare it near
> > mutex_destroy definition itself.
> >
> > I tried to put devm_mutex_init itself in mutex.h and it could've helped
> > too but it's not the place for devm API.
> >
> 
>  What do you mean by "it's not the place for devm API" ?
> 
>  If you do a 'grep devm_ include/linux/' you'll find devm_ functions in
>  almost 100 .h files. Why wouldn't mutex.h be the place for
>  devm_mutex_init() ?
> >> mutex.h's maintainers believe so.
> >>
> >> https://lore.kernel.org/lkml/070c174c-057a-46de-ae8e-836e9e20e...@salutedevices.com/T/#mb42e1d7760816b0cedd3130e08f29690496b5ac2
> >>>
> >>> Looking at it closer, I have the feeling that you want to do similar to
> >>> devm_gpio_request() in linux/gpio.h :
> >>>
> >>> In linux/mutex.h, add a prototype for devm_mutex_init() when
> >>> CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
> >>> Then define devm_mutex_init() in kernel/locking/mutex-debug.c
> >>
> >> Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
> >> we wouldn't have to include whole "linux/device.h" into mutex.h, only
> >> add forward declaration of struct device;
> >>
> >>> Wouldn't that work ?
> >
> > No. It will require inclusion of device.h (which is a twisted hell
> > from the header perspective) into mutex.h. Completely unappreciated
> > move.
>
> I see no reason for including device.h, I think a forward declaration of
> struct device would be enough, as done in linux/gpio.h
>
> Am I missing something ?

Yes, see (1) above. If you want to have it in the header, you must
provide an API, which is located in device.h. The idea about
mutex-debug.c is interesting, but the file naming and the devm_*() API
for _initing_ the mutex seems confusing.

-- 
With Best Regards,
Andy Shevchenko


Re: [PATCH] powerpc/lib: Validate size for vector operations

2023-12-07 Thread Michael Ellerman
On Thu, 23 Nov 2023 12:47:05 +0530, Naveen N Rao wrote:
> Some of the fp/vmx code in sstep.c assume a certain maximum size for the
> instructions being emulated. The size of those operations however is
> determined separately in analyse_instr().
> 
> Add a check to validate the assumption on the maximum size of the
> operations, so as to prevent any unintended kernel stack corruption.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/lib: Validate size for vector operations
  https://git.kernel.org/powerpc/c/8f9abaa6d7de0a70fc68acaedce290c1f96e2e59

cheers


Re: [PATCH] misc: ocxl: link: Remove unnecessary (void*) conversions

2023-12-07 Thread Michael Ellerman
On Mon, 13 Nov 2023 09:45:33 +0800, Li zeming wrote:
> The link pointer does not need to cast the type.
> 
> 

Applied to powerpc/next.

[1/1] misc: ocxl: link: Remove unnecessary (void*) conversions
  https://git.kernel.org/powerpc/c/220f3ced8e42b1efe9c6b84778fb0c77c0c56611

cheers


Re: [PATCH] misc: ocxl: context: Remove unnecessary (void*) conversions

2023-12-07 Thread Michael Ellerman
On Mon, 13 Nov 2023 09:15:43 +0800, Li zeming wrote:
> The ctx pointer does not need to cast the type.
> 
> 

Applied to powerpc/next.

[1/1] misc: ocxl: context: Remove unnecessary (void*) conversions
  https://git.kernel.org/powerpc/c/82d30723d58fccbd2d7d707fab7649b541fafa1b

cheers


Re: [PATCH] misc: ocxl: afu_irq: Remove unnecessary (void*) conversions

2023-12-07 Thread Michael Ellerman
On Mon, 13 Nov 2023 09:22:02 +0800, Li zeming wrote:
> The irq pointer does not need to cast the type.
> 
> 

Applied to powerpc/next.

[1/1] misc: ocxl: afu_irq: Remove unnecessary (void*) conversions
  https://git.kernel.org/powerpc/c/84ba5d3675e23e6fa824a2268c5b6a04b52dde4d

cheers


Re: [PATCH] powerpc/44x: select I2C for CURRITUCK

2023-12-07 Thread Michael Ellerman
On Thu, 30 Nov 2023 21:51:59 -0800, Randy Dunlap wrote:
> Fix build errors when CURRITUCK=y and I2C is not builtin (=m or is
> not set). Fixes these build errors:
> 
> powerpc-linux-ld: arch/powerpc/platforms/44x/ppc476.o: in function 
> `avr_halt_system':
> ppc476.c:(.text+0x58): undefined reference to `i2c_smbus_write_byte_data'
> powerpc-linux-ld: arch/powerpc/platforms/44x/ppc476.o: in function 
> `ppc47x_device_probe':
> ppc476.c:(.init.text+0x18): undefined reference to `i2c_register_driver'
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/44x: select I2C for CURRITUCK
  https://git.kernel.org/powerpc/c/4a74197b65e69c46fe6e53f7df2f4d6ce9ffe012

cheers


Re: [PATCH] powerpc/rtas_pci: rename and properly expose config access APIs

2023-12-07 Thread Michael Ellerman
On Mon, 27 Nov 2023 18:40:09 -0600, Nathan Lynch wrote:
> The rtas_read_config() and rtas_write_config() functions in
> kernel/rtas_pci.c have external linkage and two users in arch/powerpc:
> the rtas_pci code itself and the pseries platform's "enhanced error
> handling" (EEH) support code.
> 
> The prototypes for these functions in asm/ppc-pci.h have until now
> been guarded by CONFIG_EEH since the only external caller is the
> pseries EEH code. However, this presumably has always generated
> warnings when built with !CONFIG_EEH and -Wmissing-prototypes:
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/rtas_pci: rename and properly expose config access APIs
  https://git.kernel.org/powerpc/c/9be4feb768b86c25da336a6c0f3e3caefd16f1e4

cheers


Re: [PATCH 0/7] powerpc/rtas: Trivial, coding style, and kernel-doc fixes

2023-12-07 Thread Michael Ellerman
On Mon, 06 Nov 2023 07:42:52 -0600, Nathan Lynch wrote:
> * Fix recently introduced kernel-doc warnings.
> * Make minor coding style adjustments for readability.
> * Remove rtas_service_present() and an old call_rtas() declaration.
> * Move a pseries-specific function prototype to pseries code.
> 

Patches 3-7 applied to powerpc/next.

[3/7] powerpc/rtas: Drop declaration of undefined call_rtas() function
  https://git.kernel.org/powerpc/c/981d1c997fbc5e193b282f3a325a0230bf697363
[4/7] powerpc/rtas: Remove unused rtas_service_present()
  https://git.kernel.org/powerpc/c/1d8faf1f41b550eb7ab7ac841ebd70f205840dde
[5/7] powerpc/rtas: Move post_mobility_fixup() declaration to pseries
  https://git.kernel.org/powerpc/c/010862d235c9fab4f0f9dd169efc72df94110758
[6/7] powerpc/rtas: Remove trailing space
  https://git.kernel.org/powerpc/c/19773eda86e289526b7f08fa56c92e75cd7796f6
[7/7] powerpc/rtas: Remove 'extern' from function declarations in rtas.h
  https://git.kernel.org/powerpc/c/646477fc47905157a8440cdc45aad22901b5b3ce

cheers


Re: [PATCH 0/3] powerpc/pseries/memhp: Fix minor bugs and improve error logging

2023-12-07 Thread Michael Ellerman
On Tue, 14 Nov 2023 11:01:52 -0600, Nathan Lynch wrote:
> This includes a fix for an array bounds read overrun that can be
> triggered when debug messages are enabled.
> 

Patches 1 and 3 applied to powerpc/next.

[1/3] powerpc/pseries/memhp: Fix access beyond end of drmem array
  https://git.kernel.org/powerpc/c/bd68ffce69f6cf8ddd3a3c32549d1d2275e49fc5
[3/3] powerpc/pseries/memhp: Log more error conditions in add path
  https://git.kernel.org/powerpc/c/27951e1d8274e9f9a2925b069e4492939a3f2099

cheers


Re: [PATCH] powerpc: Remove orphaned reg_a2.h

2023-12-07 Thread Michael Ellerman
On Mon, 13 Nov 2023 15:39:47 +1100, Michael Ellerman wrote:
> Commit fb5a515704d7 ("powerpc: Remove platforms/wsp and associated
> pieces") removed the A2 CPU support, but missed removal of reg_a2.h.
> 
> None of the defines contained in it are used, with the exception of the
> SPRN_TEN* values, but they are also defined in reg_booke.h.
> 
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc: Remove orphaned reg_a2.h
  https://git.kernel.org/powerpc/c/6f2a9e0e0ae5fb0697dd1660ede7e609be25ff6f

cheers


Re: [PATCH] powerpc: Make cpu_spec __ro_after_init

2023-12-07 Thread Michael Ellerman
On Wed, 25 Oct 2023 12:24:52 +1100, Michael Ellerman wrote:
> The cpu_spec is a struct holding various information about the CPU the
> kernel is executing on. It's populated early in boot and must not change
> after that.
> 
> In particular the cpu_features and mmu_features hold the set of
> discovered CPU/MMU features and are used to set static keys for each
> feature, and do binary patching of assembly. So any change to the
> cpu_features/mmu_features later in boot will not be reflected in
> the state of the static keys or patched code.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc: Make cpu_spec __ro_after_init
  https://git.kernel.org/powerpc/c/98eb30fe4c69a9b602f29e406317c49b5580352a

cheers


Re: [PATCH] powerpc/lib: Avoid array bounds warnings in vec ops

2023-12-07 Thread Michael Ellerman
On Tue, 21 Nov 2023 10:54:36 +1100, Michael Ellerman wrote:
> Building with GCC 13 (which has -array-bounds enabled) there are several
> warnings in sstep.c along the lines of:
> 
>   In function ‘do_byte_reverse’,
>   inlined from ‘do_vec_load’ at arch/powerpc/lib/sstep.c:691:3,
>   inlined from ‘emulate_loadstore’ at arch/powerpc/lib/sstep.c:3439:9:
>   arch/powerpc/lib/sstep.c:289:23: error: array subscript 2 is outside array 
> bounds of ‘u8[16]’ {aka ‘unsigned char[16]’} [-Werror=array-bounds=]
> 289 | up[2] = byterev_8(up[1]);
> | ~~^~
>   arch/powerpc/lib/sstep.c: In function ‘emulate_loadstore’:
>   arch/powerpc/lib/sstep.c:681:11: note: at offset 16 into object ‘u’ of size 
> 16
> 681 | } u = {};
> |   ^
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/lib: Avoid array bounds warnings in vec ops
  https://git.kernel.org/powerpc/c/df99da19c6c24ab65052ae1bc0904f99069478d9

cheers


Re: [PATCH] powerpc/configs/64s: Enable CONFIG_MEM_SOFT_DIRTY

2023-12-07 Thread Michael Ellerman
On Tue, 24 Oct 2023 22:27:25 +1100, Michael Ellerman wrote:
> Enable CONFIG_MEM_SOFT_DIRTY to get some test coverage. Distros enable
> it, and it has been broken previously. See commit 66b2ca086210
> ("powerpc/64s/radix: Fix soft dirty tracking").
> 
> 

Applied to powerpc/next.

[1/1] powerpc/configs/64s: Enable CONFIG_MEM_SOFT_DIRTY
  https://git.kernel.org/powerpc/c/183bc0c640c785a710885a10b614193f114fe760

cheers


Re: [PATCH] powerpc/32: Drop unused grackle_set_stg()

2023-12-07 Thread Michael Ellerman
On Mon, 13 Nov 2023 16:19:29 +1100, Michael Ellerman wrote:
> The call to grackle_set_stg() ("Store Gathering") has always been inside
> an #ifdef 0, since the code was first merged in v2.3.43pre7.
> 
> Apparently it was suspected of causing problems on some hardware so was
> disabled. No one has ever proved otherwise so drop the code as unused
> for now.
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc/32: Drop unused grackle_set_stg()
  https://git.kernel.org/powerpc/c/c8a1634145c23a5a979a7166a12b99871812a6ab

cheers


Re: [PATCH 1/5] powerpc/suspend: Add prototype for do_after_copyback()

2023-12-07 Thread Michael Ellerman
On Thu, 30 Nov 2023 00:19:15 +1100, Michael Ellerman wrote:
> With HIBERNATION=y the build breaks with:
> 
>   arch/powerpc/kernel/swsusp_64.c:14:6: error: no previous prototype for 
> ‘do_after_copyback’ [-Werror=missing-prototypes]
>   14 | void do_after_copyback(void)
>  |  ^
> 
> do_after_copyback() is only called from asm, so there is no prototype,
> nor any header where it makes sense to place one. Just add a prototype
> in the C file to fix the build error.
> 
> [...]

Applied to powerpc/next.

[1/5] powerpc/suspend: Add prototype for do_after_copyback()
  https://git.kernel.org/powerpc/c/360f051d82ee0cc580edfffe9e8c0b93011ab86d
[2/5] powerpc/512x: Make pdm360ng_init() static
  https://git.kernel.org/powerpc/c/24afc61990de29dd47be7642c196a173f6cc21fc
[3/5] powerpc/512x: Fix missing prototype warnings
  https://git.kernel.org/powerpc/c/10feb8f9612239b665815807e950bcd999a75dd2
[4/5] powerpc/44x: Make ppc44x_idle_init() static
  https://git.kernel.org/powerpc/c/b90ad501715f2feb1b0bf97aa700adb39c78deb3
[5/5] powerpc/64s: Fix CONFIG_NUMA=n build
  https://git.kernel.org/powerpc/c/ede66cd22441820cbd399936bf84fdc4294bc7fa

cheers


Re: [PATCH 1/2] powerpc/mm: Fix build failures due to arch_reserved_kernel_pages()

2023-12-07 Thread Michael Ellerman
On Thu, 30 Nov 2023 22:44:32 +1100, Michael Ellerman wrote:
> With NUMA=n and FA_DUMP=y or PRESERVE_FA_DUMP=y the build fails with:
> 
>   arch/powerpc/kernel/fadump.c:1739:22: error: no previous prototype for 
> ‘arch_reserved_kernel_pages’ [-Werror=missing-prototypes]
>   1739 | unsigned long __init arch_reserved_kernel_pages(void)
>|  ^~
> 
> The prototype for arch_reserved_kernel_pages() is in include/linux/mm.h,
> but it's guarded by __HAVE_ARCH_RESERVED_KERNEL_PAGES. The powerpc
> headers define __HAVE_ARCH_RESERVED_KERNEL_PAGES in asm/mmzone.h, which
> is not included into the generic headers when NUMA=n.
> 
> [...]

Applied to powerpc/next.

[1/2] powerpc/mm: Fix build failures due to arch_reserved_kernel_pages()
  https://git.kernel.org/powerpc/c/d8c3f243d4db24675b653f0568bb65dae34e6455
[2/2] powerpc: Fix build error due to is_valid_bugaddr()
  https://git.kernel.org/powerpc/c/f8d3555355653848082c351fa90775214fb8a4fa

cheers


Re: [PATCH v2] powerpc: Add PVN support for HeXin C2000 processor

2023-12-07 Thread Michael Ellerman
On Wed, 29 Nov 2023 15:58:45 +0800, Zhao Ke wrote:
> HeXin Tech Co. has applied for a new PVN from the OpenPower Community
> for its new processor C2000. The OpenPower has assigned a new PVN
> and this newly assigned PVN is 0x0066, add pvr register related
> support for this PVN.
> 
> 

Applied to powerpc/next.

[1/1] powerpc: Add PVN support for HeXin C2000 processor
  https://git.kernel.org/powerpc/c/e12d8e2602d2bcd26022eff3e2519d25925e760c

cheers


Re: [PATCH] powerpc: add crtsavres.o to always-y instead of extra-y

2023-12-07 Thread Michael Ellerman
On Tue, 21 Nov 2023 08:23:32 +0900, Masahiro Yamada wrote:
> crtsavres.o is linked to modules. However, as explained in commit
> d0e628cd817f ("kbuild: doc: clarify the difference between extra-y
> and always-y"), 'make modules' does not build extra-y.
> 
> For example, the following command fails:
> 
>   $ make ARCH=powerpc LLVM=1 KBUILD_MODPOST_WARN=1 mrproper ps3_defconfig 
> modules
> [snip]
> LD [M]  arch/powerpc/platforms/cell/spufs/spufs.ko
>   ld.lld: error: cannot open arch/powerpc/lib/crtsavres.o: No such file or 
> directory
>   make[3]: *** [scripts/Makefile.modfinal:56: 
> arch/powerpc/platforms/cell/spufs/spufs.ko] Error 1
>   make[2]: *** [Makefile:1844: modules] Error 2
>   make[1]: *** [/home/masahiro/workspace/linux-kbuild/Makefile:350: 
> __build_one_by_one] Error 2
>   make: *** [Makefile:234: __sub-make] Error 2
> 
> [...]

Applied to powerpc/next.

[1/1] powerpc: add crtsavres.o to always-y instead of extra-y
  https://git.kernel.org/powerpc/c/1b1e38002648819c04773647d5242990e2824264

cheers


Re: [PATCH] misc: ocxl: main: Remove unnecessary ‘0’ values from rc

2023-12-07 Thread Michael Ellerman
On Mon, 13 Nov 2023 09:52:29 +0800, Li kunyu wrote:
> rc is assigned first, so it does not need to initialize the assignment.
> 
> 

Applied to powerpc/next.

[1/1] misc: ocxl: main: Remove unnecessary ‘0’ values from rc
  https://git.kernel.org/powerpc/c/29685ea5754f04c84ad443fd7c6869c68f636c26

cheers


Re: [PATCH] powerpc/85xx: Fix typo in code comment

2023-12-07 Thread Michael Ellerman
On Fri, 24 Nov 2023 11:02:37 +0100, Dario Binacchi wrote:
> s/singals/signals/
> 
> 

Applied to powerpc/next.

[1/1] powerpc/85xx: Fix typo in code comment
  https://git.kernel.org/powerpc/c/a9e1e4d6e8c77c732e8084b03bae0c78cafdceb0

cheers


Re: [PATCH] powerpc/xics: Check return value of kasprintf in icp_native_map_one_cpu

2023-12-07 Thread Michael Ellerman
On Wed, 22 Nov 2023 11:06:51 +0800, Kunwu Chan wrote:
> kasprintf() returns a pointer to dynamically allocated memory
> which can be NULL upon failure. Ensure the allocation was successful
> by checking the pointer validity.
> 
> 

Applied to powerpc/next.

[1/1] powerpc/xics: Check return value of kasprintf in icp_native_map_one_cpu
  https://git.kernel.org/powerpc/c/45b1ba7e5d1f6881050d558baf9bc74a2ae13930

cheers


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Christophe Leroy


Le 07/12/2023 à 12:59, Andy Shevchenko a écrit :
> On Thu, Dec 7, 2023 at 1:23 AM George Stark  wrote:
>> On 12/7/23 01:37, Christophe Leroy wrote:
>>> Le 06/12/2023 à 23:14, Christophe Leroy a écrit :
 Le 06/12/2023 à 19:58, George Stark a écrit :
> On 12/6/23 18:01, Hans de Goede wrote:
>> On 12/4/23 19:05, George Stark wrote:
> 
> ...
> 
>> mutex_destroy() only actually does anything if CONFIG_DEBUG_MUTEXES
>> is set, otherwise it is an empty inline-stub.
>>
>> Adding a devres resource to the device just to call an empty inline
>> stub which is a no-op seems like a waste of resources. IMHO it
>> would be better to change this to:
>>
>> static inline int devm_mutex_init(struct device *dev, struct mutex
>> *lock)
>> {
>> mutex_init(lock);
>> #ifdef CONFIG_DEBUG_MUTEXES
>> return devm_add_action_or_reset(dev, devm_mutex_release, lock);
>> #else
>> return 0;
>> #endif
>> }
>>
>> To avoid the unnecessary devres allocation when
>> CONFIG_DEBUG_MUTEXES is not set.
>
> Honestly saying I don't like unnecessary devres allocation either but
> the proposed approach has its own price:
>
> 1) we'll have more than one place with branching if mutex_destroy is
> empty or not using  indirect condition. If suddenly mutex_destroy is
> extended for non-debug code (in upstream branch or e.g. by someone for
> local debug) than there'll be a problem.
>
> 2) If mutex_destroy is empty or not depends on CONFIG_PREEMPT_RT option
> too. When CONFIG_PREEMPT_RT is on mutex_destroy is always empty.
>
> As I see it only the mutex interface (mutex.h) has to say definitely if
> mutex_destroy must be called. Probably we could add some define to
> include/linux/mutex.h,like IS_MUTEX_DESTROY_REQUIRED and declare it near
> mutex_destroy definition itself.
>
> I tried to put devm_mutex_init itself in mutex.h and it could've helped
> too but it's not the place for devm API.
>

 What do you mean by "it's not the place for devm API" ?

 If you do a 'grep devm_ include/linux/' you'll find devm_ functions in
 almost 100 .h files. Why wouldn't mutex.h be the place for
 devm_mutex_init() ?
>> mutex.h's maintainers believe so.
>>
>> https://lore.kernel.org/lkml/070c174c-057a-46de-ae8e-836e9e20e...@salutedevices.com/T/#mb42e1d7760816b0cedd3130e08f29690496b5ac2
>>>
>>> Looking at it closer, I have the feeling that you want to do similar to
>>> devm_gpio_request() in linux/gpio.h :
>>>
>>> In linux/mutex.h, add a prototype for devm_mutex_init() when
>>> CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
>>> Then define devm_mutex_init() in kernel/locking/mutex-debug.c
>>
>> Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
>> we wouldn't have to include whole "linux/device.h" into mutex.h, only
>> add forward declaration of struct device;
>>
>>> Wouldn't that work ?
> 
> No. It will require inclusion of device.h (which is a twisted hell
> from the header perspective) into mutex.h. Completely unappreciated
> move.
> 

I see no reason for including device.h, I think a forward declaration of 
struct device would be enough, as done in linux/gpio.h

Am I missing something ?


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Christophe Leroy


Le 07/12/2023 à 13:02, Andy Shevchenko a écrit :
> On Thu, Dec 7, 2023 at 1:23 AM George Stark  wrote:
>> On 12/7/23 01:37, Christophe Leroy wrote:
>>> Le 06/12/2023 à 23:14, Christophe Leroy a écrit :
> 
> ...
> 
>>> Looking at it closer, I have the feeling that you want to do similar to
>>> devm_gpio_request() in linux/gpio.h :
>>>
>>> In linux/mutex.h, add a prototype for devm_mutex_init() when
>>> CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
>>> Then define devm_mutex_init() in kernel/locking/mutex-debug.c
>>
>> Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
>> we wouldn't have to include whole "linux/device.h" into mutex.h, only
>> add forward declaration of struct device;
> 
> In case you place it into a C-file. Otherwise you need a header for
> the API and that is not acceptable for mutex.h.
> 

Right, that's the reason why I'm suggesting to define devm_mutex_init() 
in kernel/locking/mutex-debug.c.

In linux/mutex.h, you define a stub for when CONFIG_DEBUG_MUTEXES is not 
set, and the prototype of devm_mutex_init() when CONFIG_DEBUG_MUTEXES is 
set.


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Andy Shevchenko
On Thu, Dec 7, 2023 at 1:23 AM George Stark  wrote:
> On 12/7/23 01:37, Christophe Leroy wrote:
> > Le 06/12/2023 à 23:14, Christophe Leroy a écrit :

...

> > Looking at it closer, I have the feeling that you want to do similar to
> > devm_gpio_request() in linux/gpio.h :
> >
> > In linux/mutex.h, add a prototype for devm_mutex_init() when
> > CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
> > Then define devm_mutex_init() in kernel/locking/mutex-debug.c
>
> Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
> we wouldn't have to include whole "linux/device.h" into mutex.h, only
> add forward declaration of struct device;

In case you place it into a C-file. Otherwise you need a header for
the API and that is not acceptable for mutex.h.

-- 
With Best Regards,
Andy Shevchenko


Re: [PATCH v2 01/10] devm-helpers: introduce devm_mutex_init

2023-12-07 Thread Andy Shevchenko
On Thu, Dec 7, 2023 at 1:23 AM George Stark  wrote:
> On 12/7/23 01:37, Christophe Leroy wrote:
> > Le 06/12/2023 à 23:14, Christophe Leroy a écrit :
> >> Le 06/12/2023 à 19:58, George Stark a écrit :
> >>> On 12/6/23 18:01, Hans de Goede wrote:
>  On 12/4/23 19:05, George Stark wrote:

...

>  mutex_destroy() only actually does anything if CONFIG_DEBUG_MUTEXES
>  is set, otherwise it is an empty inline-stub.
> 
>  Adding a devres resource to the device just to call an empty inline
>  stub which is a no-op seems like a waste of resources. IMHO it
>  would be better to change this to:
> 
>  static inline int devm_mutex_init(struct device *dev, struct mutex
>  *lock)
>  {
> mutex_init(lock);
>  #ifdef CONFIG_DEBUG_MUTEXES
> return devm_add_action_or_reset(dev, devm_mutex_release, lock);
>  #else
> return 0;
>  #endif
>  }
> 
>  To avoid the unnecessary devres allocation when
>  CONFIG_DEBUG_MUTEXES is not set.
> >>>
> >>> Honestly saying I don't like unnecessary devres allocation either but
> >>> the proposed approach has its own price:
> >>>
> >>> 1) we'll have more than one place with branching if mutex_destroy is
> >>> empty or not using  indirect condition. If suddenly mutex_destroy is
> >>> extended for non-debug code (in upstream branch or e.g. by someone for
> >>> local debug) than there'll be a problem.
> >>>
> >>> 2) If mutex_destroy is empty or not depends on CONFIG_PREEMPT_RT option
> >>> too. When CONFIG_PREEMPT_RT is on mutex_destroy is always empty.
> >>>
> >>> As I see it only the mutex interface (mutex.h) has to say definitely if
> >>> mutex_destroy must be called. Probably we could add some define to
> >>> include/linux/mutex.h,like IS_MUTEX_DESTROY_REQUIRED and declare it near
> >>> mutex_destroy definition itself.
> >>>
> >>> I tried to put devm_mutex_init itself in mutex.h and it could've helped
> >>> too but it's not the place for devm API.
> >>>
> >>
> >> What do you mean by "it's not the place for devm API" ?
> >>
> >> If you do a 'grep devm_ include/linux/' you'll find devm_ functions in
> >> almost 100 .h files. Why wouldn't mutex.h be the place for
> >> devm_mutex_init() ?
> mutex.h's maintainers believe so.
>
> https://lore.kernel.org/lkml/070c174c-057a-46de-ae8e-836e9e20e...@salutedevices.com/T/#mb42e1d7760816b0cedd3130e08f29690496b5ac2
> >
> > Looking at it closer, I have the feeling that you want to do similar to
> > devm_gpio_request() in linux/gpio.h :
> >
> > In linux/mutex.h, add a prototype for devm_mutex_init() when
> > CONFIG_DEBUG_MUTEXES is defined and an empty static inline otherwise.
> > Then define devm_mutex_init() in kernel/locking/mutex-debug.c
>
> Yes, this would be almost perfect decision. BTW just as in linux/gpio.h
> we wouldn't have to include whole "linux/device.h" into mutex.h, only
> add forward declaration of struct device;
>
> > Wouldn't that work ?

No. It will require inclusion of device.h (which is a twisted hell
from the header perspective) into mutex.h. Completely unappreciated
move.

-- 
With Best Regards,
Andy Shevchenko


Re: [PATCH v2 00/35] bitops: add atomic find_bit() operations

2023-12-07 Thread Jan Kara
On Tue 05-12-23 21:22:59, Yury Norov wrote:
> On Mon, Dec 04, 2023 at 07:51:01PM +0100, Jan Kara wrote:
> > > This series is a result of discussion [1]. All find_bit() functions imply
> > > exclusive access to the bitmaps. However, KCSAN reports quite a number
> > > of warnings related to find_bit() API. Some of them are not pointing
> > > to real bugs because in many situations people intentionally allow
> > > concurrent bitmap operations.
> > > 
> > > If so, find_bit() can be annotated such that KCSAN will ignore it:
> > > 
> > > bit = data_race(find_first_bit(bitmap, nbits));
> > 
> > No, this is not a correct thing to do. If concurrent bitmap changes can
> > happen, find_first_bit() as it is currently implemented isn't ever a safe
> > choice because it can call __ffs(0) which is dangerous as you properly note
> > above. I proposed adding READ_ONCE() into find_first_bit() / find_next_bit()
> > implementation to fix this issue but you disliked that. So other option we
> > have is adding find_first_bit() and find_next_bit() variants that take
> > volatile 'addr' and we have to use these in code like xas_find_chunk()
> > which cannot be converted to your new helpers.
> 
> Here is some examples when concurrent operations with plain find_bit()
> are acceptable:
> 
>  - two threads running find_*_bit(): safe wrt ffs(0) and returns correct
>value, because underlying bitmap is unchanged;
>  - find_next_bit() in parallel with set or clear_bit(), when modifying
>a bit prior to the start bit to search: safe and correct;
>  - find_first_bit() in parallel with set_bit(): safe, but may return wrong
>bit number;
>  - find_first_zero_bit() in parallel with clear_bit(): same as above.
> 
> In last 2 cases find_bit() may not return a correct bit number, but
> it may be OK if caller requires any (not exactly first) set or clear
> bit, correspondingly.
> 
> In such cases, KCSAN may be safely silenced.

True - but these are special cases. In particular the case in xas_find_chunk()
is not any of these special cases. It is using find_next_bit() which is can
be racing with clear_bit(). So what are your plans for such usecase?

Honza
-- 
Jan Kara 
SUSE Labs, CR


Re: [PATCH 01/12] KVM: PPC: Book3S HV nestedv2: Invalidate RPT before deleting a guest

2023-12-07 Thread IBM
Vaibhav Jain  writes:

> From: Jordan Niethe 
>
> An L0 must invalidate the L2's RPT during H_GUEST_DELETE if this has not
> already been done. This is a slow operation that means H_GUEST_DELETE
> must return H_BUSY multiple times before completing. Invalidating the
> tables before deleting the guest so there is less work for the L0 to do.
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/kvm_book3s.h | 1 +
>  arch/powerpc/kvm/book3s_hv.c  | 6 --
>  arch/powerpc/kvm/book3s_hv_nested.c   | 2 +-
>  3 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 4f527d09c92b..a37736ed3728 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -302,6 +302,7 @@ void kvmhv_nested_exit(void);
>  void kvmhv_vm_nested_init(struct kvm *kvm);
>  long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
>  long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu);
> +void kvmhv_flush_lpid(u64 lpid);
>  void kvmhv_set_ptbl_entry(u64 lpid, u64 dw0, u64 dw1);
>  void kvmhv_release_all_nested(struct kvm *kvm);
>  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 1ed6ec140701..5543e8490cd9 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -5691,10 +5691,12 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
>   kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
>   }
>  
> - if (kvmhv_is_nestedv2())
> + if (kvmhv_is_nestedv2()) {
> + kvmhv_flush_lpid(kvm->arch.lpid);
>   plpar_guest_delete(0, kvm->arch.lpid);
>

I am not sure I follow the optimization here. I would expect the
hypervisor to kill all the translation caches as part of guest_delete.
What is the benefit of doing a lpid flush outside the guest delete?

> - else
> + } else {
>   kvmppc_free_lpid(kvm->arch.lpid);
> + }
>  
>   kvmppc_free_pimap(kvm);
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
> b/arch/powerpc/kvm/book3s_hv_nested.c
> index 3b658b8696bc..5c375ec1a3c6 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -503,7 +503,7 @@ void kvmhv_nested_exit(void)
>   }
>  }
>  
> -static void kvmhv_flush_lpid(u64 lpid)
> +void kvmhv_flush_lpid(u64 lpid)
>  {
>   long rc;
>  
> -- 
> 2.42.0