[PATCH v4 4/9] drm/i915/perf: open access for CAP_SYS_PERFMON privileged process
Open access to i915_perf monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to i915_perf subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure i915_perf monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- drivers/gpu/drm/i915/i915_perf.c | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index e42b86827d6b..e2697f8d04de 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -2748,10 +2748,10 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv, /* Similar to perf's kernel.perf_paranoid_cpu sysctl option * we check a dev.i915.perf_stream_paranoid sysctl option * to determine if it's ok to access system wide OA counters -* without CAP_SYS_ADMIN privileges. +* without CAP_SYS_PERFMON or CAP_SYS_ADMIN privileges. */ if (privileged_op && - i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n"); ret = -EACCES; goto err_ctx; @@ -2939,9 +2939,8 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, } else oa_freq_hz = 0; - if (oa_freq_hz > i915_oa_max_sample_rate && - !capable(CAP_SYS_ADMIN)) { - DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root privileges\n", + if (oa_freq_hz > i915_oa_max_sample_rate && !perfmon_capable()) { + DRM_DEBUG("OA exponent would exceed the max sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without CAP_SYS_PERFMON or CAP_SYS_ADMIN privileges\n", i915_oa_max_sample_rate); return -EACCES; } @@ -3328,7 +3327,7 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, return -EINVAL; } - if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + if (i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to add i915 OA config\n"); return -EACCES; } @@ -3474,7 +3473,7 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, return -ENOTSUPP; } - if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { + if (i915_perf_stream_paranoid && !perfmon_capable()) { DRM_DEBUG("Insufficient privileges to remove i915 OA config\n"); return -EACCES; } -- 2.20.1
Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))
Linus Torvalds writes: > On Tue, Dec 17, 2019 at 10:04 AM Linus Torvalds > wrote: >> >> Let me think about it. > > How about we just get rid of the union entirely, and just use > 'unsigned long' or 'unsigned long long' depending on the size. > > Something like the attached patch - it still requires that it be an > arithmetic type, but now because of the final cast. > > But it might still be a cast to a volatile type, of course. Then the > result will be volatile, but at least now READ_ONCE() won't be taking > the address of a volatile variable on the stack - does that at least > fix some of the horrible code generation. Hmm? Yes it seems to fix it for me. There's no unnecessary stack protector gunk, and no store/load to the stack variable. This is my previous example of ext4_resize_begin(), hacked to use a copy of the generic version of test_and_set_bit_lock(), which in turn was hacked to use a local version of your READ_ONCE(). c0534390 : c0534390: 19 01 4c 3c addis r2,r12,281 c0534394: 70 c3 42 38 addir2,r2,-15504 c0534398: a6 02 08 7c mflrr0 c053439c: 4d 98 b3 4b bl c006dbe8 <_mcount> c05343a0: a6 02 08 7c mflrr0 c05343a4: f8 ff e1 fb std r31,-8(r1) c05343a8: f0 ff c1 fb std r30,-16(r1) c05343ac: 78 1b 7f 7c mr r31,r3 c05343b0: 18 00 60 38 li r3,24 c05343b4: 10 00 01 f8 std r0,16(r1) c05343b8: 91 ff 21 f8 stdur1,-112(r1) c05343bc: 98 03 df eb ld r30,920(r31) c05343c0: d9 d3 c0 4b bl c0141798 c05343c4: 00 00 00 60 nop c05343c8: 00 00 a3 2f cmpdi cr7,r3,0 c05343cc: a4 00 9e 41 beq cr7,c0534470 c05343d0: 98 03 3f e9 ld r9,920(r31) c05343d4: 60 00 5e e9 ld r10,96(r30) c05343d8: 54 00 fe 80 lwz r7,84(r30) c05343dc: 68 00 09 e9 ld r8,104(r9) c05343e0: 18 00 4a e9 ld r10,24(r10) c05343e4: 14 00 08 81 lwz r8,20(r8) c05343e8: 36 3c 4a 7d srd r10,r10,r7 c05343ec: 00 40 aa 7f cmpdcr7,r10,r8 c05343f0: b8 00 9e 40 bne cr7,c05344a8 c05343f4: a0 00 49 a1 lhz r10,160(r9) c05343f8: 02 00 4a 71 andi. r10,r10,2 c05343fc: 84 00 82 40 bne c0534480 c0534400: 30 02 49 e9 ld r10,560(r9) # simple load of EXT4_SB(sb)->s_ext4_flags c0534404: 01 00 4a 71 andi. r10,r10,1 c0534408: 48 00 82 40 bne c0534450 c053440c: 30 02 e9 38 addir7,r9,560 c0534410: 01 00 00 39 li r8,1 c0534414: a8 38 40 7d ldarx r10,0,r7 c0534418: 78 53 06 7d or r6,r8,r10 c053441c: ad 39 c0 7c stdcx. r6,0,r7 c0534420: f4 ff c2 40 bne-c0534414 c0534424: 2c 01 00 4c isync c0534428: 01 00 49 71 andi. r9,r10,1 c053442c: 00 00 60 38 li r3,0 c0534430: 20 00 82 40 bne c0534450 c0534434: 70 00 21 38 addir1,r1,112 c0534438: 10 00 01 e8 ld r0,16(r1) c053443c: f0 ff c1 eb ld r30,-16(r1) c0534440: f8 ff e1 eb ld r31,-8(r1) c053: a6 03 08 7c mtlrr0 c0534448: 20 00 80 4e blr c053444c: 00 00 00 60 nop c0534450: 70 00 21 38 addir1,r1,112 c0534454: f0 ff 60 38 li r3,-16 c0534458: 10 00 01 e8 ld r0,16(r1) c053445c: f0 ff c1 eb ld r30,-16(r1) c0534460: f8 ff e1 eb ld r31,-8(r1) c0534464: a6 03 08 7c mtlrr0 c0534468: 20 00 80 4e blr c053446c: 00 00 00 60 nop c0534470: ff ff 60 38 li r3,-1 c0534474: c0 ff ff 4b b c0534434 c0534478: 00 00 00 60 nop c053447c: 00 00 00 60 nop c0534480: 8a ff c2 3c addis r6,r2,-118 c0534484: 74 ff 82 3c addis r4,r2,-140 c0534488: 78 fb e3 7f mr r3,r31 c053448c: 7c 00 a0 38 li r5,124 c0534490: a8 75 c6 38 addir6,r6,30120 c0534494: f8 0b 84 38 addir4,r4,3064 c0534498:
[PATCH v4 9/9] drivers/oprofile: open access for CAP_SYS_PERFMON privileged process
Open access to monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- drivers/oprofile/event_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 12ea4a4ad607..6c9edc8bbc95 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -113,7 +113,7 @@ static int event_buffer_open(struct inode *inode, struct file *file) { int err = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (test_and_set_bit_lock(0, _opened)) -- 2.20.1
[Bug 205889] CONFIG_PPC_85xx with CONFIG_CORENET_GENERIC outputs uImage instead of zImage
https://bugzilla.kernel.org/show_bug.cgi?id=205889 Michael Ellerman (mich...@ellerman.id.au) changed: What|Removed |Added CC||mich...@ellerman.id.au --- Comment #1 from Michael Ellerman (mich...@ellerman.id.au) --- I think that's working as designed, if not expected. CONFIG_CORENET_GENERIC selects CONFIG_DEFAULT_UIMAGE, which tells arch/powerpc/boot/Makefile to build a uImage. The zImage is just a hardlinkg to the uImage. You can see that using the -i flag to ls, eg: $ ls -il arch/powerpc/boot/{uImage,zImage} 1300656 -rw-rw-r-- 2 michael michael 5356164 Dec 18 22:19 arch/powerpc/boot/uImage 1300656 -rw-rw-r-- 2 michael michael 5356164 Dec 18 22:19 arch/powerpc/boot/zImage Notice the inode is the same, and the link count is 2. I think the logic behind the link is that people might have scripts to copy zImage somewhere to boot it, and we didn't want to break that just because the target of the build is a uImage. -- You are receiving this mail because: You are watching the assignee of the bug.
[PATCH v15 13/23] selftests/vm/pkeys: Introduce powerpc support
From: Ram Pai This makes use of the abstractions added earlier and introduces support for powerpc. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h| 2 + tools/testing/selftests/vm/pkey-powerpc.h| 93 +++ tools/testing/selftests/vm/protection_keys.c | 269 ++- 3 files changed, 236 insertions(+), 128 deletions(-) create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index c929cc09b3cc..5de06d3a81de 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -79,6 +79,8 @@ void expected_pkey_fault(int pkey); #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" #else /* arch */ #error Architecture not supported #endif /* arch */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index ..5ced9c58fa4a --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE0x2 +#endif + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K 3 /* pkey-0, pkey-1 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE (1UL << 16) +#define pkey_reg_t u64 +#define PKEY_REG_FMT "%016lx" + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline pkey_reg_t __read_pkey_reg(void) +{ + pkey_reg_t pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(pkey_reg_t pkey_reg) +{ + pkey_reg_t eax = pkey_reg; + + dprintf4("%s() changing "PKEY_REG_FMT" to "PKEY_REG_FMT"\n", +__func__, __read_pkey_reg(), pkey_reg); + + asm volatile("mtspr 0xd, %0" : : "r" ((unsigned long)(eax)) : "memory"); + + dprintf4("%s() pkey register after changing "PKEY_REG_FMT" to " + PKEY_REG_FMT"\n", __func__, __read_pkey_reg(), + pkey_reg); +} + +static inline int cpu_has_pku(void) +{ + return 1; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + return NR_RESERVED_PKEYS_64K; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* +* powerpc does not allow userspace to change permissions of exec-only +* keys since those keys are not allocated by userspace. The signal +* handler wont be able to reset the permissions, which means the code +* will infinitely continue to segfault here. +*/ + return; +} + +/* 8-bytes of instruction * 16384bytes = 1 page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index dbc2918bec13..1920bca84def 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -168,6 +168,125 @@ void dump_mem(void *dumpme, int len_bytes) } } +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + pkey_reg_t pkey_reg = __read_pkey_reg(); + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: "PKEY_REG_FMT"\n", __func__, pkey_reg); + + return (u32) get_pkey_bits(pkey_reg, pkey); +} + +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + pkey_reg_t old_pkey_reg = __read_pkey_reg(); + pkey_reg_t new_pkey_reg; + + /* make sure that 'rights' only contains the bits we expect: */ +
[PATCH v15 17/23] selftests/vm/pkeys: Associate key on a mapped page and detect write violation
From: Ram Pai Detect write-violation on a page to which write-disabled key is associated much after the page is mapped. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Acked-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 12 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index f63b9f55b3a7..ff207b765afd 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1001,6 +1001,17 @@ void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, expected_pkey_fault(pkey); } +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); @@ -1409,6 +1420,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_access_disabled_region, test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, -- 2.17.1
Re: [RFC PATCH 1/2] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush
On Wed, Dec 18, 2019 at 10:52:53AM +0530, Aneesh Kumar K.V wrote: > Upstream ppc64 is broken after the commit: a46cc7a90fd8 > ("powerpc/mm/radix: Improve TLB/PWC flushes"). > > Also the patches are not adding any extra TLBI on either radix or hash. > > Considering we need to backport this to stable and other distributions, > how about we do this early patches in your series before the Kconfig rename? > This should enable stable to pick them up with less dependencies. OK I suppose. Will you send a new series?
[PATCH v4 2/9] perf/core: open access for CAP_SYS_PERFMON privileged process
Open access to perf_events monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to perf_events subsystem remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- include/linux/perf_event.h | 6 +++--- kernel/events/core.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 34c7c6910026..f46acd69425f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1285,7 +1285,7 @@ static inline int perf_is_paranoid(void) static inline int perf_allow_kernel(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_KERNEL); @@ -1293,7 +1293,7 @@ static inline int perf_allow_kernel(struct perf_event_attr *attr) static inline int perf_allow_cpu(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); @@ -1301,7 +1301,7 @@ static inline int perf_allow_cpu(struct perf_event_attr *attr) static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { - if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN)) + if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); diff --git a/kernel/events/core.c b/kernel/events/core.c index 059ee7116008..d9db414f2197 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9056,7 +9056,7 @@ static int perf_kprobe_event_init(struct perf_event *event) if (event->attr.type != perf_kprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -9116,7 +9116,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* @@ -11157,7 +11157,7 @@ SYSCALL_DEFINE5(perf_event_open, } if (attr.namespaces) { - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; } -- 2.20.1
Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))
On 12.12.19 21:49, Linus Torvalds wrote: > On Thu, Dec 12, 2019 at 11:34 AM Will Deacon wrote: >> >> The root of my concern in all of this, and what started me looking at it in >> the first place, is the interaction with 'typeof()'. Inheriting 'volatile' >> for a pointer means that local variables in macros declared using typeof() >> suddenly start generating *hideous* code, particularly when pointless stack >> spills get stackprotector all excited. > > Yeah, removing volatile can be a bit annoying. > > For the particular case of the bitops, though, it's not an issue. > Since you know the type there, you can just cast it. > > And if we had the rule that READ_ONCE() was an arithmetic type, you could do > > typeof(0+(*p)) __var; > > since you might as well get the integer promotion anyway (on the > non-volatile result). > > But that doesn't work with structures or unions, of course. We do have a READ_ONCE on the following union in s390 code. union ipte_control { unsigned long val; struct { unsigned long k : 1; unsigned long kh : 31; unsigned long kg : 32; }; }; In fact this one was the original failure case why we change ACCESS_ONCE. see arch/s390/kvm/gaccess.c
Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))
On Wed, Dec 18, 2019 at 11:22:05AM +0100, Christian Borntraeger wrote: > On 12.12.19 21:49, Linus Torvalds wrote: > > On Thu, Dec 12, 2019 at 11:34 AM Will Deacon wrote: > >> > >> The root of my concern in all of this, and what started me looking at it in > >> the first place, is the interaction with 'typeof()'. Inheriting 'volatile' > >> for a pointer means that local variables in macros declared using typeof() > >> suddenly start generating *hideous* code, particularly when pointless stack > >> spills get stackprotector all excited. > > > > Yeah, removing volatile can be a bit annoying. > > > > For the particular case of the bitops, though, it's not an issue. > > Since you know the type there, you can just cast it. > > > > And if we had the rule that READ_ONCE() was an arithmetic type, you could do > > > > typeof(0+(*p)) __var; > > > > since you might as well get the integer promotion anyway (on the > > non-volatile result). > > > > But that doesn't work with structures or unions, of course. > > We do have a READ_ONCE on the following union in s390 code. > > union ipte_control { > unsigned long val; > struct { > unsigned long k : 1; > unsigned long kh : 31; > unsigned long kg : 32; > }; > }; > > > In fact this one was the original failure case why we change ACCESS_ONCE. > > see arch/s390/kvm/gaccess.c Thanks. I think we should be ok just using the 'val' field instead of the whole union but, then again, when bitfields are involved who knows what the compiler might do. I thought we usually shied away from using them to mirror hardware structures like this? Will
[Bug 205885] [Bisected] BUG: KASAN: null-ptr-deref in strncpy+0x3c/0x60
https://bugzilla.kernel.org/show_bug.cgi?id=205885 Michael Ellerman (mich...@ellerman.id.au) changed: What|Removed |Added Status|NEW |RESOLVED CC||mich...@ellerman.id.au Resolution|--- |PATCH_ALREADY_AVAILABLE --- Comment #7 from Michael Ellerman (mich...@ellerman.id.au) --- This is fixed upstream: https://git.kernel.org/torvalds/c/7de7de7ca0ae0fc70515ee3154af33af75edae2c -- You are receiving this mail because: You are watching the assignee of the bug.
Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush
On 12/18/19 2:47 PM, Peter Zijlstra wrote: On Wed, Dec 18, 2019 at 11:05:29AM +0530, Aneesh Kumar K.V wrote: From: Peter Zijlstra Architectures for which we have hardware walkers of Linux page table should flush TLB on mmu gather batch allocation failures and batch flush. Some architectures like POWER supports multiple translation modes (hash and radix) nohash, hash and radix in fact :-) and in the case of POWER only radix translation mode needs the above TLBI. This is because for hash translation mode kernel wants to avoid this extra flush since there are no hardware walkers of linux page table. With radix translation, the hardware also walks linux page table and with that, kernel needs to make sure to TLB invalidate page walk cache before page table pages are freed. More details in commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE") Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes") Signed-off-by: Peter Zijlstra (Intel) --- diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index b2c0be93929d..7f3a8b902325 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -26,6 +26,17 @@ #define tlb_flush tlb_flush extern void tlb_flush(struct mmu_gather *tlb); +/* + * book3s: + * Hash does not use the linux page-tables, so we can avoid + * the TLB invalidate for page-table freeing, Radix otoh does use the + * page-tables and needs the TLBI. + * + * nohash: + * We still do TLB invalidate in the __pte_free_tlb routine before we + * add the page table pages to mmu gather table batch. I'm a little confused though; if nohash is a software TLB fill, why do you need a TLBI for tables? nohash (AKA book3e) has different mmu modes. I don't follow all the details w.r.t book3e. Paul or Michael might be able to explain the need for table flush with book3e. Documentation/powerpc/cpu-faimilies.rst shows different hardware assist TLB fill support. What I wanted to convey with the above comment way we handle only radix translation mode with tlb_needs_table_invalidate() check. Other translations (hash, different variants of book3e) are all good because of the reason outlined above. -aneesh
Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush
"Aneesh Kumar K.V" writes: > On 12/18/19 2:47 PM, Peter Zijlstra wrote: >> On Wed, Dec 18, 2019 at 11:05:29AM +0530, Aneesh Kumar K.V wrote: >>> From: Peter Zijlstra >>> >>> Architectures for which we have hardware walkers of Linux page table should >>> flush TLB on mmu gather batch allocation failures and batch flush. Some >>> architectures like POWER supports multiple translation modes (hash and >>> radix) >> >> nohash, hash and radix in fact :-) >> >>> and in the case of POWER only radix translation mode needs the above TLBI. >> >>> This is because for hash translation mode kernel wants to avoid this extra >>> flush since there are no hardware walkers of linux page table. With radix >>> translation, the hardware also walks linux page table and with that, kernel >>> needs to make sure to TLB invalidate page walk cache before page table >>> pages are >>> freed. >>> >>> More details in >>> commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for >>> RCU_TABLE_FREE") >>> >>> Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes") >>> Signed-off-by: Peter Zijlstra (Intel) >> Signed-off-by: Aneesh Kumar K.V >>> --- >> >>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h >>> index b2c0be93929d..7f3a8b902325 100644 >>> --- a/arch/powerpc/include/asm/tlb.h >>> +++ b/arch/powerpc/include/asm/tlb.h >>> @@ -26,6 +26,17 @@ >>> >>> #define tlb_flush tlb_flush >>> extern void tlb_flush(struct mmu_gather *tlb); >>> +/* >>> + * book3s: >>> + * Hash does not use the linux page-tables, so we can avoid >>> + * the TLB invalidate for page-table freeing, Radix otoh does use the >>> + * page-tables and needs the TLBI. >>> + * >>> + * nohash: >>> + * We still do TLB invalidate in the __pte_free_tlb routine before we >>> + * add the page table pages to mmu gather table batch. >> >> I'm a little confused though; if nohash is a software TLB fill, why do >> you need a TLBI for tables? >> > > nohash (AKA book3e) has different mmu modes. I don't follow all the > details w.r.t book3e. Paul or Michael might be able to explain the need > for table flush with book3e. Some of the Book3E CPUs have a partial hardware table walker. The IBM one (A2) did, before we ripped that support out. And the Freescale (NXP) e6500 does, see eg: 28efc35fe68d ("powerpc/e6500: TLB miss handler with hardware tablewalk support") They only support walking one level IIRC, ie. you can create a TLB entry that points to a PTE page, and the hardware will dereference that to get a PTE and load that into the TLB. cheers
[PATCH v4 7/9] parisc/perf: open access for CAP_SYS_PERFMON privileged process
Open access to monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- arch/parisc/kernel/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 676683641d00..c4208d027794 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -300,7 +300,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, else return -EFAULT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; if (count != sizeof(uint32_t)) -- 2.20.1
Re: [PATCH 1/2] powerpc/pseries/svm: Don't access some SPRs
Sukadev Bhattiprolu writes: > Ultravisor disables some CPU features like EBB and BHRB in the HFSCR > for secure virtual machines (SVMs). If the SVMs attempt to access > related registers, they will get a Program Interrupt. > > Use macros/wrappers to skip accessing EBB and BHRB registers in secure > VMs. I continue to dislike this approach. The result is code that at a glance looks like it's doing one thing, ie. reading or writing an SPR, but is in fact doing nothing. It's confusing for readers. It also slows down all these already slow register accesses further, by inserting an mfmsr() in front of every single one. > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h > index b3cbb1136bce..026eb20f6d13 100644 > --- a/arch/powerpc/include/asm/reg.h > +++ b/arch/powerpc/include/asm/reg.h > @@ -1379,6 +1379,41 @@ static inline void msr_check_and_clear(unsigned long > bits) > __msr_check_and_clear(bits); > } > > +#ifdef CONFIG_PPC_SVM > +/* > + * Move from some "restricted" sprs. > + * Secure VMs should not access some registers as the related features > + * are disabled in the CPU. If an SVM is attempting read from the given > + * SPR, return 0. Otherwise behave like a normal mfspr. > + */ > +#define mfspr_r(rn) \ > +({ \ > + unsigned long rval = 0ULL; \ > + \ > + if (!(mfmsr() & MSR_S)) \ > + asm volatile("mfspr %0," __stringify(rn)\ > + : "=r" (rval)); \ > + rval; \ > +}) > + > +/* > + * Move to some "restricted" sprs. > + * Secure VMs should not access some registers as the related features > + * are disabled in the CPU. If an SVM is attempting write to the given > + * SPR, ignore the write. Otherwise behave like a normal mtspr. > + */ > +#define mtspr_r(rn, v) \ > +({ \ > + if (!(mfmsr() & MSR_S)) \ > + asm volatile("mtspr " __stringify(rn) ",%0" : \ > + : "r" ((unsigned long)(v)) \ > + : "memory"); \ > +}) > +#else > +#define mfspr_r mfspr > +#define mtspr_r mtspr > +#endif > + > #ifdef __powerpc64__ > #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) > #define mftb() ({unsigned long rval; > \ > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c > index 639ceae7da9d..9a691452ea3b 100644 > --- a/arch/powerpc/kernel/process.c > +++ b/arch/powerpc/kernel/process.c > @@ -1059,9 +1059,9 @@ static inline void save_sprs(struct thread_struct *t) > t->dscr = mfspr(SPRN_DSCR); > > if (cpu_has_feature(CPU_FTR_ARCH_207S)) { > - t->bescr = mfspr(SPRN_BESCR); > - t->ebbhr = mfspr(SPRN_EBBHR); > - t->ebbrr = mfspr(SPRN_EBBRR); > + t->bescr = mfspr_r(SPRN_BESCR); > + t->ebbhr = mfspr_r(SPRN_EBBHR); > + t->ebbrr = mfspr_r(SPRN_EBBRR); eg. here. This is the fast path of context switch. That expands to: if (!(mfmsr() & MSR_S)) asm volatile("mfspr %0, SPRN_BESCR" : "=r" (rval)); if (!(mfmsr() & MSR_S)) asm volatile("mfspr %0, SPRN_EBBHR" : "=r" (rval)); if (!(mfmsr() & MSR_S)) asm volatile("mfspr %0, SPRN_EBBRR" : "=r" (rval)); If the Ultravisor is going to disable EBB and BHRB then we need new CPU_FTR bits for those, and the code that accesses those registers needs to be put behind cpu_has_feature(EBB) etc. cheers
[PATCH v4 6/9] powerpc/perf: open access for CAP_SYS_PERFMON privileged process
Open access to monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- arch/powerpc/perf/imc-pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index cb50a9e1fd2d..e837717492e4 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -898,7 +898,7 @@ static int thread_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* Sampling not supported */ @@ -1307,7 +1307,7 @@ static int trace_imc_event_init(struct perf_event *event) if (event->attr.type != event->pmu->type) return -ENOENT; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EACCES; /* Return if this is a couting event */ -- 2.20.1
Re: [PATCH] powerpc/setup_64: use DEFINE_DEBUGFS_ATTRIBUTE to define fops_rfi_flush
Chen Zhou writes: > Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE for > debugfs files. > > Semantic patch information: > Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file() > imposes some significant overhead as compared to > DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe(). I know you didn't write this text, but these change logs are not great. It doesn't really explain why you're doing it. And it is alarming that you're converting *to* a function with "unsafe" in the name. The commit that added the script: 5103068eaca2 ("debugfs, coccinelle: check for obsolete DEFINE_SIMPLE_ATTRIBUTE() usage") Has a bit more explanation. Maybe something like this: In order to protect against file removal races, debugfs files created via debugfs_create_file() are wrapped by a struct file_operations at their opening. If the original struct file_operations is known to be safe against removal races already, the proxy creation may be bypassed by creating the files using DEFINE_DEBUGFS_ATTRIBUTE() and debugfs_create_file_unsafe(). The part that's not explained is why this file is "known to be safe against removal races already"? It also seems this conversion will make the file no longer seekable, because DEFINE_SIMPLE_ATTRIBUTE() uses generic_file_llseek() whereas DEFINE_DEBUGFS_ATTRIBUTE() uses no_llseek. That is probably fine, but should be mentioned. cheers > Signed-off-by: Chen Zhou > --- > arch/powerpc/kernel/setup_64.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c > index 6104917..4b9fbb2 100644 > --- a/arch/powerpc/kernel/setup_64.c > +++ b/arch/powerpc/kernel/setup_64.c > @@ -956,11 +956,11 @@ static int rfi_flush_get(void *data, u64 *val) > return 0; > } > > -DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, > "%llu\n"); > +DEFINE_DEBUGFS_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, > "%llu\n"); > > static __init int rfi_flush_debugfs_init(void) > { > - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, > _rfi_flush); > + debugfs_create_file_unsafe("rfi_flush", 0600, powerpc_debugfs_root, > NULL, _rfi_flush); > return 0; > } > device_initcall(rfi_flush_debugfs_init); > -- > 2.7.4
Re: [PATCH v3 3/3] powerpc: Book3S 64-bit "heavyweight" KASAN support
On 12/18/2019 04:32 AM, Daniel Axtens wrote: Daniel Axtens writes: Hi Christophe, I'm working through your feedback, thank you. Regarding this one: --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2081,7 +2081,14 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) /* * See if this is an exception frame. * We look for the "regshere" marker in the current frame. +* +* KASAN may complain about this. If it is an exception frame, +* we won't have unpoisoned the stack in asm when we set the +* exception marker. If it's not an exception frame, who knows +* how things are laid out - the shadow could be in any state +* at all. Just disable KASAN reporting for now. */ + kasan_disable_current(); if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE) && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) @@ -2091,6 +2098,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) regs->trap, (void *)regs->nip, (void *)lr); firstframe = 1; } + kasan_enable_current(); If this is really a concern for all targets including PPC32, should it be a separate patch with a Fixes: tag to be applied back in stable as well ? I've managed to repro this by commening out the kasan_disable/enable lines, and just booting in qemu without a disk attached: sudo qemu-system-ppc64 -accel kvm -m 2G -M pseries -cpu power9 -kernel ./vmlinux -nographic -chardev stdio,id=charserial0,mux=on -device spapr-vty,chardev=charserial0,reg=0x3000 -mon chardev=charserial0,mode=readline -nodefaults -smp 2 ... [0.210740] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(0,0) [0.210789] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12 [0.210844] Call Trace: [0.210866] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154 (unreliable) [0.210915] [c0006a483a00] [c025411c] panic+0x258/0x59c [0.210958] [c0006a483aa0] [c24870b0] mount_block_root+0x648/0x7ac [0.211005] == [0.211054] BUG: KASAN: stack-out-of-bounds in show_stack+0x438/0x580 [0.211095] Read of size 8 at addr c0006a483b00 by task swapper/0/1 [0.211134] [0.211152] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12 [0.211207] Call Trace: [0.211225] [c0006a483680] [c1f74f48] dump_stack+0xfc/0x154 (unreliable) [0.211274] [c0006a4836d0] [c08f877c] print_address_description.isra.10+0x7c/0x470 [0.211330] [c0006a483760] [c08f8e7c] __kasan_report+0x1bc/0x244 [0.211380] [c0006a483830] [c08f6eb8] kasan_report+0x18/0x30 [0.211422] [c0006a483850] [c08fa5d4] __asan_report_load8_noabort+0x24/0x40 [0.211471] [c0006a483870] [c003d448] show_stack+0x438/0x580 [0.211512] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154 [0.211553] [c0006a483a00] [c025411c] panic+0x258/0x59c [0.211595] [c0006a483aa0] [c24870b0] mount_block_root+0x648/0x7ac [0.211644] [c0006a483be0] [c2487784] prepare_namespace+0x1ec/0x240 [0.211694] [c0006a483c60] [c248669c] kernel_init_freeable+0x7f4/0x870 [0.211745] [c0006a483da0] [c0011f30] kernel_init+0x3c/0x15c [0.211787] [c0006a483e20] [c000bebc] ret_from_kernel_thread+0x5c/0x80 [0.211834] [0.211851] Allocated by task 0: [0.211878] save_stack+0x2c/0xe0 [0.211904] __kasan_kmalloc.isra.16+0x11c/0x150 [0.211937] kmem_cache_alloc_node+0x114/0x3b0 [0.211971] copy_process+0x5b8/0x6410 [0.211996] _do_fork+0x130/0xbf0 [0.212022] kernel_thread+0xdc/0x130 [0.212047] rest_init+0x44/0x184 [0.212072] start_kernel+0x77c/0x7dc [0.212098] start_here_common+0x1c/0x20 [0.212122] [0.212139] Freed by task 0: [0.212163] (stack is not available) [0.212187] [0.212205] The buggy address belongs to the object at c0006a48 [0.212205] which belongs to the cache thread_stack of size 16384 [0.212285] The buggy address is located 15104 bytes inside of [0.212285] 16384-byte region [c0006a48, c0006a484000) [0.212356] The buggy address belongs to the page: [0.212391] page:c00c001a9200 refcount:1 mapcount:0 mapping:c0006a019e00 index:0x0 compound_mapcount: 0 [0.212455] raw: 00710200 5deadbeef100 5deadbeef122 c0006a019e00 [0.212504] raw: 00100010 0001
[PATCH v15 21/23] selftests/vm/pkeys: Fix number of reserved powerpc pkeys
From: "Desnes A. Nunes do Rosario" The number of reserved pkeys in a PowerNV environment is different from that on PowerVM or KVM. Tested on PowerVM and PowerNV environments. Signed-off-by: "Desnes A. Nunes do Rosario" Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-powerpc.h | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index f7a20bd07870..a70577045074 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -28,7 +28,10 @@ #define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey and 24 other keys that cannot be represented in the PTE */ -#define NR_RESERVED_PKEYS_64K 3 /* pkey-0, pkey-1 and exec-only-pkey */ +#define NR_RESERVED_PKEYS_64K_3KEYS3 /* PowerNV and KVM: pkey-0, +pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS4 /* PowerVM: pkey-0, pkey-1, +pkey-31 and exec-only key */ #define PKEY_BITS_PER_PKEY 2 #define HPAGE_SIZE (1UL << 24) #define PAGE_SIZE (1UL << 16) @@ -69,12 +72,27 @@ static inline int cpu_has_pkeys(void) return 1; } +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", ) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", ) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", ) == -1) ) + return true; + + return false; +} + static inline int get_arch_reserved_keys(void) { if (sysconf(_SC_PAGESIZE) == 4096) return NR_RESERVED_PKEYS_4K; else - return NR_RESERVED_PKEYS_64K; + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; } void expect_fault_on_read_execonly_key(void *p1, int pkey) -- 2.17.1
[PATCH v4 3/9] perf tool: extend Perf tool with CAP_SYS_PERFMON capability support
Extend error messages to mention CAP_SYS_PERFMON capability as an option to substitute CAP_SYS_ADMIN capability for secure system performance monitoring and observability operations. Make perf_event_paranoid_check() and __cmd_ftrace() to be aware of CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- tools/perf/builtin-ftrace.c | 5 +++-- tools/perf/design.txt | 3 ++- tools/perf/util/cap.h | 4 tools/perf/util/evsel.c | 10 +- tools/perf/util/util.c | 1 + 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index d5adc417a4ca..8096e9b5f4f9 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -284,10 +284,11 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv) .events = POLLIN, }; - if (!perf_cap__capable(CAP_SYS_ADMIN)) { + if (!(perf_cap__capable(CAP_SYS_PERFMON) || + perf_cap__capable(CAP_SYS_ADMIN))) { pr_err("ftrace only works for %s!\n", #ifdef HAVE_LIBCAP_SUPPORT - "users with the SYS_ADMIN capability" + "users with the CAP_SYS_PERFMON or CAP_SYS_ADMIN capability" #else "root" #endif diff --git a/tools/perf/design.txt b/tools/perf/design.txt index 0453ba26cdbd..71755b3e1303 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -258,7 +258,8 @@ gets schedule to. Per task counters can be created by any user, for their own tasks. A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts -all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. +all events on CPU-x. Per CPU counters need CAP_SYS_PERFMON or +CAP_SYS_ADMIN privilege. The 'flags' parameter is currently unused and must be zero. diff --git a/tools/perf/util/cap.h b/tools/perf/util/cap.h index 051dc590ceee..0f79fbf6638b 100644 --- a/tools/perf/util/cap.h +++ b/tools/perf/util/cap.h @@ -29,4 +29,8 @@ static inline bool perf_cap__capable(int cap __maybe_unused) #define CAP_SYSLOG 34 #endif +#ifndef CAP_SYS_PERFMON +#define CAP_SYS_PERFMON 38 +#endif + #endif /* __PERF_CAP_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index f4dea055b080..3a46325e3702 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2468,14 +2468,14 @@ int perf_evsel__open_strerror(struct evsel *evsel, struct target *target, "You may not have permission to collect %sstats.\n\n" "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n" "which controls use of the performance events system by\n" -"unprivileged users (without CAP_SYS_ADMIN).\n\n" +"unprivileged users (without CAP_SYS_PERFMON or CAP_SYS_ADMIN).\n\n" "The current value is %d:\n\n" " -1: Allow use of (almost) all events by all users\n" " Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK\n" -">= 0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN\n" -" Disallow raw tracepoint access by users without CAP_SYS_ADMIN\n" -">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n" -">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN\n\n" +">= 0: Disallow ftrace function tracepoint by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n" +" Disallow raw tracepoint access by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n" +">= 1: Disallow CPU event access by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n" +">= 2: Disallow kernel profiling by users without CAP_SYS_PERFMON or CAP_SYS_ADMIN\n\n" "To make this setting permanent, edit /etc/sysctl.conf too, e.g.:\n\n" " kernel.perf_event_paranoid = -1\n" , target->system_wide ? "system-wide " : "", diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 969ae560dad9..9981db0d8d09 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -272,6 +272,7 @@ int perf_event_paranoid(void) bool perf_event_paranoid_check(int max_level) { return perf_cap__capable(CAP_SYS_ADMIN) || + perf_cap__capable(CAP_SYS_PERFMON) || perf_event_paranoid() <= max_level; } -- 2.20.1
[PATCH v4 5/9] trace/bpf_trace: open access for CAP_SYS_PERFMON privileged process
Open access to bpf_trace monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to bpf_trace monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..bafe21ac6d92 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1272,7 +1272,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) u32 *ids, prog_cnt, ids_len; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!perfmon_capable()) return -EPERM; if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; -- 2.20.1
[PATCH v4 8/9] drivers/perf: open access for CAP_SYS_PERFMON privileged process
Open access to monitoring for CAP_SYS_PERFMON privileged processes. For backward compatibility reasons access to the monitoring remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure monitoring is discouraged with respect to CAP_SYS_PERFMON capability. Signed-off-by: Alexey Budankov --- drivers/perf/arm_spe_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 4e4984a55cd1..5dff81bc3324 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -274,7 +274,7 @@ static u64 arm_spe_event_to_pmscr(struct perf_event *event) if (!attr->exclude_kernel) reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT); - if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && capable(CAP_SYS_ADMIN)) + if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable()) reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT); return reg; @@ -700,7 +700,7 @@ static int arm_spe_pmu_event_init(struct perf_event *event) return -EOPNOTSUPP; reg = arm_spe_event_to_pmscr(event); - if (!capable(CAP_SYS_ADMIN) && + if (!perfmon_capable() && (reg & (BIT(SYS_PMSCR_EL1_PA_SHIFT) | BIT(SYS_PMSCR_EL1_CX_SHIFT) | BIT(SYS_PMSCR_EL1_PCT_SHIFT -- 2.20.1
[PATCH v17 06/23] powerpc: mm: Add p?d_leaf() definitions
walk_page_range() is going to be allowed to walk page tables other than those of user space. For this it needs to know when it has reached a 'leaf' entry in the page tables. This information is provided by the p?d_leaf() functions/macros. For powerpc p?d_is_leaf() functions already exist. Export them using the new p?d_leaf() name. CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Michael Ellerman CC: linuxppc-dev@lists.ozlabs.org CC: kvm-...@vger.kernel.org Signed-off-by: Steven Price --- arch/powerpc/include/asm/book3s/64/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b01624e5c467..201a69e6a355 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1355,18 +1355,21 @@ static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_va * Like pmd_huge() and pmd_large(), but works regardless of config options */ #define pmd_is_leaf pmd_is_leaf +#define pmd_leaf pmd_is_leaf static inline bool pmd_is_leaf(pmd_t pmd) { return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } #define pud_is_leaf pud_is_leaf +#define pud_leaf pud_is_leaf static inline bool pud_is_leaf(pud_t pud) { return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); } #define pgd_is_leaf pgd_is_leaf +#define pgd_leaf pgd_is_leaf static inline bool pgd_is_leaf(pgd_t pgd) { return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); -- 2.20.1
Re: [PATCH 05/18] powerpc sstep: Prepare to support prefixed instructions
Jordan Niethe writes: > Currently all instructions are a single word long. A future ISA version > will include prefixed instructions which have a double word length. The > functions used for analysing and emulating instructions need to be > modified so that they can handle these new instruction types. > > A prefixed instruction is a word prefix followed by a word suffix. All > prefixes uniquely have the primary op-code 1. Suffixes may be valid word > instructions or instructions that only exist as suffixes. > > In handling prefixed instructions it will be convenient to treat the > suffix and prefix as separate words. To facilitate this modify > analyse_instr() and emulate_step() to take a take a suffix as a > parameter. For word instructions it does not matter what is passed in > here - it will be ignored. > > We also define a new flag, PREFIXED, to be used in instruction_op:type. > This flag will indicate when emulating an analysed instruction if the > NIP should be advanced by word length or double word length. > > The callers of analyse_instr() and emulate_step() will need their own > changes to be able to support prefixed instructions. For now modify them > to pass in 0 as a suffix. > > Note that at this point no prefixed instructions are emulated or > analysed - this is just making it possible to do so. > > Signed-off-by: Jordan Niethe > --- > arch/powerpc/include/asm/ppc-opcode.h | 3 +++ > arch/powerpc/include/asm/sstep.h | 8 +-- > arch/powerpc/include/asm/uaccess.h| 30 +++ > arch/powerpc/kernel/align.c | 2 +- > arch/powerpc/kernel/hw_breakpoint.c | 4 ++-- > arch/powerpc/kernel/kprobes.c | 2 +- > arch/powerpc/kernel/mce_power.c | 2 +- > arch/powerpc/kernel/optprobes.c | 2 +- > arch/powerpc/kernel/uprobes.c | 2 +- > arch/powerpc/kvm/emulate_loadstore.c | 2 +- > arch/powerpc/lib/sstep.c | 12 ++- > arch/powerpc/lib/test_emulate_step.c | 30 +-- > arch/powerpc/xmon/xmon.c | 4 ++-- > 13 files changed, 71 insertions(+), 32 deletions(-) > > diff --git a/arch/powerpc/include/asm/ppc-opcode.h > b/arch/powerpc/include/asm/ppc-opcode.h > index c1df75edde44..a1dfa4bdd22f 100644 > --- a/arch/powerpc/include/asm/ppc-opcode.h > +++ b/arch/powerpc/include/asm/ppc-opcode.h > @@ -377,6 +377,9 @@ > #define PPC_INST_VCMPEQUD0x10c7 > #define PPC_INST_VCMPEQUB0x1006 > > +/* macro to check if a word is a prefix */ > +#define IS_PREFIX(x) (((x) >> 26) == 1) > + > /* macros to insert fields into opcodes */ > #define ___PPC_RA(a) (((a) & 0x1f) << 16) > #define ___PPC_RB(b) (((b) & 0x1f) << 11) > diff --git a/arch/powerpc/include/asm/sstep.h > b/arch/powerpc/include/asm/sstep.h > index 769f055509c9..6d4cb602e231 100644 > --- a/arch/powerpc/include/asm/sstep.h > +++ b/arch/powerpc/include/asm/sstep.h > @@ -89,6 +89,9 @@ enum instruction_type { > #define VSX_LDLEFT 4 /* load VSX register from left */ > #define VSX_CHECK_VEC8 /* check MSR_VEC not MSR_VSX for reg >= > 32 */ > > +/* Prefixed flag, ORed in with type */ > +#define PREFIXED 0x800 > + > /* Size field in type word */ > #define SIZE(n) ((n) << 12) > #define GETSIZE(w) ((w) >> 12) > @@ -132,7 +135,7 @@ union vsx_reg { > * otherwise. > */ > extern int analyse_instr(struct instruction_op *op, const struct pt_regs > *regs, > - unsigned int instr); > + unsigned int instr, unsigned int sufx); > I'm not saying this is necessarily better, but did you consider: - making instr 64 bits and using masking and shifting macros to get the prefix and suffix? - defining an instruction type/struct/union/whatever that contains both halves in one object? I'm happy to be told that it ends up being way, way uglier/worse/etc, but I just thought I'd ask. Regards, Daniel > /* > * Emulate an instruction that can be executed just by updating > @@ -149,7 +152,8 @@ void emulate_update_regs(struct pt_regs *reg, struct > instruction_op *op); > * 0 if it could not be emulated, or -1 for an instruction that > * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.). > */ > -extern int emulate_step(struct pt_regs *regs, unsigned int instr); > +extern int emulate_step(struct pt_regs *regs, unsigned int instr, > + unsigned int sufx); > > /* > * Emulate a load or store instruction by reading/writing the > diff --git a/arch/powerpc/include/asm/uaccess.h > b/arch/powerpc/include/asm/uaccess.h > index 15002b51ff18..bc585399e0c7 100644 > --- a/arch/powerpc/include/asm/uaccess.h > +++ b/arch/powerpc/include/asm/uaccess.h > @@ -423,4 +423,34 @@ extern long __copy_from_user_flushcache(void *dst, const > void __user *src, > extern void memcpy_page_flushcache(char *to, struct page *page, size_t > offset, > size_t len); > > +/* > + * When
Re: [PATCH v11 06/25] mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM
On Mon, Dec 16, 2019 at 02:25:18PM -0800, John Hubbard wrote: > As it says in the updated comment in gup.c: current FOLL_LONGTERM > behavior is incompatible with FAULT_FLAG_ALLOW_RETRY because of the > FS DAX check requirement on vmas. > > However, the corresponding restriction in get_user_pages_remote() was > slightly stricter than is actually required: it forbade all > FOLL_LONGTERM callers, but we can actually allow FOLL_LONGTERM callers > that do not set the "locked" arg. > > Update the code and comments to loosen the restriction, allowing > FOLL_LONGTERM in some cases. > > Also, copy the DAX check ("if a VMA is DAX, don't allow long term > pinning") from the VFIO call site, all the way into the internals > of get_user_pages_remote() and __gup_longterm_locked(). That is: > get_user_pages_remote() calls __gup_longterm_locked(), which in turn > calls check_dax_vmas(). This check will then be removed from the VFIO > call site in a subsequent patch. > > Thanks to Jason Gunthorpe for pointing out a clean way to fix this, > and to Dan Williams for helping clarify the DAX refactoring. > > Tested-by: Alex Williamson > Acked-by: Alex Williamson > Reviewed-by: Jason Gunthorpe > Reviewed-by: Ira Weiny > Suggested-by: Jason Gunthorpe > Cc: Dan Williams > Cc: Jerome Glisse > Signed-off-by: John Hubbard > --- > mm/gup.c | 27 ++- > 1 file changed, 22 insertions(+), 5 deletions(-) > > diff --git a/mm/gup.c b/mm/gup.c > index 3ecce297a47f..c0c56888e7cc 100644 > --- a/mm/gup.c > +++ b/mm/gup.c > @@ -29,6 +29,13 @@ struct follow_page_context { > unsigned int page_mask; > }; > > +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, > + struct mm_struct *mm, > + unsigned long start, > + unsigned long nr_pages, > + struct page **pages, > + struct vm_area_struct **vmas, > + unsigned int flags); Any particular reason for the forward declaration? Maybe move get_user_pages_remote() down? > /* > * Return the compound head page with ref appropriately incremented, > * or NULL if that failed. > @@ -1179,13 +1186,23 @@ long get_user_pages_remote(struct task_struct *tsk, > struct mm_struct *mm, > struct vm_area_struct **vmas, int *locked) > { > /* > - * FIXME: Current FOLL_LONGTERM behavior is incompatible with > + * Parts of FOLL_LONGTERM behavior are incompatible with >* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on > - * vmas. As there are no users of this flag in this call we simply > - * disallow this option for now. > + * vmas. However, this only comes up if locked is set, and there are > + * callers that do request FOLL_LONGTERM, but do not set locked. So, > + * allow what we can. >*/ > - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) > - return -EINVAL; > + if (gup_flags & FOLL_LONGTERM) { > + if (WARN_ON_ONCE(locked)) > + return -EINVAL; > + /* > + * This will check the vmas (even if our vmas arg is NULL) > + * and return -ENOTSUPP if DAX isn't allowed in this case: > + */ > + return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, > + vmas, gup_flags | FOLL_TOUCH | > + FOLL_REMOTE); > + } > > return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, > locked, > -- > 2.24.1 > -- Kirill A. Shutemov
Re: [PATCH v6 05/10] mm/memory_hotplug: Shrink zones when offlining memory
On 01.12.19 00:21, Andrew Morton wrote: > On Sun, 27 Oct 2019 23:45:52 +0100 David Hildenbrand wrote: > >> I think I just found an issue with try_offline_node(). >> try_offline_node() is pretty much broken already (touches garbage >> memmaps and will not considers mixed NIDs within sections), however, >> relies on the node span to look for memory sections to probe. So it >> seems to rely on the nodes getting shrunk when removing memory, not when >> offlining. >> >> As we shrink the node span when offlining now and not when removing, >> this can go wrong once we offline the last memory block of the node and >> offline the last CPU. We could still have memory around that we could >> re-online, however, the node would already be offline. Unlikely, but >> possible. >> >> Note that the same is also broken without this patch in case memory is >> never onlined. The "pfn_to_nid(pfn) != nid" can easily succeed on the >> garbage memmap, resulting in no memory being detected as belonging to >> the node. Also, resize_pgdat_range() is called when onlining memory, not >> when adding it. :/ Oh this is so broken :) >> >> The right fix is probably to walk over all memory blocks that could >> exist and test if they belong to the nid (if offline, check the >> block->nid, if online check all pageblocks). A fix we can then move in >> front of this patch. >> >> Will look into this this week. > > And this series shows almost no sign of having been reviewed. I'll hold > it over for 5.6. > Hi Andrew, any chance we can get the (now at least reviewed - thx Oscar) fix in patch #5 into 5.5? (I want to do the final stable backports for the uninitialized memmap stuff) -- Thanks, David / dhildenb
Re: [PATCH v11 01/25] mm/gup: factor out duplicate code from four routines
On Mon, Dec 16, 2019 at 02:25:13PM -0800, John Hubbard wrote: > +static void put_compound_head(struct page *page, int refs) > +{ > + /* Do a get_page() first, in case refs == page->_refcount */ > + get_page(page); > + page_ref_sub(page, refs); > + put_page(page); > +} It's not terribly efficient. Maybe something like: VM_BUG_ON_PAGE(page_ref_count(page) < ref, page); if (refs > 2) page_ref_sub(page, refs - 1); put_page(page); ? -- Kirill A. Shutemov
Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
On Mon, Dec 16, 2019 at 02:25:16PM -0800, John Hubbard wrote: > An upcoming patch changes and complicates the refcounting and > especially the "put page" aspects of it. In order to keep > everything clean, refactor the devmap page release routines: > > * Rename put_devmap_managed_page() to page_is_devmap_managed(), > and limit the functionality to "read only": return a bool, > with no side effects. > > * Add a new routine, put_devmap_managed_page(), to handle checking > what kind of page it is, and what kind of refcount handling it > requires. > > * Rename __put_devmap_managed_page() to free_devmap_managed_page(), > and limit the functionality to unconditionally freeing a devmap > page. What the reason to separate put_devmap_managed_page() from free_devmap_managed_page() if free_devmap_managed_page() has exacly one caller? Is it preparation for the next patches? > This is originally based on a separate patch by Ira Weiny, which > applied to an early version of the put_user_page() experiments. > Since then, Jérôme Glisse suggested the refactoring described above. > > Cc: Christoph Hellwig > Suggested-by: Jérôme Glisse > Reviewed-by: Dan Williams > Reviewed-by: Jan Kara > Signed-off-by: Ira Weiny > Signed-off-by: John Hubbard > --- > include/linux/mm.h | 17 + > mm/memremap.c | 16 ++-- > mm/swap.c | 24 > 3 files changed, 39 insertions(+), 18 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index c97ea3b694e6..77a4df06c8a7 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -952,9 +952,10 @@ static inline bool is_zone_device_page(const struct page > *page) > #endif > > #ifdef CONFIG_DEV_PAGEMAP_OPS > -void __put_devmap_managed_page(struct page *page); > +void free_devmap_managed_page(struct page *page); > DECLARE_STATIC_KEY_FALSE(devmap_managed_key); > -static inline bool put_devmap_managed_page(struct page *page) > + > +static inline bool page_is_devmap_managed(struct page *page) > { > if (!static_branch_unlikely(_managed_key)) > return false; > @@ -963,7 +964,6 @@ static inline bool put_devmap_managed_page(struct page > *page) > switch (page->pgmap->type) { > case MEMORY_DEVICE_PRIVATE: > case MEMORY_DEVICE_FS_DAX: > - __put_devmap_managed_page(page); > return true; > default: > break; > @@ -971,7 +971,14 @@ static inline bool put_devmap_managed_page(struct page > *page) > return false; > } > > +bool put_devmap_managed_page(struct page *page); > + > #else /* CONFIG_DEV_PAGEMAP_OPS */ > +static inline bool page_is_devmap_managed(struct page *page) > +{ > + return false; > +} > + > static inline bool put_devmap_managed_page(struct page *page) > { > return false; > @@ -1028,8 +1035,10 @@ static inline void put_page(struct page *page) >* need to inform the device driver through callback. See >* include/linux/memremap.h and HMM for details. >*/ > - if (put_devmap_managed_page(page)) > + if (page_is_devmap_managed(page)) { > + put_devmap_managed_page(page); put_devmap_managed_page() has yet another page_is_devmap_managed() check inside. It looks strange. > return; > + } > > if (put_page_testzero(page)) > __put_page(page); > diff --git a/mm/memremap.c b/mm/memremap.c > index e899fa876a62..2ba773859031 100644 > --- a/mm/memremap.c > +++ b/mm/memremap.c > @@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, > EXPORT_SYMBOL_GPL(get_dev_pagemap); > > #ifdef CONFIG_DEV_PAGEMAP_OPS > -void __put_devmap_managed_page(struct page *page) > +void free_devmap_managed_page(struct page *page) > { > - int count = page_ref_dec_return(page); > - > - /* still busy */ > - if (count > 1) > - return; > - > - /* only triggered by the dev_pagemap shutdown path */ > - if (count == 0) { > - __put_page(page); > - return; > - } > - > /* notify page idle for dax */ > if (!is_device_private_page(page)) { > wake_up_var(>_refcount); > @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page) > page->mapping = NULL; > page->pgmap->ops->page_free(page); > } > -EXPORT_SYMBOL(__put_devmap_managed_page); > +EXPORT_SYMBOL(free_devmap_managed_page); > #endif /* CONFIG_DEV_PAGEMAP_OPS */ > diff --git a/mm/swap.c b/mm/swap.c > index 5341ae93861f..49f7c2eea0ba 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -1102,3 +1102,27 @@ void __init swap_setup(void) >* _really_ don't want to cluster much more >*/ > } > + > +#ifdef CONFIG_DEV_PAGEMAP_OPS > +bool put_devmap_managed_page(struct page *page) > +{ > + bool is_devmap = page_is_devmap_managed(page); > + > + if (is_devmap) { Reversing the condition would save you an indentation level. > + int count =
Re: [PATCH 18/18] powerpc/fault: Use analyse_instr() to check for store with updates to sp
Jordan Niethe writes: > A user-mode access to an address a long way below the stack pointer is > only valid if the instruction is one that would update the stack pointer > to the address accessed. This is checked by directly looking at the > instructions op-code. As a result is does not take into account prefixed > instructions. Instead of looking at the instruction our self, use > analyse_instr() determine if this a store instruction that will update > the stack pointer. > > Something to note is that there currently are not any store with update > prefixed instructions. Actually there is no plan for prefixed > update-form loads and stores. So this patch is probably not needed but > it might be preferable to use analyse_instr() rather than open coding > the test anyway. Yes please. I was looking through this code recently and was horrified. This improves things a lot and I think is justification enough as-is. Regards, Daniel > > Signed-off-by: Jordan Niethe > --- > arch/powerpc/mm/fault.c | 39 +++ > 1 file changed, 11 insertions(+), 28 deletions(-) > > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c > index b5047f9b5dec..cb78b3ca1800 100644 > --- a/arch/powerpc/mm/fault.c > +++ b/arch/powerpc/mm/fault.c > @@ -41,37 +41,17 @@ > #include > #include > #include > +#include > > /* > * Check whether the instruction inst is a store using > * an update addressing form which will update r1. > */ > -static bool store_updates_sp(unsigned int inst) > +static bool store_updates_sp(struct instruction_op *op) > { > - /* check for 1 in the rA field */ > - if (((inst >> 16) & 0x1f) != 1) > - return false; > - /* check major opcode */ > - switch (inst >> 26) { > - case OP_STWU: > - case OP_STBU: > - case OP_STHU: > - case OP_STFSU: > - case OP_STFDU: > - return true; > - case OP_STD:/* std or stdu */ > - return (inst & 3) == 1; > - case OP_31: > - /* check minor opcode */ > - switch ((inst >> 1) & 0x3ff) { > - case OP_31_XOP_STDUX: > - case OP_31_XOP_STWUX: > - case OP_31_XOP_STBUX: > - case OP_31_XOP_STHUX: > - case OP_31_XOP_STFSUX: > - case OP_31_XOP_STFDUX: > + if (GETTYPE(op->type) == STORE) { > + if ((op->type & UPDATE) && (op->update_reg == 1)) > return true; > - } > } > return false; > } > @@ -278,14 +258,17 @@ static bool bad_stack_expansion(struct pt_regs *regs, > unsigned long address, > > if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && > access_ok(nip, sizeof(*nip))) { > - unsigned int inst; > + unsigned int inst, sufx; > + struct instruction_op op; > int res; > > pagefault_disable(); > - res = __get_user_inatomic(inst, nip); > + res = __get_user_instr_inatomic(inst, sufx, nip); > pagefault_enable(); > - if (!res) > - return !store_updates_sp(inst); > + if (!res) { > + analyse_instr(, uregs, inst, sufx); > + return !store_updates_sp(); > + } > *must_retry = true; > } > return true; > -- > 2.20.1
Re: [PATCH 08/18] powerpc sstep: Add support for prefixed VSX load/stores
Jordan Niethe writes: > This adds emulation support for the following prefixed VSX load/stores: > * Prefixed Load VSX Scalar Doubleword (plxsd) > * Prefixed Load VSX Scalar Single-Precision (plxssp) > * Prefixed Load VSX Vector [0|1] (plxv, plxv0, plxv1) > * Prefixed Store VSX Scalar Doubleword (pstxsd) > * Prefixed Store VSX Scalar Single-Precision (pstxssp) > * Prefixed Store VSX Vector [0|1] (pstxv, pstxv0, pstxv1) > > Signed-off-by: Jordan Niethe Take this with a grain of salt, but I would prbably squish the 3 load/store patches into one. Part of my hesitation is that I think you also need some sstep tests for these new instructions - if they massively bloat the patches I might keep them as separate patches. I'd also like to see a test for your next patch. Regards, Daniel > --- > arch/powerpc/lib/sstep.c | 42 > 1 file changed, 42 insertions(+) > > diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c > index 9113b9a21ae9..9ae8d177b67f 100644 > --- a/arch/powerpc/lib/sstep.c > +++ b/arch/powerpc/lib/sstep.c > @@ -2713,6 +2713,48 @@ int analyse_instr(struct instruction_op *op, const > struct pt_regs *regs, > case 41:/* plwa */ > op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 4); > break; > + case 42:/* plxsd */ > + op->reg = rd + 32; > + op->type = MKOP(LOAD_VSX, PREFIXED, 8); > + op->element_size = 8; > + op->vsx_flags = VSX_CHECK_VEC; > + break; > + case 43:/* plxssp */ > + op->reg = rd + 32; > + op->type = MKOP(LOAD_VSX, PREFIXED, 4); > + op->element_size = 8; > + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; > + break; > + case 46:/* pstxsd */ > + op->reg = rd + 32; > + op->type = MKOP(STORE_VSX, PREFIXED, 8); > + op->element_size = 8; > + op->vsx_flags = VSX_CHECK_VEC; > + break; > + case 47:/* pstxssp */ > + op->reg = rd + 32; > + op->type = MKOP(STORE_VSX, PREFIXED, 4); > + op->element_size = 8; > + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC; > + break; > + case 51:/* plxv1 */ > + op->reg += 32; > + > + /* fallthru */ > + case 50:/* plxv0 */ > + op->type = MKOP(LOAD_VSX, PREFIXED, 16); > + op->element_size = 16; > + op->vsx_flags = VSX_CHECK_VEC; > + break; > + case 55:/* pstxv1 */ > + op->reg = rd + 32; > + > + /* fallthru */ > + case 54:/* pstxv0 */ > + op->type = MKOP(STORE_VSX, PREFIXED, 16); > + op->element_size = 16; > + op->vsx_flags = VSX_CHECK_VEC; > + break; > case 56:/* plq */ > op->type = MKOP(LOAD, PREFIXED, 16); > break; > -- > 2.20.1
Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush
On Thu, Dec 19, 2019 at 12:13:48AM +1100, Michael Ellerman wrote: > >> I'm a little confused though; if nohash is a software TLB fill, why do > >> you need a TLBI for tables? > >> > > > > nohash (AKA book3e) has different mmu modes. I don't follow all the > > details w.r.t book3e. Paul or Michael might be able to explain the need > > for table flush with book3e. > > Some of the Book3E CPUs have a partial hardware table walker. The IBM one (A2) > did, before we ripped that support out. And the Freescale (NXP) e6500 > does, see eg: > > 28efc35fe68d ("powerpc/e6500: TLB miss handler with hardware tablewalk > support") > > They only support walking one level IIRC, ie. you can create a TLB entry > that points to a PTE page, and the hardware will dereference that to get > a PTE and load that into the TLB. Shiny!, all the embedded goodness. Thanks for the info.
[PATCH v15 14/23] selftests/vm/pkeys: Fix assertion in test_pkey_alloc_exhaust()
From: Ram Pai Some pkeys which are valid on the hardware are reserved and not available for application use. These keys cannot be allocated. test_pkey_alloc_exhaust() tries to account for these and has an assertion which validates if all available pkeys have been exahaustively allocated. However, the expression that is currently used is only valid for x86. On powerpc, a pkey is additionally reserved as compared to x86. Hence, the assertion is made to use an arch-specific helper to get the correct count of reserved pkeys. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 1920bca84def..8d90cfe2c9bd 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1152,6 +1152,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* +* On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1159,8 +1160,16 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). +* +* On powerpc: +* There are either 5 or 32 pkeys supported in hardware +* depending on the page size (4K or 64K). Four are +* allocated by the time we get here. This includes +* pkey-0, pkey-1, exec-only pkey and the one allocated +* by the test code. +* Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); -- 2.17.1
[PATCH v15 18/23] selftests/vm/pkeys: Detect write violation on a mapped access-denied-key page
From: Ram Pai Detect write-violation on a page to which access-disabled key is associated much after the page is mapped. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Acked-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 13 + 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index ff207b765afd..176625ded549 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1026,6 +1026,18 @@ void test_write_of_access_disabled_region(int *ptr, u16 pkey) *ptr = __LINE__; expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1422,6 +1434,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_write_of_write_disabled_region, test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, -- 2.17.1
[PATCH v15 20/23] selftests/vm/pkeys: Test correct behaviour of pkey-0
From: Ram Pai Ensure that pkey-0 is allocated on start and that it can be attached dynamically in various modes, without failures. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 53 1 file changed, 53 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 5faf52ba7b8f..af577151d378 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -963,6 +963,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* +* This is a bit of a hack. But mprotect() requires +* huge-page-aligned sizes when operating on hugetlbfs. +* So, make sure that we use something that's a multiple +* of a huge page when we can. +*/ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1447,6 +1499,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) -- 2.17.1
Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush
On Wed, Dec 18, 2019 at 11:05:29AM +0530, Aneesh Kumar K.V wrote: > From: Peter Zijlstra > > Architectures for which we have hardware walkers of Linux page table should > flush TLB on mmu gather batch allocation failures and batch flush. Some > architectures like POWER supports multiple translation modes (hash and radix) nohash, hash and radix in fact :-) > and in the case of POWER only radix translation mode needs the above TLBI. > This is because for hash translation mode kernel wants to avoid this extra > flush since there are no hardware walkers of linux page table. With radix > translation, the hardware also walks linux page table and with that, kernel > needs to make sure to TLB invalidate page walk cache before page table pages > are > freed. > > More details in > commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for > RCU_TABLE_FREE") > > Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes") > Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V > --- > diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h > index b2c0be93929d..7f3a8b902325 100644 > --- a/arch/powerpc/include/asm/tlb.h > +++ b/arch/powerpc/include/asm/tlb.h > @@ -26,6 +26,17 @@ > > #define tlb_flush tlb_flush > extern void tlb_flush(struct mmu_gather *tlb); > +/* > + * book3s: > + * Hash does not use the linux page-tables, so we can avoid > + * the TLB invalidate for page-table freeing, Radix otoh does use the > + * page-tables and needs the TLBI. > + * > + * nohash: > + * We still do TLB invalidate in the __pte_free_tlb routine before we > + * add the page table pages to mmu gather table batch. I'm a little confused though; if nohash is a software TLB fill, why do you need a TLBI for tables? > + */ > +#define tlb_needs_table_invalidate() radix_enabled() > > /* Get the generic bits... */ > #include
Re: [PATCH v2 3/3] asm-generic/tlb: Avoid potential double flush
On Wed, Dec 18, 2019 at 11:05:30AM +0530, Aneesh Kumar K.V wrote: > --- a/include/asm-generic/tlb.h > +++ b/include/asm-generic/tlb.h > @@ -402,7 +402,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct > vm_area_struct *vma) { } > > static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) > { > - if (!tlb->end) > + /* > + * Anything calling __tlb_adjust_range() also sets at least one of > + * these bits. > + */ > + if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || > + tlb->cleared_puds || tlb->cleared_p4ds)) > return; FWIW I looked at the GCC generated assembly output for this (x86_64) and it did a single load and mask as expected.
[PATCH v15 12/23] selftests/vm/pkeys: Introduce generic pkey abstractions
From: Ram Pai This introduces some generic abstractions and provides the corresponding architecture-specfic implementations for these abstractions. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h| 12 tools/testing/selftests/vm/pkey-x86.h| 15 +++ tools/testing/selftests/vm/protection_keys.c | 8 ++-- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index bd90a49a3229..c929cc09b3cc 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -74,6 +74,9 @@ extern void abort_hooks(void); } \ } while (0) +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); + #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" #else /* arch */ @@ -173,4 +176,13 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write) #define __stringify_1(x...) #x #define __stringify(x...) __stringify_1(x) +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return >si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + #endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index 4937f48f77cc..bb2d4dfcacd5 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -42,6 +42,7 @@ #endif #define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ #define PKEY_BITS_PER_PKEY 2 #define HPAGE_SIZE (1UL<<21) #define PAGE_SIZE 4096 @@ -160,4 +161,18 @@ int pkey_reg_xstate_offset(void) return xstate_offset; } +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + #endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 2327cfa0b836..dbc2918bec13 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1306,9 +1306,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(pkey); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1335,9 +1333,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(); do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(UNKNOWN_PKEY); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the -- 2.17.1
[PATCH v15 19/23] selftests/vm/pkeys: Introduce a sub-page allocator
From: Ram Pai This introduces a new allocator that allocates 4K hardware pages to back 64K linux pages. This allocator is available only on powerpc. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h| 6 + tools/testing/selftests/vm/pkey-powerpc.h| 25 tools/testing/selftests/vm/pkey-x86.h| 5 tools/testing/selftests/vm/protection_keys.c | 1 + 4 files changed, 37 insertions(+) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 43299435f24c..6a03faf1ef44 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -28,6 +28,9 @@ extern int dprint_in_signal; extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; +extern int test_nr; +extern int iteration_nr; + #ifdef __GNUC__ __attribute__((format(printf, 1, 2))) #endif @@ -78,6 +81,9 @@ __attribute__((noinline)) int read_ptr(int *ptr); void expected_pkey_fault(int pkey); int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index a43d7be85a27..f7a20bd07870 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -91,4 +91,29 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey) /* 8-bytes of instruction * 16384bytes = 1 page */ #define __page_o_noops() asm(".rept 16384 ; nop; .endr") +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + #endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index e5fdee39a7d8..87b85f60b43f 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -175,4 +175,9 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey) expected_pkey_fault(pkey); } +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + #endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 176625ded549..5faf52ba7b8f 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -844,6 +844,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: -- 2.17.1
[PATCH v15 09/23] selftests/vm/pkeys: Fix assertion in pkey_disable_set/clear()
From: Ram Pai In some cases, a pkey's bits need not necessarily change in a way that the value of the pkey register increases when performing a pkey_disable_set() or decreases when performing a pkey_disable_clear(). For example, on powerpc, if a pkey's current state is PKEY_DISABLE_ACCESS and we perform a pkey_write_disable() on it, the bits still remain the same. We will observe something similar when the pkey's current state is 0 and a pkey_access_enable() is performed on it. Either case would cause some assertions to fail. This fixes the problem. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 9a6c95b220cc..fbee0b061851 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -399,7 +399,7 @@ void pkey_disable_set(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n", __func__, pkey, read_pkey_reg()); if (flags) - pkey_assert(read_pkey_reg() > orig_pkey_reg); + pkey_assert(read_pkey_reg() >= orig_pkey_reg); dprintf1("END<---%s(%d, 0x%x)\n", __func__, pkey, flags); } @@ -430,7 +430,7 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n", __func__, pkey, read_pkey_reg()); if (flags) - assert(read_pkey_reg() < orig_pkey_reg); + assert(read_pkey_reg() <= orig_pkey_reg); } void pkey_write_allow(int pkey) -- 2.17.1
[PATCH v15 10/23] selftests/vm/pkeys: Fix alloc_random_pkey() to make it really random
From: Ram Pai alloc_random_pkey() was allocating the same pkey every time. Not all pkeys were geting tested. This fixes it. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Acked-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index fbee0b061851..d3c13283bbd0 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -24,6 +24,7 @@ #define _GNU_SOURCE #include #include +#include #include #include #include @@ -545,10 +546,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) -- 2.17.1
[PATCH v15 22/23] selftests/vm/pkeys: Override access right definitions on powerpc
From: Ram Pai Some platforms hardcode the x86 values for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE such as those in: /usr/include/bits/mman-shared.h. This overrides the definitions with correct values for powerpc. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-powerpc.h | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index a70577045074..3cd8e03fd640 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -16,11 +16,13 @@ #define fpregs fp_regs #define si_pkey_offset 0x20 -#ifndef PKEY_DISABLE_ACCESS +#ifdef PKEY_DISABLE_ACCESS +#undef PKEY_DISABLE_ACCESS # define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ #endif -#ifndef PKEY_DISABLE_WRITE +#ifdef PKEY_DISABLE_WRITE +#undef PKEY_DISABLE_WRITE # define PKEY_DISABLE_WRITE0x2 #endif -- 2.17.1
[PATCH v15 02/23] selftests/vm/pkeys: Rename all references to pkru to a generic name
From: Ram Pai This renames pkru references to "pkey_reg" or "pkey" based on the usage. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Reviewed-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h| 85 +++ tools/testing/selftests/vm/protection_keys.c | 240 ++- 2 files changed, 170 insertions(+), 155 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 254e5436bdd9..d5779be4793f 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -14,7 +14,7 @@ #include #define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 +#define PKEY_BITS_PER_PKEY 2 #ifndef DEBUG_LEVEL #define DEBUG_LEVEL 0 @@ -53,85 +53,88 @@ static inline void sigsafe_printf(const char *format, ...) #define dprintf3(args...) dprintf_level(3, args) #define dprintf4(args...) dprintf_level(4, args) -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) +extern unsigned int shadow_pkey_reg; +static inline unsigned int __read_pkey_reg(void) { unsigned int eax, edx; unsigned int ecx = 0; - unsigned int pkru; + unsigned int pkey_reg; asm volatile(".byte 0x0f,0x01,0xee\n\t" : "=a" (eax), "=d" (edx) : "c" (ecx)); - pkru = eax; - return pkru; + pkey_reg = eax; + return pkey_reg; } -static inline unsigned int _rdpkru(int line) +static inline unsigned int _read_pkey_reg(int line) { - unsigned int pkru = __rdpkru(); + unsigned int pkey_reg = __read_pkey_reg(); - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); + dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); - return pkru; + return pkey_reg; } -#define rdpkru() _rdpkru(__LINE__) +#define read_pkey_reg() _read_pkey_reg(__LINE__) -static inline void __wrpkru(unsigned int pkru) +static inline void __write_pkey_reg(unsigned int pkey_reg) { - unsigned int eax = pkru; + unsigned int eax = pkey_reg; unsigned int ecx = 0; unsigned int edx = 0; - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + dprintf4("%s() changing %08x to %08x\n", __func__, + __read_pkey_reg(), pkey_reg); asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); + assert(pkey_reg == __read_pkey_reg()); } -static inline void wrpkru(unsigned int pkru) +static inline void write_pkey_reg(unsigned int pkey_reg) { - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); + dprintf4("%s() changing %08x to %08x\n", __func__, + __read_pkey_reg(), pkey_reg); /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%08x) pkey_reg: %08x\n", __func__, + pkey_reg, __read_pkey_reg()); } /* * These are technically racy. since something could - * change PKRU between the read and the write. + * change PKEY register between the read and the write. */ static inline void __pkey_access_allow(int pkey, int do_allow) { - unsigned int pkru = rdpkru(); + unsigned int pkey_reg = read_pkey_reg(); int bit = pkey * 2; if (do_allow) - pkru &= (1<>>>===SIGSEGV\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; @@ -289,19 +289,19 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) */ fpregs += 0x70; #endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)([pkru_offset]); + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)([pkey_reg_offset]); dprintf1("siginfo: %p\n", si); dprintf1(" fpregs: %p\n", fpregs); /* -* If we got a PKRU fault, we *HAVE* to have at least one bit set in +* If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n",
[PATCH v15 16/23] selftests/vm/pkeys: Associate key on a mapped page and detect access violation
From: Ram Pai Detect access-violation on a page to which access-disabled key is associated much after the page is mapped. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Acked-by: Dave Hansen Signed-off: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 7bceb98662c1..f63b9f55b3a7 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -983,6 +983,24 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) dprintf1("*ptr: %d\n", ptr_contents); expected_pkey_fault(pkey); } + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); @@ -1389,6 +1407,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, test_write_of_access_disabled_region, test_kernel_write_of_access_disabled_region, -- 2.17.1
[PATCH v15 23/23] selftests: vm: pkeys: Use the correct page size on powerpc
Both 4K and 64K pages are supported on powerpc. Parts of the selftest code perform alignment computations based on the PAGE_SIZE macro which is currently hardcoded to 64K for powerpc. This causes some test failures on kernels configured with 4K page size. This problem is solved by determining the correct page size during the build process rather than hardcoding it in the header file. cc: Dave Hansen cc: Florian Weimer cc: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/Makefile | 4 tools/testing/selftests/vm/pkey-powerpc.h | 1 - 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 4e9c741be6af..ada3a67eaac6 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -4,6 +4,10 @@ uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/') CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) +ifneq (,$(filter $(ARCH), ppc64 ppc64le)) +protection_keys: EXTRA_CFLAGS += -DPAGE_SIZE=$(shell getconf PAGESIZE) +endif + LDLIBS = -lrt TEST_GEN_FILES = compaction_test TEST_GEN_FILES += gup_benchmark diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index 3cd8e03fd640..07fa9f529014 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -36,7 +36,6 @@ pkey-31 and exec-only key */ #define PKEY_BITS_PER_PKEY 2 #define HPAGE_SIZE (1UL << 24) -#define PAGE_SIZE (1UL << 16) #define pkey_reg_t u64 #define PKEY_REG_FMT "%016lx" -- 2.17.1
[PATCH v15 00/24] selftests, powerpc, x86: Memory Protection Keys
Memory protection keys enables an application to protect its address space from inadvertent access by its own code. This feature is now enabled on powerpc and has been available since 4.16-rc1. The patches move the selftests to arch neutral directory and enhance their test coverage. Testing --- Verified for correctness on powerpc. Need help with x86 testing as I do not have access to a Skylake server. Client platforms like Coffee Lake do not have the required feature bits set in CPUID. Changelog - Link to previous version (v14): https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=55981=* v15: (1) Rebased on top of latest master. (2) Addressed review comments from Dave Hansen. (3) Moved code for getting or setting pkey bits to new helpers. These changes replace patch 7 of v14. (4) Added a fix which ensures that the correct count of reserved keys is used across different platforms. (5) Added a fix which ensures that the correct page size is used as powerpc supports both 4K and 64K pages. v14: (1) Incorporated another round of comments from Dave Hansen. v13: (1) Incorporated comments for Dave Hansen. (2) Added one more test for correct pkey-0 behavior. v12: (1) Fixed the offset of pkey field in the siginfo structure for x86_64 and powerpc. And tries to use the actual field if the headers have it defined. v11: (1) Fixed a deadlock in the ptrace testcase. v10 and prior: (1) Moved the testcase to arch neutral directory. (2) Split the changes into incremental patches. Desnes A. Nunes do Rosario (1): selftests/vm/pkeys: Fix number of reserved powerpc pkeys Ram Pai (17): selftests/x86/pkeys: Move selftests to arch-neutral directory selftests/vm/pkeys: Rename all references to pkru to a generic name selftests/vm/pkeys: Move generic definitions to header file selftests/vm/pkeys: Typecast the pkey register selftests/vm/pkeys: Fix pkey_disable_clear() selftests/vm/pkeys: Fix assertion in pkey_disable_set/clear() selftests/vm/pkeys: Fix alloc_random_pkey() to make it really random selftests/vm/pkeys: Introduce generic pkey abstractions selftests/vm/pkeys: Introduce powerpc support selftests/vm/pkeys: Fix assertion in test_pkey_alloc_exhaust() selftests/vm/pkeys: Improve checks to determine pkey support selftests/vm/pkeys: Associate key on a mapped page and detect access violation selftests/vm/pkeys: Associate key on a mapped page and detect write violation selftests/vm/pkeys: Detect write violation on a mapped access-denied-key page selftests/vm/pkeys: Introduce a sub-page allocator selftests/vm/pkeys: Test correct behaviour of pkey-0 selftests/vm/pkeys: Override access right definitions on powerpc Sandipan Das (3): selftests: vm: pkeys: Add helpers for pkey bits selftests: vm: pkeys: Use the correct huge page size selftests: vm: pkeys: Use the correct page size on powerpc Thiago Jung Bauermann (2): selftests/vm/pkeys: Move some definitions to arch-specific header selftests/vm/pkeys: Make gcc check arguments of sigsafe_printf() tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 5 + tools/testing/selftests/vm/pkey-helpers.h | 226 ++ tools/testing/selftests/vm/pkey-powerpc.h | 138 tools/testing/selftests/vm/pkey-x86.h | 183 + .../selftests/{x86 => vm}/protection_keys.c | 688 ++ tools/testing/selftests/x86/.gitignore| 1 - tools/testing/selftests/x86/pkey-helpers.h| 219 -- 8 files changed, 931 insertions(+), 530 deletions(-) create mode 100644 tools/testing/selftests/vm/pkey-helpers.h create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h create mode 100644 tools/testing/selftests/vm/pkey-x86.h rename tools/testing/selftests/{x86 => vm}/protection_keys.c (74%) delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h -- 2.17.1
Re: [PATCH 05/18] powerpc sstep: Prepare to support prefixed instructions
Jordan Niethe writes: > Currently all instructions are a single word long. A future ISA version > will include prefixed instructions which have a double word length. The > functions used for analysing and emulating instructions need to be > modified so that they can handle these new instruction types. > > A prefixed instruction is a word prefix followed by a word suffix. All > prefixes uniquely have the primary op-code 1. Suffixes may be valid word > instructions or instructions that only exist as suffixes. > > In handling prefixed instructions it will be convenient to treat the > suffix and prefix as separate words. To facilitate this modify > analyse_instr() and emulate_step() to take a take a suffix as a > parameter. For word instructions it does not matter what is passed in > here - it will be ignored. > > We also define a new flag, PREFIXED, to be used in instruction_op:type. > This flag will indicate when emulating an analysed instruction if the > NIP should be advanced by word length or double word length. > > The callers of analyse_instr() and emulate_step() will need their own > changes to be able to support prefixed instructions. For now modify them > to pass in 0 as a suffix. > > Note that at this point no prefixed instructions are emulated or > analysed - this is just making it possible to do so. > > Signed-off-by: Jordan Niethe > --- > arch/powerpc/include/asm/ppc-opcode.h | 3 +++ > arch/powerpc/include/asm/sstep.h | 8 +-- > arch/powerpc/include/asm/uaccess.h| 30 +++ > arch/powerpc/kernel/align.c | 2 +- > arch/powerpc/kernel/hw_breakpoint.c | 4 ++-- > arch/powerpc/kernel/kprobes.c | 2 +- > arch/powerpc/kernel/mce_power.c | 2 +- > arch/powerpc/kernel/optprobes.c | 2 +- > arch/powerpc/kernel/uprobes.c | 2 +- > arch/powerpc/kvm/emulate_loadstore.c | 2 +- > arch/powerpc/lib/sstep.c | 12 ++- > arch/powerpc/lib/test_emulate_step.c | 30 +-- > arch/powerpc/xmon/xmon.c | 4 ++-- > 13 files changed, 71 insertions(+), 32 deletions(-) > > diff --git a/arch/powerpc/include/asm/ppc-opcode.h > b/arch/powerpc/include/asm/ppc-opcode.h > index c1df75edde44..a1dfa4bdd22f 100644 > --- a/arch/powerpc/include/asm/ppc-opcode.h > +++ b/arch/powerpc/include/asm/ppc-opcode.h > @@ -377,6 +377,9 @@ > #define PPC_INST_VCMPEQUD0x10c7 > #define PPC_INST_VCMPEQUB0x1006 > > +/* macro to check if a word is a prefix */ > +#define IS_PREFIX(x) (((x) >> 26) == 1) > + > /* macros to insert fields into opcodes */ > #define ___PPC_RA(a) (((a) & 0x1f) << 16) > #define ___PPC_RB(b) (((b) & 0x1f) << 11) > diff --git a/arch/powerpc/include/asm/sstep.h > b/arch/powerpc/include/asm/sstep.h > index 769f055509c9..6d4cb602e231 100644 > --- a/arch/powerpc/include/asm/sstep.h > +++ b/arch/powerpc/include/asm/sstep.h > @@ -89,6 +89,9 @@ enum instruction_type { > #define VSX_LDLEFT 4 /* load VSX register from left */ > #define VSX_CHECK_VEC8 /* check MSR_VEC not MSR_VSX for reg >= > 32 */ > > +/* Prefixed flag, ORed in with type */ > +#define PREFIXED 0x800 > + > /* Size field in type word */ > #define SIZE(n) ((n) << 12) > #define GETSIZE(w) ((w) >> 12) > @@ -132,7 +135,7 @@ union vsx_reg { > * otherwise. > */ > extern int analyse_instr(struct instruction_op *op, const struct pt_regs > *regs, > - unsigned int instr); > + unsigned int instr, unsigned int sufx); > > /* > * Emulate an instruction that can be executed just by updating > @@ -149,7 +152,8 @@ void emulate_update_regs(struct pt_regs *reg, struct > instruction_op *op); > * 0 if it could not be emulated, or -1 for an instruction that > * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.). > */ > -extern int emulate_step(struct pt_regs *regs, unsigned int instr); > +extern int emulate_step(struct pt_regs *regs, unsigned int instr, > + unsigned int sufx); > > /* > * Emulate a load or store instruction by reading/writing the > diff --git a/arch/powerpc/include/asm/uaccess.h > b/arch/powerpc/include/asm/uaccess.h > index 15002b51ff18..bc585399e0c7 100644 > --- a/arch/powerpc/include/asm/uaccess.h > +++ b/arch/powerpc/include/asm/uaccess.h > @@ -423,4 +423,34 @@ extern long __copy_from_user_flushcache(void *dst, const > void __user *src, > extern void memcpy_page_flushcache(char *to, struct page *page, size_t > offset, > size_t len); > > +/* > + * When reading an instruction iff it is a prefix, the suffix needs to be > also > + * loaded. > + */ > +#define __get_user_instr(x, y, ptr) \ > +({ \ > + long __gui_ret = 0; \ > + y = 0; \ > + __gui_ret = __get_user(x, ptr);
[PATCH v4 0/7] Introduce CAP_SYS_PERFMON to secure system performance monitoring and observability
Currently access to perf_events, i915_perf and other performance monitoring and observability subsystems of the kernel is open for a privileged process [1] with CAP_SYS_ADMIN capability enabled in the process effective set [2]. This patch set introduces CAP_SYS_PERFMON capability devoted to secure system performance monitoring and observability operations so that CAP_SYS_PERFMON would assist CAP_SYS_ADMIN capability in its governing role for perf_events, i915_perf and other performance monitoring and observability subsystems of the kernel. CAP_SYS_PERFMON intends to meet the demand to secure system performance monitoring and observability operations in security sensitive, restricted, production environments (e.g. HPC clusters, cloud and virtual compute environments) where root or CAP_SYS_ADMIN credentials are not available to mass users of a system because of security considerations. CAP_SYS_PERFMON intends to harden system security and integrity during system performance monitoring and observability operations by decreasing attack surface that is available to CAP_SYS_ADMIN privileged processes [2]. CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balance amount of CAP_SYS_ADMIN credentials following the recommendations in the capabilities man page [2] for CAP_SYS_ADMIN: "Note: this capability is overloaded; see Notes to kernel developers, below." For backward compatibility reasons access to system performance monitoring and observability subsystems of the kernel remains open for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN capability usage for secure system performance monitoring and observability operations is discouraged with respect to the introduced CAP_SYS_PERFMON capability. The patch set is for tip perf/core repository: git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip perf/core sha1: ceb9e77324fa661b1001a0ae66f061b5fcb4e4e6 --- Changes in v4: - converted perfmon_capable() into an inline function - made perf_events kprobes, uprobes, hw breakpoints and namespaces data available to CAP_SYS_PERFMON privileged processes - applied perfmon_capable() to drivers/perf and drivers/oprofile - extended __cmd_ftrace() with support of CAP_SYS_PERFMON Changes in v3: - implemented perfmon_capable() macros aggregating required capabilities checks Changes in v2: - made perf_events trace points available to CAP_SYS_PERFMON privileged processes - made perf_event_paranoid_check() treat CAP_SYS_PERFMON equally to CAP_SYS_ADMIN - applied CAP_SYS_PERFMON to i915_perf, bpf_trace, powerpc and parisc system performance monitoring and observability related subsystems --- Alexey Budankov (9): capabilities: introduce CAP_SYS_PERFMON to kernel and user space perf/core: open access for CAP_SYS_PERFMON privileged process perf tool: extend Perf tool with CAP_SYS_PERFMON capability support drm/i915/perf: open access for CAP_SYS_PERFMON privileged process trace/bpf_trace: open access for CAP_SYS_PERFMON privileged process powerpc/perf: open access for CAP_SYS_PERFMON privileged process parisc/perf: open access for CAP_SYS_PERFMON privileged process drivers/perf: open access for CAP_SYS_PERFMON privileged process drivers/oprofile: open access for CAP_SYS_PERFMON privileged process arch/parisc/kernel/perf.c | 2 +- arch/powerpc/perf/imc-pmu.c | 4 ++-- drivers/gpu/drm/i915/i915_perf.c| 13 ++--- drivers/oprofile/event_buffer.c | 2 +- drivers/perf/arm_spe_pmu.c | 4 ++-- include/linux/capability.h | 4 include/linux/perf_event.h | 6 +++--- include/uapi/linux/capability.h | 8 +++- kernel/events/core.c| 6 +++--- kernel/trace/bpf_trace.c| 2 +- security/selinux/include/classmap.h | 4 ++-- tools/perf/builtin-ftrace.c | 5 +++-- tools/perf/design.txt | 3 ++- tools/perf/util/cap.h | 4 tools/perf/util/evsel.c | 10 +- tools/perf/util/util.c | 1 + 16 files changed, 47 insertions(+), 31 deletions(-) --- Testing and validation (Intel Skylake, 8 cores, Fedora 29, 5.4.0-rc8+, x86_64): libcap library [3], [4] and Perf tool can be used to apply CAP_SYS_PERFMON capability for secure system performance monitoring and observability beyond the scope permitted by the system wide perf_event_paranoid kernel setting [5] and below are the steps for evaluation: - patch, build and boot the kernel - patch, build Perf tool e.g. to /home/user/perf ... # git clone git://git.kernel.org/pub/scm/libs/libcap/libcap.git libcap # pushd libcap # patch libcap/include/uapi/linux/capabilities.h with [PATCH 1] # make # pushd progs # ./setcap "cap_sys_perfmon,cap_sys_ptrace,cap_syslog=ep" /home/user/perf # ./setcap -v "cap_sys_perfmon,cap_sys_ptrace,cap_syslog=ep" /home/user/perf /home/user/perf: OK # ./getcap
[PATCH v15 11/23] selftests: vm: pkeys: Use the correct huge page size
The huge page size can vary across architectures. This will ensure that the correct huge page size is used when accessing the hugetlb controls under sysfs. Instead of using a hardcoded page size (i.e. 2MB), this now uses the HPAGE_SIZE macro which is arch-specific. Cc: Dave Hansen Cc: Florian Weimer Cc: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 23 ++-- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index d3c13283bbd0..2327cfa0b836 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -738,12 +738,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -754,11 +757,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they -* are 2M pages. Someone might have made 1G the default. +* are PMD-level pages. Someone might have made PUD-level +* pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -766,13 +774,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } -- 2.17.1
[PATCH v4 1/9] capabilities: introduce CAP_SYS_PERFMON to kernel and user space
Introduce CAP_SYS_PERFMON capability devoted to secure system performance monitoring and observability operations so that CAP_SYS_PERFMON would assist CAP_SYS_ADMIN capability in its governing role for perf_events, i915_perf and other subsystems of the kernel. CAP_SYS_PERFMON intends to harden system security and integrity during system performance monitoring and observability operations by decreasing attack surface that is available to CAP_SYS_ADMIN privileged processes. CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balance amount of CAP_SYS_ADMIN credentials in accordance with the recommendations provided in the man page for CAP_SYS_ADMIN [1]: "Note: this capability is overloaded; see Notes to kernel developers, below." [1] http://man7.org/linux/man-pages/man7/capabilities.7.html Signed-off-by: Alexey Budankov --- include/linux/capability.h | 4 include/uapi/linux/capability.h | 8 +++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..883c879baa4b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_SYS_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 240fdb9a60f6..98e03cc76c7c 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -366,8 +366,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ + +#define CAP_SYS_PERFMON38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_SYS_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 7db24855e12d..bae602c623b0 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "sys_perfmon" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_SYS_PERFMON #error New capability defined, please update COMMON_CAP2_PERMS. #endif -- 2.20.1
[PATCH v15 01/23] selftests/x86/pkeys: Move selftests to arch-neutral directory
From: Ram Pai cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Acked-by: Ingo Molnar Acked-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/{x86 => vm}/pkey-helpers.h| 0 tools/testing/selftests/{x86 => vm}/protection_keys.c | 0 tools/testing/selftests/x86/.gitignore| 1 - 5 files changed, 2 insertions(+), 1 deletion(-) rename tools/testing/selftests/{x86 => vm}/pkey-helpers.h (100%) rename tools/testing/selftests/{x86 => vm}/protection_keys.c (100%) diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 31b3c98b6d34..c55837bf39fa 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -14,3 +14,4 @@ virtual_address_range gup_benchmark va_128TBswitch map_fixed_noreplace +protection_keys diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7f9a8a8c31da..4e9c741be6af 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -18,6 +18,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += protection_keys ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h similarity index 100% rename from tools/testing/selftests/x86/pkey-helpers.h rename to tools/testing/selftests/vm/pkey-helpers.h diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c similarity index 100% rename from tools/testing/selftests/x86/protection_keys.c rename to tools/testing/selftests/vm/protection_keys.c diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 7757f73ff9a3..eb30ffd83876 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -11,5 +11,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso -- 2.17.1
[PATCH v15 04/23] selftests/vm/pkeys: Move some definitions to arch-specific header
From: Thiago Jung Bauermann This moves definitions which have arch-specific values to the x86-specific header in preparation for multi-arch support. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Acked-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h| 111 + tools/testing/selftests/vm/pkey-x86.h| 156 +++ tools/testing/selftests/vm/protection_keys.c | 47 -- 3 files changed, 162 insertions(+), 152 deletions(-) create mode 100644 tools/testing/selftests/vm/pkey-x86.h diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 6ad1bd54ef94..3ed2f021bf7a 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -21,9 +21,6 @@ #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) -#define NR_PKEYS 16 -#define PKEY_BITS_PER_PKEY 2 - #ifndef DEBUG_LEVEL #define DEBUG_LEVEL 0 #endif @@ -73,19 +70,13 @@ extern void abort_hooks(void); } \ } while (0) +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + extern unsigned int shadow_pkey_reg; -static inline unsigned int __read_pkey_reg(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkey_reg; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" -: "=a" (eax), "=d" (edx) -: "c" (ecx)); - pkey_reg = eax; - return pkey_reg; -} static inline unsigned int _read_pkey_reg(int line) { @@ -100,19 +91,6 @@ static inline unsigned int _read_pkey_reg(int line) #define read_pkey_reg() _read_pkey_reg(__LINE__) -static inline void __write_pkey_reg(unsigned int pkey_reg) -{ - unsigned int eax = pkey_reg; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, - __read_pkey_reg(), pkey_reg); - asm volatile(".byte 0x0f,0x01,0xef\n\t" -: : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkey_reg == __read_pkey_reg()); -} - static inline void write_pkey_reg(unsigned int pkey_reg) { dprintf4("%s() changing %08x to %08x\n", __func__, @@ -157,83 +135,6 @@ static inline void __pkey_write_allow(int pkey, int do_allow_write) dprintf4("pkey_reg now: %08x\n", read_pkey_reg()); } -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x0007:0 (ecx) */ -#define X86_FEATURE_PKU(1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(, , , ); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKEY_BIT(9) -#define XSTATE_PKEY0x200 - -int pkey_reg_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKEY is set in XCR0 */ - leaf = XSTATE_PKEY_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(, , , ); - - if (leaf == XSTATE_PKEY_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKEY in xsave state\n"); - return 0; - } - - return xstate_offset; -} - #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index ..2f04ade8ca9c --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,156 @@ +/*
[PATCH v15 05/23] selftests/vm/pkeys: Make gcc check arguments of sigsafe_printf()
From: Thiago Jung Bauermann This will help us ensure we print pkey_reg_t values correctly in different architectures. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h | 4 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 3ed2f021bf7a..7f18a82e54fc 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -27,6 +27,10 @@ #define DPRINT_IN_SIGNAL_BUF_SIZE 4096 extern int dprint_in_signal; extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif static inline void sigsafe_printf(const char *format, ...) { va_list ap; -- 2.17.1
[PATCH v15 15/23] selftests/vm/pkeys: Improve checks to determine pkey support
From: Ram Pai For the pkeys subsystem to work, both the CPU and the kernel need to have support. So, additionally check if the kernel supports pkeys apart from the CPU feature checks. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/pkey-helpers.h| 30 tools/testing/selftests/vm/pkey-powerpc.h| 3 +- tools/testing/selftests/vm/pkey-x86.h| 2 +- tools/testing/selftests/vm/protection_keys.c | 7 +++-- 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h index 5de06d3a81de..43299435f24c 100644 --- a/tools/testing/selftests/vm/pkey-helpers.h +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -76,6 +76,8 @@ extern void abort_hooks(void); __attribute__((noinline)) int read_ptr(int *ptr); void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); #if defined(__i386__) || defined(__x86_64__) /* arch */ #include "pkey-x86.h" @@ -187,4 +189,32 @@ static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) #endif } +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + #endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h index 5ced9c58fa4a..a43d7be85a27 100644 --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -63,8 +63,9 @@ static inline void __write_pkey_reg(pkey_reg_t pkey_reg) pkey_reg); } -static inline int cpu_has_pku(void) +static inline int cpu_has_pkeys(void) { + /* No simple way to determine this */ return 1; } diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h index bb2d4dfcacd5..e5fdee39a7d8 100644 --- a/tools/testing/selftests/vm/pkey-x86.h +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -99,7 +99,7 @@ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, #define X86_FEATURE_PKU(1<<3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ -static inline int cpu_has_pku(void) +static inline int cpu_has_pkeys(void) { unsigned int eax; unsigned int ebx; diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 8d90cfe2c9bd..7bceb98662c1 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1377,7 +1377,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1446,12 +1446,13 @@ void pkey_setup_shadow(void) int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; -- 2.17.1
[PATCH v15 08/23] selftests/vm/pkeys: Fix pkey_disable_clear()
From: Ram Pai Currently, pkey_disable_clear() sets the specified bits instead clearing them. This has been dead code up to now because its only callers i.e. pkey_access/write_allow() are also unused. cc: Dave Hansen cc: Florian Weimer Signed-off-by: Ram Pai Acked-by: Dave Hansen Signed-off-by: Sandipan Das --- tools/testing/selftests/vm/protection_keys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index b474d4fbe92b..9a6c95b220cc 100644 --- a/tools/testing/selftests/vm/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -417,7 +417,7 @@ void pkey_disable_clear(int pkey, int flags) pkey, pkey, pkey_rights); pkey_assert(pkey_rights >= 0); - pkey_rights |= flags; + pkey_rights &= ~flags; ret = hw_pkey_set(pkey, pkey_rights, 0); shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); @@ -430,7 +430,7 @@ void pkey_disable_clear(int pkey, int flags) dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n", __func__, pkey, read_pkey_reg()); if (flags) - assert(read_pkey_reg() > orig_pkey_reg); + assert(read_pkey_reg() < orig_pkey_reg); } void pkey_write_allow(int pkey) -- 2.17.1
Re: [PATCH 03/18] powerpc: Add PREFIXED SRR1 bit for future ISA version
Jordan Niethe writes: > Add the bit definition for exceptions caused by prefixed instructions. > > Signed-off-by: Jordan Niethe > --- > arch/powerpc/include/asm/reg.h | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h > index 6f9fcc3d4c82..0a6d39fb4769 100644 > --- a/arch/powerpc/include/asm/reg.h > +++ b/arch/powerpc/include/asm/reg.h > @@ -778,6 +778,7 @@ > > #define SRR1_MCE_MCP 0x0008 /* Machine check signal > caused interrupt */ > #define SRR1_BOUNDARY 0x1000 /* Prefixed instruction > crosses 64-byte boundary */ > +#define SRR1_PREFIXED 0x2000 /* Exception caused by > prefixed instruction */ You could probably squash this with the previous patch, and maybe the next patch too. Regards, Daniel > > #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ > #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ > -- > 2.20.1
Re: [PATCH v2 1/3] powerpc/mmu_gather: Enable RCU_TABLE_FREE even for !SMP case
I'm going to assume you'll take these 3 patches through the powerpc tree for convenience, a few small nits, but otherwise ACK on the lot. On Wed, Dec 18, 2019 at 11:05:28AM +0530, Aneesh Kumar K.V wrote: > A follow up patch is going to make sure we correctly invalidate page walk > cache > before we free page table pages. In order to keep things simple enable > RCU_TABLE_FREE even for !SMP so that we don't have to fixup the !SMP case > differently in the followup patch Perhaps more clearly state that !SMP is broken for Radix/PWC. The above sorta implies it is, but it's not spelled out in any detail. > Signed-off-by: Aneesh Kumar K.V Acked-by: Peter Zijlstra (Intel)
Re: [PATCH v4 1/9] capabilities: introduce CAP_SYS_PERFMON to kernel and user space
On 12/18/19 4:24 AM, Alexey Budankov wrote: Introduce CAP_SYS_PERFMON capability devoted to secure system performance monitoring and observability operations so that CAP_SYS_PERFMON would assist CAP_SYS_ADMIN capability in its governing role for perf_events, i915_perf and other subsystems of the kernel. CAP_SYS_PERFMON intends to harden system security and integrity during system performance monitoring and observability operations by decreasing attack surface that is available to CAP_SYS_ADMIN privileged processes. CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related to system performance monitoring and observability operations and balance amount of CAP_SYS_ADMIN credentials in accordance with the recommendations provided in the man page for CAP_SYS_ADMIN [1]: "Note: this capability is overloaded; see Notes to kernel developers, below." [1] http://man7.org/linux/man-pages/man7/capabilities.7.html Signed-off-by: Alexey Budankov Acked-by: Stephen Smalley Note for selinux developers: we will need to update the selinux-testsuite tests for perf_event when/if this change lands upstream. --- include/linux/capability.h | 4 include/uapi/linux/capability.h | 8 +++- security/selinux/include/classmap.h | 4 ++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index ecce0f43c73a..883c879baa4b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); +static inline bool perfmon_capable(void) +{ + return capable(CAP_SYS_PERFMON) || capable(CAP_SYS_ADMIN); +} /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 240fdb9a60f6..98e03cc76c7c 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -366,8 +366,14 @@ struct vfs_ns_cap_data { #define CAP_AUDIT_READ 37 +/* + * Allow system performance and observability privileged operations + * using perf_events, i915_perf and other kernel subsystems + */ + +#define CAP_SYS_PERFMON38 -#define CAP_LAST_CAP CAP_AUDIT_READ +#define CAP_LAST_CAP CAP_SYS_PERFMON #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 7db24855e12d..bae602c623b0 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -27,9 +27,9 @@ "audit_control", "setfcap" #define COMMON_CAP2_PERMS "mac_override", "mac_admin", "syslog", \ - "wake_alarm", "block_suspend", "audit_read" + "wake_alarm", "block_suspend", "audit_read", "sys_perfmon" -#if CAP_LAST_CAP > CAP_AUDIT_READ +#if CAP_LAST_CAP > CAP_SYS_PERFMON #error New capability defined, please update COMMON_CAP2_PERMS. #endif
Re: [PATCH v6 05/10] mm/memory_hotplug: Shrink zones when offlining memory
On Wed, 18 Dec 2019 18:08:04 +0100 David Hildenbrand wrote: > On 01.12.19 00:21, Andrew Morton wrote: > > On Sun, 27 Oct 2019 23:45:52 +0100 David Hildenbrand > > wrote: > > > >> I think I just found an issue with try_offline_node(). > >> try_offline_node() is pretty much broken already (touches garbage > >> memmaps and will not considers mixed NIDs within sections), however, > >> relies on the node span to look for memory sections to probe. So it > >> seems to rely on the nodes getting shrunk when removing memory, not when > >> offlining. > >> > >> As we shrink the node span when offlining now and not when removing, > >> this can go wrong once we offline the last memory block of the node and > >> offline the last CPU. We could still have memory around that we could > >> re-online, however, the node would already be offline. Unlikely, but > >> possible. > >> > >> Note that the same is also broken without this patch in case memory is > >> never onlined. The "pfn_to_nid(pfn) != nid" can easily succeed on the > >> garbage memmap, resulting in no memory being detected as belonging to > >> the node. Also, resize_pgdat_range() is called when onlining memory, not > >> when adding it. :/ Oh this is so broken :) > >> > >> The right fix is probably to walk over all memory blocks that could > >> exist and test if they belong to the nid (if offline, check the > >> block->nid, if online check all pageblocks). A fix we can then move in > >> front of this patch. > >> > >> Will look into this this week. > > > > And this series shows almost no sign of having been reviewed. I'll hold > > it over for 5.6. > > > > Hi Andrew, any chance we can get the (now at least reviewed - thx Oscar) > fix in patch #5 into 5.5? (I want to do the final stable backports for > the uninitialized memmap stuff) Sure, I queued it for the next batch of 5.5 fixes.
Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register
On 12/17/19 11:51 PM, Sandipan Das wrote: > write_pkey_reg(pkey_reg); > - dprintf4("pkey_reg now: %08x\n", read_pkey_reg()); > + dprintf4("pkey_reg now: "PKEY_REG_FMT"\n", read_pkey_reg()); > } > > #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) > diff --git a/tools/testing/selftests/vm/pkey-x86.h > b/tools/testing/selftests/vm/pkey-x86.h > index 2f04ade8ca9c..5f40901219d3 100644 > --- a/tools/testing/selftests/vm/pkey-x86.h > +++ b/tools/testing/selftests/vm/pkey-x86.h > @@ -46,6 +46,8 @@ > #define HPAGE_SIZE (1UL<<21) > #define PAGE_SIZE4096 > #define MB (1<<20) > +#define pkey_reg_t u32 > +#define PKEY_REG_FMT "%016x" How big is the ppc one? I'd really just rather do %016lx *everywhere* than sprinkle the PKEY_REG_FMTs around. BTW, why are you doing a %016lx for a u32?
Re: [PATCH v15 00/24] selftests, powerpc, x86: Memory Protection Keys
On 12/17/19 11:51 PM, Sandipan Das wrote: > Testing > --- > Verified for correctness on powerpc. Need help with x86 testing as I > do not have access to a Skylake server. Client platforms like Coffee > Lake do not have the required feature bits set in CPUID. FWIW, you can get a Skylake Server instance from cloud providers. I spooled up an Amazon EC3 instance once to run these tests. It think it cost me $0.08.
Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register
On Wed, Dec 18, 2019 at 12:46:50PM -0800, Dave Hansen wrote: > On 12/17/19 11:51 PM, Sandipan Das wrote: > > write_pkey_reg(pkey_reg); > > - dprintf4("pkey_reg now: %08x\n", read_pkey_reg()); > > + dprintf4("pkey_reg now: "PKEY_REG_FMT"\n", read_pkey_reg()); > > } > > > > #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) > > diff --git a/tools/testing/selftests/vm/pkey-x86.h > > b/tools/testing/selftests/vm/pkey-x86.h > > index 2f04ade8ca9c..5f40901219d3 100644 > > --- a/tools/testing/selftests/vm/pkey-x86.h > > +++ b/tools/testing/selftests/vm/pkey-x86.h > > @@ -46,6 +46,8 @@ > > #define HPAGE_SIZE (1UL<<21) > > #define PAGE_SIZE 4096 > > #define MB (1<<20) > > +#define pkey_reg_t u32 > > +#define PKEY_REG_FMT "%016x" > > How big is the ppc one? u64 > > I'd really just rather do %016lx *everywhere* than sprinkle the > PKEY_REG_FMTs around. Does lx work with u32 without warnings? It's likely the size difference that requires a format specifier definition. > > BTW, why are you doing a %016lx for a u32? It's "%016x" without 'l' for x86 and with 'l' for ppc64. Thanks Michal
Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register
On 12/18/19 12:59 PM, Michal Suchánek wrote: >> I'd really just rather do %016lx *everywhere* than sprinkle the >> PKEY_REG_FMTs around. > Does lx work with u32 without warnings? Either way, I'd be happy to just make the x86 one u64 to make the whole thing look more sane,
Re: [PATCH v11 01/25] mm/gup: factor out duplicate code from four routines
On Wed, Dec 18, 2019 at 02:15:53PM -0800, John Hubbard wrote: > On 12/18/19 7:52 AM, Kirill A. Shutemov wrote: > > On Mon, Dec 16, 2019 at 02:25:13PM -0800, John Hubbard wrote: > > > +static void put_compound_head(struct page *page, int refs) > > > +{ > > > + /* Do a get_page() first, in case refs == page->_refcount */ > > > + get_page(page); > > > + page_ref_sub(page, refs); > > > + put_page(page); > > > +} > > > > It's not terribly efficient. Maybe something like: > > > > VM_BUG_ON_PAGE(page_ref_count(page) < ref, page); > > if (refs > 2) > > page_ref_sub(page, refs - 1); > > put_page(page); > > > > ? > > OK, but how about this instead? I don't see the need for a "2", as that > is a magic number that requires explanation. Whereas "1" is not a magic > number--here it means: either there are "many" (>1) refs, or not. Yeah, it's my thinko. Sure, it has to be '1' (or >= 2, which is less readable). > And the routine won't be called with refs less than about 32 (2MB huge > page, 64KB base page == 32 subpages) anyway. It's hard to make predictions about future :P > VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); > /* >* Calling put_page() for each ref is unnecessarily slow. Only the last >* ref needs a put_page(). >*/ > if (refs > 1) > page_ref_sub(page, refs - 1); > put_page(page); Looks good to me. -- Kirill A. Shutemov
Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register
On Wed, Dec 18, 2019 at 01:01:46PM -0800, Dave Hansen wrote: > On 12/18/19 12:59 PM, Michal Suchánek wrote: > >> I'd really just rather do %016lx *everywhere* than sprinkle the > >> PKEY_REG_FMTs around. > > Does lx work with u32 without warnings? > > Either way, I'd be happy to just make the x86 one u64 to make the whole > thing look more sane, So long as it still works with u64 on x86 it should be pretty future-proof. Does not look like we are getting 128bit registers for anything but math units any time soon. Thanks Michal
Re: [PATCH 1/1] kvm/book3s_64: Fixes crash caused by not cleaning vhost IOTLB
On Wed, 2019-12-18 at 15:53 +1100, Alexey Kardashevskiy wrote: > H_STUFF_TCE is always called with 0. Well, may be some AIX somewhere > calls it with a value other than zero, and I probably saw some other > value somewhere but in QEMU/KVM case it is 0 so you effectively disable > in-kernel acceleration of H_STUFF_TCE which is > undesirable. > Thanks for the feedback! > For now we should disable in-kernel H_STUFF_TCE/... handlers in QEMU > just like we do for VFIO for older host kernels: > > https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_iommu.c;h=3d3bcc86496a5277d62f7855fbb09c013c015f27;hb=HEAD#l208 I am still reading into this temporary solution, I could still not understand how it works. > I am not sure what a proper solution would be, something like an eventfd > and KVM's kvmppc_h_stuff_tce() signaling vhost that the latter needs to > invalidate iotlbs. Or we can just say that we do not allow KVM > acceleration if there is vhost+iommu on the same liobn (== vPHB, pretty > much). Thanks, I am not used to eventfd, but i agree it's a valid solution to talk to QEMU and then use it to send a message via /dev/vhost. KVM -> QEMU -> vhost But I can't get my mind out of another solution: doing it in kernelspace. I am not sure how that would work, though. If I could understand correctly, there is a vhost IOTLB per vhost_dev, and H_STUFF_TCE is not called in 64-bit DMA case (for tce_value == 0 case, at least), which makes sense, given it doesn't need to invalidate entries on IOTLB. So, we would need to somehow replace `return H_TOO_HARD` in this patch with code that could call vhost_process_iotlb_msg() with VHOST_IOTLB_INVALIDATE. For that, I would need to know what are the vhost_dev's of that process, which I don't know if it's possible to do currently (or safe at all). I am thinking of linking all vhost_dev's with a list (list.h) that could be searched, comparing `mm_struct *` of the calling task with all vhost_dev's, and removing the entry of all IOTLB that hits. Not sure if that's the best approach to find the related vhost_dev's. What do you think? Best regards, Leonardo Bras signature.asc Description: This is a digitally signed message part
Re: [PATCH 1/2] powerpc/pseries/svm: Don't access some SPRs
Michael Ellerman [m...@ellerman.id.au] wrote: > > eg. here. > > This is the fast path of context switch. > > That expands to: > > if (!(mfmsr() & MSR_S)) > asm volatile("mfspr %0, SPRN_BESCR" : "=r" (rval)); > if (!(mfmsr() & MSR_S)) > asm volatile("mfspr %0, SPRN_EBBHR" : "=r" (rval)); > if (!(mfmsr() & MSR_S)) > asm volatile("mfspr %0, SPRN_EBBRR" : "=r" (rval)); > Yes, should have optimized this at least :-) > > If the Ultravisor is going to disable EBB and BHRB then we need new > CPU_FTR bits for those, and the code that accesses those registers > needs to be put behind cpu_has_feature(EBB) etc. Will try the cpu_has_feature(). Would it be ok to use a single feature bit, like UV or make it per-register group as that could need more feature bits? Thanks, Sukadev
Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
On 12/18/19 8:04 AM, Kirill A. Shutemov wrote: On Mon, Dec 16, 2019 at 02:25:16PM -0800, John Hubbard wrote: An upcoming patch changes and complicates the refcounting and especially the "put page" aspects of it. In order to keep everything clean, refactor the devmap page release routines: * Rename put_devmap_managed_page() to page_is_devmap_managed(), and limit the functionality to "read only": return a bool, with no side effects. * Add a new routine, put_devmap_managed_page(), to handle checking what kind of page it is, and what kind of refcount handling it requires. * Rename __put_devmap_managed_page() to free_devmap_managed_page(), and limit the functionality to unconditionally freeing a devmap page. What the reason to separate put_devmap_managed_page() from free_devmap_managed_page() if free_devmap_managed_page() has exacly one caller? Is it preparation for the next patches? Yes. A later patch, #23, adds another caller: __unpin_devmap_managed_user_page(). ... @@ -971,7 +971,14 @@ static inline bool put_devmap_managed_page(struct page *page) return false; } +bool put_devmap_managed_page(struct page *page); + #else /* CONFIG_DEV_PAGEMAP_OPS */ +static inline bool page_is_devmap_managed(struct page *page) +{ + return false; +} + static inline bool put_devmap_managed_page(struct page *page) { return false; @@ -1028,8 +1035,10 @@ static inline void put_page(struct page *page) * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); put_devmap_managed_page() has yet another page_is_devmap_managed() check inside. It looks strange. Good point, it's an extra unnecessary check. So to clean it up, I'll note that the "if" check is required here in put_page(), in order to stay out of non-inlined function calls in the hot path (put_page()). So I'll do the following: * Leave the above code as it is here * Simplify put_devmap_managed_page(), it was trying to do two separate things, and those two things have different requirements. So change it to a void function, with a WARN_ON_ONCE to assert that page_is_devmap_managed() is true, * And change the other caller (release_pages()) to do that check. ... @@ -1102,3 +1102,27 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } + +#ifdef CONFIG_DEV_PAGEMAP_OPS +bool put_devmap_managed_page(struct page *page) +{ + bool is_devmap = page_is_devmap_managed(page); + + if (is_devmap) { Reversing the condition would save you an indentation level. Yes. Done. I'll also git-reply with an updated patch so you can see what it looks like. thanks, -- John Hubbard NVIDIA
[PATCH v4 2/4] kasan: Document support on 32-bit powerpc
KASAN is supported on 32-bit powerpc and the docs should reflect this. Suggested-by: Christophe Leroy Reviewed-by: Christophe Leroy Signed-off-by: Daniel Axtens --- Documentation/dev-tools/kasan.rst | 3 ++- Documentation/powerpc/kasan.txt | 12 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 Documentation/powerpc/kasan.txt diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index e4d66e7c50de..4af2b5d2c9b4 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -22,7 +22,8 @@ global variables yet. Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later. Currently generic KASAN is supported for the x86_64, arm64, xtensa and s390 -architectures, and tag-based KASAN is supported only for arm64. +architectures. It is also supported on 32-bit powerpc kernels. Tag-based KASAN +is supported only on arm64. Usage - diff --git a/Documentation/powerpc/kasan.txt b/Documentation/powerpc/kasan.txt new file mode 100644 index ..a85ce2ff8244 --- /dev/null +++ b/Documentation/powerpc/kasan.txt @@ -0,0 +1,12 @@ +KASAN is supported on powerpc on 32-bit only. + +32 bit support +== + +KASAN is supported on both hash and nohash MMUs on 32-bit. + +The shadow area sits at the top of the kernel virtual memory space above the +fixmap area and occupies one eighth of the total kernel virtual memory space. + +Instrumentation of the vmalloc area is not currently supported, but modules +are. -- 2.20.1
Re: [PATCH v11 06/25] mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM
On 12/18/19 8:19 AM, Kirill A. Shutemov wrote: ... diff --git a/mm/gup.c b/mm/gup.c index 3ecce297a47f..c0c56888e7cc 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -29,6 +29,13 @@ struct follow_page_context { unsigned int page_mask; }; +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags); Any particular reason for the forward declaration? Maybe move get_user_pages_remote() down? Yes, that's exactly why: I was thinking it would be cleaner to put in the forward declaration, rather than moving code blocks, but either way seems reasonable. I'll go ahead and move the code blocks and delete the forward declaration, now that someone has weighed in in favor of that. thanks, -- John Hubbard NVIDIA
[PATCH v12] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
An upcoming patch changes and complicates the refcounting and especially the "put page" aspects of it. In order to keep everything clean, refactor the devmap page release routines: * Rename put_devmap_managed_page() to page_is_devmap_managed(), and limit the functionality to "read only": return a bool, with no side effects. * Add a new routine, put_devmap_managed_page(), to handle decrementing the refcount for ZONE_DEVICE pages. * Change callers (just release_pages() and put_page()) to check page_is_devmap_managed() before calling the new put_devmap_managed_page() routine. This is a performance point: put_page() is a hot path, so we need to avoid non- inline function calls where possible. * Rename __put_devmap_managed_page() to free_devmap_managed_page(), and limit the functionality to unconditionally freeing a devmap page. This is originally based on a separate patch by Ira Weiny, which applied to an early version of the put_user_page() experiments. Since then, Jérôme Glisse suggested the refactoring described above. Cc: Christoph Hellwig Suggested-by: Jérôme Glisse Reviewed-by: Dan Williams Reviewed-by: Jan Kara Signed-off-by: Ira Weiny Signed-off-by: John Hubbard --- include/linux/mm.h | 18 +- mm/memremap.c | 16 ++-- mm/swap.c | 27 ++- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c97ea3b694e6..87b54126e46d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -952,9 +952,10 @@ static inline bool is_zone_device_page(const struct page *page) #endif #ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page); +void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool put_devmap_managed_page(struct page *page) + +static inline bool page_is_devmap_managed(struct page *page) { if (!static_branch_unlikely(_managed_key)) return false; @@ -963,7 +964,6 @@ static inline bool put_devmap_managed_page(struct page *page) switch (page->pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_FS_DAX: - __put_devmap_managed_page(page); return true; default: break; @@ -971,11 +971,17 @@ static inline bool put_devmap_managed_page(struct page *page) return false; } +void put_devmap_managed_page(struct page *page); + #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool put_devmap_managed_page(struct page *page) +static inline bool page_is_devmap_managed(struct page *page) { return false; } + +static inline void put_devmap_managed_page(struct page *page) +{ +} #endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) @@ -1028,8 +1034,10 @@ static inline void put_page(struct page *page) * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); return; + } if (put_page_testzero(page)) __put_page(page); diff --git a/mm/memremap.c b/mm/memremap.c index e899fa876a62..2ba773859031 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page) +void free_devmap_managed_page(struct page *page) { - int count = page_ref_dec_return(page); - - /* still busy */ - if (count > 1) - return; - - /* only triggered by the dev_pagemap shutdown path */ - if (count == 0) { - __put_page(page); - return; - } - /* notify page idle for dax */ if (!is_device_private_page(page)) { wake_up_var(>_refcount); @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } -EXPORT_SYMBOL(__put_devmap_managed_page); +EXPORT_SYMBOL(free_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index 5341ae93861f..cf39d24ada2a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr) * processing, and instead, expect a call to * put_page_testzero(). */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); continue; + } } page =
Re: [PATCH V3 01/13] powerpc/vas: Describe vas-port and interrupts properties
On Mon, Dec 16, 2019 at 09:48:40PM -0800, Haren Myneni wrote: > Commit message? > Signed-off-by: Haren Myneni Your author and S-o-b emails don't match. > --- > Documentation/devicetree/bindings/powerpc/ibm,vas.txt | 5 + > 1 file changed, 5 insertions(+) > > diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > index bf11d2f..12de08b 100644 > --- a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > @@ -11,6 +11,8 @@ Required properties: >window context start and length, OS/User window context start and length, >"Paste address" start and length, "Paste window id" start bit and number >of bits) > +- ibm,vas-port : Port address for the interrupt. 64-bit? > +- interrupts: IRQ value for each VAS instance and level. > > Example: > > @@ -18,5 +20,8 @@ Example: > compatible = "ibm,vas", "ibm,power9-vas"; > reg = <0x60191 0x200 0x60190 0x1 > 0x8 0x1 0x20 0x10>; > name = "vas"; > + interrupts = <0x1f 0>; > + interrupt-parent = <>; > ibm,vas-id = <0x1>; > + ibm,vas-port = <0x601000100>; > }; > -- > 1.8.3.1 > > >
Re: [PATCH v11 01/25] mm/gup: factor out duplicate code from four routines
On 12/18/19 7:52 AM, Kirill A. Shutemov wrote: On Mon, Dec 16, 2019 at 02:25:13PM -0800, John Hubbard wrote: +static void put_compound_head(struct page *page, int refs) +{ + /* Do a get_page() first, in case refs == page->_refcount */ + get_page(page); + page_ref_sub(page, refs); + put_page(page); +} It's not terribly efficient. Maybe something like: VM_BUG_ON_PAGE(page_ref_count(page) < ref, page); if (refs > 2) page_ref_sub(page, refs - 1); put_page(page); ? OK, but how about this instead? I don't see the need for a "2", as that is a magic number that requires explanation. Whereas "1" is not a magic number--here it means: either there are "many" (>1) refs, or not. And the routine won't be called with refs less than about 32 (2MB huge page, 64KB base page == 32 subpages) anyway. VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); /* * Calling put_page() for each ref is unnecessarily slow. Only the last * ref needs a put_page(). */ if (refs > 1) page_ref_sub(page, refs - 1); put_page(page); thanks, -- John Hubbard NVIDIA
Re: [PATCH V3 01/13] powerpc/vas: Describe vas-port and interrupts properties
On Wed, 2019-12-18 at 16:18 -0600, Rob Herring wrote: > On Mon, Dec 16, 2019 at 09:48:40PM -0800, Haren Myneni wrote: > > > > Commit message? > > > Signed-off-by: Haren Myneni > > Your author and S-o-b emails don't match. Thanks, Oliver suggested IRQ assign in the kernel instead of skiboot. In this case, we may not need this patch. > > > --- > > Documentation/devicetree/bindings/powerpc/ibm,vas.txt | 5 + > > 1 file changed, 5 insertions(+) > > > > diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > index bf11d2f..12de08b 100644 > > --- a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt > > @@ -11,6 +11,8 @@ Required properties: > >window context start and length, OS/User window context start and length, > >"Paste address" start and length, "Paste window id" start bit and number > >of bits) > > +- ibm,vas-port : Port address for the interrupt. > > 64-bit? > > > +- interrupts: IRQ value for each VAS instance and level. > > > > Example: > > > > @@ -18,5 +20,8 @@ Example: > > compatible = "ibm,vas", "ibm,power9-vas"; > > reg = <0x60191 0x200 0x60190 0x1 > > 0x8 0x1 0x20 0x10>; > > name = "vas"; > > + interrupts = <0x1f 0>; > > + interrupt-parent = <>; > > ibm,vas-id = <0x1>; > > + ibm,vas-port = <0x601000100>; > > }; > > -- > > 1.8.3.1 > > > > > >
Re: [PATCH 04/14] powerpc/vas: Setup IRQ mapping and register port for each window
On Wed, 2019-12-18 at 18:18 +1100, Oliver O'Halloran wrote: > On Wed, Nov 27, 2019 at 12:07 PM Haren Myneni > wrote: > > > > *snip* > > > > @@ -36,7 +62,18 @@ static int init_vas_instance(struct platform_device > > *pdev) > > return -ENODEV; > > } > > > > - if (pdev->num_resources != 4) { > > + rc = of_property_read_u64(dn, "ibm,vas-port", ); > > + if (rc) { > > + pr_err("No ibm,vas-port property for %s?\n", pdev->name); > > + /* No interrupts property */ > > + nresources = 4; > > + } > > + > > + /* > > +* interrupts property is available with 'ibm,vas-port' property. > > +* 4 Resources and 1 IRQ if interrupts property is available. > > +*/ > > + if (pdev->num_resources != nresources) { > > pr_err("Unexpected DT configuration for [%s, %d]\n", > > pdev->name, vasid); > > return -ENODEV; > > Right, so adding the IRQ in firmware will break the VAS driver in > existing kernels since it changes the resource count. This is IMO a > bug in the VAS driver that you should fix, but it does mean we need to > think twice about having firmware assign an interrupt at boot. Correct, Hence added vas-user-space nvram switch in skiboot. > > I had a closer look at this series and I'm not convinced that any > firmware changes are actually required either. We already have OPAL > calls for allocating an hwirq for the kernel to use and for getting > the IRQ's XIVE trigger port (see pnv_ocxl_alloc_xive_irq() for an > example). Why not use those here too? Doing so would allow us to > assign interrupts to individual windows too which might be useful for > the windows used by the kernel. Thanks for the pointer. like using pnv_ocxl_alloc_xive_irq(), we can disregard FW change. BTW, VAS fault handling is needed only for user space VAS windows. int vas_alloc_xive_irq(u32 chipid, u32 *irq, u64 *trigger_addr) { __be64 flags, trigger_page; u32 hwirq; s64 rc; hwirq = opal_xive_allocate_irq_raw(chipid); if (hwirq < 0) return -ENOENT; rc = opal_xive_get_irq_info(hwirq, , NULL, _page, NULL, NULL); if (rc || !trigger_page) { xive_native_free_irq(hwirq); return -ENOENT; } *irq = hwirq; *trigger_addr = be64_to_cpu(trigger_page); return 0; } We can have common function for VAS and cxl except per chip IRQ allocation is needed for each VAS instance. I will post patch-set with this change. Thanks Haren
[PATCH v4 4/4] powerpc: Book3S 64-bit "heavyweight" KASAN support
KASAN support on Book3S is a bit tricky to get right: - It would be good to support inline instrumentation so as to be able to catch stack issues that cannot be caught with outline mode. - Inline instrumentation requires a fixed offset. - Book3S runs code in real mode after booting. Most notably a lot of KVM runs in real mode, and it would be good to be able to instrument it. - Because code runs in real mode after boot, the offset has to point to valid memory both in and out of real mode. [ppc64 mm note: The kernel installs a linear mapping at effective address c000... onward. This is a one-to-one mapping with physical memory from ... onward. Because of how memory accesses work on powerpc 64-bit Book3S, a kernel pointer in the linear map accesses the same memory both with translations on (accessing as an 'effective address'), and with translations off (accessing as a 'real address'). This works in both guests and the hypervisor. For more details, see s5.7 of Book III of version 3 of the ISA, in particular the Storage Control Overview, s5.7.3, and s5.7.5 - noting that this KASAN implementation currently only supports Radix.] One approach is just to give up on inline instrumentation. This way all checks can be delayed until after everything set is up correctly, and the address-to-shadow calculations can be overridden. However, the features and speed boost provided by inline instrumentation are worth trying to do better. If _at compile time_ it is known how much contiguous physical memory a system has, the top 1/8th of the first block of physical memory can be set aside for the shadow. This is a big hammer and comes with 3 big consequences: - there's no nice way to handle physically discontiguous memory, so only the first physical memory block can be used. - kernels will simply fail to boot on machines with less memory than specified when compiling. - kernels running on machines with more memory than specified when compiling will simply ignore the extra memory. Implement and document KASAN this way. The current implementation is Radix only. Despite the limitations, it can still find bugs, e.g. http://patchwork.ozlabs.org/patch/1103775/ At the moment, this physical memory limit must be set _even for outline mode_. This may be changed in a later series - a different implementation could be added for outline mode that dynamically allocates shadow at a fixed offset. For example, see https://patchwork.ozlabs.org/patch/795211/ Suggested-by: Michael Ellerman Cc: Balbir Singh # ppc64 out-of-line radix version Cc: Christophe Leroy # ppc32 version Signed-off-by: Daniel Axtens --- Changes since v3: - Address further feedback from Christophe. - Drop changes to stack walking, it looks like the issue I observed is related to that particular stack, not stack-walking generally. Changes since v2: - Address feedback from Christophe around cleanups and docs. - Address feedback from Balbir: at this point I don't have a good solution for the issues you identify around the limitations of the inline implementation but I think that it's worth trying to get the stack instrumentation support. I'm happy to have an alternative and more flexible outline mode - I had envisoned this would be called 'lightweight' mode as it imposes fewer restrictions. I've linked to your implementation. I think it's best to add it in a follow-up series. - Made the default PHYS_MEM_SIZE_FOR_KASAN value 1024MB. I think most people have guests with at least that much memory in the Radix 64s case so it's a much saner default - it means that if you just turn on KASAN without reading the docs you're much more likely to have a bootable kernel, which you will never have if the value is set to zero! I'm happy to bikeshed the value if we want. Changes since v1: - Landed kasan vmalloc support upstream - Lots of feedback from Christophe. Changes since the rfc: - Boots real and virtual hardware, kvm works. - disabled reporting when we're checking the stack for exception frames. The behaviour isn't wrong, just incompatible with KASAN. - Documentation! - Dropped old module stuff in favour of KASAN_VMALLOC. The bugs with ftrace and kuap were due to kernel bloat pushing prom_init calls to be done via the plt. Because we did not have a relocatable kernel, and they are done very early, this caused everything to explode. Compile with CONFIG_RELOCATABLE! --- Documentation/dev-tools/kasan.rst| 8 +- Documentation/powerpc/kasan.txt | 112 ++- arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug | 21 arch/powerpc/Makefile| 11 ++ arch/powerpc/include/asm/book3s/64/hash.h| 4 + arch/powerpc/include/asm/book3s/64/pgtable.h | 7 ++ arch/powerpc/include/asm/book3s/64/radix.h | 5 +
[PATCH v4 3/4] powerpc/mm/kasan: rename kasan_init_32.c to init_32.c
kasan is already implied by the directory name, we don't need to repeat it. Suggested-by: Christophe Leroy Signed-off-by: Daniel Axtens --- arch/powerpc/mm/kasan/Makefile | 2 +- arch/powerpc/mm/kasan/{kasan_init_32.c => init_32.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename arch/powerpc/mm/kasan/{kasan_init_32.c => init_32.c} (100%) diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile index 6577897673dd..36a4e1b10b2d 100644 --- a/arch/powerpc/mm/kasan/Makefile +++ b/arch/powerpc/mm/kasan/Makefile @@ -2,4 +2,4 @@ KASAN_SANITIZE := n -obj-$(CONFIG_PPC32) += kasan_init_32.o +obj-$(CONFIG_PPC32) += init_32.o diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/init_32.c similarity index 100% rename from arch/powerpc/mm/kasan/kasan_init_32.c rename to arch/powerpc/mm/kasan/init_32.c -- 2.20.1
[PATCH v4 1/4] kasan: define and use MAX_PTRS_PER_* for early shadow tables
powerpc has a variable number of PTRS_PER_*, set at runtime based on the MMU that the kernel is booted under. This means the PTRS_PER_* are no longer constants, and therefore breaks the build. Define default MAX_PTRS_PER_*s in the same style as MAX_PTRS_PER_P4D. As KASAN is the only user at the moment, just define them in the kasan header, and have them default to PTRS_PER_* unless overridden in arch code. Suggested-by: Christophe Leroy Suggested-by: Balbir Singh Reviewed-by: Christophe Leroy Reviewed-by: Balbir Singh Signed-off-by: Daniel Axtens --- include/linux/kasan.h | 18 +++--- mm/kasan/init.c | 6 +++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index e18fe54969e9..70865810d0e7 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -14,10 +14,22 @@ struct task_struct; #include #include +#ifndef MAX_PTRS_PER_PTE +#define MAX_PTRS_PER_PTE PTRS_PER_PTE +#endif + +#ifndef MAX_PTRS_PER_PMD +#define MAX_PTRS_PER_PMD PTRS_PER_PMD +#endif + +#ifndef MAX_PTRS_PER_PUD +#define MAX_PTRS_PER_PUD PTRS_PER_PUD +#endif + extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; -extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE]; -extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; -extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD]; +extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE]; +extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD]; +extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ce45c491ebcd..8b54a96d3b3e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -46,7 +46,7 @@ static inline bool kasan_p4d_table(pgd_t pgd) } #endif #if CONFIG_PGTABLE_LEVELS > 3 -pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss; +pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss; static inline bool kasan_pud_table(p4d_t p4d) { return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); @@ -58,7 +58,7 @@ static inline bool kasan_pud_table(p4d_t p4d) } #endif #if CONFIG_PGTABLE_LEVELS > 2 -pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss; +pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss; static inline bool kasan_pmd_table(pud_t pud) { return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); @@ -69,7 +69,7 @@ static inline bool kasan_pmd_table(pud_t pud) return false; } #endif -pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; +pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE] __page_aligned_bss; static inline bool kasan_pte_table(pmd_t pmd) { -- 2.20.1
[PATCH v4 0/4] KASAN for powerpc64 radix
Building on the work of Christophe, Aneesh and Balbir, I've ported KASAN to 64-bit Book3S kernels running on the Radix MMU. This provides full inline instrumentation on radix, but does require that you be able to specify the amount of physically contiguous memory on the system at compile time. More details in patch 4. v4: More cleanups, split renaming out, clarify bits and bobs. Drop the stack walk disablement, that isn't needed. No other functional change. v3: Reduce the overly ambitious scope of the MAX_PTRS change. Document more things, including around why some of the restrictions apply. Clean up the code more, thanks Christophe. v2: The big change is the introduction of tree-wide(ish) MAX_PTRS_PER_{PTE,PMD,PUD} macros in preference to the previous approach, which was for the arch to override the page table array definitions with their own. (And I squashed the annoying intermittent crash!) Apart from that there's just a lot of cleanup. Christophe, I've addressed most of what you asked for and I will reply to your v1 emails to clarify what remains unchanged. Daniel Axtens (4): kasan: define and use MAX_PTRS_PER_* for early shadow tables kasan: Document support on 32-bit powerpc powerpc/mm/kasan: rename kasan_init_32.c to init_32.c powerpc: Book3S 64-bit "heavyweight" KASAN support Documentation/dev-tools/kasan.rst | 7 +- Documentation/powerpc/kasan.txt | 122 ++ arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug| 21 +++ arch/powerpc/Makefile | 11 ++ arch/powerpc/include/asm/book3s/64/hash.h | 4 + arch/powerpc/include/asm/book3s/64/pgtable.h | 7 + arch/powerpc/include/asm/book3s/64/radix.h| 5 + arch/powerpc/include/asm/kasan.h | 21 ++- arch/powerpc/kernel/prom.c| 61 - arch/powerpc/mm/kasan/Makefile| 3 +- .../mm/kasan/{kasan_init_32.c => init_32.c} | 0 arch/powerpc/mm/kasan/init_book3s_64.c| 70 ++ arch/powerpc/platforms/Kconfig.cputype| 1 + include/linux/kasan.h | 18 ++- mm/kasan/init.c | 6 +- 16 files changed, 346 insertions(+), 13 deletions(-) create mode 100644 Documentation/powerpc/kasan.txt rename arch/powerpc/mm/kasan/{kasan_init_32.c => init_32.c} (100%) create mode 100644 arch/powerpc/mm/kasan/init_book3s_64.c -- 2.20.1
"ftrace: Rework event_create_dir()" triggers boot error messages
The linux-next commit "ftrace: Rework event_create_dir()” [1] triggers boot warnings for Clang-build (Clang version 8.0.1) kernels (reproduced on both arm64 and powerpc). Reverted it (with trivial conflict fixes) on the top of today’s linux-next fixed the issue. configs: https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config [1] https://lore.kernel.org/lkml/2019132458.342979...@infradead.org/ [ 115.799327][T1] Registered efivars operations [ 115.849770][T1] clocksource: Switched to clocksource arch_sys_counter [ 115.901145][T1] Could not initialize trace point events/sys_enter_rt_sigreturn [ 115.908854][T1] Could not create directory for event sys_enter_rt_sigreturn [ 115.998949][T1] Could not initialize trace point events/sys_enter_restart_syscall [ 116.006802][T1] Could not create directory for event sys_enter_restart_syscall [ 116.062702][T1] Could not initialize trace point events/sys_enter_getpid [ 116.069828][T1] Could not create directory for event sys_enter_getpid [ 116.078058][T1] Could not initialize trace point events/sys_enter_gettid [ 116.085181][T1] Could not create directory for event sys_enter_gettid [ 116.093405][T1] Could not initialize trace point events/sys_enter_getppid [ 116.100612][T1] Could not create directory for event sys_enter_getppid [ 116.108989][T1] Could not initialize trace point events/sys_enter_getuid [ 116.116058][T1] Could not create directory for event sys_enter_getuid [ 116.124250][T1] Could not initialize trace point events/sys_enter_geteuid [ 116.131457][T1] Could not create directory for event sys_enter_geteuid [ 116.139840][T1] Could not initialize trace point events/sys_enter_getgid [ 116.146908][T1] Could not create directory for event sys_enter_getgid [ 116.155163][T1] Could not initialize trace point events/sys_enter_getegid [ 116.162370][T1] Could not create directory for event sys_enter_getegid [ 116.178015][T1] Could not initialize trace point events/sys_enter_setsid [ 116.185138][T1] Could not create directory for event sys_enter_setsid [ 116.269307][T1] Could not initialize trace point events/sys_enter_sched_yield [ 116.276811][T1] Could not create directory for event sys_enter_sched_yield [ 116.527652][T1] Could not initialize trace point events/sys_enter_munlockall [ 116.535126][T1] Could not create directory for event sys_enter_munlockall [ 116.622096][T1] Could not initialize trace point events/sys_enter_vhangup [ 116.629307][T1] Could not create directory for event sys_enter_vhangup [ 116.783867][T1] Could not initialize trace point events/sys_enter_sync [ 116.790819][T1] Could not create directory for event sys_enter_sync [ 117.723402][T1] pnp: PnP ACPI init [ 117.736379][T1] system 00:00: [mem 0x3000-0x3fff window] could not be reserved [ 126.020353][T1] pnp: PnP ACPI: found 1 devices [ 126.093919][T1] NET: Registered protocol family 2 [ 126.180007][T1] tcp_listen_portaddr_hash hash table entries: 65536 (order: 6, 4718592 bytes, vmalloc) [ 126.206510][T1] TCP established hash table entries: 524288 (order: 6, 4194304 bytes, vmalloc) [ 126.227766][T1] TCP bind hash table entries: 65536 (order: 6, 4194304 bytes, vmalloc) [ 126.240146][T1] TCP: Hash tables configured (established 524288 bind 65536)
Re: "ftrace: Rework event_create_dir()" triggers boot error messages
On Wed, 18 Dec 2019 22:58:23 -0500 Qian Cai wrote: > The linux-next commit "ftrace: Rework event_create_dir()” [1] triggers boot > warnings > for Clang-build (Clang version 8.0.1) kernels (reproduced on both arm64 and > powerpc). > Reverted it (with trivial conflict fixes) on the top of today’s linux-next > fixed the issue. > > configs: > https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config > https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config > > [1] https://lore.kernel.org/lkml/2019132458.342979...@infradead.org/ > > [ 115.799327][T1] Registered efivars operations > [ 115.849770][T1] clocksource: Switched to clocksource arch_sys_counter > [ 115.901145][T1] Could not initialize trace point > events/sys_enter_rt_sigreturn > [ 115.908854][T1] Could not create directory for event > sys_enter_rt_sigreturn > [ 115.998949][T1] Could not initialize trace point > events/sys_enter_restart_syscall > [ 116.006802][T1] Could not create directory for event > sys_enter_restart_syscall > [ 116.062702][T1] Could not initialize trace point > events/sys_enter_getpid > [ 116.069828][T1] Could not create directory for event sys_enter_getpid > [ 116.078058][T1] Could not initialize trace point > events/sys_enter_gettid > [ 116.085181][T1] Could not create directory for event sys_enter_gettid > [ 116.093405][T1] Could not initialize trace point > events/sys_enter_getppid > [ 116.100612][T1] Could not create directory for event sys_enter_getppid > [ 116.108989][T1] Could not initialize trace point > events/sys_enter_getuid > [ 116.116058][T1] Could not create directory for event sys_enter_getuid > [ 116.124250][T1] Could not initialize trace point > events/sys_enter_geteuid > [ 116.131457][T1] Could not create directory for event sys_enter_geteuid > [ 116.139840][T1] Could not initialize trace point > events/sys_enter_getgid > [ 116.146908][T1] Could not create directory for event sys_enter_getgid > [ 116.155163][T1] Could not initialize trace point > events/sys_enter_getegid > [ 116.162370][T1] Could not create directory for event sys_enter_getegid > [ 116.178015][T1] Could not initialize trace point > events/sys_enter_setsid > [ 116.185138][T1] Could not create directory for event sys_enter_setsid > [ 116.269307][T1] Could not initialize trace point > events/sys_enter_sched_yield > [ 116.276811][T1] Could not create directory for event > sys_enter_sched_yield > [ 116.527652][T1] Could not initialize trace point > events/sys_enter_munlockall > [ 116.535126][T1] Could not create directory for event > sys_enter_munlockall > [ 116.622096][T1] Could not initialize trace point > events/sys_enter_vhangup > [ 116.629307][T1] Could not create directory for event sys_enter_vhangup > [ 116.783867][T1] Could not initialize trace point events/sys_enter_sync > [ 116.790819][T1] Could not create directory for event sys_enter_sync > [ 117.723402][T1] pnp: PnP ACPI init I noticed that all of the above have zero parameters. Does the following patch fix it? (note, I prefer "ret" and "i" on different lines anyway) -- Steve diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 53935259f701..abb70c71fe60 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -269,7 +269,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; int offset = offsetof(typeof(trace), args); - int ret, i; + int ret = 0; + int i; for (i = 0; i < meta->nb_args; i++) { ret = trace_define_field(call, meta->types[i],
Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
On Mon, Dec 16, 2019 at 2:26 PM John Hubbard wrote: > > An upcoming patch changes and complicates the refcounting and > especially the "put page" aspects of it. In order to keep > everything clean, refactor the devmap page release routines: > > * Rename put_devmap_managed_page() to page_is_devmap_managed(), > and limit the functionality to "read only": return a bool, > with no side effects. > > * Add a new routine, put_devmap_managed_page(), to handle checking > what kind of page it is, and what kind of refcount handling it > requires. > > * Rename __put_devmap_managed_page() to free_devmap_managed_page(), > and limit the functionality to unconditionally freeing a devmap > page. > > This is originally based on a separate patch by Ira Weiny, which > applied to an early version of the put_user_page() experiments. > Since then, Jérôme Glisse suggested the refactoring described above. > > Cc: Christoph Hellwig > Suggested-by: Jérôme Glisse > Reviewed-by: Dan Williams > Reviewed-by: Jan Kara > Signed-off-by: Ira Weiny > Signed-off-by: John Hubbard > --- > include/linux/mm.h | 17 + > mm/memremap.c | 16 ++-- > mm/swap.c | 24 > 3 files changed, 39 insertions(+), 18 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index c97ea3b694e6..77a4df06c8a7 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -952,9 +952,10 @@ static inline bool is_zone_device_page(const struct page > *page) > #endif > > #ifdef CONFIG_DEV_PAGEMAP_OPS > -void __put_devmap_managed_page(struct page *page); > +void free_devmap_managed_page(struct page *page); > DECLARE_STATIC_KEY_FALSE(devmap_managed_key); > -static inline bool put_devmap_managed_page(struct page *page) > + > +static inline bool page_is_devmap_managed(struct page *page) > { > if (!static_branch_unlikely(_managed_key)) > return false; > @@ -963,7 +964,6 @@ static inline bool put_devmap_managed_page(struct page > *page) > switch (page->pgmap->type) { > case MEMORY_DEVICE_PRIVATE: > case MEMORY_DEVICE_FS_DAX: > - __put_devmap_managed_page(page); > return true; > default: > break; > @@ -971,7 +971,14 @@ static inline bool put_devmap_managed_page(struct page > *page) > return false; > } > > +bool put_devmap_managed_page(struct page *page); > + > #else /* CONFIG_DEV_PAGEMAP_OPS */ > +static inline bool page_is_devmap_managed(struct page *page) > +{ > + return false; > +} > + > static inline bool put_devmap_managed_page(struct page *page) > { > return false; > @@ -1028,8 +1035,10 @@ static inline void put_page(struct page *page) > * need to inform the device driver through callback. See > * include/linux/memremap.h and HMM for details. > */ > - if (put_devmap_managed_page(page)) > + if (page_is_devmap_managed(page)) { > + put_devmap_managed_page(page); > return; > + } > > if (put_page_testzero(page)) > __put_page(page); > diff --git a/mm/memremap.c b/mm/memremap.c > index e899fa876a62..2ba773859031 100644 > --- a/mm/memremap.c > +++ b/mm/memremap.c > @@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, > EXPORT_SYMBOL_GPL(get_dev_pagemap); > > #ifdef CONFIG_DEV_PAGEMAP_OPS > -void __put_devmap_managed_page(struct page *page) > +void free_devmap_managed_page(struct page *page) > { > - int count = page_ref_dec_return(page); > - > - /* still busy */ > - if (count > 1) > - return; > - > - /* only triggered by the dev_pagemap shutdown path */ > - if (count == 0) { > - __put_page(page); > - return; > - } > - > /* notify page idle for dax */ > if (!is_device_private_page(page)) { > wake_up_var(>_refcount); > @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page) > page->mapping = NULL; > page->pgmap->ops->page_free(page); > } > -EXPORT_SYMBOL(__put_devmap_managed_page); > +EXPORT_SYMBOL(free_devmap_managed_page); This patch does not have a module consumer for free_devmap_managed_page(), so the export should move to the patch that needs the new export. Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL is that there was no practical way to hide the devmap details from evey module in the kernel that did put_page(). I would expect free_devmap_managed_page() to EXPORT_SYMBOL_GPL if it is not inlined into an existing exported static inline api.
Re: "ftrace: Rework event_create_dir()" triggers boot error messages
> On Dec 18, 2019, at 11:31 PM, Steven Rostedt wrote: > > On Wed, 18 Dec 2019 22:58:23 -0500 > Qian Cai wrote: > >> The linux-next commit "ftrace: Rework event_create_dir()” [1] triggers boot >> warnings >> for Clang-build (Clang version 8.0.1) kernels (reproduced on both arm64 and >> powerpc). >> Reverted it (with trivial conflict fixes) on the top of today’s linux-next >> fixed the issue. >> >> configs: >> https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config >> https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config >> >> [1] https://lore.kernel.org/lkml/2019132458.342979...@infradead.org/ >> >> [ 115.799327][T1] Registered efivars operations >> [ 115.849770][T1] clocksource: Switched to clocksource arch_sys_counter >> [ 115.901145][T1] Could not initialize trace point >> events/sys_enter_rt_sigreturn >> [ 115.908854][T1] Could not create directory for event >> sys_enter_rt_sigreturn >> [ 115.998949][T1] Could not initialize trace point >> events/sys_enter_restart_syscall >> [ 116.006802][T1] Could not create directory for event >> sys_enter_restart_syscall >> [ 116.062702][T1] Could not initialize trace point >> events/sys_enter_getpid >> [ 116.069828][T1] Could not create directory for event sys_enter_getpid >> [ 116.078058][T1] Could not initialize trace point >> events/sys_enter_gettid >> [ 116.085181][T1] Could not create directory for event sys_enter_gettid >> [ 116.093405][T1] Could not initialize trace point >> events/sys_enter_getppid >> [ 116.100612][T1] Could not create directory for event sys_enter_getppid >> [ 116.108989][T1] Could not initialize trace point >> events/sys_enter_getuid >> [ 116.116058][T1] Could not create directory for event sys_enter_getuid >> [ 116.124250][T1] Could not initialize trace point >> events/sys_enter_geteuid >> [ 116.131457][T1] Could not create directory for event sys_enter_geteuid >> [ 116.139840][T1] Could not initialize trace point >> events/sys_enter_getgid >> [ 116.146908][T1] Could not create directory for event sys_enter_getgid >> [ 116.155163][T1] Could not initialize trace point >> events/sys_enter_getegid >> [ 116.162370][T1] Could not create directory for event sys_enter_getegid >> [ 116.178015][T1] Could not initialize trace point >> events/sys_enter_setsid >> [ 116.185138][T1] Could not create directory for event sys_enter_setsid >> [ 116.269307][T1] Could not initialize trace point >> events/sys_enter_sched_yield >> [ 116.276811][T1] Could not create directory for event >> sys_enter_sched_yield >> [ 116.527652][T1] Could not initialize trace point >> events/sys_enter_munlockall >> [ 116.535126][T1] Could not create directory for event >> sys_enter_munlockall >> [ 116.622096][T1] Could not initialize trace point >> events/sys_enter_vhangup >> [ 116.629307][T1] Could not create directory for event sys_enter_vhangup >> [ 116.783867][T1] Could not initialize trace point events/sys_enter_sync >> [ 116.790819][T1] Could not create directory for event sys_enter_sync >> [ 117.723402][T1] pnp: PnP ACPI init > > I noticed that all of the above have zero parameters. Does the > following patch fix it? Yes, it works. > > (note, I prefer "ret" and "i" on different lines anyway) > > -- Steve > > diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c > index 53935259f701..abb70c71fe60 100644 > --- a/kernel/trace/trace_syscalls.c > +++ b/kernel/trace/trace_syscalls.c > @@ -269,7 +269,8 @@ static int __init syscall_enter_define_fields(struct > trace_event_call *call) > struct syscall_trace_enter trace; > struct syscall_metadata *meta = call->data; > int offset = offsetof(typeof(trace), args); > - int ret, i; > + int ret = 0; > + int i; > > for (i = 0; i < meta->nb_args; i++) { > ret = trace_define_field(call, meta->types[i],
Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
On Wed, Dec 18, 2019 at 9:51 PM John Hubbard wrote: > > On 12/18/19 9:27 PM, Dan Williams wrote: > ... > >> @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page) > >> page->mapping = NULL; > >> page->pgmap->ops->page_free(page); > >> } > >> -EXPORT_SYMBOL(__put_devmap_managed_page); > >> +EXPORT_SYMBOL(free_devmap_managed_page); > > > > This patch does not have a module consumer for > > free_devmap_managed_page(), so the export should move to the patch > > that needs the new export. > > Hi Dan, > > OK, I know that's a policy--although it seems quite pointless here given > that this is definitely going to need an EXPORT. > > At the moment, the series doesn't use it in any module at all, so I'll just > delete the EXPORT for now. > > > > > Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL > > instead of EXPORT_SYMBOL_GPL is that there was no practical way to > > hide the devmap details from evey module in the kernel that did > > put_page(). I would expect free_devmap_managed_page() to > > EXPORT_SYMBOL_GPL if it is not inlined into an existing exported > > static inline api. > > > > Sure, I'll change it to EXPORT_SYMBOL_GPL when the time comes. We do have > to be careful that we don't shut out normal put_page() types of callers, > but...glancing through the current callers, that doesn't look to be a problem. > Good. So it should be OK to do EXPORT_SYMBOL_GPL here. > > Are you *sure* you don't want to just pre-emptively EXPORT now, and save > looking at it again? I'm positive. There is enough history for "trust me the consumer is coming" turning out not to be true to justify the hassle in my mind. I do trust you, but things happen.
Re: [PATCH] powerpc/setup_64: use DEFINE_DEBUGFS_ATTRIBUTE to define fops_rfi_flush
Hi Michael, On 2019/12/18 19:02, Michael Ellerman wrote: > Chen Zhou writes: >> Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE for >> debugfs files. >> >> Semantic patch information: >> Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file() >> imposes some significant overhead as compared to >> DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe(). > > I know you didn't write this text, but these change logs are not great. > It doesn't really explain why you're doing it. And it is alarming that > you're converting *to* a function with "unsafe" in the name. > > The commit that added the script: > > 5103068eaca2 ("debugfs, coccinelle: check for obsolete > DEFINE_SIMPLE_ATTRIBUTE() usage") > > Has a bit more explanation. > > Maybe something like this: > > In order to protect against file removal races, debugfs files created via > debugfs_create_file() are wrapped by a struct file_operations at their > opening. > > If the original struct file_operations is known to be safe against removal > races already, the proxy creation may be bypassed by creating the files > using DEFINE_DEBUGFS_ATTRIBUTE() and debugfs_create_file_unsafe(). > > > The part that's not explained is why this file is "known to be safe > against removal races already"? > > It also seems this conversion will make the file no longer seekable, > because DEFINE_SIMPLE_ATTRIBUTE() uses generic_file_llseek() whereas > DEFINE_DEBUGFS_ATTRIBUTE() uses no_llseek. > > That is probably fine, but should be mentioned. Thanks for your explanation. This part indeed should be mentioned. Chen Zhou > > cheers > >> Signed-off-by: Chen Zhou >> --- >> arch/powerpc/kernel/setup_64.c | 4 ++-- >> 1 file changed, 2 insertions(+), 2 deletions(-) >> >> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c >> index 6104917..4b9fbb2 100644 >> --- a/arch/powerpc/kernel/setup_64.c >> +++ b/arch/powerpc/kernel/setup_64.c >> @@ -956,11 +956,11 @@ static int rfi_flush_get(void *data, u64 *val) >> return 0; >> } >> >> -DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, >> "%llu\n"); >> +DEFINE_DEBUGFS_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, >> "%llu\n"); >> >> static __init int rfi_flush_debugfs_init(void) >> { >> -debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, >> _rfi_flush); >> +debugfs_create_file_unsafe("rfi_flush", 0600, powerpc_debugfs_root, >> NULL, _rfi_flush); >> return 0; >> } >> device_initcall(rfi_flush_debugfs_init); >> -- >> 2.7.4 > > . >
Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
On 12/18/19 9:27 PM, Dan Williams wrote: ... @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } -EXPORT_SYMBOL(__put_devmap_managed_page); +EXPORT_SYMBOL(free_devmap_managed_page); This patch does not have a module consumer for free_devmap_managed_page(), so the export should move to the patch that needs the new export. Hi Dan, OK, I know that's a policy--although it seems quite pointless here given that this is definitely going to need an EXPORT. At the moment, the series doesn't use it in any module at all, so I'll just delete the EXPORT for now. Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL is that there was no practical way to hide the devmap details from evey module in the kernel that did put_page(). I would expect free_devmap_managed_page() to EXPORT_SYMBOL_GPL if it is not inlined into an existing exported static inline api. Sure, I'll change it to EXPORT_SYMBOL_GPL when the time comes. We do have to be careful that we don't shut out normal put_page() types of callers, but...glancing through the current callers, that doesn't look to be a problem. Good. So it should be OK to do EXPORT_SYMBOL_GPL here. Are you *sure* you don't want to just pre-emptively EXPORT now, and save looking at it again? thanks, -- John Hubbard NVIDIA
Re: [PATCH v4 2/2] powerpc/irq: inline call_do_irq() and call_do_softirq()
Le 09/12/2019 à 11:53, Michael Ellerman a écrit : Segher Boessenkool writes: On Sat, Dec 07, 2019 at 10:42:28AM +0100, Christophe Leroy wrote: Le 06/12/2019 à 21:59, Segher Boessenkool a écrit : If the compiler can see the callee wants the same TOC as the caller has, it does not arrange to set (and restore) it, no. If it sees it may be different, it does arrange for that (and the linker then will check if it actually needs to do anything, and do that if needed). In this case, the compiler cannot know the callee wants the same TOC, which complicates thing a lot -- but it all works out. Do we have a way to make sure which TOC the functions are using ? Is there several TOC at all in kernel code ? Kernel modules have their own TOC, I think? Yes. Yes, this means that exported functions have to care about that, right ? And that's the reason why exported assembly functions like copy_page() use _GLOBAL_TOC() and not _GLOBAL() But main part of the kernel only has one TOC, so r2 can be assumed constant for non exported functions, can't it ? I think things can still go wrong if any of this is inlined into a kernel module? Is there anything that prevents this / can this not happen for some fundamental reason I don't see? This can't happen can it ? do_softirq_own_stack() is an outline function, defined in powerpc irq.c Its only caller is do_softirq() which is an outline function defined in kernel/softirq.c That prevents inlining, doesn't it ? Hopefully, sure. Would be nice if it was clearer that this works... It is too much like working by chance, the way it is :-( There's no way any of that code can end up in a module. Or at least if there is, that's a bug. That's my conclusion as well. So I guess we can consider r2 as constant over those functions. Anyway, until we clarify all this I'll limit my patch to PPC32 which is where the real benefit is I guess. At the end, maybe the solution should be to switch to IRQ stack immediately in the exception entry as x86_64 do ? Yeah that might be cleaner. I prepared a patch for that on PPC32, but it doesn't get rid of the IRQ stack switch completely because do_IRQ() is also called from other places like the timer interrupt. And we will still have the switch for softirqs. We could move do_softirq_own_stack() to assembly and merge it with call_do_softirq(), but a find it cleaner to inline call_do_softirq() instead, now that we have demonstrated that r2 can't change. Christophe
Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages
On 12/18/19 10:52 PM, Dan Williams wrote: On Wed, Dec 18, 2019 at 9:51 PM John Hubbard wrote: On 12/18/19 9:27 PM, Dan Williams wrote: ... @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } -EXPORT_SYMBOL(__put_devmap_managed_page); +EXPORT_SYMBOL(free_devmap_managed_page); This patch does not have a module consumer for free_devmap_managed_page(), so the export should move to the patch that needs the new export. Hi Dan, OK, I know that's a policy--although it seems quite pointless here given that this is definitely going to need an EXPORT. At the moment, the series doesn't use it in any module at all, so I'll just delete the EXPORT for now. Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL is that there was no practical way to hide the devmap details from evey module in the kernel that did put_page(). I would expect free_devmap_managed_page() to EXPORT_SYMBOL_GPL if it is not inlined into an existing exported static inline api. Sure, I'll change it to EXPORT_SYMBOL_GPL when the time comes. We do have to be careful that we don't shut out normal put_page() types of callers, but...glancing through the current callers, that doesn't look to be a problem. Good. So it should be OK to do EXPORT_SYMBOL_GPL here. Are you *sure* you don't want to just pre-emptively EXPORT now, and save looking at it again? I'm positive. There is enough history for "trust me the consumer is coming" turning out not to be true to justify the hassle in my mind. I do trust you, but things happen. OK, it's deleted locally. Thanks for looking at the patch. I'll post a v12 series that includes the change, once it looks like reviews are slowing down. thanks, -- John Hubbard NVIDIA
Re: [PATCH v4 4/4] powerpc: Book3S 64-bit "heavyweight" KASAN support
On 12/19/2019 12:36 AM, Daniel Axtens wrote: KASAN support on Book3S is a bit tricky to get right: - It would be good to support inline instrumentation so as to be able to catch stack issues that cannot be caught with outline mode. - Inline instrumentation requires a fixed offset. - Book3S runs code in real mode after booting. Most notably a lot of KVM runs in real mode, and it would be good to be able to instrument it. - Because code runs in real mode after boot, the offset has to point to valid memory both in and out of real mode. [ppc64 mm note: The kernel installs a linear mapping at effective address c000... onward. This is a one-to-one mapping with physical memory from ... onward. Because of how memory accesses work on powerpc 64-bit Book3S, a kernel pointer in the linear map accesses the same memory both with translations on (accessing as an 'effective address'), and with translations off (accessing as a 'real address'). This works in both guests and the hypervisor. For more details, see s5.7 of Book III of version 3 of the ISA, in particular the Storage Control Overview, s5.7.3, and s5.7.5 - noting that this KASAN implementation currently only supports Radix.] One approach is just to give up on inline instrumentation. This way all checks can be delayed until after everything set is up correctly, and the address-to-shadow calculations can be overridden. However, the features and speed boost provided by inline instrumentation are worth trying to do better. If _at compile time_ it is known how much contiguous physical memory a system has, the top 1/8th of the first block of physical memory can be set aside for the shadow. This is a big hammer and comes with 3 big consequences: - there's no nice way to handle physically discontiguous memory, so only the first physical memory block can be used. - kernels will simply fail to boot on machines with less memory than specified when compiling. - kernels running on machines with more memory than specified when compiling will simply ignore the extra memory. Implement and document KASAN this way. The current implementation is Radix only. Despite the limitations, it can still find bugs, e.g. http://patchwork.ozlabs.org/patch/1103775/ At the moment, this physical memory limit must be set _even for outline mode_. This may be changed in a later series - a different implementation could be added for outline mode that dynamically allocates shadow at a fixed offset. For example, see https://patchwork.ozlabs.org/patch/795211/ Suggested-by: Michael Ellerman Cc: Balbir Singh # ppc64 out-of-line radix version Cc: Christophe Leroy # ppc32 version Signed-off-by: Daniel Axtens --- Changes since v3: - Address further feedback from Christophe. - Drop changes to stack walking, it looks like the issue I observed is related to that particular stack, not stack-walking generally. Changes since v2: - Address feedback from Christophe around cleanups and docs. - Address feedback from Balbir: at this point I don't have a good solution for the issues you identify around the limitations of the inline implementation but I think that it's worth trying to get the stack instrumentation support. I'm happy to have an alternative and more flexible outline mode - I had envisoned this would be called 'lightweight' mode as it imposes fewer restrictions. I've linked to your implementation. I think it's best to add it in a follow-up series. - Made the default PHYS_MEM_SIZE_FOR_KASAN value 1024MB. I think most people have guests with at least that much memory in the Radix 64s case so it's a much saner default - it means that if you just turn on KASAN without reading the docs you're much more likely to have a bootable kernel, which you will never have if the value is set to zero! I'm happy to bikeshed the value if we want. Changes since v1: - Landed kasan vmalloc support upstream - Lots of feedback from Christophe. Changes since the rfc: - Boots real and virtual hardware, kvm works. - disabled reporting when we're checking the stack for exception frames. The behaviour isn't wrong, just incompatible with KASAN. - Documentation! - Dropped old module stuff in favour of KASAN_VMALLOC. The bugs with ftrace and kuap were due to kernel bloat pushing prom_init calls to be done via the plt. Because we did not have a relocatable kernel, and they are done very early, this caused everything to explode. Compile with CONFIG_RELOCATABLE! --- Documentation/dev-tools/kasan.rst| 8 +- Documentation/powerpc/kasan.txt | 112 ++- arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug | 21 arch/powerpc/Makefile| 11 ++ arch/powerpc/include/asm/book3s/64/hash.h| 4 +
[PATCH v3] powerpc/32: add support of KASAN_VMALLOC
Add support of KASAN_VMALLOC on PPC32. To allow this, the early shadow covering the VMALLOC space need to be removed once high_memory var is set and before freeing memblock. And the VMALLOC area need to be aligned such that boundaries are covered by a full shadow page. Signed-off-by: Christophe Leroy --- v3: added missing inclusion of asm/kasan.h needed when CONFIG_KASAN is not set. v2: rebased ; exclude specific module handling when CONFIG_KASAN_VMALLOC is set. --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/32/pgtable.h | 5 + arch/powerpc/include/asm/kasan.h | 2 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 5 + arch/powerpc/mm/kasan/kasan_init_32.c| 33 +++- arch/powerpc/mm/mem.c| 4 6 files changed, 49 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1ec34e16ed65..a247bbfb03d4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -173,6 +173,7 @@ config PPC select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if PPC32 + select HAVE_ARCH_KASAN_VMALLOC if PPC32 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 0796533d37dd..5b39c11e884a 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -193,7 +193,12 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #else #define VMALLOC_START long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) #endif + +#ifdef CONFIG_KASAN_VMALLOC +#define VMALLOC_END_ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) +#else #define VMALLOC_ENDioremap_bot +#endif #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index 296e51c2f066..fbff9ff9032e 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -31,9 +31,11 @@ void kasan_early_init(void); void kasan_mmu_init(void); void kasan_init(void); +void kasan_late_init(void); #else static inline void kasan_init(void) { } static inline void kasan_mmu_init(void) { } +static inline void kasan_late_init(void) { } #endif #endif /* __ASSEMBLY */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 552b96eef0c8..60c4d829152e 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -114,7 +114,12 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #else #define VMALLOC_START long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) #endif + +#ifdef CONFIG_KASAN_VMALLOC +#define VMALLOC_END_ALIGN_DOWN(ioremap_bot, PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) +#else #define VMALLOC_ENDioremap_bot +#endif /* * Bits in a linux-style PTE. These match the bits in the diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 0e6ed4413eea..88036fb88350 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -129,6 +129,31 @@ static void __init kasan_remap_early_shadow_ro(void) flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END); } +static void __init kasan_unmap_early_shadow_vmalloc(void) +{ + unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_START); + unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void *)VMALLOC_END); + unsigned long k_cur; + phys_addr_t pa = __pa(kasan_early_shadow_page); + + if (!early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + int ret = kasan_init_shadow_page_tables(k_start, k_end); + + if (ret) + panic("kasan: kasan_init_shadow_page_tables() failed"); + } + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { + pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pte_t *ptep = pte_offset_kernel(pmd, k_cur); + + if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) + continue; + + __set_pte_at(_mm, k_cur, ptep, __pte(0), 0); + } + flush_tlb_kernel_range(k_start, k_end); +} + void __init kasan_mmu_init(void) { int ret; @@ -165,7 +190,13 @@ void __init kasan_init(void) pr_info("KASAN init done\n"); } -#ifdef CONFIG_MODULES +void __init kasan_late_init(void) +{ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_unmap_early_shadow_vmalloc(); +} + +#if defined(CONFIG_MODULES) &&