[PATCH v4 4/9] drm/i915/perf: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to i915_perf monitoring for CAP_SYS_PERFMON privileged
processes. For backward compatibility reasons access to i915_perf
subsystem remains open for CAP_SYS_ADMIN privileged processes but
CAP_SYS_ADMIN usage for secure i915_perf monitoring is discouraged
with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 drivers/gpu/drm/i915/i915_perf.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index e42b86827d6b..e2697f8d04de 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -2748,10 +2748,10 @@ i915_perf_open_ioctl_locked(struct drm_i915_private 
*dev_priv,
/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
 * we check a dev.i915.perf_stream_paranoid sysctl option
 * to determine if it's ok to access system wide OA counters
-* without CAP_SYS_ADMIN privileges.
+* without CAP_SYS_PERFMON or CAP_SYS_ADMIN privileges.
 */
if (privileged_op &&
-   i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
+   i915_perf_stream_paranoid && !perfmon_capable()) {
DRM_DEBUG("Insufficient privileges to open system-wide i915 
perf stream\n");
ret = -EACCES;
goto err_ctx;
@@ -2939,9 +2939,8 @@ static int read_properties_unlocked(struct 
drm_i915_private *dev_priv,
} else
oa_freq_hz = 0;
 
-   if (oa_freq_hz > i915_oa_max_sample_rate &&
-   !capable(CAP_SYS_ADMIN)) {
-   DRM_DEBUG("OA exponent would exceed the max 
sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root 
privileges\n",
+   if (oa_freq_hz > i915_oa_max_sample_rate && 
!perfmon_capable()) {
+   DRM_DEBUG("OA exponent would exceed the max 
sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without 
CAP_SYS_PERFMON or CAP_SYS_ADMIN privileges\n",
  i915_oa_max_sample_rate);
return -EACCES;
}
@@ -3328,7 +3327,7 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, 
void *data,
return -EINVAL;
}
 
-   if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
+   if (i915_perf_stream_paranoid && !perfmon_capable()) {
DRM_DEBUG("Insufficient privileges to add i915 OA config\n");
return -EACCES;
}
@@ -3474,7 +3473,7 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, 
void *data,
return -ENOTSUPP;
}
 
-   if (i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
+   if (i915_perf_stream_paranoid && !perfmon_capable()) {
DRM_DEBUG("Insufficient privileges to remove i915 OA config\n");
return -EACCES;
}
-- 
2.20.1



Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-18 Thread Michael Ellerman
Linus Torvalds  writes:
> On Tue, Dec 17, 2019 at 10:04 AM Linus Torvalds
>  wrote:
>>
>> Let me think about it.
>
> How about we just get rid of the union entirely, and just use
> 'unsigned long' or 'unsigned long long' depending on the size.
>
> Something like the attached patch - it still requires that it be an
> arithmetic type, but now because of the final cast.
>
> But it might still be a cast to a volatile type, of course. Then the
> result will be volatile, but at least now READ_ONCE() won't be taking
> the address of a volatile variable on the stack - does that at least
> fix some of the horrible code generation. Hmm?

Yes it seems to fix it for me.

There's no unnecessary stack protector gunk, and no store/load to the
stack variable.

This is my previous example of ext4_resize_begin(), hacked to use a copy
of the generic version of test_and_set_bit_lock(), which in turn was
hacked to use a local version of your READ_ONCE().

  c0534390 :
  c0534390:   19 01 4c 3c addis   r2,r12,281
  c0534394:   70 c3 42 38 addir2,r2,-15504
  c0534398:   a6 02 08 7c mflrr0
  c053439c:   4d 98 b3 4b bl  c006dbe8 <_mcount>
  c05343a0:   a6 02 08 7c mflrr0
  c05343a4:   f8 ff e1 fb std r31,-8(r1)
  c05343a8:   f0 ff c1 fb std r30,-16(r1)
  c05343ac:   78 1b 7f 7c mr  r31,r3
  c05343b0:   18 00 60 38 li  r3,24
  c05343b4:   10 00 01 f8 std r0,16(r1)
  c05343b8:   91 ff 21 f8 stdur1,-112(r1)
  c05343bc:   98 03 df eb ld  r30,920(r31)
  c05343c0:   d9 d3 c0 4b bl  c0141798 
  c05343c4:   00 00 00 60 nop
  c05343c8:   00 00 a3 2f cmpdi   cr7,r3,0
  c05343cc:   a4 00 9e 41 beq cr7,c0534470 

  c05343d0:   98 03 3f e9 ld  r9,920(r31)
  c05343d4:   60 00 5e e9 ld  r10,96(r30)
  c05343d8:   54 00 fe 80 lwz r7,84(r30)
  c05343dc:   68 00 09 e9 ld  r8,104(r9)
  c05343e0:   18 00 4a e9 ld  r10,24(r10)
  c05343e4:   14 00 08 81 lwz r8,20(r8)
  c05343e8:   36 3c 4a 7d srd r10,r10,r7
  c05343ec:   00 40 aa 7f cmpdcr7,r10,r8
  c05343f0:   b8 00 9e 40 bne cr7,c05344a8 

  c05343f4:   a0 00 49 a1 lhz r10,160(r9)
  c05343f8:   02 00 4a 71 andi.   r10,r10,2
  c05343fc:   84 00 82 40 bne c0534480 

  c0534400:   30 02 49 e9 ld  r10,560(r9)   
# simple load of EXT4_SB(sb)->s_ext4_flags
  c0534404:   01 00 4a 71 andi.   r10,r10,1
  c0534408:   48 00 82 40 bne c0534450 

  c053440c:   30 02 e9 38 addir7,r9,560
  c0534410:   01 00 00 39 li  r8,1
  c0534414:   a8 38 40 7d ldarx   r10,0,r7
  c0534418:   78 53 06 7d or  r6,r8,r10
  c053441c:   ad 39 c0 7c stdcx.  r6,0,r7
  c0534420:   f4 ff c2 40 bne-c0534414 

  c0534424:   2c 01 00 4c isync
  c0534428:   01 00 49 71 andi.   r9,r10,1
  c053442c:   00 00 60 38 li  r3,0
  c0534430:   20 00 82 40 bne c0534450 

  c0534434:   70 00 21 38 addir1,r1,112
  c0534438:   10 00 01 e8 ld  r0,16(r1)
  c053443c:   f0 ff c1 eb ld  r30,-16(r1)
  c0534440:   f8 ff e1 eb ld  r31,-8(r1)
  c053:   a6 03 08 7c mtlrr0
  c0534448:   20 00 80 4e blr
  c053444c:   00 00 00 60 nop
  c0534450:   70 00 21 38 addir1,r1,112
  c0534454:   f0 ff 60 38 li  r3,-16
  c0534458:   10 00 01 e8 ld  r0,16(r1)
  c053445c:   f0 ff c1 eb ld  r30,-16(r1)
  c0534460:   f8 ff e1 eb ld  r31,-8(r1)
  c0534464:   a6 03 08 7c mtlrr0
  c0534468:   20 00 80 4e blr
  c053446c:   00 00 00 60 nop
  c0534470:   ff ff 60 38 li  r3,-1
  c0534474:   c0 ff ff 4b b   c0534434 

  c0534478:   00 00 00 60 nop
  c053447c:   00 00 00 60 nop
  c0534480:   8a ff c2 3c addis   r6,r2,-118
  c0534484:   74 ff 82 3c addis   r4,r2,-140
  c0534488:   78 fb e3 7f mr  r3,r31
  c053448c:   7c 00 a0 38 li  r5,124
  c0534490:   a8 75 c6 38 addir6,r6,30120
  c0534494:   f8 0b 84 38 addir4,r4,3064
  c0534498:

[PATCH v4 9/9] drivers/oprofile: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to monitoring for CAP_SYS_PERFMON privileged processes.
For backward compatibility reasons access to the monitoring remains open
for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure
monitoring is discouraged with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 drivers/oprofile/event_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index 12ea4a4ad607..6c9edc8bbc95 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -113,7 +113,7 @@ static int event_buffer_open(struct inode *inode, struct 
file *file)
 {
int err = -EPERM;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EPERM;
 
if (test_and_set_bit_lock(0, _opened))
-- 
2.20.1




[Bug 205889] CONFIG_PPC_85xx with CONFIG_CORENET_GENERIC outputs uImage instead of zImage

2019-12-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205889

Michael Ellerman (mich...@ellerman.id.au) changed:

   What|Removed |Added

 CC||mich...@ellerman.id.au

--- Comment #1 from Michael Ellerman (mich...@ellerman.id.au) ---
I think that's working as designed, if not expected.

CONFIG_CORENET_GENERIC selects CONFIG_DEFAULT_UIMAGE, which tells
arch/powerpc/boot/Makefile to build a uImage.

The zImage is just a hardlinkg to the uImage.

You can see that using the -i flag to ls, eg:

$ ls -il arch/powerpc/boot/{uImage,zImage}
1300656 -rw-rw-r-- 2 michael michael 5356164 Dec 18 22:19
arch/powerpc/boot/uImage
1300656 -rw-rw-r-- 2 michael michael 5356164 Dec 18 22:19
arch/powerpc/boot/zImage

Notice the inode is the same, and the link count is 2.

I think the logic behind the link is that people might have scripts to copy
zImage somewhere to boot it, and we didn't want to break that just because the
target of the build is a uImage.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[PATCH v15 13/23] selftests/vm/pkeys: Introduce powerpc support

2019-12-18 Thread Sandipan Das
From: Ram Pai 

This makes use of the abstractions added earlier and
introduces support for powerpc.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h|   2 +
 tools/testing/selftests/vm/pkey-powerpc.h|  93 +++
 tools/testing/selftests/vm/protection_keys.c | 269 ++-
 3 files changed, 236 insertions(+), 128 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index c929cc09b3cc..5de06d3a81de 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -79,6 +79,8 @@ void expected_pkey_fault(int pkey);
 
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
 #else /* arch */
 #error Architecture not supported
 #endif /* arch */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
new file mode 100644
index ..5ced9c58fa4a
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key  386
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc384
+# define SYS_pkey_free 385
+#endif
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define gregs  gp_regs
+#define fpregs fp_regs
+#define si_pkey_offset 0x20
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS   0x3  /* disable read and write */
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE0x2
+#endif
+
+#define NR_PKEYS   32
+#define NR_RESERVED_PKEYS_4K   27 /* pkey-0, pkey-1, exec-only-pkey
+ and 24 other keys that cannot be
+ represented in the PTE */
+#define NR_RESERVED_PKEYS_64K  3  /* pkey-0, pkey-1 and exec-only-pkey */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL << 24)
+#define PAGE_SIZE  (1UL << 16)
+#define pkey_reg_t u64
+#define PKEY_REG_FMT   "%016lx"
+
+static inline u32 pkey_bit_position(int pkey)
+{
+   return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline pkey_reg_t __read_pkey_reg(void)
+{
+   pkey_reg_t pkey_reg;
+
+   asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+   return pkey_reg;
+}
+
+static inline void __write_pkey_reg(pkey_reg_t pkey_reg)
+{
+   pkey_reg_t eax = pkey_reg;
+
+   dprintf4("%s() changing "PKEY_REG_FMT" to "PKEY_REG_FMT"\n",
+__func__, __read_pkey_reg(), pkey_reg);
+
+   asm volatile("mtspr 0xd, %0" : : "r" ((unsigned long)(eax)) : "memory");
+
+   dprintf4("%s() pkey register after changing "PKEY_REG_FMT" to "
+   PKEY_REG_FMT"\n", __func__, __read_pkey_reg(),
+   pkey_reg);
+}
+
+static inline int cpu_has_pku(void)
+{
+   return 1;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+   if (sysconf(_SC_PAGESIZE) == 4096)
+   return NR_RESERVED_PKEYS_4K;
+   else
+   return NR_RESERVED_PKEYS_64K;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+   /*
+* powerpc does not allow userspace to change permissions of exec-only
+* keys since those keys are not allocated by userspace. The signal
+* handler wont be able to reset the permissions, which means the code
+* will infinitely continue to segfault here.
+*/
+   return;
+}
+
+/* 8-bytes of instruction * 16384bytes = 1 page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index dbc2918bec13..1920bca84def 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -168,6 +168,125 @@ void dump_mem(void *dumpme, int len_bytes)
}
 }
 
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+   pkey_reg_t pkey_reg = __read_pkey_reg();
+
+   dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+   __func__, pkey, flags, 0, 0);
+   dprintf2("%s() raw pkey_reg: "PKEY_REG_FMT"\n", __func__, pkey_reg);
+
+   return (u32) get_pkey_bits(pkey_reg, pkey);
+}
+
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+   u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+   pkey_reg_t old_pkey_reg = __read_pkey_reg();
+   pkey_reg_t new_pkey_reg;
+
+   /* make sure that 'rights' only contains the bits we expect: */
+   

[PATCH v15 17/23] selftests/vm/pkeys: Associate key on a mapped page and detect write violation

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Detect write-violation on a page to which write-disabled
key is associated much after the page is mapped.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index f63b9f55b3a7..ff207b765afd 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1001,6 +1001,17 @@ void 
test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
expected_pkey_fault(pkey);
 }
 
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   *ptr = __LINE__;
+   dprintf1("disabling write access; after accessing the page, "
+   "to PKEY[%02d], doing write\n", pkey);
+   pkey_write_deny(pkey);
+   *ptr = __LINE__;
+   expected_pkey_fault(pkey);
+}
+
 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
@@ -1409,6 +1420,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_read_of_access_disabled_region,
test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
+   test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
-- 
2.17.1



Re: [RFC PATCH 1/2] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-18 Thread Peter Zijlstra
On Wed, Dec 18, 2019 at 10:52:53AM +0530, Aneesh Kumar K.V wrote:
> Upstream ppc64 is broken after the commit: a46cc7a90fd8
> ("powerpc/mm/radix: Improve TLB/PWC flushes").
> 
> Also the patches are not adding any extra TLBI on either radix or hash.
> 
> Considering we need to backport this to stable and other distributions,
> how about we do this early patches in your series before the Kconfig rename?
> This should enable stable to pick them up with less dependencies. 

OK I suppose. Will you send a new series?


[PATCH v4 2/9] perf/core: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to perf_events monitoring for CAP_SYS_PERFMON privileged
processes. For backward compatibility reasons access to perf_events
subsystem remains open for CAP_SYS_ADMIN privileged processes but
CAP_SYS_ADMIN usage for secure perf_events monitoring is discouraged
with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 include/linux/perf_event.h | 6 +++---
 kernel/events/core.c   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 34c7c6910026..f46acd69425f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1285,7 +1285,7 @@ static inline int perf_is_paranoid(void)
 
 static inline int perf_allow_kernel(struct perf_event_attr *attr)
 {
-   if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
+   if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
return -EACCES;
 
return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
@@ -1293,7 +1293,7 @@ static inline int perf_allow_kernel(struct 
perf_event_attr *attr)
 
 static inline int perf_allow_cpu(struct perf_event_attr *attr)
 {
-   if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
+   if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
return -EACCES;
 
return security_perf_event_open(attr, PERF_SECURITY_CPU);
@@ -1301,7 +1301,7 @@ static inline int perf_allow_cpu(struct perf_event_attr 
*attr)
 
 static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
 {
-   if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
+   if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
return -EPERM;
 
return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 059ee7116008..d9db414f2197 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9056,7 +9056,7 @@ static int perf_kprobe_event_init(struct perf_event 
*event)
if (event->attr.type != perf_kprobe.type)
return -ENOENT;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EACCES;
 
/*
@@ -9116,7 +9116,7 @@ static int perf_uprobe_event_init(struct perf_event 
*event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EACCES;
 
/*
@@ -11157,7 +11157,7 @@ SYSCALL_DEFINE5(perf_event_open,
}
 
if (attr.namespaces) {
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EACCES;
}
 
-- 
2.20.1



Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-18 Thread Christian Borntraeger
On 12.12.19 21:49, Linus Torvalds wrote:
> On Thu, Dec 12, 2019 at 11:34 AM Will Deacon  wrote:
>>
>> The root of my concern in all of this, and what started me looking at it in
>> the first place, is the interaction with 'typeof()'. Inheriting 'volatile'
>> for a pointer means that local variables in macros declared using typeof()
>> suddenly start generating *hideous* code, particularly when pointless stack
>> spills get stackprotector all excited.
> 
> Yeah, removing volatile can be a bit annoying.
> 
> For the particular case of the bitops, though, it's not an issue.
> Since you know the type there, you can just cast it.
> 
> And if we had the rule that READ_ONCE() was an arithmetic type, you could do
> 
> typeof(0+(*p)) __var;
> 
> since you might as well get the integer promotion anyway (on the
> non-volatile result).
> 
> But that doesn't work with structures or unions, of course.

We do have a READ_ONCE on the following union in s390 code.

union ipte_control {
unsigned long val;
struct {
unsigned long k  : 1;
unsigned long kh : 31;
unsigned long kg : 32;
}; 
};


In fact this one was the original failure case why we change ACCESS_ONCE.

see arch/s390/kvm/gaccess.c



Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-18 Thread Will Deacon
On Wed, Dec 18, 2019 at 11:22:05AM +0100, Christian Borntraeger wrote:
> On 12.12.19 21:49, Linus Torvalds wrote:
> > On Thu, Dec 12, 2019 at 11:34 AM Will Deacon  wrote:
> >>
> >> The root of my concern in all of this, and what started me looking at it in
> >> the first place, is the interaction with 'typeof()'. Inheriting 'volatile'
> >> for a pointer means that local variables in macros declared using typeof()
> >> suddenly start generating *hideous* code, particularly when pointless stack
> >> spills get stackprotector all excited.
> > 
> > Yeah, removing volatile can be a bit annoying.
> > 
> > For the particular case of the bitops, though, it's not an issue.
> > Since you know the type there, you can just cast it.
> > 
> > And if we had the rule that READ_ONCE() was an arithmetic type, you could do
> > 
> > typeof(0+(*p)) __var;
> > 
> > since you might as well get the integer promotion anyway (on the
> > non-volatile result).
> > 
> > But that doesn't work with structures or unions, of course.
> 
> We do have a READ_ONCE on the following union in s390 code.
> 
> union ipte_control {
> unsigned long val;
> struct {
> unsigned long k  : 1;
> unsigned long kh : 31;
> unsigned long kg : 32;
> }; 
> };
> 
> 
> In fact this one was the original failure case why we change ACCESS_ONCE.
> 
> see arch/s390/kvm/gaccess.c

Thanks. I think we should be ok just using the 'val' field instead of the
whole union but, then again, when bitfields are involved who knows what the
compiler might do. I thought we usually shied away from using them to mirror
hardware structures like this?

Will


[Bug 205885] [Bisected] BUG: KASAN: null-ptr-deref in strncpy+0x3c/0x60

2019-12-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205885

Michael Ellerman (mich...@ellerman.id.au) changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 CC||mich...@ellerman.id.au
 Resolution|--- |PATCH_ALREADY_AVAILABLE

--- Comment #7 from Michael Ellerman (mich...@ellerman.id.au) ---
This is fixed upstream:

https://git.kernel.org/torvalds/c/7de7de7ca0ae0fc70515ee3154af33af75edae2c

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-18 Thread Aneesh Kumar K.V

On 12/18/19 2:47 PM, Peter Zijlstra wrote:

On Wed, Dec 18, 2019 at 11:05:29AM +0530, Aneesh Kumar K.V wrote:

From: Peter Zijlstra 

Architectures for which we have hardware walkers of Linux page table should
flush TLB on mmu gather batch allocation failures and batch flush. Some
architectures like POWER supports multiple translation modes (hash and radix)


nohash, hash and radix in fact :-)


and in the case of POWER only radix translation mode needs the above TLBI.



This is because for hash translation mode kernel wants to avoid this extra
flush since there are no hardware walkers of linux page table. With radix
translation, the hardware also walks linux page table and with that, kernel
needs to make sure to TLB invalidate page walk cache before page table pages are
freed.

More details in
commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for 
RCU_TABLE_FREE")

Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
Signed-off-by: Peter Zijlstra (Intel) 
---



diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index b2c0be93929d..7f3a8b902325 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -26,6 +26,17 @@
  
  #define tlb_flush tlb_flush

  extern void tlb_flush(struct mmu_gather *tlb);
+/*
+ * book3s:
+ * Hash does not use the linux page-tables, so we can avoid
+ * the TLB invalidate for page-table freeing, Radix otoh does use the
+ * page-tables and needs the TLBI.
+ *
+ * nohash:
+ * We still do TLB invalidate in the __pte_free_tlb routine before we
+ * add the page table pages to mmu gather table batch.


I'm a little confused though; if nohash is a software TLB fill, why do
you need a TLBI for tables?



nohash (AKA book3e) has different mmu modes. I don't follow all the 
details w.r.t book3e. Paul or Michael might be able to explain the need 
for table flush with book3e.


Documentation/powerpc/cpu-faimilies.rst shows different hardware assist 
TLB fill support.


What I wanted to convey with the above comment way we handle only radix 
translation mode with tlb_needs_table_invalidate() check. Other 
translations (hash,  different variants of book3e) are all good because 
of the reason outlined above.


-aneesh


Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-18 Thread Michael Ellerman
"Aneesh Kumar K.V"  writes:
> On 12/18/19 2:47 PM, Peter Zijlstra wrote:
>> On Wed, Dec 18, 2019 at 11:05:29AM +0530, Aneesh Kumar K.V wrote:
>>> From: Peter Zijlstra 
>>>
>>> Architectures for which we have hardware walkers of Linux page table should
>>> flush TLB on mmu gather batch allocation failures and batch flush. Some
>>> architectures like POWER supports multiple translation modes (hash and 
>>> radix)
>> 
>> nohash, hash and radix in fact :-)
>> 
>>> and in the case of POWER only radix translation mode needs the above TLBI.
>> 
>>> This is because for hash translation mode kernel wants to avoid this extra
>>> flush since there are no hardware walkers of linux page table. With radix
>>> translation, the hardware also walks linux page table and with that, kernel
>>> needs to make sure to TLB invalidate page walk cache before page table 
>>> pages are
>>> freed.
>>>
>>> More details in
>>> commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for 
>>> RCU_TABLE_FREE")
>>>
>>> Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
>>> Signed-off-by: Peter Zijlstra (Intel) >> Signed-off-by: Aneesh Kumar K.V 
>>> ---
>> 
>>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
>>> index b2c0be93929d..7f3a8b902325 100644
>>> --- a/arch/powerpc/include/asm/tlb.h
>>> +++ b/arch/powerpc/include/asm/tlb.h
>>> @@ -26,6 +26,17 @@
>>>   
>>>   #define tlb_flush tlb_flush
>>>   extern void tlb_flush(struct mmu_gather *tlb);
>>> +/*
>>> + * book3s:
>>> + * Hash does not use the linux page-tables, so we can avoid
>>> + * the TLB invalidate for page-table freeing, Radix otoh does use the
>>> + * page-tables and needs the TLBI.
>>> + *
>>> + * nohash:
>>> + * We still do TLB invalidate in the __pte_free_tlb routine before we
>>> + * add the page table pages to mmu gather table batch.
>> 
>> I'm a little confused though; if nohash is a software TLB fill, why do
>> you need a TLBI for tables?
>> 
>
> nohash (AKA book3e) has different mmu modes. I don't follow all the 
> details w.r.t book3e. Paul or Michael might be able to explain the need 
> for table flush with book3e.

Some of the Book3E CPUs have a partial hardware table walker. The IBM one (A2)
did, before we ripped that support out. And the Freescale (NXP) e6500
does, see eg:

  28efc35fe68d ("powerpc/e6500: TLB miss handler with hardware tablewalk 
support")

They only support walking one level IIRC, ie. you can create a TLB entry
that points to a PTE page, and the hardware will dereference that to get
a PTE and load that into the TLB.

cheers


[PATCH v4 7/9] parisc/perf: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to monitoring for CAP_SYS_PERFMON privileged processes.
For backward compatibility reasons access to the monitoring remains open
for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure
monitoring is discouraged with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 arch/parisc/kernel/perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c
index 676683641d00..c4208d027794 100644
--- a/arch/parisc/kernel/perf.c
+++ b/arch/parisc/kernel/perf.c
@@ -300,7 +300,7 @@ static ssize_t perf_write(struct file *file, const char 
__user *buf,
else
return -EFAULT;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EACCES;
 
if (count != sizeof(uint32_t))
-- 
2.20.1




Re: [PATCH 1/2] powerpc/pseries/svm: Don't access some SPRs

2019-12-18 Thread Michael Ellerman
Sukadev Bhattiprolu  writes:
> Ultravisor disables some CPU features like EBB and BHRB in the HFSCR
> for secure virtual machines (SVMs). If the SVMs attempt to access
> related registers, they will get a Program Interrupt.
>
> Use macros/wrappers to skip accessing EBB and BHRB registers in secure
> VMs.

I continue to dislike this approach.

The result is code that at a glance looks like it's doing one thing,
ie. reading or writing an SPR, but is in fact doing nothing.

It's confusing for readers.

It also slows down all these already slow register accesses further, by
inserting an mfmsr() in front of every single one.


> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index b3cbb1136bce..026eb20f6d13 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -1379,6 +1379,41 @@ static inline void msr_check_and_clear(unsigned long 
> bits)
>   __msr_check_and_clear(bits);
>  }
>  
> +#ifdef CONFIG_PPC_SVM
> +/*
> + * Move from some "restricted" sprs.
> + * Secure VMs should not access some registers as the related features
> + * are disabled in the CPU. If an SVM is attempting read from the given
> + * SPR, return 0. Otherwise behave like a normal mfspr.
> + */
> +#define mfspr_r(rn)  \
> +({   \
> + unsigned long rval = 0ULL;  \
> + \
> + if (!(mfmsr() & MSR_S)) \
> + asm volatile("mfspr %0," __stringify(rn)\
> + : "=r" (rval)); \
> + rval;   \
> +})
> +
> +/*
> + * Move to some "restricted" sprs.
> + * Secure VMs should not access some registers as the related features
> + * are disabled in the CPU. If an SVM is attempting write to the given
> + * SPR, ignore the write. Otherwise behave like a normal mtspr.
> + */
> +#define mtspr_r(rn, v)   \
> +({   \
> + if (!(mfmsr() & MSR_S)) \
> + asm volatile("mtspr " __stringify(rn) ",%0" :   \
> +  : "r" ((unsigned long)(v)) \
> +  : "memory");   \
> +})
> +#else
> +#define mfspr_r  mfspr
> +#define mtspr_r  mtspr
> +#endif
> +
>  #ifdef __powerpc64__
>  #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
>  #define mftb()   ({unsigned long rval;   
> \
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 639ceae7da9d..9a691452ea3b 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1059,9 +1059,9 @@ static inline void save_sprs(struct thread_struct *t)
>   t->dscr = mfspr(SPRN_DSCR);
>  
>   if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
> - t->bescr = mfspr(SPRN_BESCR);
> - t->ebbhr = mfspr(SPRN_EBBHR);
> - t->ebbrr = mfspr(SPRN_EBBRR);
> + t->bescr = mfspr_r(SPRN_BESCR);
> + t->ebbhr = mfspr_r(SPRN_EBBHR);
> + t->ebbrr = mfspr_r(SPRN_EBBRR);

eg. here.

This is the fast path of context switch.

That expands to:

if (!(mfmsr() & MSR_S))
asm volatile("mfspr %0, SPRN_BESCR" : "=r" (rval));
if (!(mfmsr() & MSR_S))
asm volatile("mfspr %0, SPRN_EBBHR" : "=r" (rval));
if (!(mfmsr() & MSR_S))
asm volatile("mfspr %0, SPRN_EBBRR" : "=r" (rval));


If the Ultravisor is going to disable EBB and BHRB then we need new
CPU_FTR bits for those, and the code that accesses those registers
needs to be put behind cpu_has_feature(EBB) etc.

cheers


[PATCH v4 6/9] powerpc/perf: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to monitoring for CAP_SYS_PERFMON privileged processes.
For backward compatibility reasons access to the monitoring remains open
for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure
monitoring is discouraged with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 arch/powerpc/perf/imc-pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index cb50a9e1fd2d..e837717492e4 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -898,7 +898,7 @@ static int thread_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EACCES;
 
/* Sampling not supported */
@@ -1307,7 +1307,7 @@ static int trace_imc_event_init(struct perf_event *event)
if (event->attr.type != event->pmu->type)
return -ENOENT;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EACCES;
 
/* Return if this is a couting event */
-- 
2.20.1




Re: [PATCH] powerpc/setup_64: use DEFINE_DEBUGFS_ATTRIBUTE to define fops_rfi_flush

2019-12-18 Thread Michael Ellerman
Chen Zhou  writes:
> Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE for
> debugfs files.
>
> Semantic patch information:
> Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file()
> imposes some significant overhead as compared to
> DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe().

I know you didn't write this text, but these change logs are not great.
It doesn't really explain why you're doing it. And it is alarming that
you're converting *to* a function with "unsafe" in the name.

The commit that added the script:

  5103068eaca2 ("debugfs, coccinelle: check for obsolete 
DEFINE_SIMPLE_ATTRIBUTE() usage")

Has a bit more explanation.

Maybe something like this:

  In order to protect against file removal races, debugfs files created via
  debugfs_create_file() are wrapped by a struct file_operations at their
  opening.
  
  If the original struct file_operations is known to be safe against removal
  races already, the proxy creation may be bypassed by creating the files
  using DEFINE_DEBUGFS_ATTRIBUTE() and debugfs_create_file_unsafe().


The part that's not explained is why this file is "known to be safe
against removal races already"?

It also seems this conversion will make the file no longer seekable,
because DEFINE_SIMPLE_ATTRIBUTE() uses generic_file_llseek() whereas
DEFINE_DEBUGFS_ATTRIBUTE() uses no_llseek.

That is probably fine, but should be mentioned.

cheers

> Signed-off-by: Chen Zhou 
> ---
>  arch/powerpc/kernel/setup_64.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 6104917..4b9fbb2 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -956,11 +956,11 @@ static int rfi_flush_get(void *data, u64 *val)
>   return 0;
>  }
>  
> -DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, 
> "%llu\n");
> +DEFINE_DEBUGFS_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, 
> "%llu\n");
>  
>  static __init int rfi_flush_debugfs_init(void)
>  {
> - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, 
> _rfi_flush);
> + debugfs_create_file_unsafe("rfi_flush", 0600, powerpc_debugfs_root, 
> NULL, _rfi_flush);
>   return 0;
>  }
>  device_initcall(rfi_flush_debugfs_init);
> -- 
> 2.7.4


Re: [PATCH v3 3/3] powerpc: Book3S 64-bit "heavyweight" KASAN support

2019-12-18 Thread Christophe Leroy




On 12/18/2019 04:32 AM, Daniel Axtens wrote:

Daniel Axtens  writes:


Hi Christophe,

I'm working through your feedback, thank you. Regarding this one:


--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2081,7 +2081,14 @@ void show_stack(struct task_struct *tsk, unsigned long 
*stack)
/*
 * See if this is an exception frame.
 * We look for the "regshere" marker in the current frame.
+*
+* KASAN may complain about this. If it is an exception frame,
+* we won't have unpoisoned the stack in asm when we set the
+* exception marker. If it's not an exception frame, who knows
+* how things are laid out - the shadow could be in any state
+* at all. Just disable KASAN reporting for now.
 */
+   kasan_disable_current();
if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
&& stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
struct pt_regs *regs = (struct pt_regs *)
@@ -2091,6 +2098,7 @@ void show_stack(struct task_struct *tsk, unsigned long 
*stack)
   regs->trap, (void *)regs->nip, (void *)lr);
firstframe = 1;
}
+   kasan_enable_current();


If this is really a concern for all targets including PPC32, should it
be a separate patch with a Fixes: tag to be applied back in stable as well ?


I've managed to repro this by commening out the kasan_disable/enable
lines, and just booting in qemu without a disk attached:

sudo qemu-system-ppc64 -accel kvm -m 2G -M pseries -cpu power9  -kernel 
./vmlinux  -nographic -chardev stdio,id=charserial0,mux=on -device 
spapr-vty,chardev=charserial0,reg=0x3000  -mon 
chardev=charserial0,mode=readline -nodefaults -smp 2

...

[0.210740] Kernel panic - not syncing: VFS: Unable to mount root fs on 
unknown-block(0,0)
[0.210789] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
[0.210844] Call Trace:
[0.210866] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154 
(unreliable)
[0.210915] [c0006a483a00] [c025411c] panic+0x258/0x59c
[0.210958] [c0006a483aa0] [c24870b0] 
mount_block_root+0x648/0x7ac
[0.211005] 
==
[0.211054] BUG: KASAN: stack-out-of-bounds in show_stack+0x438/0x580
[0.211095] Read of size 8 at addr c0006a483b00 by task swapper/0/1
[0.211134]
[0.211152] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
[0.211207] Call Trace:
[0.211225] [c0006a483680] [c1f74f48] dump_stack+0xfc/0x154 
(unreliable)
[0.211274] [c0006a4836d0] [c08f877c] 
print_address_description.isra.10+0x7c/0x470
[0.211330] [c0006a483760] [c08f8e7c] __kasan_report+0x1bc/0x244
[0.211380] [c0006a483830] [c08f6eb8] kasan_report+0x18/0x30
[0.211422] [c0006a483850] [c08fa5d4] 
__asan_report_load8_noabort+0x24/0x40
[0.211471] [c0006a483870] [c003d448] show_stack+0x438/0x580
[0.211512] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154
[0.211553] [c0006a483a00] [c025411c] panic+0x258/0x59c
[0.211595] [c0006a483aa0] [c24870b0] 
mount_block_root+0x648/0x7ac
[0.211644] [c0006a483be0] [c2487784] 
prepare_namespace+0x1ec/0x240
[0.211694] [c0006a483c60] [c248669c] 
kernel_init_freeable+0x7f4/0x870
[0.211745] [c0006a483da0] [c0011f30] kernel_init+0x3c/0x15c
[0.211787] [c0006a483e20] [c000bebc] 
ret_from_kernel_thread+0x5c/0x80
[0.211834]
[0.211851] Allocated by task 0:
[0.211878]  save_stack+0x2c/0xe0
[0.211904]  __kasan_kmalloc.isra.16+0x11c/0x150
[0.211937]  kmem_cache_alloc_node+0x114/0x3b0
[0.211971]  copy_process+0x5b8/0x6410
[0.211996]  _do_fork+0x130/0xbf0
[0.212022]  kernel_thread+0xdc/0x130
[0.212047]  rest_init+0x44/0x184
[0.212072]  start_kernel+0x77c/0x7dc
[0.212098]  start_here_common+0x1c/0x20
[0.212122]
[0.212139] Freed by task 0:
[0.212163] (stack is not available)
[0.212187]
[0.212205] The buggy address belongs to the object at c0006a48
[0.212205]  which belongs to the cache thread_stack of size 16384
[0.212285] The buggy address is located 15104 bytes inside of
[0.212285]  16384-byte region [c0006a48, c0006a484000)
[0.212356] The buggy address belongs to the page:
[0.212391] page:c00c001a9200 refcount:1 mapcount:0 
mapping:c0006a019e00 index:0x0 compound_mapcount: 0
[0.212455] raw: 00710200 5deadbeef100 5deadbeef122 
c0006a019e00
[0.212504] raw:  00100010 0001 

[PATCH v15 21/23] selftests/vm/pkeys: Fix number of reserved powerpc pkeys

2019-12-18 Thread Sandipan Das
From: "Desnes A. Nunes do Rosario" 

The number of reserved pkeys in a PowerNV environment is
different from that on PowerVM or KVM.

Tested on PowerVM and PowerNV environments.

Signed-off-by: "Desnes A. Nunes do Rosario" 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-powerpc.h | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index f7a20bd07870..a70577045074 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -28,7 +28,10 @@
 #define NR_RESERVED_PKEYS_4K   27 /* pkey-0, pkey-1, exec-only-pkey
  and 24 other keys that cannot be
  represented in the PTE */
-#define NR_RESERVED_PKEYS_64K  3  /* pkey-0, pkey-1 and exec-only-pkey */
+#define NR_RESERVED_PKEYS_64K_3KEYS3 /* PowerNV and KVM: pkey-0,
+pkey-1 and exec-only key */
+#define NR_RESERVED_PKEYS_64K_4KEYS4 /* PowerVM: pkey-0, pkey-1,
+pkey-31 and exec-only key */
 #define PKEY_BITS_PER_PKEY 2
 #define HPAGE_SIZE (1UL << 24)
 #define PAGE_SIZE  (1UL << 16)
@@ -69,12 +72,27 @@ static inline int cpu_has_pkeys(void)
return 1;
 }
 
+static inline bool arch_is_powervm()
+{
+   struct stat buf;
+
+   if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", ) == 
0) &&
+   (stat("/sys/firmware/devicetree/base/hmc-managed?", ) == 0) &&
+   (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", 
) == -1) )
+   return true;
+
+   return false;
+}
+
 static inline int get_arch_reserved_keys(void)
 {
if (sysconf(_SC_PAGESIZE) == 4096)
return NR_RESERVED_PKEYS_4K;
else
-   return NR_RESERVED_PKEYS_64K;
+   if (arch_is_powervm())
+   return NR_RESERVED_PKEYS_64K_4KEYS;
+   else
+   return NR_RESERVED_PKEYS_64K_3KEYS;
 }
 
 void expect_fault_on_read_execonly_key(void *p1, int pkey)
-- 
2.17.1



[PATCH v4 3/9] perf tool: extend Perf tool with CAP_SYS_PERFMON capability support

2019-12-18 Thread Alexey Budankov


Extend error messages to mention CAP_SYS_PERFMON capability as an option
to substitute CAP_SYS_ADMIN capability for secure system performance
monitoring and observability operations. Make perf_event_paranoid_check()
and __cmd_ftrace() to be aware of CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 tools/perf/builtin-ftrace.c |  5 +++--
 tools/perf/design.txt   |  3 ++-
 tools/perf/util/cap.h   |  4 
 tools/perf/util/evsel.c | 10 +-
 tools/perf/util/util.c  |  1 +
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index d5adc417a4ca..8096e9b5f4f9 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -284,10 +284,11 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int 
argc, const char **argv)
.events = POLLIN,
};
 
-   if (!perf_cap__capable(CAP_SYS_ADMIN)) {
+   if (!(perf_cap__capable(CAP_SYS_PERFMON) ||
+ perf_cap__capable(CAP_SYS_ADMIN))) {
pr_err("ftrace only works for %s!\n",
 #ifdef HAVE_LIBCAP_SUPPORT
-   "users with the SYS_ADMIN capability"
+   "users with the CAP_SYS_PERFMON or CAP_SYS_ADMIN capability"
 #else
"root"
 #endif
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index 0453ba26cdbd..71755b3e1303 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -258,7 +258,8 @@ gets schedule to. Per task counters can be created by any 
user, for
 their own tasks.
 
 A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
-all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+all events on CPU-x. Per CPU counters need CAP_SYS_PERFMON or
+CAP_SYS_ADMIN privilege.
 
 The 'flags' parameter is currently unused and must be zero.
 
diff --git a/tools/perf/util/cap.h b/tools/perf/util/cap.h
index 051dc590ceee..0f79fbf6638b 100644
--- a/tools/perf/util/cap.h
+++ b/tools/perf/util/cap.h
@@ -29,4 +29,8 @@ static inline bool perf_cap__capable(int cap __maybe_unused)
 #define CAP_SYSLOG 34
 #endif
 
+#ifndef CAP_SYS_PERFMON
+#define CAP_SYS_PERFMON 38
+#endif
+
 #endif /* __PERF_CAP_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index f4dea055b080..3a46325e3702 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2468,14 +2468,14 @@ int perf_evsel__open_strerror(struct evsel *evsel, 
struct target *target,
 "You may not have permission to collect %sstats.\n\n"
 "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
 "which controls use of the performance events system by\n"
-"unprivileged users (without CAP_SYS_ADMIN).\n\n"
+"unprivileged users (without CAP_SYS_PERFMON or 
CAP_SYS_ADMIN).\n\n"
 "The current value is %d:\n\n"
 "  -1: Allow use of (almost) all events by all users\n"
 "  Ignore mlock limit after perf_event_mlock_kb without 
CAP_IPC_LOCK\n"
-">= 0: Disallow ftrace function tracepoint by users without 
CAP_SYS_ADMIN\n"
-"  Disallow raw tracepoint access by users without 
CAP_SYS_ADMIN\n"
-">= 1: Disallow CPU event access by users without 
CAP_SYS_ADMIN\n"
-">= 2: Disallow kernel profiling by users without 
CAP_SYS_ADMIN\n\n"
+">= 0: Disallow ftrace function tracepoint by users without 
CAP_SYS_PERFMON or CAP_SYS_ADMIN\n"
+"  Disallow raw tracepoint access by users without 
CAP_SYS_PERFMON or CAP_SYS_ADMIN\n"
+">= 1: Disallow CPU event access by users without 
CAP_SYS_PERFMON or CAP_SYS_ADMIN\n"
+">= 2: Disallow kernel profiling by users without 
CAP_SYS_PERFMON or CAP_SYS_ADMIN\n\n"
 "To make this setting permanent, edit /etc/sysctl.conf too, 
e.g.:\n\n"
 "  kernel.perf_event_paranoid = -1\n" ,
 target->system_wide ? "system-wide " : "",
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 969ae560dad9..9981db0d8d09 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -272,6 +272,7 @@ int perf_event_paranoid(void)
 bool perf_event_paranoid_check(int max_level)
 {
return perf_cap__capable(CAP_SYS_ADMIN) ||
+   perf_cap__capable(CAP_SYS_PERFMON) ||
perf_event_paranoid() <= max_level;
 }
 
-- 
2.20.1




[PATCH v4 5/9] trace/bpf_trace: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to bpf_trace monitoring for CAP_SYS_PERFMON privileged
processes. For backward compatibility reasons access to bpf_trace
monitoring remains open for CAP_SYS_ADMIN privileged processes but
CAP_SYS_ADMIN usage for secure bpf_trace monitoring is discouraged
with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 44bd08f2443b..bafe21ac6d92 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1272,7 +1272,7 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info)
u32 *ids, prog_cnt, ids_len;
int ret;
 
-   if (!capable(CAP_SYS_ADMIN))
+   if (!perfmon_capable())
return -EPERM;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
-- 
2.20.1




[PATCH v4 8/9] drivers/perf: open access for CAP_SYS_PERFMON privileged process

2019-12-18 Thread Alexey Budankov


Open access to monitoring for CAP_SYS_PERFMON privileged processes.
For backward compatibility reasons access to the monitoring remains open
for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure
monitoring is discouraged with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 
---
 drivers/perf/arm_spe_pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 4e4984a55cd1..5dff81bc3324 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -274,7 +274,7 @@ static u64 arm_spe_event_to_pmscr(struct perf_event *event)
if (!attr->exclude_kernel)
reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT);
 
-   if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && capable(CAP_SYS_ADMIN))
+   if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable())
reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);
 
return reg;
@@ -700,7 +700,7 @@ static int arm_spe_pmu_event_init(struct perf_event *event)
return -EOPNOTSUPP;
 
reg = arm_spe_event_to_pmscr(event);
-   if (!capable(CAP_SYS_ADMIN) &&
+   if (!perfmon_capable() &&
(reg & (BIT(SYS_PMSCR_EL1_PA_SHIFT) |
BIT(SYS_PMSCR_EL1_CX_SHIFT) |
BIT(SYS_PMSCR_EL1_PCT_SHIFT
-- 
2.20.1




[PATCH v17 06/23] powerpc: mm: Add p?d_leaf() definitions

2019-12-18 Thread Steven Price
walk_page_range() is going to be allowed to walk page tables other than
those of user space. For this it needs to know when it has reached a
'leaf' entry in the page tables. This information is provided by the
p?d_leaf() functions/macros.

For powerpc p?d_is_leaf() functions already exist. Export them using the
new p?d_leaf() name.

CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: linuxppc-dev@lists.ozlabs.org
CC: kvm-...@vger.kernel.org
Signed-off-by: Steven Price 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b01624e5c467..201a69e6a355 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1355,18 +1355,21 @@ static inline bool is_pte_rw_upgrade(unsigned long 
old_val, unsigned long new_va
  * Like pmd_huge() and pmd_large(), but works regardless of config options
  */
 #define pmd_is_leaf pmd_is_leaf
+#define pmd_leaf pmd_is_leaf
 static inline bool pmd_is_leaf(pmd_t pmd)
 {
return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
 }
 
 #define pud_is_leaf pud_is_leaf
+#define pud_leaf pud_is_leaf
 static inline bool pud_is_leaf(pud_t pud)
 {
return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
 }
 
 #define pgd_is_leaf pgd_is_leaf
+#define pgd_leaf pgd_is_leaf
 static inline bool pgd_is_leaf(pgd_t pgd)
 {
return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
-- 
2.20.1



Re: [PATCH 05/18] powerpc sstep: Prepare to support prefixed instructions

2019-12-18 Thread Daniel Axtens
Jordan Niethe  writes:

> Currently all instructions are a single word long. A future ISA version
> will include prefixed instructions which have a double word length. The
> functions used for analysing and emulating instructions need to be
> modified so that they can handle these new instruction types.
>
> A prefixed instruction is a word prefix followed by a word suffix. All
> prefixes uniquely have the primary op-code 1. Suffixes may be valid word
> instructions or instructions that only exist as suffixes.
>
> In handling prefixed instructions it will be convenient to treat the
> suffix and prefix as separate words. To facilitate this modify
> analyse_instr() and emulate_step() to take a take a suffix as a
> parameter. For word instructions it does not matter what is passed in
> here - it will be ignored.
>
> We also define a new flag, PREFIXED, to be used in instruction_op:type.
> This flag will indicate when emulating an analysed instruction if the
> NIP should be advanced by word length or double word length.
>
> The callers of analyse_instr() and emulate_step() will need their own
> changes to be able to support prefixed instructions. For now modify them
> to pass in 0 as a suffix.
>
> Note that at this point no prefixed instructions are emulated or
> analysed - this is just making it possible to do so.
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/ppc-opcode.h |  3 +++
>  arch/powerpc/include/asm/sstep.h  |  8 +--
>  arch/powerpc/include/asm/uaccess.h| 30 +++
>  arch/powerpc/kernel/align.c   |  2 +-
>  arch/powerpc/kernel/hw_breakpoint.c   |  4 ++--
>  arch/powerpc/kernel/kprobes.c |  2 +-
>  arch/powerpc/kernel/mce_power.c   |  2 +-
>  arch/powerpc/kernel/optprobes.c   |  2 +-
>  arch/powerpc/kernel/uprobes.c |  2 +-
>  arch/powerpc/kvm/emulate_loadstore.c  |  2 +-
>  arch/powerpc/lib/sstep.c  | 12 ++-
>  arch/powerpc/lib/test_emulate_step.c  | 30 +--
>  arch/powerpc/xmon/xmon.c  |  4 ++--
>  13 files changed, 71 insertions(+), 32 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
> b/arch/powerpc/include/asm/ppc-opcode.h
> index c1df75edde44..a1dfa4bdd22f 100644
> --- a/arch/powerpc/include/asm/ppc-opcode.h
> +++ b/arch/powerpc/include/asm/ppc-opcode.h
> @@ -377,6 +377,9 @@
>  #define PPC_INST_VCMPEQUD0x10c7
>  #define PPC_INST_VCMPEQUB0x1006
>  
> +/* macro to check if a word is a prefix */
> +#define IS_PREFIX(x) (((x) >> 26) == 1)
> +
>  /* macros to insert fields into opcodes */
>  #define ___PPC_RA(a) (((a) & 0x1f) << 16)
>  #define ___PPC_RB(b) (((b) & 0x1f) << 11)
> diff --git a/arch/powerpc/include/asm/sstep.h 
> b/arch/powerpc/include/asm/sstep.h
> index 769f055509c9..6d4cb602e231 100644
> --- a/arch/powerpc/include/asm/sstep.h
> +++ b/arch/powerpc/include/asm/sstep.h
> @@ -89,6 +89,9 @@ enum instruction_type {
>  #define VSX_LDLEFT   4   /* load VSX register from left */
>  #define VSX_CHECK_VEC8   /* check MSR_VEC not MSR_VSX for reg >= 
> 32 */
>  
> +/* Prefixed flag, ORed in with type */
> +#define PREFIXED 0x800
> +
>  /* Size field in type word */
>  #define SIZE(n)  ((n) << 12)
>  #define GETSIZE(w)   ((w) >> 12)
> @@ -132,7 +135,7 @@ union vsx_reg {
>   * otherwise.
>   */
>  extern int analyse_instr(struct instruction_op *op, const struct pt_regs 
> *regs,
> -  unsigned int instr);
> +  unsigned int instr, unsigned int sufx);
>

I'm not saying this is necessarily better, but did you consider:

 - making instr 64 bits and using masking and shifting macros to get the
   prefix and suffix?

 - defining an instruction type/struct/union/whatever that contains both
   halves in one object?

I'm happy to be told that it ends up being way, way uglier/worse/etc,
but I just thought I'd ask.

Regards,
Daniel

>  /*
>   * Emulate an instruction that can be executed just by updating
> @@ -149,7 +152,8 @@ void emulate_update_regs(struct pt_regs *reg, struct 
> instruction_op *op);
>   * 0 if it could not be emulated, or -1 for an instruction that
>   * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.).
>   */
> -extern int emulate_step(struct pt_regs *regs, unsigned int instr);
> +extern int emulate_step(struct pt_regs *regs, unsigned int instr,
> + unsigned int sufx);
>  
>  /*
>   * Emulate a load or store instruction by reading/writing the
> diff --git a/arch/powerpc/include/asm/uaccess.h 
> b/arch/powerpc/include/asm/uaccess.h
> index 15002b51ff18..bc585399e0c7 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -423,4 +423,34 @@ extern long __copy_from_user_flushcache(void *dst, const 
> void __user *src,
>  extern void memcpy_page_flushcache(char *to, struct page *page, size_t 
> offset,
>  size_t len);
>  
> +/*
> + * When 

Re: [PATCH v11 06/25] mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM

2019-12-18 Thread Kirill A. Shutemov
On Mon, Dec 16, 2019 at 02:25:18PM -0800, John Hubbard wrote:
> As it says in the updated comment in gup.c: current FOLL_LONGTERM
> behavior is incompatible with FAULT_FLAG_ALLOW_RETRY because of the
> FS DAX check requirement on vmas.
> 
> However, the corresponding restriction in get_user_pages_remote() was
> slightly stricter than is actually required: it forbade all
> FOLL_LONGTERM callers, but we can actually allow FOLL_LONGTERM callers
> that do not set the "locked" arg.
> 
> Update the code and comments to loosen the restriction, allowing
> FOLL_LONGTERM in some cases.
> 
> Also, copy the DAX check ("if a VMA is DAX, don't allow long term
> pinning") from the VFIO call site, all the way into the internals
> of get_user_pages_remote() and __gup_longterm_locked(). That is:
> get_user_pages_remote() calls __gup_longterm_locked(), which in turn
> calls check_dax_vmas(). This check will then be removed from the VFIO
> call site in a subsequent patch.
> 
> Thanks to Jason Gunthorpe for pointing out a clean way to fix this,
> and to Dan Williams for helping clarify the DAX refactoring.
> 
> Tested-by: Alex Williamson 
> Acked-by: Alex Williamson 
> Reviewed-by: Jason Gunthorpe 
> Reviewed-by: Ira Weiny 
> Suggested-by: Jason Gunthorpe 
> Cc: Dan Williams 
> Cc: Jerome Glisse 
> Signed-off-by: John Hubbard 
> ---
>  mm/gup.c | 27 ++-
>  1 file changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 3ecce297a47f..c0c56888e7cc 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -29,6 +29,13 @@ struct follow_page_context {
>   unsigned int page_mask;
>  };
>  
> +static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
> +   struct mm_struct *mm,
> +   unsigned long start,
> +   unsigned long nr_pages,
> +   struct page **pages,
> +   struct vm_area_struct **vmas,
> +   unsigned int flags);

Any particular reason for the forward declaration? Maybe move
get_user_pages_remote() down?

>  /*
>   * Return the compound head page with ref appropriately incremented,
>   * or NULL if that failed.
> @@ -1179,13 +1186,23 @@ long get_user_pages_remote(struct task_struct *tsk, 
> struct mm_struct *mm,
>   struct vm_area_struct **vmas, int *locked)
>  {
>   /*
> -  * FIXME: Current FOLL_LONGTERM behavior is incompatible with
> +  * Parts of FOLL_LONGTERM behavior are incompatible with
>* FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
> -  * vmas.  As there are no users of this flag in this call we simply
> -  * disallow this option for now.
> +  * vmas. However, this only comes up if locked is set, and there are
> +  * callers that do request FOLL_LONGTERM, but do not set locked. So,
> +  * allow what we can.
>*/
> - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
> - return -EINVAL;
> + if (gup_flags & FOLL_LONGTERM) {
> + if (WARN_ON_ONCE(locked))
> + return -EINVAL;
> + /*
> +  * This will check the vmas (even if our vmas arg is NULL)
> +  * and return -ENOTSUPP if DAX isn't allowed in this case:
> +  */
> + return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
> +  vmas, gup_flags | FOLL_TOUCH |
> +  FOLL_REMOTE);
> + }
>  
>   return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
>  locked,
> -- 
> 2.24.1
> 

-- 
 Kirill A. Shutemov


Re: [PATCH v6 05/10] mm/memory_hotplug: Shrink zones when offlining memory

2019-12-18 Thread David Hildenbrand
On 01.12.19 00:21, Andrew Morton wrote:
> On Sun, 27 Oct 2019 23:45:52 +0100 David Hildenbrand  wrote:
> 
>> I think I just found an issue with try_offline_node(). 
>> try_offline_node() is pretty much broken already (touches garbage 
>> memmaps and will not considers mixed NIDs within sections), however, 
>> relies on the node span to look for memory sections to probe. So it 
>> seems to rely on the nodes getting shrunk when removing memory, not when 
>> offlining.
>>
>> As we shrink the node span when offlining now and not when removing, 
>> this can go wrong once we offline the last memory block of the node and 
>> offline the last CPU. We could still have memory around that we could 
>> re-online, however, the node would already be offline. Unlikely, but 
>> possible.
>>
>> Note that the same is also broken without this patch in case memory is 
>> never onlined. The "pfn_to_nid(pfn) != nid" can easily succeed on the 
>> garbage memmap, resulting in  no memory being detected as belonging to 
>> the node. Also, resize_pgdat_range() is called when onlining memory, not 
>> when adding it. :/ Oh this is so broken :)
>>
>> The right fix is probably to walk over all memory blocks that could 
>> exist and test if they belong to the nid (if offline, check the 
>> block->nid, if online check all pageblocks). A fix we can then move in 
>> front of this patch.
>>
>> Will look into this this week.
> 
> And this series shows almost no sign of having been reviewed.  I'll hold
> it over for 5.6.
> 

Hi Andrew, any chance we can get the (now at least reviewed - thx Oscar)
fix in patch #5 into 5.5? (I want to do the final stable backports for
the uninitialized memmap stuff)

-- 
Thanks,

David / dhildenb



Re: [PATCH v11 01/25] mm/gup: factor out duplicate code from four routines

2019-12-18 Thread Kirill A. Shutemov
On Mon, Dec 16, 2019 at 02:25:13PM -0800, John Hubbard wrote:
> +static void put_compound_head(struct page *page, int refs)
> +{
> + /* Do a get_page() first, in case refs == page->_refcount */
> + get_page(page);
> + page_ref_sub(page, refs);
> + put_page(page);
> +}

It's not terribly efficient. Maybe something like:

VM_BUG_ON_PAGE(page_ref_count(page) < ref, page);
if (refs > 2)
page_ref_sub(page, refs - 1);
put_page(page);

?

-- 
 Kirill A. Shutemov


Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread Kirill A. Shutemov
On Mon, Dec 16, 2019 at 02:25:16PM -0800, John Hubbard wrote:
> An upcoming patch changes and complicates the refcounting and
> especially the "put page" aspects of it. In order to keep
> everything clean, refactor the devmap page release routines:
> 
> * Rename put_devmap_managed_page() to page_is_devmap_managed(),
>   and limit the functionality to "read only": return a bool,
>   with no side effects.
> 
> * Add a new routine, put_devmap_managed_page(), to handle checking
>   what kind of page it is, and what kind of refcount handling it
>   requires.
> 
> * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
>   and limit the functionality to unconditionally freeing a devmap
>   page.

What the reason to separate put_devmap_managed_page() from
free_devmap_managed_page() if free_devmap_managed_page() has exacly one
caller? Is it preparation for the next patches?

> This is originally based on a separate patch by Ira Weiny, which
> applied to an early version of the put_user_page() experiments.
> Since then, Jérôme Glisse suggested the refactoring described above.
> 
> Cc: Christoph Hellwig 
> Suggested-by: Jérôme Glisse 
> Reviewed-by: Dan Williams 
> Reviewed-by: Jan Kara 
> Signed-off-by: Ira Weiny 
> Signed-off-by: John Hubbard 
> ---
>  include/linux/mm.h | 17 +
>  mm/memremap.c  | 16 ++--
>  mm/swap.c  | 24 
>  3 files changed, 39 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c97ea3b694e6..77a4df06c8a7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -952,9 +952,10 @@ static inline bool is_zone_device_page(const struct page 
> *page)
>  #endif
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void free_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
>   if (!static_branch_unlikely(_managed_key))
>   return false;
> @@ -963,7 +964,6 @@ static inline bool put_devmap_managed_page(struct page 
> *page)
>   switch (page->pgmap->type) {
>   case MEMORY_DEVICE_PRIVATE:
>   case MEMORY_DEVICE_FS_DAX:
> - __put_devmap_managed_page(page);
>   return true;
>   default:
>   break;
> @@ -971,7 +971,14 @@ static inline bool put_devmap_managed_page(struct page 
> *page)
>   return false;
>  }
>  
> +bool put_devmap_managed_page(struct page *page);
> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
> +static inline bool page_is_devmap_managed(struct page *page)
> +{
> + return false;
> +}
> +
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
>   return false;
> @@ -1028,8 +1035,10 @@ static inline void put_page(struct page *page)
>* need to inform the device driver through callback. See
>* include/linux/memremap.h and HMM for details.
>*/
> - if (put_devmap_managed_page(page))
> + if (page_is_devmap_managed(page)) {
> + put_devmap_managed_page(page);

put_devmap_managed_page() has yet another page_is_devmap_managed() check
inside. It looks strange.

>   return;
> + }
>  
>   if (put_page_testzero(page))
>   __put_page(page);
> diff --git a/mm/memremap.c b/mm/memremap.c
> index e899fa876a62..2ba773859031 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page)
> +void free_devmap_managed_page(struct page *page)
>  {
> - int count = page_ref_dec_return(page);
> -
> - /* still busy */
> - if (count > 1)
> - return;
> -
> - /* only triggered by the dev_pagemap shutdown path */
> - if (count == 0) {
> - __put_page(page);
> - return;
> - }
> -
>   /* notify page idle for dax */
>   if (!is_device_private_page(page)) {
>   wake_up_var(>_refcount);
> @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page)
>   page->mapping = NULL;
>   page->pgmap->ops->page_free(page);
>  }
> -EXPORT_SYMBOL(__put_devmap_managed_page);
> +EXPORT_SYMBOL(free_devmap_managed_page);
>  #endif /* CONFIG_DEV_PAGEMAP_OPS */
> diff --git a/mm/swap.c b/mm/swap.c
> index 5341ae93861f..49f7c2eea0ba 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -1102,3 +1102,27 @@ void __init swap_setup(void)
>* _really_ don't want to cluster much more
>*/
>  }
> +
> +#ifdef CONFIG_DEV_PAGEMAP_OPS
> +bool put_devmap_managed_page(struct page *page)
> +{
> + bool is_devmap = page_is_devmap_managed(page);
> +
> + if (is_devmap) {

Reversing the condition would save you an indentation level.

> + int count = 

Re: [PATCH 18/18] powerpc/fault: Use analyse_instr() to check for store with updates to sp

2019-12-18 Thread Daniel Axtens
Jordan Niethe  writes:

> A user-mode access to an address a long way below the stack pointer is
> only valid if the instruction is one that would update the stack pointer
> to the address accessed. This is checked by directly looking at the
> instructions op-code. As a result is does not take into account prefixed
> instructions. Instead of looking at the instruction our self, use
> analyse_instr() determine if this a store instruction that will update
> the stack pointer.
>
> Something to note is that there currently are not any store with update
> prefixed instructions. Actually there is no plan for prefixed
> update-form loads and stores. So this patch is probably not needed but
> it might be preferable to use analyse_instr() rather than open coding
> the test anyway.

Yes please. I was looking through this code recently and was
horrified. This improves things a lot and I think is justification
enough as-is.

Regards,
Daniel
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/mm/fault.c | 39 +++
>  1 file changed, 11 insertions(+), 28 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index b5047f9b5dec..cb78b3ca1800 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -41,37 +41,17 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * Check whether the instruction inst is a store using
>   * an update addressing form which will update r1.
>   */
> -static bool store_updates_sp(unsigned int inst)
> +static bool store_updates_sp(struct instruction_op *op)
>  {
> - /* check for 1 in the rA field */
> - if (((inst >> 16) & 0x1f) != 1)
> - return false;
> - /* check major opcode */
> - switch (inst >> 26) {
> - case OP_STWU:
> - case OP_STBU:
> - case OP_STHU:
> - case OP_STFSU:
> - case OP_STFDU:
> - return true;
> - case OP_STD:/* std or stdu */
> - return (inst & 3) == 1;
> - case OP_31:
> - /* check minor opcode */
> - switch ((inst >> 1) & 0x3ff) {
> - case OP_31_XOP_STDUX:
> - case OP_31_XOP_STWUX:
> - case OP_31_XOP_STBUX:
> - case OP_31_XOP_STHUX:
> - case OP_31_XOP_STFSUX:
> - case OP_31_XOP_STFDUX:
> + if (GETTYPE(op->type) == STORE) {
> + if ((op->type & UPDATE) && (op->update_reg == 1))
>   return true;
> - }
>   }
>   return false;
>  }
> @@ -278,14 +258,17 @@ static bool bad_stack_expansion(struct pt_regs *regs, 
> unsigned long address,
>  
>   if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
>   access_ok(nip, sizeof(*nip))) {
> - unsigned int inst;
> + unsigned int inst, sufx;
> + struct instruction_op op;
>   int res;
>  
>   pagefault_disable();
> - res = __get_user_inatomic(inst, nip);
> + res = __get_user_instr_inatomic(inst, sufx, nip);
>   pagefault_enable();
> - if (!res)
> - return !store_updates_sp(inst);
> + if (!res) {
> + analyse_instr(, uregs, inst, sufx);
> + return !store_updates_sp();
> + }
>   *must_retry = true;
>   }
>   return true;
> -- 
> 2.20.1


Re: [PATCH 08/18] powerpc sstep: Add support for prefixed VSX load/stores

2019-12-18 Thread Daniel Axtens
Jordan Niethe  writes:

> This adds emulation support for the following prefixed VSX load/stores:
>   * Prefixed Load VSX Scalar Doubleword (plxsd)
>   * Prefixed Load VSX Scalar Single-Precision (plxssp)
>   * Prefixed Load VSX Vector [0|1]  (plxv, plxv0, plxv1)
>   * Prefixed Store VSX Scalar Doubleword (pstxsd)
>   * Prefixed Store VSX Scalar Single-Precision (pstxssp)
>   * Prefixed Store VSX Vector [0|1] (pstxv, pstxv0, pstxv1)
>
> Signed-off-by: Jordan Niethe 

Take this with a grain of salt, but I would prbably squish the 3
load/store patches into one.

Part of my hesitation is that I think you also need some sstep tests for
these new instructions - if they massively bloat the patches I might
keep them as separate patches.

I'd also like to see a test for your next patch.

Regards,
Daniel

> ---
>  arch/powerpc/lib/sstep.c | 42 
>  1 file changed, 42 insertions(+)
>
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 9113b9a21ae9..9ae8d177b67f 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -2713,6 +2713,48 @@ int analyse_instr(struct instruction_op *op, const 
> struct pt_regs *regs,
>   case 41:/* plwa */
>   op->type = MKOP(LOAD, PREFIXED | SIGNEXT, 4);
>   break;
> + case 42:/* plxsd */
> + op->reg = rd + 32;
> + op->type = MKOP(LOAD_VSX, PREFIXED, 8);
> + op->element_size = 8;
> + op->vsx_flags = VSX_CHECK_VEC;
> + break;
> + case 43:/* plxssp */
> + op->reg = rd + 32;
> + op->type = MKOP(LOAD_VSX, PREFIXED, 4);
> + op->element_size = 8;
> + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC;
> + break;
> + case 46:/* pstxsd */
> + op->reg = rd + 32;
> + op->type = MKOP(STORE_VSX, PREFIXED, 8);
> + op->element_size = 8;
> + op->vsx_flags = VSX_CHECK_VEC;
> + break;
> + case 47:/* pstxssp */
> + op->reg = rd + 32;
> + op->type = MKOP(STORE_VSX, PREFIXED, 4);
> + op->element_size = 8;
> + op->vsx_flags = VSX_FPCONV | VSX_CHECK_VEC;
> + break;
> + case 51:/* plxv1 */
> + op->reg += 32;
> +
> + /* fallthru */
> + case 50:/* plxv0 */
> + op->type = MKOP(LOAD_VSX, PREFIXED, 16);
> + op->element_size = 16;
> + op->vsx_flags = VSX_CHECK_VEC;
> + break;
> + case 55:/* pstxv1 */
> + op->reg = rd + 32;
> +
> + /* fallthru */
> + case 54:/* pstxv0 */
> + op->type = MKOP(STORE_VSX, PREFIXED, 16);
> + op->element_size = 16;
> + op->vsx_flags = VSX_CHECK_VEC;
> + break;
>   case 56:/* plq */
>   op->type = MKOP(LOAD, PREFIXED, 16);
>   break;
> -- 
> 2.20.1


Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-18 Thread Peter Zijlstra
On Thu, Dec 19, 2019 at 12:13:48AM +1100, Michael Ellerman wrote:

> >> I'm a little confused though; if nohash is a software TLB fill, why do
> >> you need a TLBI for tables?
> >> 
> >
> > nohash (AKA book3e) has different mmu modes. I don't follow all the 
> > details w.r.t book3e. Paul or Michael might be able to explain the need 
> > for table flush with book3e.
> 
> Some of the Book3E CPUs have a partial hardware table walker. The IBM one (A2)
> did, before we ripped that support out. And the Freescale (NXP) e6500
> does, see eg:
> 
>   28efc35fe68d ("powerpc/e6500: TLB miss handler with hardware tablewalk 
> support")
> 
> They only support walking one level IIRC, ie. you can create a TLB entry
> that points to a PTE page, and the hardware will dereference that to get
> a PTE and load that into the TLB.

Shiny!, all the embedded goodness. Thanks for the info.


[PATCH v15 14/23] selftests/vm/pkeys: Fix assertion in test_pkey_alloc_exhaust()

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Some pkeys which are valid on the hardware are reserved
and not available for application use. These keys cannot
be allocated.

test_pkey_alloc_exhaust() tries to account for these and
has an assertion which validates if all available pkeys
have been exahaustively allocated. However, the expression
that is currently used is only valid for x86. On powerpc,
a pkey is additionally reserved as compared to x86. Hence,
the assertion is made to use an arch-specific helper to
get the correct count of reserved pkeys.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 1920bca84def..8d90cfe2c9bd 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1152,6 +1152,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
dprintf3("%s()::%d\n", __func__, __LINE__);
 
/*
+* On x86:
 * There are 16 pkeys supported in hardware.  Three are
 * allocated by the time we get here:
 *   1. The default key (0)
@@ -1159,8 +1160,16 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
 *   3. One allocated by the test code and passed in via
 *  'pkey' to this function.
 * Ensure that we can allocate at least another 13 (16-3).
+*
+* On powerpc:
+* There are either 5 or 32 pkeys supported in hardware
+* depending on the page size (4K or 64K). Four are
+* allocated by the time we get here. This includes
+* pkey-0, pkey-1, exec-only pkey and the one allocated
+* by the test code.
+* Ensure that we can allocate the remaining.
 */
-   pkey_assert(i >= NR_PKEYS-3);
+   pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
 
for (i = 0; i < nr_allocated_pkeys; i++) {
err = sys_pkey_free(allocated_pkeys[i]);
-- 
2.17.1



[PATCH v15 18/23] selftests/vm/pkeys: Detect write violation on a mapped access-denied-key page

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Detect write-violation on a page to which access-disabled
key is associated much after the page is mapped.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index ff207b765afd..176625ded549 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1026,6 +1026,18 @@ void test_write_of_access_disabled_region(int *ptr, u16 
pkey)
*ptr = __LINE__;
expected_pkey_fault(pkey);
 }
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   *ptr = __LINE__;
+   dprintf1("disabling access; after accessing the page, "
+   " to PKEY[%02d], doing write\n", pkey);
+   pkey_access_deny(pkey);
+   *ptr = __LINE__;
+   expected_pkey_fault(pkey);
+}
+
 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
 {
int ret;
@@ -1422,6 +1434,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_write_of_write_disabled_region,
test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
+   test_write_of_access_disabled_region_with_page_already_mapped,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
test_kernel_gup_of_access_disabled_region,
-- 
2.17.1



[PATCH v15 20/23] selftests/vm/pkeys: Test correct behaviour of pkey-0

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Ensure that pkey-0 is allocated on start and that it can
be attached dynamically in various modes, without failures.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 53 
 1 file changed, 53 insertions(+)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 5faf52ba7b8f..af577151d378 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -963,6 +963,58 @@ __attribute__((noinline)) int read_ptr(int *ptr)
return *ptr;
 }
 
+void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+   int i, err;
+   int max_nr_pkey_allocs;
+   int alloced_pkeys[NR_PKEYS];
+   int nr_alloced = 0;
+   long size;
+
+   pkey_assert(pkey_last_malloc_record);
+   size = pkey_last_malloc_record->size;
+   /*
+* This is a bit of a hack.  But mprotect() requires
+* huge-page-aligned sizes when operating on hugetlbfs.
+* So, make sure that we use something that's a multiple
+* of a huge page when we can.
+*/
+   if (size >= HPAGE_SIZE)
+   size = HPAGE_SIZE;
+
+   /* allocate every possible key and make sure key-0 never got allocated 
*/
+   max_nr_pkey_allocs = NR_PKEYS;
+   for (i = 0; i < max_nr_pkey_allocs; i++) {
+   int new_pkey = alloc_pkey();
+   pkey_assert(new_pkey != 0);
+
+   if (new_pkey < 0)
+   break;
+   alloced_pkeys[nr_alloced++] = new_pkey;
+   }
+   /* free all the allocated keys */
+   for (i = 0; i < nr_alloced; i++) {
+   int free_ret;
+
+   if (!alloced_pkeys[i])
+   continue;
+   free_ret = sys_pkey_free(alloced_pkeys[i]);
+   pkey_assert(!free_ret);
+   }
+
+   /* attach key-0 in various modes */
+   err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+   pkey_assert(!err);
+}
+
 void test_read_of_write_disabled_region(int *ptr, u16 pkey)
 {
int ptr_contents;
@@ -1447,6 +1499,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_pkey_syscalls_on_non_allocated_pkey,
test_pkey_syscalls_bad_args,
test_pkey_alloc_exhaust,
+   test_pkey_alloc_free_attach_pkey0,
 };
 
 void run_tests_once(void)
-- 
2.17.1



Re: [PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-18 Thread Peter Zijlstra
On Wed, Dec 18, 2019 at 11:05:29AM +0530, Aneesh Kumar K.V wrote:
> From: Peter Zijlstra 
> 
> Architectures for which we have hardware walkers of Linux page table should
> flush TLB on mmu gather batch allocation failures and batch flush. Some
> architectures like POWER supports multiple translation modes (hash and radix)

nohash, hash and radix in fact :-)

> and in the case of POWER only radix translation mode needs the above TLBI.

> This is because for hash translation mode kernel wants to avoid this extra
> flush since there are no hardware walkers of linux page table. With radix
> translation, the hardware also walks linux page table and with that, kernel
> needs to make sure to TLB invalidate page walk cache before page table pages 
> are
> freed.
> 
> More details in
> commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for 
> RCU_TABLE_FREE")
> 
> Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
> Signed-off-by: Peter Zijlstra (Intel)  Signed-off-by: Aneesh Kumar K.V 
> ---

> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
> index b2c0be93929d..7f3a8b902325 100644
> --- a/arch/powerpc/include/asm/tlb.h
> +++ b/arch/powerpc/include/asm/tlb.h
> @@ -26,6 +26,17 @@
>  
>  #define tlb_flush tlb_flush
>  extern void tlb_flush(struct mmu_gather *tlb);
> +/*
> + * book3s:
> + * Hash does not use the linux page-tables, so we can avoid
> + * the TLB invalidate for page-table freeing, Radix otoh does use the
> + * page-tables and needs the TLBI.
> + *
> + * nohash:
> + * We still do TLB invalidate in the __pte_free_tlb routine before we
> + * add the page table pages to mmu gather table batch.

I'm a little confused though; if nohash is a software TLB fill, why do
you need a TLBI for tables?

> + */
> +#define tlb_needs_table_invalidate() radix_enabled()
>  
>  /* Get the generic bits... */
>  #include 


Re: [PATCH v2 3/3] asm-generic/tlb: Avoid potential double flush

2019-12-18 Thread Peter Zijlstra
On Wed, Dec 18, 2019 at 11:05:30AM +0530, Aneesh Kumar K.V wrote:

> --- a/include/asm-generic/tlb.h
> +++ b/include/asm-generic/tlb.h
> @@ -402,7 +402,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct 
> vm_area_struct *vma) { }
>  
>  static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
>  {
> - if (!tlb->end)
> + /*
> +  * Anything calling __tlb_adjust_range() also sets at least one of
> +  * these bits.
> +  */
> + if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
> +   tlb->cleared_puds || tlb->cleared_p4ds))
>   return;

FWIW I looked at the GCC generated assembly output for this (x86_64) and
it did a single load and mask as expected.


[PATCH v15 12/23] selftests/vm/pkeys: Introduce generic pkey abstractions

2019-12-18 Thread Sandipan Das
From: Ram Pai 

This introduces some generic abstractions and provides
the corresponding architecture-specfic implementations
for these abstractions.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h| 12 
 tools/testing/selftests/vm/pkey-x86.h| 15 +++
 tools/testing/selftests/vm/protection_keys.c |  8 ++--
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index bd90a49a3229..c929cc09b3cc 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -74,6 +74,9 @@ extern void abort_hooks(void);
}   \
 } while (0)
 
+__attribute__((noinline)) int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
 #else /* arch */
@@ -173,4 +176,13 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
 #define __stringify_1(x...) #x
 #define __stringify(x...)   __stringify_1(x)
 
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+   return >si_pkey;
+#else
+   return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
+}
+
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index 4937f48f77cc..bb2d4dfcacd5 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -42,6 +42,7 @@
 #endif
 
 #define NR_PKEYS   16
+#define NR_RESERVED_PKEYS  2 /* pkey-0 and exec-only-pkey */
 #define PKEY_BITS_PER_PKEY 2
 #define HPAGE_SIZE (1UL<<21)
 #define PAGE_SIZE  4096
@@ -160,4 +161,18 @@ int pkey_reg_xstate_offset(void)
return xstate_offset;
 }
 
+static inline int get_arch_reserved_keys(void)
+{
+   return NR_RESERVED_PKEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+   int ptr_contents;
+
+   ptr_contents = read_ptr(p1);
+   dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+   expected_pkey_fault(pkey);
+}
+
 #endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 2327cfa0b836..dbc2918bec13 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1306,9 +1306,7 @@ void test_executing_on_unreadable_memory(int *ptr, u16 
pkey)
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
lots_o_noops_around_write();
do_not_expect_pkey_fault("executing on PROT_EXEC memory");
-   ptr_contents = read_ptr(p1);
-   dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-   expected_pkey_fault(pkey);
+   expect_fault_on_read_execonly_key(p1, pkey);
 }
 
 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
@@ -1335,9 +1333,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, 
u16 pkey)
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
lots_o_noops_around_write();
do_not_expect_pkey_fault("executing on PROT_EXEC memory");
-   ptr_contents = read_ptr(p1);
-   dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
-   expected_pkey_fault(UNKNOWN_PKEY);
+   expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
 
/*
 * Put the memory back to non-PROT_EXEC.  Should clear the
-- 
2.17.1



[PATCH v15 19/23] selftests/vm/pkeys: Introduce a sub-page allocator

2019-12-18 Thread Sandipan Das
From: Ram Pai 

This introduces a new allocator that allocates 4K hardware
pages to back 64K linux pages. This allocator is available
only on powerpc.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h|  6 +
 tools/testing/selftests/vm/pkey-powerpc.h| 25 
 tools/testing/selftests/vm/pkey-x86.h|  5 
 tools/testing/selftests/vm/protection_keys.c |  1 +
 4 files changed, 37 insertions(+)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 43299435f24c..6a03faf1ef44 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -28,6 +28,9 @@
 extern int dprint_in_signal;
 extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
+extern int test_nr;
+extern int iteration_nr;
+
 #ifdef __GNUC__
 __attribute__((format(printf, 1, 2)))
 #endif
@@ -78,6 +81,9 @@ __attribute__((noinline)) int read_ptr(int *ptr);
 void expected_pkey_fault(int pkey);
 int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
 int sys_pkey_free(unsigned long pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+   unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
 
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index a43d7be85a27..f7a20bd07870 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -91,4 +91,29 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey)
 /* 8-bytes of instruction * 16384bytes = 1 page */
 #define __page_o_noops() asm(".rept 16384 ; nop; .endr")
 
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+   void *ptr;
+   int ret;
+
+   dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+   size, prot, pkey);
+   pkey_assert(pkey < NR_PKEYS);
+   ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+   pkey_assert(ptr != (void *)-1);
+
+   ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+   if (ret) {
+   perror("subpage_perm");
+   return PTR_ERR_ENOTSUP;
+   }
+
+   ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+   pkey_assert(!ret);
+   record_pkey_malloc(ptr, size, prot);
+
+   dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+   return ptr;
+}
+
 #endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index e5fdee39a7d8..87b85f60b43f 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -175,4 +175,9 @@ void expect_fault_on_read_execonly_key(void *p1, int pkey)
expected_pkey_fault(pkey);
 }
 
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+   return PTR_ERR_ENOTSUP;
+}
+
 #endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 176625ded549..5faf52ba7b8f 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -844,6 +844,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 
malloc_pkey_with_mprotect,
+   malloc_pkey_with_mprotect_subpage,
malloc_pkey_anon_huge,
malloc_pkey_hugetlb
 /* can not do direct with the pkey_mprotect() API:
-- 
2.17.1



[PATCH v15 09/23] selftests/vm/pkeys: Fix assertion in pkey_disable_set/clear()

2019-12-18 Thread Sandipan Das
From: Ram Pai 

In some cases, a pkey's bits need not necessarily change
in a way that the value of the pkey register increases
when performing a pkey_disable_set() or decreases when
performing a pkey_disable_clear().

For example, on powerpc, if a pkey's current state is
PKEY_DISABLE_ACCESS and we perform a pkey_write_disable()
on it, the bits still remain the same. We will observe
something similar when the pkey's current state is 0 and
a pkey_access_enable() is performed on it.

Either case would cause some assertions to fail. This
fixes the problem.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 9a6c95b220cc..fbee0b061851 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -399,7 +399,7 @@ void pkey_disable_set(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n",
__func__, pkey, read_pkey_reg());
if (flags)
-   pkey_assert(read_pkey_reg() > orig_pkey_reg);
+   pkey_assert(read_pkey_reg() >= orig_pkey_reg);
dprintf1("END<---%s(%d, 0x%x)\n", __func__,
pkey, flags);
 }
@@ -430,7 +430,7 @@ void pkey_disable_clear(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n", __func__,
pkey, read_pkey_reg());
if (flags)
-   assert(read_pkey_reg() < orig_pkey_reg);
+   assert(read_pkey_reg() <= orig_pkey_reg);
 }
 
 void pkey_write_allow(int pkey)
-- 
2.17.1



[PATCH v15 10/23] selftests/vm/pkeys: Fix alloc_random_pkey() to make it really random

2019-12-18 Thread Sandipan Das
From: Ram Pai 

alloc_random_pkey() was allocating the same pkey every
time. Not all pkeys were geting tested. This fixes it.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index fbee0b061851..d3c13283bbd0 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -24,6 +24,7 @@
 #define _GNU_SOURCE
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -545,10 +546,10 @@ int alloc_random_pkey(void)
int nr_alloced = 0;
int random_index;
memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+   srand((unsigned int)time(NULL));
 
/* allocate every possible key and make a note of which ones we got */
max_nr_pkey_allocs = NR_PKEYS;
-   max_nr_pkey_allocs = 1;
for (i = 0; i < max_nr_pkey_allocs; i++) {
int new_pkey = alloc_pkey();
if (new_pkey < 0)
-- 
2.17.1



[PATCH v15 22/23] selftests/vm/pkeys: Override access right definitions on powerpc

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Some platforms hardcode the x86 values for PKEY_DISABLE_ACCESS
and PKEY_DISABLE_WRITE such as those in:
 /usr/include/bits/mman-shared.h.

This overrides the definitions with correct values for powerpc.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-powerpc.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index a70577045074..3cd8e03fd640 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -16,11 +16,13 @@
 #define fpregs fp_regs
 #define si_pkey_offset 0x20
 
-#ifndef PKEY_DISABLE_ACCESS
+#ifdef PKEY_DISABLE_ACCESS
+#undef PKEY_DISABLE_ACCESS
 # define PKEY_DISABLE_ACCESS   0x3  /* disable read and write */
 #endif
 
-#ifndef PKEY_DISABLE_WRITE
+#ifdef PKEY_DISABLE_WRITE
+#undef PKEY_DISABLE_WRITE
 # define PKEY_DISABLE_WRITE0x2
 #endif
 
-- 
2.17.1



[PATCH v15 02/23] selftests/vm/pkeys: Rename all references to pkru to a generic name

2019-12-18 Thread Sandipan Das
From: Ram Pai 

This renames pkru references to "pkey_reg" or "pkey" based on
the usage.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Reviewed-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h|  85 +++
 tools/testing/selftests/vm/protection_keys.c | 240 ++-
 2 files changed, 170 insertions(+), 155 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 254e5436bdd9..d5779be4793f 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -14,7 +14,7 @@
 #include 
 
 #define NR_PKEYS 16
-#define PKRU_BITS_PER_PKEY 2
+#define PKEY_BITS_PER_PKEY 2
 
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
@@ -53,85 +53,88 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
-extern unsigned int shadow_pkru;
-static inline unsigned int __rdpkru(void)
+extern unsigned int shadow_pkey_reg;
+static inline unsigned int __read_pkey_reg(void)
 {
unsigned int eax, edx;
unsigned int ecx = 0;
-   unsigned int pkru;
+   unsigned int pkey_reg;
 
asm volatile(".byte 0x0f,0x01,0xee\n\t"
 : "=a" (eax), "=d" (edx)
 : "c" (ecx));
-   pkru = eax;
-   return pkru;
+   pkey_reg = eax;
+   return pkey_reg;
 }
 
-static inline unsigned int _rdpkru(int line)
+static inline unsigned int _read_pkey_reg(int line)
 {
-   unsigned int pkru = __rdpkru();
+   unsigned int pkey_reg = __read_pkey_reg();
 
-   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
-   line, pkru, shadow_pkru);
-   assert(pkru == shadow_pkru);
+   dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n",
+   line, pkey_reg, shadow_pkey_reg);
+   assert(pkey_reg == shadow_pkey_reg);
 
-   return pkru;
+   return pkey_reg;
 }
 
-#define rdpkru() _rdpkru(__LINE__)
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
 
-static inline void __wrpkru(unsigned int pkru)
+static inline void __write_pkey_reg(unsigned int pkey_reg)
 {
-   unsigned int eax = pkru;
+   unsigned int eax = pkey_reg;
unsigned int ecx = 0;
unsigned int edx = 0;
 
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %08x to %08x\n", __func__,
+   __read_pkey_reg(), pkey_reg);
asm volatile(".byte 0x0f,0x01,0xef\n\t"
 : : "a" (eax), "c" (ecx), "d" (edx));
-   assert(pkru == __rdpkru());
+   assert(pkey_reg == __read_pkey_reg());
 }
 
-static inline void wrpkru(unsigned int pkru)
+static inline void write_pkey_reg(unsigned int pkey_reg)
 {
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %08x to %08x\n", __func__,
+   __read_pkey_reg(), pkey_reg);
/* will do the shadow check for us: */
-   rdpkru();
-   __wrpkru(pkru);
-   shadow_pkru = pkru;
-   dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+   read_pkey_reg();
+   __write_pkey_reg(pkey_reg);
+   shadow_pkey_reg = pkey_reg;
+   dprintf4("%s(%08x) pkey_reg: %08x\n", __func__,
+   pkey_reg, __read_pkey_reg());
 }
 
 /*
  * These are technically racy. since something could
- * change PKRU between the read and the write.
+ * change PKEY register between the read and the write.
  */
 static inline void __pkey_access_allow(int pkey, int do_allow)
 {
-   unsigned int pkru = rdpkru();
+   unsigned int pkey_reg = read_pkey_reg();
int bit = pkey * 2;
 
if (do_allow)
-   pkru &= (1<>>>===SIGSEGV\n");
-   dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
-   __rdpkru(), shadow_pkru);
+   dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__,
+   __read_pkey_reg(), shadow_pkey_reg);
 
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
@@ -289,19 +289,19 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
 */
fpregs += 0x70;
 #endif
-   pkru_offset = pkru_xstate_offset();
-   pkru_ptr = (void *)([pkru_offset]);
+   pkey_reg_offset = pkey_reg_xstate_offset();
+   pkey_reg_ptr = (void *)([pkey_reg_offset]);
 
dprintf1("siginfo: %p\n", si);
dprintf1(" fpregs: %p\n", fpregs);
/*
-* If we got a PKRU fault, we *HAVE* to have at least one bit set in
+* If we got a PKEY fault, we *HAVE* to have at least one bit set in
 * here.
 */
-   dprintf1("pkru_xstate_offset: %d\n", 

[PATCH v15 16/23] selftests/vm/pkeys: Associate key on a mapped page and detect access violation

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Detect access-violation on a page to which access-disabled
key is associated much after the page is mapped.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
Signed-off: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 7bceb98662c1..f63b9f55b3a7 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -983,6 +983,24 @@ void test_read_of_access_disabled_region(int *ptr, u16 
pkey)
dprintf1("*ptr: %d\n", ptr_contents);
expected_pkey_fault(pkey);
 }
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   int ptr_contents;
+
+   dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+   pkey, ptr);
+   ptr_contents = read_ptr(ptr);
+   dprintf1("reading ptr before disabling the read : %d\n",
+   ptr_contents);
+   read_pkey_reg();
+   pkey_access_deny(pkey);
+   ptr_contents = read_ptr(ptr);
+   dprintf1("*ptr: %d\n", ptr_contents);
+   expected_pkey_fault(pkey);
+}
+
 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
@@ -1389,6 +1407,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
 void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_read_of_write_disabled_region,
test_read_of_access_disabled_region,
+   test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
test_write_of_access_disabled_region,
test_kernel_write_of_access_disabled_region,
-- 
2.17.1



[PATCH v15 23/23] selftests: vm: pkeys: Use the correct page size on powerpc

2019-12-18 Thread Sandipan Das
Both 4K and 64K pages are supported on powerpc. Parts of
the selftest code perform alignment computations based on
the PAGE_SIZE macro which is currently hardcoded to 64K
for powerpc. This causes some test failures on kernels
configured with 4K page size.

This problem is solved by determining the correct page
size during the build process rather than hardcoding it
in the header file.

cc: Dave Hansen 
cc: Florian Weimer 
cc: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/Makefile   | 4 
 tools/testing/selftests/vm/pkey-powerpc.h | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index 4e9c741be6af..ada3a67eaac6 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -4,6 +4,10 @@ uname_M := $(shell uname -m 2>/dev/null || echo not)
 ARCH ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/')
 
 CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
+ifneq (,$(filter $(ARCH), ppc64 ppc64le))
+protection_keys: EXTRA_CFLAGS += -DPAGE_SIZE=$(shell getconf PAGESIZE)
+endif
+
 LDLIBS = -lrt
 TEST_GEN_FILES = compaction_test
 TEST_GEN_FILES += gup_benchmark
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index 3cd8e03fd640..07fa9f529014 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -36,7 +36,6 @@
 pkey-31 and exec-only key */
 #define PKEY_BITS_PER_PKEY 2
 #define HPAGE_SIZE (1UL << 24)
-#define PAGE_SIZE  (1UL << 16)
 #define pkey_reg_t u64
 #define PKEY_REG_FMT   "%016lx"
 
-- 
2.17.1



[PATCH v15 00/24] selftests, powerpc, x86: Memory Protection Keys

2019-12-18 Thread Sandipan Das
Memory protection keys enables an application to protect its address
space from inadvertent access by its own code.

This feature is now enabled on powerpc and has been available since
4.16-rc1. The patches move the selftests to arch neutral directory
and enhance their test coverage.

Testing
---
Verified for correctness on powerpc. Need help with x86 testing as I
do not have access to a Skylake server. Client platforms like Coffee
Lake do not have the required feature bits set in CPUID.

Changelog
-
Link to previous version (v14):
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=55981=*

v15:
(1) Rebased on top of latest master.
(2) Addressed review comments from Dave Hansen.
(3) Moved code for getting or setting pkey bits to new
helpers. These changes replace patch 7 of v14.
(4) Added a fix which ensures that the correct count of
reserved keys is used across different platforms.
(5) Added a fix which ensures that the correct page size
is used as powerpc supports both 4K and 64K pages.

v14:
(1) Incorporated another round of comments from Dave Hansen.

v13:
(1) Incorporated comments for Dave Hansen.
(2) Added one more test for correct pkey-0 behavior.

v12:
(1) Fixed the offset of pkey field in the siginfo structure for
x86_64 and powerpc. And tries to use the actual field
if the headers have it defined.

v11:
(1) Fixed a deadlock in the ptrace testcase.

v10 and prior:
(1) Moved the testcase to arch neutral directory.
(2) Split the changes into incremental patches.

Desnes A. Nunes do Rosario (1):
  selftests/vm/pkeys: Fix number of reserved powerpc pkeys

Ram Pai (17):
  selftests/x86/pkeys: Move selftests to arch-neutral directory
  selftests/vm/pkeys: Rename all references to pkru to a generic name
  selftests/vm/pkeys: Move generic definitions to header file
  selftests/vm/pkeys: Typecast the pkey register
  selftests/vm/pkeys: Fix pkey_disable_clear()
  selftests/vm/pkeys: Fix assertion in pkey_disable_set/clear()
  selftests/vm/pkeys: Fix alloc_random_pkey() to make it really random
  selftests/vm/pkeys: Introduce generic pkey abstractions
  selftests/vm/pkeys: Introduce powerpc support
  selftests/vm/pkeys: Fix assertion in test_pkey_alloc_exhaust()
  selftests/vm/pkeys: Improve checks to determine pkey support
  selftests/vm/pkeys: Associate key on a mapped page and detect access
violation
  selftests/vm/pkeys: Associate key on a mapped page and detect write
violation
  selftests/vm/pkeys: Detect write violation on a mapped
access-denied-key page
  selftests/vm/pkeys: Introduce a sub-page allocator
  selftests/vm/pkeys: Test correct behaviour of pkey-0
  selftests/vm/pkeys: Override access right definitions on powerpc

Sandipan Das (3):
  selftests: vm: pkeys: Add helpers for pkey bits
  selftests: vm: pkeys: Use the correct huge page size
  selftests: vm: pkeys: Use the correct page size on powerpc

Thiago Jung Bauermann (2):
  selftests/vm/pkeys: Move some definitions to arch-specific header
  selftests/vm/pkeys: Make gcc check arguments of sigsafe_printf()

 tools/testing/selftests/vm/.gitignore |   1 +
 tools/testing/selftests/vm/Makefile   |   5 +
 tools/testing/selftests/vm/pkey-helpers.h | 226 ++
 tools/testing/selftests/vm/pkey-powerpc.h | 138 
 tools/testing/selftests/vm/pkey-x86.h | 183 +
 .../selftests/{x86 => vm}/protection_keys.c   | 688 ++
 tools/testing/selftests/x86/.gitignore|   1 -
 tools/testing/selftests/x86/pkey-helpers.h| 219 --
 8 files changed, 931 insertions(+), 530 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h
 create mode 100644 tools/testing/selftests/vm/pkey-x86.h
 rename tools/testing/selftests/{x86 => vm}/protection_keys.c (74%)
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h

-- 
2.17.1



Re: [PATCH 05/18] powerpc sstep: Prepare to support prefixed instructions

2019-12-18 Thread Daniel Axtens
Jordan Niethe  writes:

> Currently all instructions are a single word long. A future ISA version
> will include prefixed instructions which have a double word length. The
> functions used for analysing and emulating instructions need to be
> modified so that they can handle these new instruction types.
>
> A prefixed instruction is a word prefix followed by a word suffix. All
> prefixes uniquely have the primary op-code 1. Suffixes may be valid word
> instructions or instructions that only exist as suffixes.
>
> In handling prefixed instructions it will be convenient to treat the
> suffix and prefix as separate words. To facilitate this modify
> analyse_instr() and emulate_step() to take a take a suffix as a
> parameter. For word instructions it does not matter what is passed in
> here - it will be ignored.
>
> We also define a new flag, PREFIXED, to be used in instruction_op:type.
> This flag will indicate when emulating an analysed instruction if the
> NIP should be advanced by word length or double word length.
>
> The callers of analyse_instr() and emulate_step() will need their own
> changes to be able to support prefixed instructions. For now modify them
> to pass in 0 as a suffix.
>
> Note that at this point no prefixed instructions are emulated or
> analysed - this is just making it possible to do so.
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/ppc-opcode.h |  3 +++
>  arch/powerpc/include/asm/sstep.h  |  8 +--
>  arch/powerpc/include/asm/uaccess.h| 30 +++
>  arch/powerpc/kernel/align.c   |  2 +-
>  arch/powerpc/kernel/hw_breakpoint.c   |  4 ++--
>  arch/powerpc/kernel/kprobes.c |  2 +-
>  arch/powerpc/kernel/mce_power.c   |  2 +-
>  arch/powerpc/kernel/optprobes.c   |  2 +-
>  arch/powerpc/kernel/uprobes.c |  2 +-
>  arch/powerpc/kvm/emulate_loadstore.c  |  2 +-
>  arch/powerpc/lib/sstep.c  | 12 ++-
>  arch/powerpc/lib/test_emulate_step.c  | 30 +--
>  arch/powerpc/xmon/xmon.c  |  4 ++--
>  13 files changed, 71 insertions(+), 32 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
> b/arch/powerpc/include/asm/ppc-opcode.h
> index c1df75edde44..a1dfa4bdd22f 100644
> --- a/arch/powerpc/include/asm/ppc-opcode.h
> +++ b/arch/powerpc/include/asm/ppc-opcode.h
> @@ -377,6 +377,9 @@
>  #define PPC_INST_VCMPEQUD0x10c7
>  #define PPC_INST_VCMPEQUB0x1006
>  
> +/* macro to check if a word is a prefix */
> +#define IS_PREFIX(x) (((x) >> 26) == 1)
> +
>  /* macros to insert fields into opcodes */
>  #define ___PPC_RA(a) (((a) & 0x1f) << 16)
>  #define ___PPC_RB(b) (((b) & 0x1f) << 11)
> diff --git a/arch/powerpc/include/asm/sstep.h 
> b/arch/powerpc/include/asm/sstep.h
> index 769f055509c9..6d4cb602e231 100644
> --- a/arch/powerpc/include/asm/sstep.h
> +++ b/arch/powerpc/include/asm/sstep.h
> @@ -89,6 +89,9 @@ enum instruction_type {
>  #define VSX_LDLEFT   4   /* load VSX register from left */
>  #define VSX_CHECK_VEC8   /* check MSR_VEC not MSR_VSX for reg >= 
> 32 */
>  
> +/* Prefixed flag, ORed in with type */
> +#define PREFIXED 0x800
> +
>  /* Size field in type word */
>  #define SIZE(n)  ((n) << 12)
>  #define GETSIZE(w)   ((w) >> 12)
> @@ -132,7 +135,7 @@ union vsx_reg {
>   * otherwise.
>   */
>  extern int analyse_instr(struct instruction_op *op, const struct pt_regs 
> *regs,
> -  unsigned int instr);
> +  unsigned int instr, unsigned int sufx);
>  
>  /*
>   * Emulate an instruction that can be executed just by updating
> @@ -149,7 +152,8 @@ void emulate_update_regs(struct pt_regs *reg, struct 
> instruction_op *op);
>   * 0 if it could not be emulated, or -1 for an instruction that
>   * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.).
>   */
> -extern int emulate_step(struct pt_regs *regs, unsigned int instr);
> +extern int emulate_step(struct pt_regs *regs, unsigned int instr,
> + unsigned int sufx);
>  
>  /*
>   * Emulate a load or store instruction by reading/writing the
> diff --git a/arch/powerpc/include/asm/uaccess.h 
> b/arch/powerpc/include/asm/uaccess.h
> index 15002b51ff18..bc585399e0c7 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -423,4 +423,34 @@ extern long __copy_from_user_flushcache(void *dst, const 
> void __user *src,
>  extern void memcpy_page_flushcache(char *to, struct page *page, size_t 
> offset,
>  size_t len);
>  
> +/*
> + * When reading an instruction iff it is a prefix, the suffix needs to be 
> also
> + * loaded.
> + */
> +#define __get_user_instr(x, y, ptr)  \
> +({   \
> + long __gui_ret = 0; \
> + y = 0;  \
> + __gui_ret = __get_user(x, ptr);  

[PATCH v4 0/7] Introduce CAP_SYS_PERFMON to secure system performance monitoring and observability

2019-12-18 Thread Alexey Budankov


Currently access to perf_events, i915_perf and other performance monitoring and
observability subsystems of the kernel is open for a privileged process [1] with
CAP_SYS_ADMIN capability enabled in the process effective set [2].

This patch set introduces CAP_SYS_PERFMON capability devoted to secure system
performance monitoring and observability operations so that CAP_SYS_PERFMON 
would
assist CAP_SYS_ADMIN capability in its governing role for perf_events, i915_perf
and other performance monitoring and observability subsystems of the kernel.

CAP_SYS_PERFMON intends to meet the demand to secure system performance 
monitoring
and observability operations in security sensitive, restricted, production
environments (e.g. HPC clusters, cloud and virtual compute environments) where 
root
or CAP_SYS_ADMIN credentials are not available to mass users of a system because
of security considerations.

CAP_SYS_PERFMON intends to harden system security and integrity during system
performance monitoring and observability operations by decreasing attack surface
that is available to CAP_SYS_ADMIN privileged processes [2].

CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related to system
performance monitoring and observability operations and balance amount of
CAP_SYS_ADMIN credentials following the recommendations in the capabilities man
page [2] for CAP_SYS_ADMIN: "Note: this capability is overloaded; see Notes to
kernel developers, below."

For backward compatibility reasons access to system performance monitoring and
observability subsystems of the kernel remains open for CAP_SYS_ADMIN privileged
processes but CAP_SYS_ADMIN capability usage for secure system performance
monitoring and observability operations is discouraged with respect to the
introduced CAP_SYS_PERFMON capability.

The patch set is for tip perf/core repository:
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip perf/core
sha1: ceb9e77324fa661b1001a0ae66f061b5fcb4e4e6

---
Changes in v4:
- converted perfmon_capable() into an inline function
- made perf_events kprobes, uprobes, hw breakpoints and namespaces data 
available
  to CAP_SYS_PERFMON privileged processes
- applied perfmon_capable() to drivers/perf and drivers/oprofile
- extended __cmd_ftrace() with support of CAP_SYS_PERFMON
Changes in v3:
- implemented perfmon_capable() macros aggregating required capabilities checks
Changes in v2:
- made perf_events trace points available to CAP_SYS_PERFMON privileged 
processes
- made perf_event_paranoid_check() treat CAP_SYS_PERFMON equally to 
CAP_SYS_ADMIN
- applied CAP_SYS_PERFMON to i915_perf, bpf_trace, powerpc and parisc system
  performance monitoring and observability related subsystems

---
Alexey Budankov (9):
  capabilities: introduce CAP_SYS_PERFMON to kernel and user space
  perf/core: open access for CAP_SYS_PERFMON privileged process
  perf tool: extend Perf tool with CAP_SYS_PERFMON capability support
  drm/i915/perf: open access for CAP_SYS_PERFMON privileged process
  trace/bpf_trace: open access for CAP_SYS_PERFMON privileged process
  powerpc/perf: open access for CAP_SYS_PERFMON privileged process
  parisc/perf: open access for CAP_SYS_PERFMON privileged process
  drivers/perf: open access for CAP_SYS_PERFMON privileged process
  drivers/oprofile: open access for CAP_SYS_PERFMON privileged process

 arch/parisc/kernel/perf.c   |  2 +-
 arch/powerpc/perf/imc-pmu.c |  4 ++--
 drivers/gpu/drm/i915/i915_perf.c| 13 ++---
 drivers/oprofile/event_buffer.c |  2 +-
 drivers/perf/arm_spe_pmu.c  |  4 ++--
 include/linux/capability.h  |  4 
 include/linux/perf_event.h  |  6 +++---
 include/uapi/linux/capability.h |  8 +++-
 kernel/events/core.c|  6 +++---
 kernel/trace/bpf_trace.c|  2 +-
 security/selinux/include/classmap.h |  4 ++--
 tools/perf/builtin-ftrace.c |  5 +++--
 tools/perf/design.txt   |  3 ++-
 tools/perf/util/cap.h   |  4 
 tools/perf/util/evsel.c | 10 +-
 tools/perf/util/util.c  |  1 +
 16 files changed, 47 insertions(+), 31 deletions(-)

---
Testing and validation (Intel Skylake, 8 cores, Fedora 29, 5.4.0-rc8+, x86_64):

libcap library [3], [4] and Perf tool can be used to apply CAP_SYS_PERFMON 
capability for secure system performance monitoring and observability beyond the
scope permitted by the system wide perf_event_paranoid kernel setting [5] and
below are the steps for evaluation:

  - patch, build and boot the kernel
  - patch, build Perf tool e.g. to /home/user/perf
  ...
  # git clone git://git.kernel.org/pub/scm/libs/libcap/libcap.git libcap
  # pushd libcap
  # patch libcap/include/uapi/linux/capabilities.h with [PATCH 1]
  # make
  # pushd progs
  # ./setcap "cap_sys_perfmon,cap_sys_ptrace,cap_syslog=ep" /home/user/perf
  # ./setcap -v "cap_sys_perfmon,cap_sys_ptrace,cap_syslog=ep" /home/user/perf
  /home/user/perf: OK
  # ./getcap 

[PATCH v15 11/23] selftests: vm: pkeys: Use the correct huge page size

2019-12-18 Thread Sandipan Das
The huge page size can vary across architectures. This will
ensure that the correct huge page size is used when accessing
the hugetlb controls under sysfs. Instead of using a hardcoded
page size (i.e. 2MB), this now uses the HPAGE_SIZE macro which
is arch-specific.

Cc: Dave Hansen 
Cc: Florian Weimer 
Cc: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 23 ++--
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index d3c13283bbd0..2327cfa0b836 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -738,12 +738,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 }
 
 int hugetlb_setup_ok;
+#define SYSFS_FMT_NR_HUGE_PAGES 
"/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
 #define GET_NR_HUGE_PAGES 10
 void setup_hugetlbfs(void)
 {
int err;
int fd;
-   char buf[] = "123";
+   char buf[256];
+   long hpagesz_kb;
+   long hpagesz_mb;
 
if (geteuid() != 0) {
fprintf(stderr, "WARNING: not run as root, can not do hugetlb 
test\n");
@@ -754,11 +757,16 @@ void setup_hugetlbfs(void)
 
/*
 * Now go make sure that we got the pages and that they
-* are 2M pages.  Someone might have made 1G the default.
+* are PMD-level pages. Someone might have made PUD-level
+* pages the default.
 */
-   fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", 
O_RDONLY);
+   hpagesz_kb = HPAGE_SIZE / 1024;
+   hpagesz_mb = hpagesz_kb / 1024;
+   sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
+   fd = open(buf, O_RDONLY);
if (fd < 0) {
-   perror("opening sysfs 2M hugetlb config");
+   fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
+   hpagesz_mb, strerror(errno));
return;
}
 
@@ -766,13 +774,14 @@ void setup_hugetlbfs(void)
err = read(fd, buf, sizeof(buf)-1);
close(fd);
if (err <= 0) {
-   perror("reading sysfs 2M hugetlb config");
+   fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
+   hpagesz_mb, strerror(errno));
return;
}
 
if (atoi(buf) != GET_NR_HUGE_PAGES) {
-   fprintf(stderr, "could not confirm 2M pages, got: '%s' expected 
%d\n",
-   buf, GET_NR_HUGE_PAGES);
+   fprintf(stderr, "could not confirm %ldM pages, got: '%s' 
expected %d\n",
+   hpagesz_mb, buf, GET_NR_HUGE_PAGES);
return;
}
 
-- 
2.17.1



[PATCH v4 1/9] capabilities: introduce CAP_SYS_PERFMON to kernel and user space

2019-12-18 Thread Alexey Budankov


Introduce CAP_SYS_PERFMON capability devoted to secure system performance
monitoring and observability operations so that CAP_SYS_PERFMON would
assist CAP_SYS_ADMIN capability in its governing role for perf_events,
i915_perf and other subsystems of the kernel.

CAP_SYS_PERFMON intends to harden system security and integrity during
system performance monitoring and observability operations by decreasing
attack surface that is available to CAP_SYS_ADMIN privileged processes.

CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related
to system performance monitoring and observability operations and balance
amount of CAP_SYS_ADMIN credentials in accordance with the recommendations
provided in the man page for CAP_SYS_ADMIN [1]: "Note: this capability
is overloaded; see Notes to kernel developers, below."

[1] http://man7.org/linux/man-pages/man7/capabilities.7.html

Signed-off-by: Alexey Budankov 
---
 include/linux/capability.h  | 4 
 include/uapi/linux/capability.h | 8 +++-
 security/selinux/include/classmap.h | 4 ++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index ecce0f43c73a..883c879baa4b 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct 
user_namespace *ns, const struct
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace 
*ns, int cap);
 extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace 
*ns);
+static inline bool perfmon_capable(void)
+{
+   return capable(CAP_SYS_PERFMON) || capable(CAP_SYS_ADMIN);
+}
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..98e03cc76c7c 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -366,8 +366,14 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_READ 37
 
+/*
+ * Allow system performance and observability privileged operations
+ * using perf_events, i915_perf and other kernel subsystems
+ */
+
+#define CAP_SYS_PERFMON38
 
-#define CAP_LAST_CAP CAP_AUDIT_READ
+#define CAP_LAST_CAP CAP_SYS_PERFMON
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
diff --git a/security/selinux/include/classmap.h 
b/security/selinux/include/classmap.h
index 7db24855e12d..bae602c623b0 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -27,9 +27,9 @@
"audit_control", "setfcap"
 
 #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
-   "wake_alarm", "block_suspend", "audit_read"
+   "wake_alarm", "block_suspend", "audit_read", "sys_perfmon"
 
-#if CAP_LAST_CAP > CAP_AUDIT_READ
+#if CAP_LAST_CAP > CAP_SYS_PERFMON
 #error New capability defined, please update COMMON_CAP2_PERMS.
 #endif
 
-- 
2.20.1




[PATCH v15 01/23] selftests/x86/pkeys: Move selftests to arch-neutral directory

2019-12-18 Thread Sandipan Das
From: Ram Pai 

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Acked-by: Ingo Molnar 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/.gitignore | 1 +
 tools/testing/selftests/vm/Makefile   | 1 +
 tools/testing/selftests/{x86 => vm}/pkey-helpers.h| 0
 tools/testing/selftests/{x86 => vm}/protection_keys.c | 0
 tools/testing/selftests/x86/.gitignore| 1 -
 5 files changed, 2 insertions(+), 1 deletion(-)
 rename tools/testing/selftests/{x86 => vm}/pkey-helpers.h (100%)
 rename tools/testing/selftests/{x86 => vm}/protection_keys.c (100%)

diff --git a/tools/testing/selftests/vm/.gitignore 
b/tools/testing/selftests/vm/.gitignore
index 31b3c98b6d34..c55837bf39fa 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -14,3 +14,4 @@ virtual_address_range
 gup_benchmark
 va_128TBswitch
 map_fixed_noreplace
+protection_keys
diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index 7f9a8a8c31da..4e9c741be6af 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -18,6 +18,7 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += protection_keys
 
 ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 
sparc64 x86_64))
 TEST_GEN_FILES += va_128TBswitch
diff --git a/tools/testing/selftests/x86/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
similarity index 100%
rename from tools/testing/selftests/x86/pkey-helpers.h
rename to tools/testing/selftests/vm/pkey-helpers.h
diff --git a/tools/testing/selftests/x86/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
similarity index 100%
rename from tools/testing/selftests/x86/protection_keys.c
rename to tools/testing/selftests/vm/protection_keys.c
diff --git a/tools/testing/selftests/x86/.gitignore 
b/tools/testing/selftests/x86/.gitignore
index 7757f73ff9a3..eb30ffd83876 100644
--- a/tools/testing/selftests/x86/.gitignore
+++ b/tools/testing/selftests/x86/.gitignore
@@ -11,5 +11,4 @@ ldt_gdt
 iopl
 mpx-mini-test
 ioperm
-protection_keys
 test_vdso
-- 
2.17.1



[PATCH v15 04/23] selftests/vm/pkeys: Move some definitions to arch-specific header

2019-12-18 Thread Sandipan Das
From: Thiago Jung Bauermann 

This moves definitions which have arch-specific values to the
x86-specific header in preparation for multi-arch support.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h| 111 +
 tools/testing/selftests/vm/pkey-x86.h| 156 +++
 tools/testing/selftests/vm/protection_keys.c |  47 --
 3 files changed, 162 insertions(+), 152 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-x86.h

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 6ad1bd54ef94..3ed2f021bf7a 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -21,9 +21,6 @@
 
 #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
 
-#define NR_PKEYS 16
-#define PKEY_BITS_PER_PKEY 2
-
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
 #endif
@@ -73,19 +70,13 @@ extern void abort_hooks(void);
}   \
 } while (0)
 
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
 extern unsigned int shadow_pkey_reg;
-static inline unsigned int __read_pkey_reg(void)
-{
-   unsigned int eax, edx;
-   unsigned int ecx = 0;
-   unsigned int pkey_reg;
-
-   asm volatile(".byte 0x0f,0x01,0xee\n\t"
-: "=a" (eax), "=d" (edx)
-: "c" (ecx));
-   pkey_reg = eax;
-   return pkey_reg;
-}
 
 static inline unsigned int _read_pkey_reg(int line)
 {
@@ -100,19 +91,6 @@ static inline unsigned int _read_pkey_reg(int line)
 
 #define read_pkey_reg() _read_pkey_reg(__LINE__)
 
-static inline void __write_pkey_reg(unsigned int pkey_reg)
-{
-   unsigned int eax = pkey_reg;
-   unsigned int ecx = 0;
-   unsigned int edx = 0;
-
-   dprintf4("%s() changing %08x to %08x\n", __func__,
-   __read_pkey_reg(), pkey_reg);
-   asm volatile(".byte 0x0f,0x01,0xef\n\t"
-: : "a" (eax), "c" (ecx), "d" (edx));
-   assert(pkey_reg == __read_pkey_reg());
-}
-
 static inline void write_pkey_reg(unsigned int pkey_reg)
 {
dprintf4("%s() changing %08x to %08x\n", __func__,
@@ -157,83 +135,6 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
dprintf4("pkey_reg now: %08x\n", read_pkey_reg());
 }
 
-#define PAGE_SIZE 4096
-#define MB (1<<20)
-
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
-   unsigned int *ecx, unsigned int *edx)
-{
-   /* ecx is often an input as well as an output. */
-   asm volatile(
-   "cpuid;"
-   : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
-   : "0" (*eax), "2" (*ecx));
-}
-
-/* Intel-defined CPU features, CPUID level 0x0007:0 (ecx) */
-#define X86_FEATURE_PKU(1<<3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE  (1<<4) /* OS Protection Keys Enable */
-
-static inline int cpu_has_pku(void)
-{
-   unsigned int eax;
-   unsigned int ebx;
-   unsigned int ecx;
-   unsigned int edx;
-
-   eax = 0x7;
-   ecx = 0x0;
-   __cpuid(, , , );
-
-   if (!(ecx & X86_FEATURE_PKU)) {
-   dprintf2("cpu does not have PKU\n");
-   return 0;
-   }
-   if (!(ecx & X86_FEATURE_OSPKE)) {
-   dprintf2("cpu does not have OSPKE\n");
-   return 0;
-   }
-   return 1;
-}
-
-#define XSTATE_PKEY_BIT(9)
-#define XSTATE_PKEY0x200
-
-int pkey_reg_xstate_offset(void)
-{
-   unsigned int eax;
-   unsigned int ebx;
-   unsigned int ecx;
-   unsigned int edx;
-   int xstate_offset;
-   int xstate_size;
-   unsigned long XSTATE_CPUID = 0xd;
-   int leaf;
-
-   /* assume that XSTATE_PKEY is set in XCR0 */
-   leaf = XSTATE_PKEY_BIT;
-   {
-   eax = XSTATE_CPUID;
-   ecx = leaf;
-   __cpuid(, , , );
-
-   if (leaf == XSTATE_PKEY_BIT) {
-   xstate_offset = ebx;
-   xstate_size = eax;
-   }
-   }
-
-   if (xstate_size == 0) {
-   printf("could not find size/offset of PKEY in xsave state\n");
-   return 0;
-   }
-
-   return xstate_offset;
-}
-
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
 #define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
 #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
new file mode 100644
index ..2f04ade8ca9c
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -0,0 +1,156 @@
+/* 

[PATCH v15 05/23] selftests/vm/pkeys: Make gcc check arguments of sigsafe_printf()

2019-12-18 Thread Sandipan Das
From: Thiago Jung Bauermann 

This will help us ensure we print pkey_reg_t values correctly in
different architectures.

Signed-off-by: Thiago Jung Bauermann 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 3ed2f021bf7a..7f18a82e54fc 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -27,6 +27,10 @@
 #define DPRINT_IN_SIGNAL_BUF_SIZE 4096
 extern int dprint_in_signal;
 extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
 static inline void sigsafe_printf(const char *format, ...)
 {
va_list ap;
-- 
2.17.1



[PATCH v15 15/23] selftests/vm/pkeys: Improve checks to determine pkey support

2019-12-18 Thread Sandipan Das
From: Ram Pai 

For the pkeys subsystem to work, both the CPU and the
kernel need to have support. So, additionally check if
the kernel supports pkeys apart from the CPU feature
checks.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h| 30 
 tools/testing/selftests/vm/pkey-powerpc.h|  3 +-
 tools/testing/selftests/vm/pkey-x86.h|  2 +-
 tools/testing/selftests/vm/protection_keys.c |  7 +++--
 4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 5de06d3a81de..43299435f24c 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -76,6 +76,8 @@ extern void abort_hooks(void);
 
 __attribute__((noinline)) int read_ptr(int *ptr);
 void expected_pkey_fault(int pkey);
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+int sys_pkey_free(unsigned long pkey);
 
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
@@ -187,4 +189,32 @@ static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
 #endif
 }
 
+static inline int kernel_has_pkeys(void)
+{
+   /* try allocating a key and see if it succeeds */
+   int ret = sys_pkey_alloc(0, 0);
+   if (ret <= 0) {
+   return 0;
+   }
+   sys_pkey_free(ret);
+   return 1;
+}
+
+static inline int is_pkeys_supported(void)
+{
+   /* check if the cpu supports pkeys */
+   if (!cpu_has_pkeys()) {
+   dprintf1("SKIP: %s: no CPU support\n", __func__);
+   return 0;
+   }
+
+   /* check if the kernel supports pkeys */
+   if (!kernel_has_pkeys()) {
+   dprintf1("SKIP: %s: no kernel support\n", __func__);
+   return 0;
+   }
+
+   return 1;
+}
+
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index 5ced9c58fa4a..a43d7be85a27 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -63,8 +63,9 @@ static inline void __write_pkey_reg(pkey_reg_t pkey_reg)
pkey_reg);
 }
 
-static inline int cpu_has_pku(void)
+static inline int cpu_has_pkeys(void)
 {
+   /* No simple way to determine this */
return 1;
 }
 
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index bb2d4dfcacd5..e5fdee39a7d8 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -99,7 +99,7 @@ static inline void __cpuid(unsigned int *eax, unsigned int 
*ebx,
 #define X86_FEATURE_PKU(1<<3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE  (1<<4) /* OS Protection Keys Enable */
 
-static inline int cpu_has_pku(void)
+static inline int cpu_has_pkeys(void)
 {
unsigned int eax;
unsigned int ebx;
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 8d90cfe2c9bd..7bceb98662c1 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1377,7 +1377,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
int size = PAGE_SIZE;
int sret;
 
-   if (cpu_has_pku()) {
+   if (cpu_has_pkeys()) {
dprintf1("SKIP: %s: no CPU support\n", __func__);
return;
}
@@ -1446,12 +1446,13 @@ void pkey_setup_shadow(void)
 int main(void)
 {
int nr_iterations = 22;
+   int pkeys_supported = is_pkeys_supported();
 
setup_handlers();
 
-   printf("has pku: %d\n", cpu_has_pku());
+   printf("has pkeys: %d\n", pkeys_supported);
 
-   if (!cpu_has_pku()) {
+   if (!pkeys_supported) {
int size = PAGE_SIZE;
int *ptr;
 
-- 
2.17.1



[PATCH v15 08/23] selftests/vm/pkeys: Fix pkey_disable_clear()

2019-12-18 Thread Sandipan Das
From: Ram Pai 

Currently, pkey_disable_clear() sets the specified bits
instead clearing them. This has been dead code up to now
because its only callers i.e. pkey_access/write_allow()
are also unused.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/protection_keys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index b474d4fbe92b..9a6c95b220cc 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -417,7 +417,7 @@ void pkey_disable_clear(int pkey, int flags)
pkey, pkey, pkey_rights);
pkey_assert(pkey_rights >= 0);
 
-   pkey_rights |= flags;
+   pkey_rights &= ~flags;
 
ret = hw_pkey_set(pkey, pkey_rights, 0);
shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
@@ -430,7 +430,7 @@ void pkey_disable_clear(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n", __func__,
pkey, read_pkey_reg());
if (flags)
-   assert(read_pkey_reg() > orig_pkey_reg);
+   assert(read_pkey_reg() < orig_pkey_reg);
 }
 
 void pkey_write_allow(int pkey)
-- 
2.17.1



Re: [PATCH 03/18] powerpc: Add PREFIXED SRR1 bit for future ISA version

2019-12-18 Thread Daniel Axtens
Jordan Niethe  writes:

> Add the bit definition for exceptions caused by prefixed instructions.
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/reg.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index 6f9fcc3d4c82..0a6d39fb4769 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -778,6 +778,7 @@
>  
>  #define   SRR1_MCE_MCP   0x0008 /* Machine check signal 
> caused interrupt */
>  #define   SRR1_BOUNDARY  0x1000 /* Prefixed instruction 
> crosses 64-byte boundary */
> +#define   SRR1_PREFIXED  0x2000 /* Exception caused by 
> prefixed instruction */

You could probably squash this with the previous patch, and maybe the
next patch too.

Regards,
Daniel

>  
>  #define SPRN_HSRR0   0x13A   /* Save/Restore Register 0 */
>  #define SPRN_HSRR1   0x13B   /* Save/Restore Register 1 */
> -- 
> 2.20.1


Re: [PATCH v2 1/3] powerpc/mmu_gather: Enable RCU_TABLE_FREE even for !SMP case

2019-12-18 Thread Peter Zijlstra


I'm going to assume you'll take these 3 patches through the powerpc tree
for convenience, a few small nits, but otherwise ACK on the lot.

On Wed, Dec 18, 2019 at 11:05:28AM +0530, Aneesh Kumar K.V wrote:
> A follow up patch is going to make sure we correctly invalidate page walk 
> cache
> before we free page table pages. In order to keep things simple enable
> RCU_TABLE_FREE even for !SMP so that we don't have to fixup the !SMP case
> differently in the followup patch

Perhaps more clearly state that !SMP is broken for Radix/PWC. The above
sorta implies it is, but it's not spelled out in any detail.

> Signed-off-by: Aneesh Kumar K.V 

Acked-by: Peter Zijlstra (Intel) 


Re: [PATCH v4 1/9] capabilities: introduce CAP_SYS_PERFMON to kernel and user space

2019-12-18 Thread Stephen Smalley

On 12/18/19 4:24 AM, Alexey Budankov wrote:


Introduce CAP_SYS_PERFMON capability devoted to secure system performance
monitoring and observability operations so that CAP_SYS_PERFMON would
assist CAP_SYS_ADMIN capability in its governing role for perf_events,
i915_perf and other subsystems of the kernel.

CAP_SYS_PERFMON intends to harden system security and integrity during
system performance monitoring and observability operations by decreasing
attack surface that is available to CAP_SYS_ADMIN privileged processes.

CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related
to system performance monitoring and observability operations and balance
amount of CAP_SYS_ADMIN credentials in accordance with the recommendations
provided in the man page for CAP_SYS_ADMIN [1]: "Note: this capability
is overloaded; see Notes to kernel developers, below."

[1] http://man7.org/linux/man-pages/man7/capabilities.7.html

Signed-off-by: Alexey Budankov 


Acked-by: Stephen Smalley 

Note for selinux developers: we will need to update the 
selinux-testsuite tests for perf_event when/if this change lands upstream.



---
  include/linux/capability.h  | 4 
  include/uapi/linux/capability.h | 8 +++-
  security/selinux/include/classmap.h | 4 ++--
  3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index ecce0f43c73a..883c879baa4b 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,10 @@ extern bool privileged_wrt_inode_uidgid(struct 
user_namespace *ns, const struct
  extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
  extern bool file_ns_capable(const struct file *file, struct user_namespace 
*ns, int cap);
  extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace 
*ns);
+static inline bool perfmon_capable(void)
+{
+   return capable(CAP_SYS_PERFMON) || capable(CAP_SYS_ADMIN);
+}
  
  /* audit system wants to get cap info from files as well */

  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..98e03cc76c7c 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -366,8 +366,14 @@ struct vfs_ns_cap_data {
  
  #define CAP_AUDIT_READ		37
  
+/*

+ * Allow system performance and observability privileged operations
+ * using perf_events, i915_perf and other kernel subsystems
+ */
+
+#define CAP_SYS_PERFMON38
  
-#define CAP_LAST_CAP CAP_AUDIT_READ

+#define CAP_LAST_CAP CAP_SYS_PERFMON
  
  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
  
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h

index 7db24855e12d..bae602c623b0 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -27,9 +27,9 @@
"audit_control", "setfcap"
  
  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \

-   "wake_alarm", "block_suspend", "audit_read"
+   "wake_alarm", "block_suspend", "audit_read", "sys_perfmon"
  
-#if CAP_LAST_CAP > CAP_AUDIT_READ

+#if CAP_LAST_CAP > CAP_SYS_PERFMON
  #error New capability defined, please update COMMON_CAP2_PERMS.
  #endif
  





Re: [PATCH v6 05/10] mm/memory_hotplug: Shrink zones when offlining memory

2019-12-18 Thread Andrew Morton
On Wed, 18 Dec 2019 18:08:04 +0100 David Hildenbrand  wrote:

> On 01.12.19 00:21, Andrew Morton wrote:
> > On Sun, 27 Oct 2019 23:45:52 +0100 David Hildenbrand  
> > wrote:
> > 
> >> I think I just found an issue with try_offline_node(). 
> >> try_offline_node() is pretty much broken already (touches garbage 
> >> memmaps and will not considers mixed NIDs within sections), however, 
> >> relies on the node span to look for memory sections to probe. So it 
> >> seems to rely on the nodes getting shrunk when removing memory, not when 
> >> offlining.
> >>
> >> As we shrink the node span when offlining now and not when removing, 
> >> this can go wrong once we offline the last memory block of the node and 
> >> offline the last CPU. We could still have memory around that we could 
> >> re-online, however, the node would already be offline. Unlikely, but 
> >> possible.
> >>
> >> Note that the same is also broken without this patch in case memory is 
> >> never onlined. The "pfn_to_nid(pfn) != nid" can easily succeed on the 
> >> garbage memmap, resulting in  no memory being detected as belonging to 
> >> the node. Also, resize_pgdat_range() is called when onlining memory, not 
> >> when adding it. :/ Oh this is so broken :)
> >>
> >> The right fix is probably to walk over all memory blocks that could 
> >> exist and test if they belong to the nid (if offline, check the 
> >> block->nid, if online check all pageblocks). A fix we can then move in 
> >> front of this patch.
> >>
> >> Will look into this this week.
> > 
> > And this series shows almost no sign of having been reviewed.  I'll hold
> > it over for 5.6.
> > 
> 
> Hi Andrew, any chance we can get the (now at least reviewed - thx Oscar)
> fix in patch #5 into 5.5? (I want to do the final stable backports for
> the uninitialized memmap stuff)

Sure, I queued it for the next batch of 5.5 fixes.


Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register

2019-12-18 Thread Dave Hansen
On 12/17/19 11:51 PM, Sandipan Das wrote:
>   write_pkey_reg(pkey_reg);
> - dprintf4("pkey_reg now: %08x\n", read_pkey_reg());
> + dprintf4("pkey_reg now: "PKEY_REG_FMT"\n", read_pkey_reg());
>  }
>  
>  #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
> diff --git a/tools/testing/selftests/vm/pkey-x86.h 
> b/tools/testing/selftests/vm/pkey-x86.h
> index 2f04ade8ca9c..5f40901219d3 100644
> --- a/tools/testing/selftests/vm/pkey-x86.h
> +++ b/tools/testing/selftests/vm/pkey-x86.h
> @@ -46,6 +46,8 @@
>  #define HPAGE_SIZE   (1UL<<21)
>  #define PAGE_SIZE4096
>  #define MB   (1<<20)
> +#define pkey_reg_t   u32
> +#define PKEY_REG_FMT "%016x"

How big is the ppc one?

I'd really just rather do %016lx *everywhere* than sprinkle the
PKEY_REG_FMTs around.

BTW, why are you doing a %016lx for a u32?


Re: [PATCH v15 00/24] selftests, powerpc, x86: Memory Protection Keys

2019-12-18 Thread Dave Hansen
On 12/17/19 11:51 PM, Sandipan Das wrote:
> Testing
> ---
> Verified for correctness on powerpc. Need help with x86 testing as I
> do not have access to a Skylake server. Client platforms like Coffee
> Lake do not have the required feature bits set in CPUID.

FWIW, you can get a Skylake Server instance from cloud providers.  I
spooled up an Amazon EC3 instance once to run these tests.  It think it
cost me $0.08.


Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register

2019-12-18 Thread Michal Suchánek
On Wed, Dec 18, 2019 at 12:46:50PM -0800, Dave Hansen wrote:
> On 12/17/19 11:51 PM, Sandipan Das wrote:
> > write_pkey_reg(pkey_reg);
> > -   dprintf4("pkey_reg now: %08x\n", read_pkey_reg());
> > +   dprintf4("pkey_reg now: "PKEY_REG_FMT"\n", read_pkey_reg());
> >  }
> >  
> >  #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
> > diff --git a/tools/testing/selftests/vm/pkey-x86.h 
> > b/tools/testing/selftests/vm/pkey-x86.h
> > index 2f04ade8ca9c..5f40901219d3 100644
> > --- a/tools/testing/selftests/vm/pkey-x86.h
> > +++ b/tools/testing/selftests/vm/pkey-x86.h
> > @@ -46,6 +46,8 @@
> >  #define HPAGE_SIZE (1UL<<21)
> >  #define PAGE_SIZE  4096
> >  #define MB (1<<20)
> > +#define pkey_reg_t u32
> > +#define PKEY_REG_FMT   "%016x"
> 
> How big is the ppc one?
u64
> 
> I'd really just rather do %016lx *everywhere* than sprinkle the
> PKEY_REG_FMTs around.

Does lx work with u32 without warnings?

It's likely the size difference that requires a format specifier definition.

> 
> BTW, why are you doing a %016lx for a u32?

It's "%016x" without 'l' for x86 and with 'l' for ppc64.

Thanks

Michal


Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register

2019-12-18 Thread Dave Hansen
On 12/18/19 12:59 PM, Michal Suchánek wrote:
>> I'd really just rather do %016lx *everywhere* than sprinkle the
>> PKEY_REG_FMTs around.
> Does lx work with u32 without warnings?

Either way, I'd be happy to just make the x86 one u64 to make the whole
thing look more sane,


Re: [PATCH v11 01/25] mm/gup: factor out duplicate code from four routines

2019-12-18 Thread Kirill A. Shutemov
On Wed, Dec 18, 2019 at 02:15:53PM -0800, John Hubbard wrote:
> On 12/18/19 7:52 AM, Kirill A. Shutemov wrote:
> > On Mon, Dec 16, 2019 at 02:25:13PM -0800, John Hubbard wrote:
> > > +static void put_compound_head(struct page *page, int refs)
> > > +{
> > > + /* Do a get_page() first, in case refs == page->_refcount */
> > > + get_page(page);
> > > + page_ref_sub(page, refs);
> > > + put_page(page);
> > > +}
> > 
> > It's not terribly efficient. Maybe something like:
> > 
> > VM_BUG_ON_PAGE(page_ref_count(page) < ref, page);
> > if (refs > 2)
> > page_ref_sub(page, refs - 1);
> > put_page(page);
> > 
> > ?
> 
> OK, but how about this instead? I don't see the need for a "2", as that
> is a magic number that requires explanation. Whereas "1" is not a magic
> number--here it means: either there are "many" (>1) refs, or not.

Yeah, it's my thinko. Sure, it has to be '1' (or >= 2, which is less readable).

> And the routine won't be called with refs less than about 32 (2MB huge
> page, 64KB base page == 32 subpages) anyway.

It's hard to make predictions about future :P

>   VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
>   /*
>* Calling put_page() for each ref is unnecessarily slow. Only the last
>* ref needs a put_page().
>*/
>   if (refs > 1)
>   page_ref_sub(page, refs - 1);
>   put_page(page);

Looks good to me.

-- 
 Kirill A. Shutemov


Re: [PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register

2019-12-18 Thread Michal Suchánek
On Wed, Dec 18, 2019 at 01:01:46PM -0800, Dave Hansen wrote:
> On 12/18/19 12:59 PM, Michal Suchánek wrote:
> >> I'd really just rather do %016lx *everywhere* than sprinkle the
> >> PKEY_REG_FMTs around.
> > Does lx work with u32 without warnings?
> 
> Either way, I'd be happy to just make the x86 one u64 to make the whole
> thing look more sane,

So long as it still works with u64 on x86 it should be pretty
future-proof.  Does not look like we are getting 128bit registers for
anything but math units any time soon.

Thanks

Michal


Re: [PATCH 1/1] kvm/book3s_64: Fixes crash caused by not cleaning vhost IOTLB

2019-12-18 Thread Leonardo Bras
On Wed, 2019-12-18 at 15:53 +1100, Alexey Kardashevskiy wrote:
> H_STUFF_TCE is always called with 0. Well, may be some AIX somewhere
> calls it with a value other than zero, and I probably saw some other
> value somewhere but in QEMU/KVM case it is 0 so you effectively disable
> in-kernel acceleration of H_STUFF_TCE which is
> undesirable.
> 

Thanks for the feedback!

> For now we should disable in-kernel H_STUFF_TCE/... handlers in QEMU
> just like we do for VFIO for older host kernels:
> 
> https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_iommu.c;h=3d3bcc86496a5277d62f7855fbb09c013c015f27;hb=HEAD#l208
 
I am still reading into this temporary solution, I could still not
understand how it works.

> I am not sure what a proper solution would be, something like an eventfd
> and KVM's kvmppc_h_stuff_tce() signaling vhost that the latter needs to
> invalidate iotlbs. Or we can just say that we do not allow KVM
> acceleration if there is vhost+iommu on the same liobn (== vPHB, pretty
> much). Thanks,

I am not used to eventfd, but i agree it's a valid solution to talk to
QEMU and then use it to send a message via /dev/vhost.
KVM -> QEMU -> vhost

But I can't get my mind out of another solution: doing it in
kernelspace.  I am not sure how that would work, though.

If I could understand correctly, there is a vhost IOTLB per vhost_dev,
and H_STUFF_TCE is not called in 64-bit DMA case (for tce_value == 0
case, at least), which makes sense, given it doesn't need to invalidate
entries on IOTLB.

So, we would need to somehow replace `return H_TOO_HARD` in this patch
with code that could call vhost_process_iotlb_msg() with
VHOST_IOTLB_INVALIDATE.

For that, I would need to know what are the vhost_dev's of that
process, which I don't know if it's possible to do currently (or safe
at all).

I am thinking of linking all vhost_dev's with a list (list.h) that
could be searched, comparing `mm_struct *` of the calling task with all
vhost_dev's, and removing the entry of all IOTLB that hits.

Not sure if that's the best approach to find the related vhost_dev's.

What do you think?

Best regards,
Leonardo Bras


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 1/2] powerpc/pseries/svm: Don't access some SPRs

2019-12-18 Thread Sukadev Bhattiprolu
Michael Ellerman [m...@ellerman.id.au] wrote:
> 
> eg. here.
> 
> This is the fast path of context switch.
> 
> That expands to:
> 
>   if (!(mfmsr() & MSR_S))
>   asm volatile("mfspr %0, SPRN_BESCR" : "=r" (rval));
>   if (!(mfmsr() & MSR_S))
>   asm volatile("mfspr %0, SPRN_EBBHR" : "=r" (rval));
>   if (!(mfmsr() & MSR_S))
>   asm volatile("mfspr %0, SPRN_EBBRR" : "=r" (rval));
> 

Yes, should have optimized this at least :-)
> 
> If the Ultravisor is going to disable EBB and BHRB then we need new
> CPU_FTR bits for those, and the code that accesses those registers
> needs to be put behind cpu_has_feature(EBB) etc.

Will try the cpu_has_feature(). Would it be ok to use a single feature
bit, like UV or make it per-register group as that could need more
feature bits?

Thanks,

Sukadev


Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread John Hubbard

On 12/18/19 8:04 AM, Kirill A. Shutemov wrote:

On Mon, Dec 16, 2019 at 02:25:16PM -0800, John Hubbard wrote:

An upcoming patch changes and complicates the refcounting and
especially the "put page" aspects of it. In order to keep
everything clean, refactor the devmap page release routines:

* Rename put_devmap_managed_page() to page_is_devmap_managed(),
   and limit the functionality to "read only": return a bool,
   with no side effects.

* Add a new routine, put_devmap_managed_page(), to handle checking
   what kind of page it is, and what kind of refcount handling it
   requires.

* Rename __put_devmap_managed_page() to free_devmap_managed_page(),
   and limit the functionality to unconditionally freeing a devmap
   page.


What the reason to separate put_devmap_managed_page() from
free_devmap_managed_page() if free_devmap_managed_page() has exacly one
caller? Is it preparation for the next patches?



Yes. A later patch, #23, adds another caller: 
__unpin_devmap_managed_user_page().

...

@@ -971,7 +971,14 @@ static inline bool put_devmap_managed_page(struct page 
*page)
return false;
  }
  
+bool put_devmap_managed_page(struct page *page);

+
  #else /* CONFIG_DEV_PAGEMAP_OPS */
+static inline bool page_is_devmap_managed(struct page *page)
+{
+   return false;
+}
+
  static inline bool put_devmap_managed_page(struct page *page)
  {
return false;
@@ -1028,8 +1035,10 @@ static inline void put_page(struct page *page)
 * need to inform the device driver through callback. See
 * include/linux/memremap.h and HMM for details.
 */
-   if (put_devmap_managed_page(page))
+   if (page_is_devmap_managed(page)) {
+   put_devmap_managed_page(page);


put_devmap_managed_page() has yet another page_is_devmap_managed() check
inside. It looks strange.



Good point, it's an extra unnecessary check. So to clean it up, I'll note
that the "if" check is required here in put_page(), in order to stay out of
non-inlined function calls in the hot path (put_page()). So I'll do the
following:

* Leave the above code as it is here

* Simplify put_devmap_managed_page(), it was trying to do two separate things,
  and those two things have different requirements. So change it to a void
  function, with a WARN_ON_ONCE to assert that page_is_devmap_managed() is true,

* And change the other caller (release_pages()) to do that check.

...

@@ -1102,3 +1102,27 @@ void __init swap_setup(void)
 * _really_ don't want to cluster much more
 */
  }
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+bool put_devmap_managed_page(struct page *page)
+{
+   bool is_devmap = page_is_devmap_managed(page);
+
+   if (is_devmap) {


Reversing the condition would save you an indentation level.


Yes. Done.

I'll also git-reply with an updated patch so you can see what it looks like.


thanks,
--
John Hubbard
NVIDIA


[PATCH v4 2/4] kasan: Document support on 32-bit powerpc

2019-12-18 Thread Daniel Axtens
KASAN is supported on 32-bit powerpc and the docs should reflect this.

Suggested-by: Christophe Leroy 
Reviewed-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 
---
 Documentation/dev-tools/kasan.rst |  3 ++-
 Documentation/powerpc/kasan.txt   | 12 
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/powerpc/kasan.txt

diff --git a/Documentation/dev-tools/kasan.rst 
b/Documentation/dev-tools/kasan.rst
index e4d66e7c50de..4af2b5d2c9b4 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -22,7 +22,8 @@ global variables yet.
 Tag-based KASAN is only supported in Clang and requires version 7.0.0 or later.
 
 Currently generic KASAN is supported for the x86_64, arm64, xtensa and s390
-architectures, and tag-based KASAN is supported only for arm64.
+architectures. It is also supported on 32-bit powerpc kernels. Tag-based KASAN
+is supported only on arm64.
 
 Usage
 -
diff --git a/Documentation/powerpc/kasan.txt b/Documentation/powerpc/kasan.txt
new file mode 100644
index ..a85ce2ff8244
--- /dev/null
+++ b/Documentation/powerpc/kasan.txt
@@ -0,0 +1,12 @@
+KASAN is supported on powerpc on 32-bit only.
+
+32 bit support
+==
+
+KASAN is supported on both hash and nohash MMUs on 32-bit.
+
+The shadow area sits at the top of the kernel virtual memory space above the
+fixmap area and occupies one eighth of the total kernel virtual memory space.
+
+Instrumentation of the vmalloc area is not currently supported, but modules
+are.
-- 
2.20.1



Re: [PATCH v11 06/25] mm: fix get_user_pages_remote()'s handling of FOLL_LONGTERM

2019-12-18 Thread John Hubbard

On 12/18/19 8:19 AM, Kirill A. Shutemov wrote:
...

diff --git a/mm/gup.c b/mm/gup.c
index 3ecce297a47f..c0c56888e7cc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,13 @@ struct follow_page_context {
unsigned int page_mask;
  };
  
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,

+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags);


Any particular reason for the forward declaration? Maybe move
get_user_pages_remote() down?



Yes, that's exactly why: I was thinking it would be cleaner to put in the
forward declaration, rather than moving code blocks, but either way seems
reasonable. I'll go ahead and move the code blocks and delete the forward
declaration, now that someone has weighed in in favor of that.

thanks,
--
John Hubbard
NVIDIA


[PATCH v12] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread John Hubbard
An upcoming patch changes and complicates the refcounting and
especially the "put page" aspects of it. In order to keep
everything clean, refactor the devmap page release routines:

* Rename put_devmap_managed_page() to page_is_devmap_managed(),
  and limit the functionality to "read only": return a bool,
  with no side effects.

* Add a new routine, put_devmap_managed_page(), to handle
  decrementing the refcount for ZONE_DEVICE pages.

* Change callers (just release_pages() and put_page()) to check
  page_is_devmap_managed() before calling the new
  put_devmap_managed_page() routine. This is a performance
  point: put_page() is a hot path, so we need to avoid non-
  inline function calls where possible.

* Rename __put_devmap_managed_page() to free_devmap_managed_page(),
  and limit the functionality to unconditionally freeing a devmap
  page.

This is originally based on a separate patch by Ira Weiny, which
applied to an early version of the put_user_page() experiments.
Since then, Jérôme Glisse suggested the refactoring described above.

Cc: Christoph Hellwig 
Suggested-by: Jérôme Glisse 
Reviewed-by: Dan Williams 
Reviewed-by: Jan Kara 
Signed-off-by: Ira Weiny 
Signed-off-by: John Hubbard 
---
 include/linux/mm.h | 18 +-
 mm/memremap.c  | 16 ++--
 mm/swap.c  | 27 ++-
 3 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c97ea3b694e6..87b54126e46d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -952,9 +952,10 @@ static inline bool is_zone_device_page(const struct page 
*page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page);
+void free_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-static inline bool put_devmap_managed_page(struct page *page)
+
+static inline bool page_is_devmap_managed(struct page *page)
 {
if (!static_branch_unlikely(_managed_key))
return false;
@@ -963,7 +964,6 @@ static inline bool put_devmap_managed_page(struct page 
*page)
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_FS_DAX:
-   __put_devmap_managed_page(page);
return true;
default:
break;
@@ -971,11 +971,17 @@ static inline bool put_devmap_managed_page(struct page 
*page)
return false;
 }
 
+void put_devmap_managed_page(struct page *page);
+
 #else /* CONFIG_DEV_PAGEMAP_OPS */
-static inline bool put_devmap_managed_page(struct page *page)
+static inline bool page_is_devmap_managed(struct page *page)
 {
return false;
 }
+
+static inline void put_devmap_managed_page(struct page *page)
+{
+}
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 static inline bool is_device_private_page(const struct page *page)
@@ -1028,8 +1034,10 @@ static inline void put_page(struct page *page)
 * need to inform the device driver through callback. See
 * include/linux/memremap.h and HMM for details.
 */
-   if (put_devmap_managed_page(page))
+   if (page_is_devmap_managed(page)) {
+   put_devmap_managed_page(page);
return;
+   }
 
if (put_page_testzero(page))
__put_page(page);
diff --git a/mm/memremap.c b/mm/memremap.c
index e899fa876a62..2ba773859031 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page)
+void free_devmap_managed_page(struct page *page)
 {
-   int count = page_ref_dec_return(page);
-
-   /* still busy */
-   if (count > 1)
-   return;
-
-   /* only triggered by the dev_pagemap shutdown path */
-   if (count == 0) {
-   __put_page(page);
-   return;
-   }
-
/* notify page idle for dax */
if (!is_device_private_page(page)) {
wake_up_var(>_refcount);
@@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page)
page->mapping = NULL;
page->pgmap->ops->page_free(page);
 }
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(free_devmap_managed_page);
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/swap.c b/mm/swap.c
index 5341ae93861f..cf39d24ada2a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr)
 * processing, and instead, expect a call to
 * put_page_testzero().
 */
-   if (put_devmap_managed_page(page))
+   if (page_is_devmap_managed(page)) {
+   put_devmap_managed_page(page);
continue;
+   }
}
 
page = 

Re: [PATCH V3 01/13] powerpc/vas: Describe vas-port and interrupts properties

2019-12-18 Thread Rob Herring
On Mon, Dec 16, 2019 at 09:48:40PM -0800, Haren Myneni wrote:
> 

Commit message?

> Signed-off-by: Haren Myneni 

Your author and S-o-b emails don't match.

> ---
>  Documentation/devicetree/bindings/powerpc/ibm,vas.txt | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt 
> b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> index bf11d2f..12de08b 100644
> --- a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> @@ -11,6 +11,8 @@ Required properties:
>window context start and length, OS/User window context start and length,
>"Paste address" start and length, "Paste window id" start bit and number
>of bits)
> +- ibm,vas-port : Port address for the interrupt.

64-bit?

> +- interrupts: IRQ value for each VAS instance and level.
>  
>  Example:
>  
> @@ -18,5 +20,8 @@ Example:
>   compatible = "ibm,vas", "ibm,power9-vas";
>   reg = <0x60191 0x200 0x60190 0x1 
> 0x8 0x1 0x20 0x10>;
>   name = "vas";
> + interrupts = <0x1f 0>;
> + interrupt-parent = <>;
>   ibm,vas-id = <0x1>;
> + ibm,vas-port = <0x601000100>;
>   };
> -- 
> 1.8.3.1
> 
> 
> 


Re: [PATCH v11 01/25] mm/gup: factor out duplicate code from four routines

2019-12-18 Thread John Hubbard

On 12/18/19 7:52 AM, Kirill A. Shutemov wrote:

On Mon, Dec 16, 2019 at 02:25:13PM -0800, John Hubbard wrote:

+static void put_compound_head(struct page *page, int refs)
+{
+   /* Do a get_page() first, in case refs == page->_refcount */
+   get_page(page);
+   page_ref_sub(page, refs);
+   put_page(page);
+}


It's not terribly efficient. Maybe something like:

VM_BUG_ON_PAGE(page_ref_count(page) < ref, page);
if (refs > 2)
page_ref_sub(page, refs - 1);
put_page(page);

?


OK, but how about this instead? I don't see the need for a "2", as that
is a magic number that requires explanation. Whereas "1" is not a magic
number--here it means: either there are "many" (>1) refs, or not.

And the routine won't be called with refs less than about 32 (2MB huge
page, 64KB base page == 32 subpages) anyway.

VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
/*
 * Calling put_page() for each ref is unnecessarily slow. Only the last
 * ref needs a put_page().
 */
if (refs > 1)
page_ref_sub(page, refs - 1);
put_page(page);


thanks,
--
John Hubbard
NVIDIA
 


Re: [PATCH V3 01/13] powerpc/vas: Describe vas-port and interrupts properties

2019-12-18 Thread Haren Myneni
On Wed, 2019-12-18 at 16:18 -0600, Rob Herring wrote:
> On Mon, Dec 16, 2019 at 09:48:40PM -0800, Haren Myneni wrote:
> > 
> 
> Commit message?
> 
> > Signed-off-by: Haren Myneni 
> 
> Your author and S-o-b emails don't match.

Thanks, Oliver suggested IRQ assign in the kernel instead of skiboot. In
this case, we may not need this patch. 
> 
> > ---
> >  Documentation/devicetree/bindings/powerpc/ibm,vas.txt | 5 +
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt 
> > b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> > index bf11d2f..12de08b 100644
> > --- a/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> > +++ b/Documentation/devicetree/bindings/powerpc/ibm,vas.txt
> > @@ -11,6 +11,8 @@ Required properties:
> >window context start and length, OS/User window context start and length,
> >"Paste address" start and length, "Paste window id" start bit and number
> >of bits)
> > +- ibm,vas-port : Port address for the interrupt.
> 
> 64-bit?
> 
> > +- interrupts: IRQ value for each VAS instance and level.
> >  
> >  Example:
> >  
> > @@ -18,5 +20,8 @@ Example:
> > compatible = "ibm,vas", "ibm,power9-vas";
> > reg = <0x60191 0x200 0x60190 0x1 
> > 0x8 0x1 0x20 0x10>;
> > name = "vas";
> > +   interrupts = <0x1f 0>;
> > +   interrupt-parent = <>;
> > ibm,vas-id = <0x1>;
> > +   ibm,vas-port = <0x601000100>;
> > };
> > -- 
> > 1.8.3.1
> > 
> > 
> > 




Re: [PATCH 04/14] powerpc/vas: Setup IRQ mapping and register port for each window

2019-12-18 Thread Haren Myneni
On Wed, 2019-12-18 at 18:18 +1100, Oliver O'Halloran wrote:
> On Wed, Nov 27, 2019 at 12:07 PM Haren Myneni  
> wrote:
> >
> > *snip*
> >
> > @@ -36,7 +62,18 @@ static int init_vas_instance(struct platform_device 
> > *pdev)
> > return -ENODEV;
> > }
> >
> > -   if (pdev->num_resources != 4) {
> > +   rc = of_property_read_u64(dn, "ibm,vas-port", );
> > +   if (rc) {
> > +   pr_err("No ibm,vas-port property for %s?\n", pdev->name);
> > +   /* No interrupts property */
> > +   nresources = 4;
> > +   }
> > +
> > +   /*
> > +* interrupts property is available with 'ibm,vas-port' property.
> > +* 4 Resources and 1 IRQ if interrupts property is available.
> > +*/
> > +   if (pdev->num_resources != nresources) {
> > pr_err("Unexpected DT configuration for [%s, %d]\n",
> > pdev->name, vasid);
> > return -ENODEV;
> 
> Right, so adding the IRQ in firmware will break the VAS driver in
> existing kernels since it changes the resource count. This is IMO a
> bug in the VAS driver that you should fix, but it does mean we need to
> think twice about having firmware assign an interrupt at boot.

Correct, Hence added vas-user-space nvram switch in skiboot.  

> 
> I had a closer look at this series and I'm not convinced that any
> firmware changes are actually required either. We already have OPAL
> calls for allocating an hwirq for the kernel to use and for getting
> the IRQ's XIVE trigger port (see pnv_ocxl_alloc_xive_irq() for an
> example). Why not use those here too? Doing so would allow us to
> assign interrupts to individual windows too which might be useful for
> the windows used by the kernel.

Thanks for the pointer. like using pnv_ocxl_alloc_xive_irq(), we can
disregard FW change. BTW, VAS fault handling is needed only for user
space VAS windows. 

 int vas_alloc_xive_irq(u32 chipid, u32 *irq, u64 *trigger_addr)
{
__be64 flags, trigger_page;
u32 hwirq;
s64 rc;

hwirq = opal_xive_allocate_irq_raw(chipid);
if (hwirq < 0)
return -ENOENT;

rc = opal_xive_get_irq_info(hwirq, , NULL, _page,
NULL,
NULL);
if (rc || !trigger_page) {
xive_native_free_irq(hwirq);
return -ENOENT;
}

*irq = hwirq;
*trigger_addr = be64_to_cpu(trigger_page);
return 0;
}

We can have common function for VAS and cxl except per chip IRQ
allocation is needed for each VAS instance. I will post patch-set with
this change.

Thanks
Haren




[PATCH v4 4/4] powerpc: Book3S 64-bit "heavyweight" KASAN support

2019-12-18 Thread Daniel Axtens
KASAN support on Book3S is a bit tricky to get right:

 - It would be good to support inline instrumentation so as to be able to
   catch stack issues that cannot be caught with outline mode.

 - Inline instrumentation requires a fixed offset.

 - Book3S runs code in real mode after booting. Most notably a lot of KVM
   runs in real mode, and it would be good to be able to instrument it.

 - Because code runs in real mode after boot, the offset has to point to
   valid memory both in and out of real mode.

[ppc64 mm note: The kernel installs a linear mapping at effective
address c000... onward. This is a one-to-one mapping with physical
memory from ... onward. Because of how memory accesses work on
powerpc 64-bit Book3S, a kernel pointer in the linear map accesses the
same memory both with translations on (accessing as an 'effective
address'), and with translations off (accessing as a 'real
address'). This works in both guests and the hypervisor. For more
details, see s5.7 of Book III of version 3 of the ISA, in particular
the Storage Control Overview, s5.7.3, and s5.7.5 - noting that this
KASAN implementation currently only supports Radix.]

One approach is just to give up on inline instrumentation. This way all
checks can be delayed until after everything set is up correctly, and the
address-to-shadow calculations can be overridden. However, the features and
speed boost provided by inline instrumentation are worth trying to do
better.

If _at compile time_ it is known how much contiguous physical memory a
system has, the top 1/8th of the first block of physical memory can be set
aside for the shadow. This is a big hammer and comes with 3 big
consequences:

 - there's no nice way to handle physically discontiguous memory, so only
   the first physical memory block can be used.

 - kernels will simply fail to boot on machines with less memory than
   specified when compiling.

 - kernels running on machines with more memory than specified when
   compiling will simply ignore the extra memory.

Implement and document KASAN this way. The current implementation is Radix
only.

Despite the limitations, it can still find bugs,
e.g. http://patchwork.ozlabs.org/patch/1103775/

At the moment, this physical memory limit must be set _even for outline
mode_. This may be changed in a later series - a different implementation
could be added for outline mode that dynamically allocates shadow at a
fixed offset. For example, see https://patchwork.ozlabs.org/patch/795211/

Suggested-by: Michael Ellerman 
Cc: Balbir Singh  # ppc64 out-of-line radix version
Cc: Christophe Leroy  # ppc32 version
Signed-off-by: Daniel Axtens 

---
Changes since v3:
 - Address further feedback from Christophe.
 - Drop changes to stack walking, it looks like the issue I observed is
   related to that particular stack, not stack-walking generally.

Changes since v2:

 - Address feedback from Christophe around cleanups and docs.
 - Address feedback from Balbir: at this point I don't have a good solution
   for the issues you identify around the limitations of the inline 
implementation
   but I think that it's worth trying to get the stack instrumentation support.
   I'm happy to have an alternative and more flexible outline mode - I had
   envisoned this would be called 'lightweight' mode as it imposes fewer 
restrictions.
   I've linked to your implementation. I think it's best to add it in a 
follow-up series.
 - Made the default PHYS_MEM_SIZE_FOR_KASAN value 1024MB. I think most people 
have
   guests with at least that much memory in the Radix 64s case so it's a much
   saner default - it means that if you just turn on KASAN without reading the
   docs you're much more likely to have a bootable kernel, which you will never
   have if the value is set to zero! I'm happy to bikeshed the value if we want.

Changes since v1:
 - Landed kasan vmalloc support upstream
 - Lots of feedback from Christophe.

Changes since the rfc:

 - Boots real and virtual hardware, kvm works.

 - disabled reporting when we're checking the stack for exception
   frames. The behaviour isn't wrong, just incompatible with KASAN.

 - Documentation!

 - Dropped old module stuff in favour of KASAN_VMALLOC.

The bugs with ftrace and kuap were due to kernel bloat pushing
prom_init calls to be done via the plt. Because we did not have
a relocatable kernel, and they are done very early, this caused
everything to explode. Compile with CONFIG_RELOCATABLE!
---
 Documentation/dev-tools/kasan.rst|   8 +-
 Documentation/powerpc/kasan.txt  | 112 ++-
 arch/powerpc/Kconfig |   2 +
 arch/powerpc/Kconfig.debug   |  21 
 arch/powerpc/Makefile|  11 ++
 arch/powerpc/include/asm/book3s/64/hash.h|   4 +
 arch/powerpc/include/asm/book3s/64/pgtable.h |   7 ++
 arch/powerpc/include/asm/book3s/64/radix.h   |   5 +
 

[PATCH v4 3/4] powerpc/mm/kasan: rename kasan_init_32.c to init_32.c

2019-12-18 Thread Daniel Axtens
kasan is already implied by the directory name, we don't need to
repeat it.

Suggested-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 
---
 arch/powerpc/mm/kasan/Makefile   | 2 +-
 arch/powerpc/mm/kasan/{kasan_init_32.c => init_32.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename arch/powerpc/mm/kasan/{kasan_init_32.c => init_32.c} (100%)

diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
index 6577897673dd..36a4e1b10b2d 100644
--- a/arch/powerpc/mm/kasan/Makefile
+++ b/arch/powerpc/mm/kasan/Makefile
@@ -2,4 +2,4 @@
 
 KASAN_SANITIZE := n
 
-obj-$(CONFIG_PPC32)   += kasan_init_32.o
+obj-$(CONFIG_PPC32)   += init_32.o
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c 
b/arch/powerpc/mm/kasan/init_32.c
similarity index 100%
rename from arch/powerpc/mm/kasan/kasan_init_32.c
rename to arch/powerpc/mm/kasan/init_32.c
-- 
2.20.1



[PATCH v4 1/4] kasan: define and use MAX_PTRS_PER_* for early shadow tables

2019-12-18 Thread Daniel Axtens
powerpc has a variable number of PTRS_PER_*, set at runtime based
on the MMU that the kernel is booted under.

This means the PTRS_PER_* are no longer constants, and therefore
breaks the build.

Define default MAX_PTRS_PER_*s in the same style as MAX_PTRS_PER_P4D.
As KASAN is the only user at the moment, just define them in the kasan
header, and have them default to PTRS_PER_* unless overridden in arch
code.

Suggested-by: Christophe Leroy 
Suggested-by: Balbir Singh 
Reviewed-by: Christophe Leroy 
Reviewed-by: Balbir Singh 
Signed-off-by: Daniel Axtens 
---
 include/linux/kasan.h | 18 +++---
 mm/kasan/init.c   |  6 +++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index e18fe54969e9..70865810d0e7 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -14,10 +14,22 @@ struct task_struct;
 #include 
 #include 
 
+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif
+
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
-extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
-extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
-extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
+extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE];
+extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
+extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
 extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
 
 int kasan_populate_early_shadow(const void *shadow_start,
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ce45c491ebcd..8b54a96d3b3e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -46,7 +46,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
 static inline bool kasan_pud_table(p4d_t p4d)
 {
return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -58,7 +58,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
 }
 #endif
 #if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
 static inline bool kasan_pmd_table(pud_t pud)
 {
return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -69,7 +69,7 @@ static inline bool kasan_pmd_table(pud_t pud)
return false;
 }
 #endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE] __page_aligned_bss;
 
 static inline bool kasan_pte_table(pmd_t pmd)
 {
-- 
2.20.1



[PATCH v4 0/4] KASAN for powerpc64 radix

2019-12-18 Thread Daniel Axtens
Building on the work of Christophe, Aneesh and Balbir, I've ported
KASAN to 64-bit Book3S kernels running on the Radix MMU.

This provides full inline instrumentation on radix, but does require
that you be able to specify the amount of physically contiguous memory
on the system at compile time. More details in patch 4.

v4: More cleanups, split renaming out, clarify bits and bobs.
Drop the stack walk disablement, that isn't needed. No other
functional change.

v3: Reduce the overly ambitious scope of the MAX_PTRS change.
Document more things, including around why some of the
restrictions apply.
Clean up the code more, thanks Christophe.

v2: The big change is the introduction of tree-wide(ish)
MAX_PTRS_PER_{PTE,PMD,PUD} macros in preference to the previous
approach, which was for the arch to override the page table array
definitions with their own. (And I squashed the annoying
intermittent crash!)

Apart from that there's just a lot of cleanup. Christophe, I've
addressed most of what you asked for and I will reply to your v1
emails to clarify what remains unchanged.

Daniel Axtens (4):
  kasan: define and use MAX_PTRS_PER_* for early shadow tables
  kasan: Document support on 32-bit powerpc
  powerpc/mm/kasan: rename kasan_init_32.c to init_32.c
  powerpc: Book3S 64-bit "heavyweight" KASAN support

 Documentation/dev-tools/kasan.rst |   7 +-
 Documentation/powerpc/kasan.txt   | 122 ++
 arch/powerpc/Kconfig  |   2 +
 arch/powerpc/Kconfig.debug|  21 +++
 arch/powerpc/Makefile |  11 ++
 arch/powerpc/include/asm/book3s/64/hash.h |   4 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  |   7 +
 arch/powerpc/include/asm/book3s/64/radix.h|   5 +
 arch/powerpc/include/asm/kasan.h  |  21 ++-
 arch/powerpc/kernel/prom.c|  61 -
 arch/powerpc/mm/kasan/Makefile|   3 +-
 .../mm/kasan/{kasan_init_32.c => init_32.c}   |   0
 arch/powerpc/mm/kasan/init_book3s_64.c|  70 ++
 arch/powerpc/platforms/Kconfig.cputype|   1 +
 include/linux/kasan.h |  18 ++-
 mm/kasan/init.c   |   6 +-
 16 files changed, 346 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/powerpc/kasan.txt
 rename arch/powerpc/mm/kasan/{kasan_init_32.c => init_32.c} (100%)
 create mode 100644 arch/powerpc/mm/kasan/init_book3s_64.c

-- 
2.20.1



"ftrace: Rework event_create_dir()" triggers boot error messages

2019-12-18 Thread Qian Cai
The linux-next commit "ftrace: Rework event_create_dir()” [1] triggers boot 
warnings
for Clang-build (Clang version 8.0.1) kernels (reproduced on both arm64 and 
powerpc).
Reverted it (with trivial conflict fixes) on the top of today’s linux-next 
fixed the issue.

configs:
https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config
https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config

[1] https://lore.kernel.org/lkml/2019132458.342979...@infradead.org/

[  115.799327][T1] Registered efivars operations
[  115.849770][T1] clocksource: Switched to clocksource arch_sys_counter
[  115.901145][T1] Could not initialize trace point 
events/sys_enter_rt_sigreturn
[  115.908854][T1] Could not create directory for event 
sys_enter_rt_sigreturn
[  115.998949][T1] Could not initialize trace point 
events/sys_enter_restart_syscall
[  116.006802][T1] Could not create directory for event 
sys_enter_restart_syscall
[  116.062702][T1] Could not initialize trace point events/sys_enter_getpid
[  116.069828][T1] Could not create directory for event sys_enter_getpid
[  116.078058][T1] Could not initialize trace point events/sys_enter_gettid
[  116.085181][T1] Could not create directory for event sys_enter_gettid
[  116.093405][T1] Could not initialize trace point events/sys_enter_getppid
[  116.100612][T1] Could not create directory for event sys_enter_getppid
[  116.108989][T1] Could not initialize trace point events/sys_enter_getuid
[  116.116058][T1] Could not create directory for event sys_enter_getuid
[  116.124250][T1] Could not initialize trace point events/sys_enter_geteuid
[  116.131457][T1] Could not create directory for event sys_enter_geteuid
[  116.139840][T1] Could not initialize trace point events/sys_enter_getgid
[  116.146908][T1] Could not create directory for event sys_enter_getgid
[  116.155163][T1] Could not initialize trace point events/sys_enter_getegid
[  116.162370][T1] Could not create directory for event sys_enter_getegid
[  116.178015][T1] Could not initialize trace point events/sys_enter_setsid
[  116.185138][T1] Could not create directory for event sys_enter_setsid
[  116.269307][T1] Could not initialize trace point 
events/sys_enter_sched_yield
[  116.276811][T1] Could not create directory for event 
sys_enter_sched_yield
[  116.527652][T1] Could not initialize trace point 
events/sys_enter_munlockall
[  116.535126][T1] Could not create directory for event sys_enter_munlockall
[  116.622096][T1] Could not initialize trace point events/sys_enter_vhangup
[  116.629307][T1] Could not create directory for event sys_enter_vhangup
[  116.783867][T1] Could not initialize trace point events/sys_enter_sync
[  116.790819][T1] Could not create directory for event sys_enter_sync
[  117.723402][T1] pnp: PnP ACPI init
[  117.736379][T1] system 00:00: [mem 0x3000-0x3fff window] could 
not be reserved
[  126.020353][T1] pnp: PnP ACPI: found 1 devices
[  126.093919][T1] NET: Registered protocol family 2
[  126.180007][T1] tcp_listen_portaddr_hash hash table entries: 65536 
(order: 6, 4718592 bytes, vmalloc)
[  126.206510][T1] TCP established hash table entries: 524288 (order: 6, 
4194304 bytes, vmalloc)
[  126.227766][T1] TCP bind hash table entries: 65536 (order: 6, 4194304 
bytes, vmalloc)
[  126.240146][T1] TCP: Hash tables configured (established 524288 bind 
65536)

Re: "ftrace: Rework event_create_dir()" triggers boot error messages

2019-12-18 Thread Steven Rostedt
On Wed, 18 Dec 2019 22:58:23 -0500
Qian Cai  wrote:

> The linux-next commit "ftrace: Rework event_create_dir()” [1] triggers boot 
> warnings
> for Clang-build (Clang version 8.0.1) kernels (reproduced on both arm64 and 
> powerpc).
> Reverted it (with trivial conflict fixes) on the top of today’s linux-next 
> fixed the issue.
> 
> configs:
> https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config
> https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config
> 
> [1] https://lore.kernel.org/lkml/2019132458.342979...@infradead.org/
> 
> [  115.799327][T1] Registered efivars operations
> [  115.849770][T1] clocksource: Switched to clocksource arch_sys_counter
> [  115.901145][T1] Could not initialize trace point 
> events/sys_enter_rt_sigreturn
> [  115.908854][T1] Could not create directory for event 
> sys_enter_rt_sigreturn
> [  115.998949][T1] Could not initialize trace point 
> events/sys_enter_restart_syscall
> [  116.006802][T1] Could not create directory for event 
> sys_enter_restart_syscall
> [  116.062702][T1] Could not initialize trace point 
> events/sys_enter_getpid
> [  116.069828][T1] Could not create directory for event sys_enter_getpid
> [  116.078058][T1] Could not initialize trace point 
> events/sys_enter_gettid
> [  116.085181][T1] Could not create directory for event sys_enter_gettid
> [  116.093405][T1] Could not initialize trace point 
> events/sys_enter_getppid
> [  116.100612][T1] Could not create directory for event sys_enter_getppid
> [  116.108989][T1] Could not initialize trace point 
> events/sys_enter_getuid
> [  116.116058][T1] Could not create directory for event sys_enter_getuid
> [  116.124250][T1] Could not initialize trace point 
> events/sys_enter_geteuid
> [  116.131457][T1] Could not create directory for event sys_enter_geteuid
> [  116.139840][T1] Could not initialize trace point 
> events/sys_enter_getgid
> [  116.146908][T1] Could not create directory for event sys_enter_getgid
> [  116.155163][T1] Could not initialize trace point 
> events/sys_enter_getegid
> [  116.162370][T1] Could not create directory for event sys_enter_getegid
> [  116.178015][T1] Could not initialize trace point 
> events/sys_enter_setsid
> [  116.185138][T1] Could not create directory for event sys_enter_setsid
> [  116.269307][T1] Could not initialize trace point 
> events/sys_enter_sched_yield
> [  116.276811][T1] Could not create directory for event 
> sys_enter_sched_yield
> [  116.527652][T1] Could not initialize trace point 
> events/sys_enter_munlockall
> [  116.535126][T1] Could not create directory for event 
> sys_enter_munlockall
> [  116.622096][T1] Could not initialize trace point 
> events/sys_enter_vhangup
> [  116.629307][T1] Could not create directory for event sys_enter_vhangup
> [  116.783867][T1] Could not initialize trace point events/sys_enter_sync
> [  116.790819][T1] Could not create directory for event sys_enter_sync
> [  117.723402][T1] pnp: PnP ACPI init

I noticed that all of the above have zero parameters. Does the
following patch fix it?

(note, I prefer "ret" and "i" on different lines anyway)

-- Steve

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 53935259f701..abb70c71fe60 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -269,7 +269,8 @@ static int __init syscall_enter_define_fields(struct 
trace_event_call *call)
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
int offset = offsetof(typeof(trace), args);
-   int ret, i;
+   int ret = 0;
+   int i;
 
for (i = 0; i < meta->nb_args; i++) {
ret = trace_define_field(call, meta->types[i],


Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread Dan Williams
On Mon, Dec 16, 2019 at 2:26 PM John Hubbard  wrote:
>
> An upcoming patch changes and complicates the refcounting and
> especially the "put page" aspects of it. In order to keep
> everything clean, refactor the devmap page release routines:
>
> * Rename put_devmap_managed_page() to page_is_devmap_managed(),
>   and limit the functionality to "read only": return a bool,
>   with no side effects.
>
> * Add a new routine, put_devmap_managed_page(), to handle checking
>   what kind of page it is, and what kind of refcount handling it
>   requires.
>
> * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
>   and limit the functionality to unconditionally freeing a devmap
>   page.
>
> This is originally based on a separate patch by Ira Weiny, which
> applied to an early version of the put_user_page() experiments.
> Since then, Jérôme Glisse suggested the refactoring described above.
>
> Cc: Christoph Hellwig 
> Suggested-by: Jérôme Glisse 
> Reviewed-by: Dan Williams 
> Reviewed-by: Jan Kara 
> Signed-off-by: Ira Weiny 
> Signed-off-by: John Hubbard 
> ---
>  include/linux/mm.h | 17 +
>  mm/memremap.c  | 16 ++--
>  mm/swap.c  | 24 
>  3 files changed, 39 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index c97ea3b694e6..77a4df06c8a7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -952,9 +952,10 @@ static inline bool is_zone_device_page(const struct page 
> *page)
>  #endif
>
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void free_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
> if (!static_branch_unlikely(_managed_key))
> return false;
> @@ -963,7 +964,6 @@ static inline bool put_devmap_managed_page(struct page 
> *page)
> switch (page->pgmap->type) {
> case MEMORY_DEVICE_PRIVATE:
> case MEMORY_DEVICE_FS_DAX:
> -   __put_devmap_managed_page(page);
> return true;
> default:
> break;
> @@ -971,7 +971,14 @@ static inline bool put_devmap_managed_page(struct page 
> *page)
> return false;
>  }
>
> +bool put_devmap_managed_page(struct page *page);
> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
> +static inline bool page_is_devmap_managed(struct page *page)
> +{
> +   return false;
> +}
> +
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
> return false;
> @@ -1028,8 +1035,10 @@ static inline void put_page(struct page *page)
>  * need to inform the device driver through callback. See
>  * include/linux/memremap.h and HMM for details.
>  */
> -   if (put_devmap_managed_page(page))
> +   if (page_is_devmap_managed(page)) {
> +   put_devmap_managed_page(page);
> return;
> +   }
>
> if (put_page_testzero(page))
> __put_page(page);
> diff --git a/mm/memremap.c b/mm/memremap.c
> index e899fa876a62..2ba773859031 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  EXPORT_SYMBOL_GPL(get_dev_pagemap);
>
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page)
> +void free_devmap_managed_page(struct page *page)
>  {
> -   int count = page_ref_dec_return(page);
> -
> -   /* still busy */
> -   if (count > 1)
> -   return;
> -
> -   /* only triggered by the dev_pagemap shutdown path */
> -   if (count == 0) {
> -   __put_page(page);
> -   return;
> -   }
> -
> /* notify page idle for dax */
> if (!is_device_private_page(page)) {
> wake_up_var(>_refcount);
> @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page)
> page->mapping = NULL;
> page->pgmap->ops->page_free(page);
>  }
> -EXPORT_SYMBOL(__put_devmap_managed_page);
> +EXPORT_SYMBOL(free_devmap_managed_page);

This patch does not have a module consumer for
free_devmap_managed_page(), so the export should move to the patch
that needs the new export.

Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL
instead of EXPORT_SYMBOL_GPL is that there was no practical way to
hide the devmap details from evey module in the kernel that did
put_page(). I would expect free_devmap_managed_page() to
EXPORT_SYMBOL_GPL if it is not inlined into an existing exported
static inline api.


Re: "ftrace: Rework event_create_dir()" triggers boot error messages

2019-12-18 Thread Qian Cai



> On Dec 18, 2019, at 11:31 PM, Steven Rostedt  wrote:
> 
> On Wed, 18 Dec 2019 22:58:23 -0500
> Qian Cai  wrote:
> 
>> The linux-next commit "ftrace: Rework event_create_dir()” [1] triggers boot 
>> warnings
>> for Clang-build (Clang version 8.0.1) kernels (reproduced on both arm64 and 
>> powerpc).
>> Reverted it (with trivial conflict fixes) on the top of today’s linux-next 
>> fixed the issue.
>> 
>> configs:
>> https://raw.githubusercontent.com/cailca/linux-mm/master/arm64.config
>> https://raw.githubusercontent.com/cailca/linux-mm/master/powerpc.config
>> 
>> [1] https://lore.kernel.org/lkml/2019132458.342979...@infradead.org/
>> 
>> [  115.799327][T1] Registered efivars operations
>> [  115.849770][T1] clocksource: Switched to clocksource arch_sys_counter
>> [  115.901145][T1] Could not initialize trace point 
>> events/sys_enter_rt_sigreturn
>> [  115.908854][T1] Could not create directory for event 
>> sys_enter_rt_sigreturn
>> [  115.998949][T1] Could not initialize trace point 
>> events/sys_enter_restart_syscall
>> [  116.006802][T1] Could not create directory for event 
>> sys_enter_restart_syscall
>> [  116.062702][T1] Could not initialize trace point 
>> events/sys_enter_getpid
>> [  116.069828][T1] Could not create directory for event sys_enter_getpid
>> [  116.078058][T1] Could not initialize trace point 
>> events/sys_enter_gettid
>> [  116.085181][T1] Could not create directory for event sys_enter_gettid
>> [  116.093405][T1] Could not initialize trace point 
>> events/sys_enter_getppid
>> [  116.100612][T1] Could not create directory for event sys_enter_getppid
>> [  116.108989][T1] Could not initialize trace point 
>> events/sys_enter_getuid
>> [  116.116058][T1] Could not create directory for event sys_enter_getuid
>> [  116.124250][T1] Could not initialize trace point 
>> events/sys_enter_geteuid
>> [  116.131457][T1] Could not create directory for event sys_enter_geteuid
>> [  116.139840][T1] Could not initialize trace point 
>> events/sys_enter_getgid
>> [  116.146908][T1] Could not create directory for event sys_enter_getgid
>> [  116.155163][T1] Could not initialize trace point 
>> events/sys_enter_getegid
>> [  116.162370][T1] Could not create directory for event sys_enter_getegid
>> [  116.178015][T1] Could not initialize trace point 
>> events/sys_enter_setsid
>> [  116.185138][T1] Could not create directory for event sys_enter_setsid
>> [  116.269307][T1] Could not initialize trace point 
>> events/sys_enter_sched_yield
>> [  116.276811][T1] Could not create directory for event 
>> sys_enter_sched_yield
>> [  116.527652][T1] Could not initialize trace point 
>> events/sys_enter_munlockall
>> [  116.535126][T1] Could not create directory for event 
>> sys_enter_munlockall
>> [  116.622096][T1] Could not initialize trace point 
>> events/sys_enter_vhangup
>> [  116.629307][T1] Could not create directory for event sys_enter_vhangup
>> [  116.783867][T1] Could not initialize trace point events/sys_enter_sync
>> [  116.790819][T1] Could not create directory for event sys_enter_sync
>> [  117.723402][T1] pnp: PnP ACPI init
> 
> I noticed that all of the above have zero parameters. Does the
> following patch fix it?

Yes, it works.

> 
> (note, I prefer "ret" and "i" on different lines anyway)
> 
> -- Steve
> 
> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 53935259f701..abb70c71fe60 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c
> @@ -269,7 +269,8 @@ static int __init syscall_enter_define_fields(struct 
> trace_event_call *call)
>   struct syscall_trace_enter trace;
>   struct syscall_metadata *meta = call->data;
>   int offset = offsetof(typeof(trace), args);
> - int ret, i;
> + int ret = 0;
> + int i;
> 
>   for (i = 0; i < meta->nb_args; i++) {
>   ret = trace_define_field(call, meta->types[i],



Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread Dan Williams
On Wed, Dec 18, 2019 at 9:51 PM John Hubbard  wrote:
>
> On 12/18/19 9:27 PM, Dan Williams wrote:
> ...
> >> @@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page)
> >>  page->mapping = NULL;
> >>  page->pgmap->ops->page_free(page);
> >>   }
> >> -EXPORT_SYMBOL(__put_devmap_managed_page);
> >> +EXPORT_SYMBOL(free_devmap_managed_page);
> >
> > This patch does not have a module consumer for
> > free_devmap_managed_page(), so the export should move to the patch
> > that needs the new export.
>
> Hi Dan,
>
> OK, I know that's a policy--although it seems quite pointless here given
> that this is definitely going to need an EXPORT.
>
> At the moment, the series doesn't use it in any module at all, so I'll just
> delete the EXPORT for now.
>
> >
> > Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL
> > instead of EXPORT_SYMBOL_GPL is that there was no practical way to
> > hide the devmap details from evey module in the kernel that did
> > put_page(). I would expect free_devmap_managed_page() to
> > EXPORT_SYMBOL_GPL if it is not inlined into an existing exported
> > static inline api.
> >
>
> Sure, I'll change it to EXPORT_SYMBOL_GPL when the time comes. We do have
> to be careful that we don't shut out normal put_page() types of callers,
> but...glancing through the current callers, that doesn't look to be a problem.
> Good. So it should be OK to do EXPORT_SYMBOL_GPL here.
>
> Are you *sure* you don't want to just pre-emptively EXPORT now, and save
> looking at it again?

I'm positive. There is enough history for "trust me the consumer is
coming" turning out not to be true to justify the hassle in my mind. I
do trust you, but things happen.


Re: [PATCH] powerpc/setup_64: use DEFINE_DEBUGFS_ATTRIBUTE to define fops_rfi_flush

2019-12-18 Thread Chen Zhou
Hi Michael,

On 2019/12/18 19:02, Michael Ellerman wrote:
> Chen Zhou  writes:
>> Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE for
>> debugfs files.
>>
>> Semantic patch information:
>> Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file()
>> imposes some significant overhead as compared to
>> DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe().
> 
> I know you didn't write this text, but these change logs are not great.
> It doesn't really explain why you're doing it. And it is alarming that
> you're converting *to* a function with "unsafe" in the name.
> 
> The commit that added the script:
> 
>   5103068eaca2 ("debugfs, coccinelle: check for obsolete 
> DEFINE_SIMPLE_ATTRIBUTE() usage")
> 
> Has a bit more explanation.
> 
> Maybe something like this:
> 
>   In order to protect against file removal races, debugfs files created via
>   debugfs_create_file() are wrapped by a struct file_operations at their
>   opening.
>   
>   If the original struct file_operations is known to be safe against removal
>   races already, the proxy creation may be bypassed by creating the files
>   using DEFINE_DEBUGFS_ATTRIBUTE() and debugfs_create_file_unsafe().
> 
> 
> The part that's not explained is why this file is "known to be safe
> against removal races already"?
> 
> It also seems this conversion will make the file no longer seekable,
> because DEFINE_SIMPLE_ATTRIBUTE() uses generic_file_llseek() whereas
> DEFINE_DEBUGFS_ATTRIBUTE() uses no_llseek.
> 
> That is probably fine, but should be mentioned.

Thanks for your explanation. This part indeed should be mentioned.

Chen Zhou

> 
> cheers
> 
>> Signed-off-by: Chen Zhou 
>> ---
>>  arch/powerpc/kernel/setup_64.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
>> index 6104917..4b9fbb2 100644
>> --- a/arch/powerpc/kernel/setup_64.c
>> +++ b/arch/powerpc/kernel/setup_64.c
>> @@ -956,11 +956,11 @@ static int rfi_flush_get(void *data, u64 *val)
>>  return 0;
>>  }
>>  
>> -DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, 
>> "%llu\n");
>> +DEFINE_DEBUGFS_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, 
>> "%llu\n");
>>  
>>  static __init int rfi_flush_debugfs_init(void)
>>  {
>> -debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, 
>> _rfi_flush);
>> +debugfs_create_file_unsafe("rfi_flush", 0600, powerpc_debugfs_root, 
>> NULL, _rfi_flush);
>>  return 0;
>>  }
>>  device_initcall(rfi_flush_debugfs_init);
>> -- 
>> 2.7.4
> 
> .
> 



Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread John Hubbard

On 12/18/19 9:27 PM, Dan Williams wrote:
...

@@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page)
 page->mapping = NULL;
 page->pgmap->ops->page_free(page);
  }
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(free_devmap_managed_page);


This patch does not have a module consumer for
free_devmap_managed_page(), so the export should move to the patch
that needs the new export.


Hi Dan,

OK, I know that's a policy--although it seems quite pointless here given
that this is definitely going to need an EXPORT.

At the moment, the series doesn't use it in any module at all, so I'll just
delete the EXPORT for now.



Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL
instead of EXPORT_SYMBOL_GPL is that there was no practical way to
hide the devmap details from evey module in the kernel that did
put_page(). I would expect free_devmap_managed_page() to
EXPORT_SYMBOL_GPL if it is not inlined into an existing exported
static inline api.



Sure, I'll change it to EXPORT_SYMBOL_GPL when the time comes. We do have
to be careful that we don't shut out normal put_page() types of callers,
but...glancing through the current callers, that doesn't look to be a problem.
Good. So it should be OK to do EXPORT_SYMBOL_GPL here.

Are you *sure* you don't want to just pre-emptively EXPORT now, and save
looking at it again?

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v4 2/2] powerpc/irq: inline call_do_irq() and call_do_softirq()

2019-12-18 Thread Christophe Leroy




Le 09/12/2019 à 11:53, Michael Ellerman a écrit :

Segher Boessenkool  writes:

On Sat, Dec 07, 2019 at 10:42:28AM +0100, Christophe Leroy wrote:

Le 06/12/2019 à 21:59, Segher Boessenkool a écrit :

If the compiler can see the callee wants the same TOC as the caller has,
it does not arrange to set (and restore) it, no.  If it sees it may be
different, it does arrange for that (and the linker then will check if
it actually needs to do anything, and do that if needed).

In this case, the compiler cannot know the callee wants the same TOC,
which complicates thing a lot -- but it all works out.


Do we have a way to make sure which TOC the functions are using ? Is
there several TOC at all in kernel code ?


Kernel modules have their own TOC, I think?


Yes.


Yes, this means that exported functions have to care about that, right ?
And that's the reason why exported assembly functions like copy_page() 
use _GLOBAL_TOC() and not _GLOBAL()


But main part of the kernel only has one TOC, so r2 can be assumed 
constant for non exported functions, can't it ?





I think things can still go wrong if any of this is inlined into a kernel
module?  Is there anything that prevents this / can this not happen for
some fundamental reason I don't see?


This can't happen can it ?
do_softirq_own_stack() is an outline function, defined in powerpc irq.c
Its only caller is do_softirq() which is an outline function defined in
kernel/softirq.c

That prevents inlining, doesn't it ?


Hopefully, sure.  Would be nice if it was clearer that this works...  It
is too much like working by chance, the way it is :-(


There's no way any of that code can end up in a module. Or at least if
there is, that's a bug.


That's my conclusion as well. So I guess we can consider r2 as constant 
over those functions.





Anyway, until we clarify all this I'll limit my patch to PPC32 which is
where the real benefit is I guess.

At the end, maybe the solution should be to switch to IRQ stack
immediately in the exception entry as x86_64 do ?


Yeah that might be cleaner.



I prepared a patch for that on PPC32, but it doesn't get rid of the IRQ 
stack switch completely because do_IRQ() is also called from other 
places like the timer interrupt.


And we will still have the switch for softirqs. We could move 
do_softirq_own_stack() to assembly and merge it with call_do_softirq(), 
but a find it cleaner to inline call_do_softirq() instead, now that we 
have demonstrated that r2 can't change.


Christophe


Re: [PATCH v11 04/25] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-12-18 Thread John Hubbard

On 12/18/19 10:52 PM, Dan Williams wrote:

On Wed, Dec 18, 2019 at 9:51 PM John Hubbard  wrote:


On 12/18/19 9:27 PM, Dan Williams wrote:
...

@@ -461,5 +449,5 @@ void __put_devmap_managed_page(struct page *page)
  page->mapping = NULL;
  page->pgmap->ops->page_free(page);
   }
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(free_devmap_managed_page);


This patch does not have a module consumer for
free_devmap_managed_page(), so the export should move to the patch
that needs the new export.


Hi Dan,

OK, I know that's a policy--although it seems quite pointless here given
that this is definitely going to need an EXPORT.

At the moment, the series doesn't use it in any module at all, so I'll just
delete the EXPORT for now.



Also the only reason that put_devmap_managed_page() is EXPORT_SYMBOL
instead of EXPORT_SYMBOL_GPL is that there was no practical way to
hide the devmap details from evey module in the kernel that did
put_page(). I would expect free_devmap_managed_page() to
EXPORT_SYMBOL_GPL if it is not inlined into an existing exported
static inline api.



Sure, I'll change it to EXPORT_SYMBOL_GPL when the time comes. We do have
to be careful that we don't shut out normal put_page() types of callers,
but...glancing through the current callers, that doesn't look to be a problem.
Good. So it should be OK to do EXPORT_SYMBOL_GPL here.

Are you *sure* you don't want to just pre-emptively EXPORT now, and save
looking at it again?


I'm positive. There is enough history for "trust me the consumer is
coming" turning out not to be true to justify the hassle in my mind. I
do trust you, but things happen.



OK, it's deleted locally. Thanks for looking at the patch. I'll post a v12 
series
that includes the change, once it looks like reviews are slowing down.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v4 4/4] powerpc: Book3S 64-bit "heavyweight" KASAN support

2019-12-18 Thread Christophe Leroy




On 12/19/2019 12:36 AM, Daniel Axtens wrote:

KASAN support on Book3S is a bit tricky to get right:

  - It would be good to support inline instrumentation so as to be able to
catch stack issues that cannot be caught with outline mode.

  - Inline instrumentation requires a fixed offset.

  - Book3S runs code in real mode after booting. Most notably a lot of KVM
runs in real mode, and it would be good to be able to instrument it.

  - Because code runs in real mode after boot, the offset has to point to
valid memory both in and out of real mode.

 [ppc64 mm note: The kernel installs a linear mapping at effective
 address c000... onward. This is a one-to-one mapping with physical
 memory from ... onward. Because of how memory accesses work on
 powerpc 64-bit Book3S, a kernel pointer in the linear map accesses the
 same memory both with translations on (accessing as an 'effective
 address'), and with translations off (accessing as a 'real
 address'). This works in both guests and the hypervisor. For more
 details, see s5.7 of Book III of version 3 of the ISA, in particular
 the Storage Control Overview, s5.7.3, and s5.7.5 - noting that this
 KASAN implementation currently only supports Radix.]

One approach is just to give up on inline instrumentation. This way all
checks can be delayed until after everything set is up correctly, and the
address-to-shadow calculations can be overridden. However, the features and
speed boost provided by inline instrumentation are worth trying to do
better.

If _at compile time_ it is known how much contiguous physical memory a
system has, the top 1/8th of the first block of physical memory can be set
aside for the shadow. This is a big hammer and comes with 3 big
consequences:

  - there's no nice way to handle physically discontiguous memory, so only
the first physical memory block can be used.

  - kernels will simply fail to boot on machines with less memory than
specified when compiling.

  - kernels running on machines with more memory than specified when
compiling will simply ignore the extra memory.

Implement and document KASAN this way. The current implementation is Radix
only.

Despite the limitations, it can still find bugs,
e.g. http://patchwork.ozlabs.org/patch/1103775/

At the moment, this physical memory limit must be set _even for outline
mode_. This may be changed in a later series - a different implementation
could be added for outline mode that dynamically allocates shadow at a
fixed offset. For example, see https://patchwork.ozlabs.org/patch/795211/

Suggested-by: Michael Ellerman 
Cc: Balbir Singh  # ppc64 out-of-line radix version
Cc: Christophe Leroy  # ppc32 version
Signed-off-by: Daniel Axtens 

---
Changes since v3:
  - Address further feedback from Christophe.
  - Drop changes to stack walking, it looks like the issue I observed is
related to that particular stack, not stack-walking generally.

Changes since v2:

  - Address feedback from Christophe around cleanups and docs.
  - Address feedback from Balbir: at this point I don't have a good solution
for the issues you identify around the limitations of the inline 
implementation
but I think that it's worth trying to get the stack instrumentation support.
I'm happy to have an alternative and more flexible outline mode - I had
envisoned this would be called 'lightweight' mode as it imposes fewer 
restrictions.
I've linked to your implementation. I think it's best to add it in a 
follow-up series.
  - Made the default PHYS_MEM_SIZE_FOR_KASAN value 1024MB. I think most people 
have
guests with at least that much memory in the Radix 64s case so it's a much
saner default - it means that if you just turn on KASAN without reading the
docs you're much more likely to have a bootable kernel, which you will never
have if the value is set to zero! I'm happy to bikeshed the value if we 
want.

Changes since v1:
  - Landed kasan vmalloc support upstream
  - Lots of feedback from Christophe.

Changes since the rfc:

  - Boots real and virtual hardware, kvm works.

  - disabled reporting when we're checking the stack for exception
frames. The behaviour isn't wrong, just incompatible with KASAN.

  - Documentation!

  - Dropped old module stuff in favour of KASAN_VMALLOC.

The bugs with ftrace and kuap were due to kernel bloat pushing
prom_init calls to be done via the plt. Because we did not have
a relocatable kernel, and they are done very early, this caused
everything to explode. Compile with CONFIG_RELOCATABLE!
---
  Documentation/dev-tools/kasan.rst|   8 +-
  Documentation/powerpc/kasan.txt  | 112 ++-
  arch/powerpc/Kconfig |   2 +
  arch/powerpc/Kconfig.debug   |  21 
  arch/powerpc/Makefile|  11 ++
  arch/powerpc/include/asm/book3s/64/hash.h|   4 +
  

[PATCH v3] powerpc/32: add support of KASAN_VMALLOC

2019-12-18 Thread Christophe Leroy
Add support of KASAN_VMALLOC on PPC32.

To allow this, the early shadow covering the VMALLOC space
need to be removed once high_memory var is set and before
freeing memblock.

And the VMALLOC area need to be aligned such that boundaries
are covered by a full shadow page.

Signed-off-by: Christophe Leroy 

---
v3: added missing inclusion of asm/kasan.h needed when CONFIG_KASAN is not set.

v2: rebased ; exclude specific module handling when CONFIG_KASAN_VMALLOC is set.
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/book3s/32/pgtable.h |  5 +
 arch/powerpc/include/asm/kasan.h |  2 ++
 arch/powerpc/include/asm/nohash/32/pgtable.h |  5 +
 arch/powerpc/mm/kasan/kasan_init_32.c| 33 +++-
 arch/powerpc/mm/mem.c|  4 
 6 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1ec34e16ed65..a247bbfb03d4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -173,6 +173,7 @@ config PPC
select HAVE_ARCH_HUGE_VMAP  if PPC_BOOK3S_64 && 
PPC_RADIX_MMU
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN  if PPC32
+   select HAVE_ARCH_KASAN_VMALLOC  if PPC32
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if COMPAT
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 0796533d37dd..5b39c11e884a 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -193,7 +193,12 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, 
pgprot_t prot);
 #else
 #define VMALLOC_START long)high_memory + VMALLOC_OFFSET) & 
~(VMALLOC_OFFSET-1)))
 #endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+#define VMALLOC_END_ALIGN_DOWN(ioremap_bot, PAGE_SIZE << 
KASAN_SHADOW_SCALE_SHIFT)
+#else
 #define VMALLOC_ENDioremap_bot
+#endif
 
 #ifndef __ASSEMBLY__
 #include 
diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
index 296e51c2f066..fbff9ff9032e 100644
--- a/arch/powerpc/include/asm/kasan.h
+++ b/arch/powerpc/include/asm/kasan.h
@@ -31,9 +31,11 @@
 void kasan_early_init(void);
 void kasan_mmu_init(void);
 void kasan_init(void);
+void kasan_late_init(void);
 #else
 static inline void kasan_init(void) { }
 static inline void kasan_mmu_init(void) { }
+static inline void kasan_late_init(void) { }
 #endif
 
 #endif /* __ASSEMBLY */
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 552b96eef0c8..60c4d829152e 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -114,7 +114,12 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, 
pgprot_t prot);
 #else
 #define VMALLOC_START long)high_memory + VMALLOC_OFFSET) & 
~(VMALLOC_OFFSET-1)))
 #endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+#define VMALLOC_END_ALIGN_DOWN(ioremap_bot, PAGE_SIZE << 
KASAN_SHADOW_SCALE_SHIFT)
+#else
 #define VMALLOC_ENDioremap_bot
+#endif
 
 /*
  * Bits in a linux-style PTE.  These match the bits in the
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c 
b/arch/powerpc/mm/kasan/kasan_init_32.c
index 0e6ed4413eea..88036fb88350 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -129,6 +129,31 @@ static void __init kasan_remap_early_shadow_ro(void)
flush_tlb_kernel_range(KASAN_SHADOW_START, KASAN_SHADOW_END);
 }
 
+static void __init kasan_unmap_early_shadow_vmalloc(void)
+{
+   unsigned long k_start = (unsigned long)kasan_mem_to_shadow((void 
*)VMALLOC_START);
+   unsigned long k_end = (unsigned long)kasan_mem_to_shadow((void 
*)VMALLOC_END);
+   unsigned long k_cur;
+   phys_addr_t pa = __pa(kasan_early_shadow_page);
+
+   if (!early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) {
+   int ret = kasan_init_shadow_page_tables(k_start, k_end);
+
+   if (ret)
+   panic("kasan: kasan_init_shadow_page_tables() failed");
+   }
+   for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) {
+   pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), 
k_cur);
+   pte_t *ptep = pte_offset_kernel(pmd, k_cur);
+
+   if ((pte_val(*ptep) & PTE_RPN_MASK) != pa)
+   continue;
+
+   __set_pte_at(_mm, k_cur, ptep, __pte(0), 0);
+   }
+   flush_tlb_kernel_range(k_start, k_end);
+}
+
 void __init kasan_mmu_init(void)
 {
int ret;
@@ -165,7 +190,13 @@ void __init kasan_init(void)
pr_info("KASAN init done\n");
 }
 
-#ifdef CONFIG_MODULES
+void __init kasan_late_init(void)
+{
+   if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+   kasan_unmap_early_shadow_vmalloc();
+}
+
+#if defined(CONFIG_MODULES) &&