date:20210406

[PATCH v2 4/4] powerpc/selftests: Add selftest to test concurrent perf/ptrace events

2021-04-06 Thread Ravi Bangoria

ptrace and perf watchpoints can't co-exists if their address range
overlaps. See commit 29da4f91c0c1 ("powerpc/watchpoint: Don't allow
concurrent perf and ptrace events") for more detail. Add selftest
for the same.

Sample o/p:
  # ./ptrace-perf-hwbreak
  test: ptrace-perf-hwbreak
  tags: git_version:powerpc-5.8-7-118-g937fa174a15d-dirty
  perf cpu event -> ptrace thread event (Overlapping): Ok
  perf cpu event -> ptrace thread event (Non-overlapping): Ok
  perf thread event -> ptrace same thread event (Overlapping): Ok
  perf thread event -> ptrace same thread event (Non-overlapping): Ok
  perf thread event -> ptrace other thread event: Ok
  ptrace thread event -> perf kernel event: Ok
  ptrace thread event -> perf same thread event (Overlapping): Ok
  ptrace thread event -> perf same thread event (Non-overlapping): Ok
  ptrace thread event -> perf other thread event: Ok
  ptrace thread event -> perf cpu event (Overlapping): Ok
  ptrace thread event -> perf cpu event (Non-overlapping): Ok
  ptrace thread event -> perf same thread & cpu event (Overlapping): Ok
  ptrace thread event -> perf same thread & cpu event (Non-overlapping): Ok
  ptrace thread event -> perf other thread & cpu event: Ok
  success: ptrace-perf-hwbreak

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/.gitignore   |   1 +
 .../testing/selftests/powerpc/ptrace/Makefile |   2 +-
 .../powerpc/ptrace/ptrace-perf-hwbreak.c  | 659 ++
 3 files changed, 661 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c

diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore 
b/tools/testing/selftests/powerpc/ptrace/.gitignore
index 0e96150b7c7e..eb75e5360e31 100644
--- a/tools/testing/selftests/powerpc/ptrace/.gitignore
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -14,3 +14,4 @@ perf-hwbreak
 core-pkey
 ptrace-pkey
 ptrace-syscall
+ptrace-perf-hwbreak
diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile 
b/tools/testing/selftests/powerpc/ptrace/Makefile
index 8d3f006c98cc..a500639da97a 100644
--- a/tools/testing/selftests/powerpc/ptrace/Makefile
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -2,7 +2,7 @@
 TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \
   ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx 
ptrace-tm-vsx \
   ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey 
core-pkey \
-  perf-hwbreak ptrace-syscall
+  perf-hwbreak ptrace-syscall ptrace-perf-hwbreak
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c
new file mode 100644
index ..6b8804a4942e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c
@@ -0,0 +1,659 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "ptrace.h"
+
+char data[16];
+
+/* Overlapping address range */
+volatile __u64 *ptrace_data1 = (__u64 *)[0];
+volatile __u64 *perf_data1 = (__u64 *)[4];
+
+/* Non-overlapping address range */
+volatile __u64 *ptrace_data2 = (__u64 *)[0];
+volatile __u64 *perf_data2 = (__u64 *)[8];
+
+static unsigned long pid_max_addr(void)
+{
+   FILE *fp;
+   char *line, *c;
+   char addr[100];
+   size_t len = 0;
+
+   fp = fopen("/proc/kallsyms", "r");
+   if (!fp) {
+   printf("Failed to read /proc/kallsyms. Exiting..\n");
+   exit(EXIT_FAILURE);
+   }
+
+   while (getline(, , fp) != -1) {
+   if (!strstr(line, "pid_max") || strstr(line, "pid_max_max") ||
+   strstr(line, "pid_max_min"))
+   continue;
+
+   strncpy(addr, line, len < 100 ? len : 100);
+   c = strchr(addr, ' ');
+   *c = '\0';
+   return strtoul(addr, , 16);
+   }
+   fclose(fp);
+   printf("Could not find pix_max. Exiting..\n");
+   exit(EXIT_FAILURE);
+   return -1;
+}
+
+static void perf_user_event_attr_set(struct perf_event_attr *attr, __u64 addr, 
__u64 len)
+{
+   memset(attr, 0, sizeof(struct perf_event_attr));
+   attr->type   = PERF_TYPE_BREAKPOINT;
+   attr->size   = sizeof(struct perf_event_attr);
+   attr->bp_type= HW_BREAKPOINT_R;
+   attr->bp_addr= addr;
+   attr->bp_len = len;
+   attr->exclude_kernel = 1;
+   attr->exclude_hv = 1;
+}
+
+static void perf_kernel_event_attr_set(struct perf_event_attr *attr)
+{
+   memset(attr, 0, sizeof(struct perf_event_attr));
+   attr->type   = PERF_TYPE_BREAKPOINT;
+   attr->size   = sizeof(struct perf_event_attr);
+   attr->bp_type= HW_BREAKPOINT_R;
+   attr->bp_addr=

[PATCH v2 3/4] powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR

2021-04-06 Thread Ravi Bangoria

Extend perf-hwbreak.c selftest to test multiple DAWRs. Also add
testcase for testing 512 byte boundary removal.

Sample o/p:
  # ./perf-hwbreak
  ...
  TESTED: Process specific, Two events, diff addr
  TESTED: Process specific, Two events, same addr
  TESTED: Process specific, Two events, diff addr, one is RO, other is WO
  TESTED: Process specific, Two events, same addr, one is RO, other is WO
  TESTED: Systemwide, Two events, diff addr
  TESTED: Systemwide, Two events, same addr
  TESTED: Systemwide, Two events, diff addr, one is RO, other is WO
  TESTED: Systemwide, Two events, same addr, one is RO, other is WO
  TESTED: Process specific, 512 bytes, unaligned
  success: perf_hwbreak

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 568 +-
 1 file changed, 567 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index bde475341c8a..1dafba42c23d 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -21,8 +21,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -34,6 +39,12 @@
 
 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
 
+int nprocs;
+
+static volatile int a = 10;
+static volatile int b = 10;
+static volatile char c[512 + 8] __attribute__((aligned(512)));
+
 static void perf_event_attr_set(struct perf_event_attr *attr,
__u32 type, __u64 addr, __u64 len,
bool exclude_user)
@@ -68,6 +79,76 @@ static int perf_process_event_open(__u32 type, __u64 addr, 
__u64 len)
return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
 }
 
+static int perf_cpu_event_open(long cpu, __u32 type, __u64 addr, __u64 len)
+{
+   struct perf_event_attr attr;
+
+   perf_event_attr_set(, type, addr, len, 0);
+   return syscall(__NR_perf_event_open, , -1, cpu, -1, 0);
+}
+
+static void close_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   close(fd[i]);
+}
+
+static unsigned long read_fds(int *fd, int n)
+{
+   int i;
+   unsigned long c = 0;
+   unsigned long count = 0;
+   size_t res;
+
+   for (i = 0; i < n; i++) {
+   res = read(fd[i], , sizeof(c));
+   assert(res == sizeof(unsigned long long));
+   count += c;
+   }
+   return count;
+}
+
+static void reset_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   ioctl(fd[i], PERF_EVENT_IOC_RESET);
+}
+
+static void enable_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   ioctl(fd[i], PERF_EVENT_IOC_ENABLE);
+}
+
+static void disable_fds(int *fd, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++)
+   ioctl(fd[i], PERF_EVENT_IOC_DISABLE);
+}
+
+static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 
len)
+{
+   int i = 0;
+
+   /* Assume online processors are 0 to nprocs for simplisity */
+   for (i = 0; i < nprocs; i++) {
+   fd[i] = perf_cpu_event_open(i, type, addr, len);
+   if (fd[i] < 0) {
+   close_fds(fd, i);
+   return fd[i];
+   }
+   }
+   return 0;
+}
+
 static inline bool breakpoint_test(int len)
 {
int fd;
@@ -261,11 +342,483 @@ static int runtest_dar_outside(void)
return fail;
 }
 
+static void multi_dawr_workload(void)
+{
+   a += 10;
+   b += 10;
+   c[512 + 1] += 'a';
+}
+
+static int test_process_multi_diff_addr(void)
+{
+   unsigned long long breaks1 = 0, breaks2 = 0;
+   int fd1, fd2;
+   char *desc = "Process specific, Two events, diff addr";
+   size_t res;
+
+   fd1 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64), 
(__u64)sizeof(a));
+   if (fd1 < 0) {
+   perror("perf_process_event_open");
+   exit(EXIT_FAILURE);
+   }
+
+   fd2 = perf_process_event_open(HW_BREAKPOINT_RW, (__u64), 
(__u64)sizeof(b));
+   if (fd2 < 0) {
+   close(fd1);
+   perror("perf_process_event_open");
+   exit(EXIT_FAILURE);
+   }
+
+   ioctl(fd1, PERF_EVENT_IOC_RESET);
+   ioctl(fd2, PERF_EVENT_IOC_RESET);
+   ioctl(fd1, PERF_EVENT_IOC_ENABLE);
+   ioctl(fd2, PERF_EVENT_IOC_ENABLE);
+   multi_dawr_workload();
+   ioctl(fd1, PERF_EVENT_IOC_DISABLE);
+   ioctl(fd2, PERF_EVENT_IOC_DISABLE);
+
+   res = read(fd1, , sizeof(breaks1));
+   assert(res == sizeof(unsigned long long));
+   res = read(fd2, , sizeof(breaks2));
+   assert(res == sizeof(unsigned long long));
+
+   close(fd1);
+   close(fd2);
+
+   if (breaks1 != 2 || breaks2 != 2) {
+   printf("FAILED: %s: %lld != 2 || %lld

[PATCH v2 2/4] powerpc/selftests/perf-hwbreak: Coalesce event creation code

2021-04-06 Thread Ravi Bangoria

perf-hwbreak selftest opens hw-breakpoint event at multiple places for
which it has same code repeated. Coalesce that code into a function.

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 78 +--
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index c1f324afdbf3..bde475341c8a 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -34,28 +34,46 @@
 
 #define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
 
-static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
- int cpu, int group_fd,
- unsigned long flags)
+static void perf_event_attr_set(struct perf_event_attr *attr,
+   __u32 type, __u64 addr, __u64 len,
+   bool exclude_user)
 {
-   attr->size = sizeof(*attr);
-   return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+   memset(attr, 0, sizeof(struct perf_event_attr));
+   attr->type   = PERF_TYPE_BREAKPOINT;
+   attr->size   = sizeof(struct perf_event_attr);
+   attr->bp_type= type;
+   attr->bp_addr= addr;
+   attr->bp_len = len;
+   attr->exclude_kernel = 1;
+   attr->exclude_hv = 1;
+   attr->exclude_guest  = 1;
+   attr->exclude_user   = exclude_user;
+   attr->disabled   = 1;
 }
 
-static inline bool breakpoint_test(int len)
+static int
+perf_process_event_open_exclude_user(__u32 type, __u64 addr, __u64 len, bool 
exclude_user)
 {
struct perf_event_attr attr;
+
+   perf_event_attr_set(, type, addr, len, exclude_user);
+   return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+}
+
+static int perf_process_event_open(__u32 type, __u64 addr, __u64 len)
+{
+   struct perf_event_attr attr;
+
+   perf_event_attr_set(, type, addr, len, 0);
+   return syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+}
+
+static inline bool breakpoint_test(int len)
+{
int fd;
 
-   /* setup counters */
-   memset(, 0, sizeof(attr));
-   attr.disabled = 1;
-   attr.type = PERF_TYPE_BREAKPOINT;
-   attr.bp_type = HW_BREAKPOINT_R;
/* bp_addr can point anywhere but needs to be aligned */
-   attr.bp_addr = (__u64)() & 0xf800;
-   attr.bp_len = len;
-   fd = sys_perf_event_open(, 0, -1, -1, 0);
+   fd = perf_process_event_open(HW_BREAKPOINT_R, (__u64)() & 
0xf800, len);
if (fd < 0)
return false;
close(fd);
@@ -75,7 +93,6 @@ static inline bool dawr_supported(void)
 static int runtestsingle(int readwriteflag, int exclude_user, int arraytest)
 {
int i,j;
-   struct perf_event_attr attr;
size_t res;
unsigned long long breaks, needed;
int readint;
@@ -94,19 +111,11 @@ static int runtestsingle(int readwriteflag, int 
exclude_user, int arraytest)
if (arraytest)
ptr = [0];
 
-   /* setup counters */
-   memset(, 0, sizeof(attr));
-   attr.disabled = 1;
-   attr.type = PERF_TYPE_BREAKPOINT;
-   attr.bp_type = readwriteflag;
-   attr.bp_addr = (__u64)ptr;
-   attr.bp_len = sizeof(int);
-   if (arraytest)
-   attr.bp_len = DAWR_LENGTH_MAX;
-   attr.exclude_user = exclude_user;
-   break_fd = sys_perf_event_open(, 0, -1, -1, 0);
+   break_fd = perf_process_event_open_exclude_user(readwriteflag, 
(__u64)ptr,
+   arraytest ? DAWR_LENGTH_MAX : sizeof(int),
+   exclude_user);
if (break_fd < 0) {
-   perror("sys_perf_event_open");
+   perror("perf_process_event_open_exclude_user");
exit(1);
}
 
@@ -153,7 +162,6 @@ static int runtest_dar_outside(void)
void *target;
volatile __u16 temp16;
volatile __u64 temp64;
-   struct perf_event_attr attr;
int break_fd;
unsigned long long breaks;
int fail = 0;
@@ -165,21 +173,11 @@ static int runtest_dar_outside(void)
exit(EXIT_FAILURE);
}
 
-   /* setup counters */
-   memset(, 0, sizeof(attr));
-   attr.disabled = 1;
-   attr.type = PERF_TYPE_BREAKPOINT;
-   attr.exclude_kernel = 1;
-   attr.exclude_hv = 1;
-   attr.exclude_guest = 1;
-   attr.bp_type = HW_BREAKPOINT_RW;
/* watch middle half of target array */
-   attr.bp_addr = (__u64)(target + 2);
-   attr.bp_len = 4;
-   break_fd = sys_perf_event_open(, 0, -1, -1, 0);
+   break_fd = perf_process_event_open(HW_BREAKPOINT_RW, (__u64)(target + 
2), 4);
if (break_fd < 0) {
free(target);
-   perror("sys_perf_event_open");
+

[PATCH v2 1/4] powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR

2021-04-06 Thread Ravi Bangoria

Add selftests to test multiple active DAWRs with ptrace interface.

Sample o/p:
  $ ./ptrace-hwbreak
  ...
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO, len: 6: Ok
  PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO, len: 6: Ok

Signed-off-by: Ravi Bangoria 
---
 .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 79 +++
 1 file changed, 79 insertions(+)

diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c 
b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
index 2e0d86e0687e..a0635a3819aa 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
@@ -194,6 +194,18 @@ static void test_workload(void)
big_var[rand() % DAWR_MAX_LEN] = 'a';
else
cvar = big_var[rand() % DAWR_MAX_LEN];
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */
+   gstruct.a[rand() % A_LEN] = 'a';
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */
+   cvar = gstruct.b[rand() % B_LEN];
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */
+   gstruct.a[rand() % A_LEN] = 'a';
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */
+   cvar = gstruct.a[rand() % A_LEN];
 }
 
 static void check_success(pid_t child_pid, const char *name, const char *type,
@@ -417,6 +429,69 @@ static void test_sethwdebug_range_aligned(pid_t child_pid)
ptrace_delhwdebug(child_pid, wh);
 }
 
+static void test_multi_sethwdebug_range(pid_t child_pid)
+{
+   struct ppc_hw_breakpoint info1, info2;
+   unsigned long wp_addr1, wp_addr2;
+   char *name1 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED";
+   char *name2 = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED";
+   int len1, len2;
+   int wh1, wh2;
+
+   wp_addr1 = (unsigned long)
+   wp_addr2 = (unsigned long)
+   len1 = A_LEN;
+   len2 = B_LEN;
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, 
len1);
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, 
len2);
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW ALIGNED, WO test */
+   wh1 = ptrace_sethwdebug(child_pid, );
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DW UNALIGNED, RO test */
+   wh2 = ptrace_sethwdebug(child_pid, );
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name1, "WO", wp_addr1, len1);
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name2, "RO", wp_addr2, len2);
+
+   ptrace_delhwdebug(child_pid, wh1);
+   ptrace_delhwdebug(child_pid, wh2);
+}
+
+static void test_multi_sethwdebug_range_dawr_overlap(pid_t child_pid)
+{
+   struct ppc_hw_breakpoint info1, info2;
+   unsigned long wp_addr1, wp_addr2;
+   char *name = "PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap";
+   int len1, len2;
+   int wh1, wh2;
+
+   wp_addr1 = (unsigned long)
+   wp_addr2 = (unsigned long)
+   len1 = A_LEN;
+   len2 = A_LEN;
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr1, 
len1);
+   get_ppc_hw_breakpoint(, PPC_BREAKPOINT_TRIGGER_READ, wp_addr2, 
len2);
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, WO test */
+   wh1 = ptrace_sethwdebug(child_pid, );
+
+   /* PPC_PTRACE_SETHWDEBUG 2, MODE_RANGE, DAWR Overlap, RO test */
+   wh2 = ptrace_sethwdebug(child_pid, );
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name, "WO", wp_addr1, len1);
+
+   ptrace(PTRACE_CONT, child_pid, NULL, 0);
+   check_success(child_pid, name, "RO", wp_addr2, len2);
+
+   ptrace_delhwdebug(child_pid, wh1);
+   ptrace_delhwdebug(child_pid, wh2);
+}
+
 static void test_sethwdebug_range_unaligned(pid_t child_pid)
 {
struct ppc_hw_breakpoint info;
@@ -504,6 +579,10 @@ run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, 
bool dawr)
test_sethwdebug_range_unaligned(child_pid);
test_sethwdebug_range_unaligned_dar(child_pid);
test_sethwdebug_dawr_max_range(child_pid);
+   if (dbginfo->num_data_bps > 1) {
+   test_multi_sethwdebug_range(child_pid);
+   
test_multi_sethwdebug_range_dawr_overlap(child_pid);
+   }
}
}
 }
-- 
2.27.0

[PATCH v2 0/4] powerpc/selftests: Add Power10 2nd DAWR selftests

2021-04-06 Thread Ravi Bangoria

Add selftests for 2nd DAWR supported by Power10.

v1: 
https://lore.kernel.org/r/20200723102058.312282-1-ravi.bango...@linux.ibm.com
v1->v2:
 - Kvm patches are already upstream
 - Rebased selftests to powerpc/next

Ravi Bangoria (4):
  powerpc/selftests/ptrace-hwbreak: Add testcases for 2nd DAWR
  powerpc/selftests/perf-hwbreak: Coalesce event creation code
  powerpc/selftests/perf-hwbreak: Add testcases for 2nd DAWR
  powerpc/selftests: Add selftest to test concurrent perf/ptrace events

 .../selftests/powerpc/ptrace/.gitignore   |   1 +
 .../testing/selftests/powerpc/ptrace/Makefile |   2 +-
 .../selftests/powerpc/ptrace/perf-hwbreak.c   | 646 +++--
 .../selftests/powerpc/ptrace/ptrace-hwbreak.c |  79 +++
 .../powerpc/ptrace/ptrace-perf-hwbreak.c  | 659 ++
 5 files changed, 1345 insertions(+), 42 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-perf-hwbreak.c

-- 
2.27.0

[PATCH 16/20] kbuild: powerpc: use common install script

2021-04-06 Thread Greg Kroah-Hartman

The common scripts/install.sh script will now work for powerpc, all that
is needed is to add it to the list of arches that do not put the version
number in the installed file name.

After the kernel is installed, powerpc also likes to install a few
random files, so provide the ability to do that as well.

With that we can remove the powerpc-only version of the install script.

Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Greg Kroah-Hartman 
---
 arch/powerpc/boot/Makefile   |  4 +--
 arch/powerpc/boot/install.sh | 55 
 scripts/install.sh   | 14 -
 3 files changed, 15 insertions(+), 58 deletions(-)
 delete mode 100644 arch/powerpc/boot/install.sh

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 2b8da923ceca..bbfcbd33e0b7 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -442,11 +442,11 @@ $(obj)/zImage.initrd: $(addprefix $(obj)/, 
$(initrd-y))
 
 # Only install the vmlinux
 install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y))
-   sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux 
System.map "$(INSTALL_PATH)"
+   sh -x $(srctree)/scripts/install.sh "$(KERNELRELEASE)" vmlinux 
System.map "$(INSTALL_PATH)"
 
 # Install the vmlinux and other built boot targets.
 zInstall: $(CONFIGURE) $(addprefix $(obj)/, $(image-y))
-   sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux 
System.map "$(INSTALL_PATH)" $^
+   sh -x $(srctree)/scripts/install.sh "$(KERNELRELEASE)" vmlinux 
System.map "$(INSTALL_PATH)" $^
 
 PHONY += install zInstall
 
diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh
deleted file mode 100644
index b6a256bc96ee..
--- a/arch/powerpc/boot/install.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/sh
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 1995 by Linus Torvalds
-#
-# Blatantly stolen from in arch/i386/boot/install.sh by Dave Hansen 
-#
-# "make install" script for ppc64 architecture
-#
-# Arguments:
-#   $1 - kernel version
-#   $2 - kernel image file
-#   $3 - kernel map file
-#   $4 - default install path (blank if root directory)
-#   $5 and more - kernel boot files; zImage*, uImage, cuImage.*, etc.
-#
-
-# Bail with error code if anything goes wrong
-set -e
-
-# User may have a custom install script
-
-if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi
-if [ -x /sbin/${INSTALLKERNEL} ]; then exec /sbin/${INSTALLKERNEL} "$@"; fi
-
-# Default install
-
-# this should work for both the pSeries zImage and the iSeries vmlinux.sm
-image_name=`basename $2`
-
-if [ -f $4/$image_name ]; then
-   mv $4/$image_name $4/$image_name.old
-fi
-
-if [ -f $4/System.map ]; then
-   mv $4/System.map $4/System.old
-fi
-
-cat $2 > $4/$image_name
-cp $3 $4/System.map
-
-# Copy all the bootable image files
-path=$4
-shift 4
-while [ $# -ne 0 ]; do
-   image_name=`basename $1`
-   if [ -f $path/$image_name ]; then
-   mv $path/$image_name $path/$image_name.old
-   fi
-   cat $1 > $path/$image_name
-   shift
-done;
diff --git a/scripts/install.sh b/scripts/install.sh
index e0ffb95737d4..67c0a5f74af2 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -67,7 +67,7 @@ fi
 # Some architectures name their files based on version number, and
 # others do not.  Call out the ones that do not to make it obvious.
 case "${ARCH}" in
-   ia64 | m68k | nios2 | x86)
+   ia64 | m68k | nios2 | powerpc | x86)
version=""
;;
*)
@@ -93,6 +93,18 @@ case "${ARCH}" in
/usr/sbin/elilo
fi
;;
+   powerpc)
+   # powerpc installation can list other boot targets after the
+   # install path that should be copied to the correct location
+   path=$4
+   shift 4
+   while [ $# -ne 0 ]; do
+   image_name=$(basename "$1")
+   install "$1" "$path"/"$image_name"
+   shift
+   done;
+   sync
+   ;;
x86)
if [ -x /sbin/lilo ]; then
/sbin/lilo
-- 
2.31.1

[PATCH] powerpc/mce: save ignore_event flag unconditionally for UE

2021-04-06 Thread Ganesh Goudar

When we hit an UE while using machine check safe copy routines,
ignore_event flag is set and the event is ignored by mce handler,
And the flag is also saved for defered handling and printing of
mce event information, But as of now saving of this flag is done
on checking if the effective address is provided or physical address
is calculated, which is not right.

Save ignore_event flag regardless of whether the effective address is
provided or physical address is calculated.

Without this change following log is seen, when the event is to be
ignored.

[  512.971365] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.971509] MCE: CPU1: NIP: [c00b67c0] memcpy+0x40/0x90
[  512.971655] MCE: CPU1: Initiator CPU
[  512.971739] MCE: CPU1: Unknown
[  512.972209] MCE: CPU1: machine check (Severe)  UE Load/Store [Recovered]
[  512.972334] MCE: CPU1: NIP: [c00b6808] memcpy+0x88/0x90
[  512.972456] MCE: CPU1: Initiator CPU
[  512.972534] MCE: CPU1: Unknown

Signed-off-by: Ganesh Goudar 
---
 arch/powerpc/kernel/mce.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 11f0cae086ed..db9363e131ce 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 * Populate the mce error_type and type-specific error_type.
 */
mce_set_error_info(mce, mce_err);
+   if (mce->error_type == MCE_ERROR_TYPE_UE)
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
 
if (!addr)
return;
@@ -159,7 +161,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   mce->u.ue_error.ignore_event = mce_err->ignore_event;
machine_check_ue_event(mce);
}
}
-- 
2.26.2

Re: [PATCH net-next v3 2/2] of: net: fix of_get_mac_addr_nvmem() for PCI and DSA nodes

2021-04-06 Thread Florian Fainelli




On 4/6/2021 3:09 PM, Michael Walle wrote:
> of_get_mac_address() already supports fetching the MAC address by an
> nvmem provider. But until now, it was just working for platform devices.
> Esp. it was not working for DSA ports and PCI devices. It gets more
> common that PCI devices have a device tree binding since SoCs contain
> integrated root complexes.
> 
> Use the nvmem of_* binding to fetch the nvmem cells by a struct
> device_node. We still have to try to read the cell by device first
> because there might be a nvmem_cell_lookup associated with that device.
> 
> Signed-off-by: Michael Walle 
> ---
> Please note, that I've kept the nvmem_get_mac_address() which operates
> on a device. The new of_get_mac_addr_nvmem() is almost identical and
> there are no users of the former function right now, but it seems to be
> the "newer" version to get the MAC address for a "struct device". Thus
> I've kept it. Please advise, if I should kill it though.

Nit: if you need to resubmit you could rephrase the subject such that
the limitation of of_get_mac_addr_nvmem() is lifted to include all kinds
of devices, and no longer just platform_device instances as before.
-- 
Florian

Re: [PATCH 2/2] of: net: fix of_get_mac_addr_nvmem() for PCI and DSA nodes

2021-04-06 Thread Rob Herring

On Mon, Apr 5, 2021 at 11:47 AM Michael Walle  wrote:
>
> of_get_mac_address() already supports fetching the MAC address by an
> nvmem provider. But until now, it was just working for platform devices.
> Esp. it was not working for DSA ports and PCI devices. It gets more
> common that PCI devices have a device tree binding since SoCs contain
> integrated root complexes.
>
> Use the nvmem of_* binding to fetch the nvmem cells by a struct
> device_node. We still have to try to read the cell by device first
> because there might be a nvmem_cell_lookup associated with that device.
>
> Signed-off-by: Michael Walle 
> ---
> Please note, that I've kept the nvmem_get_mac_address() which operates
> on a device. The new of_get_mac_addr_nvmem() is almost identical and
> there are no users of the former function right now, but it seems to be
> the "newer" version to get the MAC address for a "struct device". Thus
> I've kept it. Please advise, if I should kill it though.

It seems kind of backwards from how we normally design this type of
API where the API with a struct device will call a firmware specific
version if there's a firmware handle. But certainly, I don't think we
should be operating on platform device if we can help it.

>  drivers/of/of_net.c | 37 +++--
>  1 file changed, 31 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
> index 2344ad7fff5e..2323c6063eaf 100644
> --- a/drivers/of/of_net.c
> +++ b/drivers/of/of_net.c
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  /**
>   * of_get_phy_mode - Get phy mode for given device_node
> @@ -56,18 +57,42 @@ static int of_get_mac_addr(struct device_node *np, const 
> char *name, u8 *addr)
> return -ENODEV;
>  }
>
> -static int of_get_mac_addr_nvmem(struct device_node *np, u8 addr)
> +static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
>  {
> struct platform_device *pdev = of_find_device_by_node(np);
> +   struct nvmem_cell *cell;
> +   const void *mac;
> +   size_t len;
> int ret;
>
> -   if (!pdev)
> -   return -ENODEV;
> +   /* Try lookup by device first, there might be a nvmem_cell_lookup
> +* associated with a given device.
> +*/
> +   if (pdev) {
> +   ret = nvmem_get_mac_address(>dev, addr);
> +   put_device(>dev);
> +   return ret;
> +   }
> +
> +   cell = of_nvmem_cell_get(np, "mac-address");
> +   if (IS_ERR(cell))
> +   return PTR_ERR(cell);
> +
> +   mac = nvmem_cell_read(cell, );
> +   nvmem_cell_put(cell);
> +
> +   if (IS_ERR(mac))
> +   return PTR_ERR(mac);
> +
> +   if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
> +   kfree(mac);
> +   return -EINVAL;
> +   }
>
> -   ret = nvmem_get_mac_address(>dev, addr);
> -   put_device(>dev);
> +   ether_addr_copy(addr, mac);
> +   kfree(mac);
>
> -   return ret;
> +   return 0;
>  }
>
>  /**
> --
> 2.20.1
>

RE: [PATCH v6] soc: fsl: enable acpi support in RCPM driver

2021-04-06 Thread Leo Li



> -Original Message-
> From: Ran Wang 
> Sent: Tuesday, April 6, 2021 8:32 PM
> To: Leo Li 
> Cc: Christophe Leroy ; linuxppc-dev
> ; moderated list:ARM/FREESCALE IMX / MXC
> ARM ARCHITECTURE ; lkml  ker...@vger.kernel.org>
> Subject: RE: [PATCH v6] soc: fsl: enable acpi support in RCPM driver
> 
> Hi Leo,
> 
> On Wednesday, April 7, 2021 5:45 AM, Li Yang wrote:
> >
> > On Fri, Mar 12, 2021 at 2:56 AM Ran Wang  wrote:
> > >
> > > From: Peng Ma 
> > >
> > > This patch enables ACPI support in RCPM driver.
> > >
> > > Signed-off-by: Peng Ma 
> > > Signed-off-by: Ran Wang 
> > > ---
> > > Change in v6:
> > >  - Remove copyright udpate to rebase on latest mainline
> > >
> > > Change in v5:
> > >  - Fix panic when dev->of_node is null
> > >
> > > Change in v4:
> > >  - Make commit subject more accurate
> > >  - Remove unrelated new blank line
> > >
> > > Change in v3:
> > >  - Add #ifdef CONFIG_ACPI for acpi_device_id
> > >  - Rename rcpm_acpi_imx_ids to rcpm_acpi_ids
> > >
> > > Change in v2:
> > >  - Update acpi_device_id to fix conflict with other driver
> > >
> > >  drivers/soc/fsl/rcpm.c | 18 --
> > >  1 file changed, 16 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c index
> > > 4ace28cab314..7aa997b932d1 100644
> > > --- a/drivers/soc/fsl/rcpm.c
> > > +++ b/drivers/soc/fsl/rcpm.c
> > > @@ -13,6 +13,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #define RCPM_WAKEUP_CELL_MAX_SIZE  7
> > >
> > > @@ -78,10 +79,14 @@ static int rcpm_pm_prepare(struct device *dev)
> > > "fsl,rcpm-wakeup", value,
> > > rcpm->wakeup_cells + 1);
> > >
> > > -   /*  Wakeup source should refer to current rcpm device */
> > > -   if (ret || (np->phandle != value[0]))
> > > +   if (ret)
> > > continue;
> > >
> > > +   if (is_of_node(dev->fwnode))
> > > +   /*  Should refer to current rcpm device */

Better to be /* Only handle devices with fsl,rcpm-wakeup pointing to the 
current rcpm node*/
> > > +   if (np->phandle != value[0])
> > > +   continue;
> >
> > It looks like that we assume that in the ACPI scenario there will only
> > be one RCPM controller and all devices are controlled by this single
> > PM controller.  This probably is true for all existing SoCs with a RCPM.  
> > But
> since the driver tried to support multiple RCPMs, maybe we should continue
> to support multiple RCPM controllers or at least mention that in the
> comment.
> 
> How about adding some comment as below:
> 
> /* For ACPI mode, currently we assume there is only one RCPM controller
> existing */

Ok.  On the other hand, it will be clearer to update the existing comment above.

> 
> Regards,
> Ran
> 
> >
> > > +
> > > /* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines 
> > > the
> > >  * number of IPPDEXPCR register cells, and 
> > > "fsl,rcpm-wakeup"
> > >  * of wakeup source IP contains an integer array:
> > >  > > rcpm_of_match[] = {  };  MODULE_DEVICE_TABLE(of, rcpm_of_match);
> > >
> > > +#ifdef CONFIG_ACPI
> > > +static const struct acpi_device_id rcpm_acpi_ids[] = {
> > > +   {"NXP0015",},
> > > +   { }
> > > +};
> > > +MODULE_DEVICE_TABLE(acpi, rcpm_acpi_ids); #endif
> > > +
> > >  static struct platform_driver rcpm_driver = {
> > > .driver = {
> > > .name = "rcpm",
> > > .of_match_table = rcpm_of_match,
> > > +   .acpi_match_table = ACPI_PTR(rcpm_acpi_ids),
> > > .pm = _pm_ops,
> > > },
> > > .probe = rcpm_probe,
> > > --
> > > 2.25.1
> > >

RE: [PATCH v6] soc: fsl: enable acpi support in RCPM driver

2021-04-06 Thread Ran Wang

Hi Leo,

On Wednesday, April 7, 2021 5:45 AM, Li Yang wrote:
> 
> On Fri, Mar 12, 2021 at 2:56 AM Ran Wang  wrote:
> >
> > From: Peng Ma 
> >
> > This patch enables ACPI support in RCPM driver.
> >
> > Signed-off-by: Peng Ma 
> > Signed-off-by: Ran Wang 
> > ---
> > Change in v6:
> >  - Remove copyright udpate to rebase on latest mainline
> >
> > Change in v5:
> >  - Fix panic when dev->of_node is null
> >
> > Change in v4:
> >  - Make commit subject more accurate
> >  - Remove unrelated new blank line
> >
> > Change in v3:
> >  - Add #ifdef CONFIG_ACPI for acpi_device_id
> >  - Rename rcpm_acpi_imx_ids to rcpm_acpi_ids
> >
> > Change in v2:
> >  - Update acpi_device_id to fix conflict with other driver
> >
> >  drivers/soc/fsl/rcpm.c | 18 --
> >  1 file changed, 16 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c index
> > 4ace28cab314..7aa997b932d1 100644
> > --- a/drivers/soc/fsl/rcpm.c
> > +++ b/drivers/soc/fsl/rcpm.c
> > @@ -13,6 +13,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  #define RCPM_WAKEUP_CELL_MAX_SIZE  7
> >
> > @@ -78,10 +79,14 @@ static int rcpm_pm_prepare(struct device *dev)
> > "fsl,rcpm-wakeup", value,
> > rcpm->wakeup_cells + 1);
> >
> > -   /*  Wakeup source should refer to current rcpm device */
> > -   if (ret || (np->phandle != value[0]))
> > +   if (ret)
> > continue;
> >
> > +   if (is_of_node(dev->fwnode))
> > +   /*  Should refer to current rcpm device */
> > +   if (np->phandle != value[0])
> > +   continue;
> 
> It looks like that we assume that in the ACPI scenario there will only be one 
> RCPM controller and all devices are controlled by this single
> PM controller.  This probably is true for all existing SoCs with a RCPM.  But 
> since the driver tried to support multiple RCPMs, maybe we
> should continue to support multiple RCPM controllers or at least mention that 
> in the comment.

How about adding some comment as below:

/* For ACPI mode, currently we assume there is only one RCPM controller 
existing */

Regards,
Ran

> 
> > +
> > /* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines 
> > the
> >  * number of IPPDEXPCR register cells, and "fsl,rcpm-wakeup"
> >  * of wakeup source IP contains an integer array:
> >  > rcpm_of_match[] = {  };  MODULE_DEVICE_TABLE(of, rcpm_of_match);
> >
> > +#ifdef CONFIG_ACPI
> > +static const struct acpi_device_id rcpm_acpi_ids[] = {
> > +   {"NXP0015",},
> > +   { }
> > +};
> > +MODULE_DEVICE_TABLE(acpi, rcpm_acpi_ids); #endif
> > +
> >  static struct platform_driver rcpm_driver = {
> > .driver = {
> > .name = "rcpm",
> > .of_match_table = rcpm_of_match,
> > +   .acpi_match_table = ACPI_PTR(rcpm_acpi_ids),
> > .pm = _pm_ops,
> > },
> > .probe = rcpm_probe,
> > --
> > 2.25.1
> >

Re: [OpenRISC] [PATCH v6 1/9] locking/qspinlock: Add ARCH_USE_QUEUED_SPINLOCKS_XCHG32

2021-04-06 Thread Stafford Horne

On Wed, Apr 07, 2021 at 12:51:56AM +0800, Boqun Feng wrote:
> Hi,
> 
> On Wed, Mar 31, 2021 at 02:30:32PM +, guo...@kernel.org wrote:
> > From: Guo Ren 
> > 
> > Some architectures don't have sub-word swap atomic instruction,
> > they only have the full word's one.
> > 
> > The sub-word swap only improve the performance when:
> > NR_CPUS < 16K
> >  *  0- 7: locked byte
> >  * 8: pending
> >  *  9-15: not used
> >  * 16-17: tail index
> >  * 18-31: tail cpu (+1)
> > 
> > The 9-15 bits are wasted to use xchg16 in xchg_tail.
> > 
> > Please let architecture select xchg16/xchg32 to implement
> > xchg_tail.
> > 
> 
> If the architecture doesn't have sub-word swap atomic, won't it generate
> the same/similar code no matter which version xchg_tail() is used? That
> is even CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32=y, xchg_tail() acts
> similar to an xchg16() implemented by cmpxchg(), which means we still
> don't have forward progress guarantee. So this configuration doesn't
> solve the problem.
> 
> I think it's OK to introduce this config and don't provide xchg16() for
> risc-v. But I don't see the point of converting other architectures to
> use it.

Hello,

For OpenRISC I did ack the patch to convert to
CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32=y.  But I think you are right, the
generic code in xchg_tail and the xchg16 emulation code in produced by OpenRISC
using xchg32 would produce very similar code.  I have not compared instructions,
but it does seem like duplicate functionality.

Why doesn't RISC-V add the xchg16 emulation code similar to OpenRISC?  For
OpenRISC we added xchg16 and xchg8 emulation code to enable qspinlocks.  So
one thought is with CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32=y, can we remove our
xchg16/xchg8 emulation code?

-Stafford

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Wei Liu

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
> 
> Signed-off-by: Andy Shevchenko 

Acked-by: Wei Liu

[PATCH net-next v3 1/2] of: net: pass the dst buffer to of_get_mac_address()

2021-04-06 Thread Michael Walle

of_get_mac_address() returns a "const void*" pointer to a MAC address.
Lately, support to fetch the MAC address by an NVMEM provider was added.
But this will only work with platform devices. It will not work with
PCI devices (e.g. of an integrated root complex) and esp. not with DSA
ports.

There is an of_* variant of the nvmem binding which works without
devices. The returned data of a nvmem_cell_read() has to be freed after
use. On the other hand the return of_get_mac_address() points to some
static data without a lifetime. The trick for now, was to allocate a
device resource managed buffer which is then returned. This will only
work if we have an actual device.

Change it, so that the caller of of_get_mac_address() has to supply a
buffer where the MAC address is written to. Unfortunately, this will
touch all drivers which use the of_get_mac_address().

Usually the code looks like:

  const char *addr;
  addr = of_get_mac_address(np);
  if (!IS_ERR(addr))
ether_addr_copy(ndev->dev_addr, addr);

This can then be simply rewritten as:

  of_get_mac_address(np, ndev->dev_addr);

Sometimes is_valid_ether_addr() is used to test the MAC address.
of_get_mac_address() already makes sure, it just returns a valid MAC
address. Thus we can just test its return code. But we have to be
careful if there are still other sources for the MAC address before the
of_get_mac_address(). In this case we have to keep the
is_valid_ether_addr() call.

The following coccinelle patch was used to convert common cases to the
new style. Afterwards, I've manually gone over the drivers and fixed the
return code variable: either used a new one or if one was already
available use that. Mansour Moufid, thanks for that coccinelle patch!


@a@
identifier x;
expression y, z;
@@
- x = of_get_mac_address(y);
+ x = of_get_mac_address(y, z);
  <...
- ether_addr_copy(z, x);
  ...>

@@
identifier a.x;
@@
- if (<+... x ...+>) {}

@@
identifier a.x;
@@
  if (<+... x ...+>) {
  ...
  }
- else {}

@@
identifier a.x;
expression e;
@@
- if (<+... x ...+>@e)
- {}
- else
+ if (!(e))
  {...}

@@
expression x, y, z;
@@
- x = of_get_mac_address(y, z);
+ of_get_mac_address(y, z);
  ... when != x


All drivers, except drivers/net/ethernet/aeroflex/greth.c, were
compile-time tested.

Suggested-by: Andrew Lunn 
Signed-off-by: Michael Walle 
---
 arch/arm/mach-mvebu/kirkwood.c|  3 +-
 arch/powerpc/sysdev/tsi108_dev.c  |  5 +-
 drivers/net/ethernet/aeroflex/greth.c |  6 +-
 drivers/net/ethernet/allwinner/sun4i-emac.c   | 10 +---
 drivers/net/ethernet/altera/altera_tse_main.c |  7 +--
 drivers/net/ethernet/arc/emac_main.c  |  8 +--
 drivers/net/ethernet/atheros/ag71xx.c |  7 +--
 drivers/net/ethernet/broadcom/bcm4908_enet.c  |  7 +--
 drivers/net/ethernet/broadcom/bcmsysport.c|  7 +--
 drivers/net/ethernet/broadcom/bgmac-bcma.c| 10 ++--
 .../net/ethernet/broadcom/bgmac-platform.c| 11 ++--
 drivers/net/ethernet/cadence/macb_main.c  | 11 +---
 .../net/ethernet/cavium/octeon/octeon_mgmt.c  |  8 +--
 .../net/ethernet/cavium/thunder/thunder_bgx.c |  5 +-
 drivers/net/ethernet/davicom/dm9000.c | 10 ++--
 drivers/net/ethernet/ethoc.c  |  6 +-
 drivers/net/ethernet/ezchip/nps_enet.c|  7 +--
 drivers/net/ethernet/freescale/fec_main.c |  7 ++-
 drivers/net/ethernet/freescale/fec_mpc52xx.c  |  7 +--
 drivers/net/ethernet/freescale/fman/mac.c |  9 +--
 .../ethernet/freescale/fs_enet/fs_enet-main.c |  5 +-
 drivers/net/ethernet/freescale/gianfar.c  |  8 +--
 drivers/net/ethernet/freescale/ucc_geth.c |  5 +-
 drivers/net/ethernet/hisilicon/hisi_femac.c   |  7 +--
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c |  7 +--
 drivers/net/ethernet/lantiq_xrx200.c  |  7 +--
 drivers/net/ethernet/marvell/mv643xx_eth.c|  5 +-
 drivers/net/ethernet/marvell/mvneta.c |  6 +-
 .../ethernet/marvell/prestera/prestera_main.c | 11 ++--
 drivers/net/ethernet/marvell/pxa168_eth.c |  9 +--
 drivers/net/ethernet/marvell/sky2.c   |  8 +--
 drivers/net/ethernet/mediatek/mtk_eth_soc.c   | 11 ++--
 drivers/net/ethernet/micrel/ks8851_common.c   |  7 +--
 drivers/net/ethernet/microchip/lan743x_main.c |  5 +-
 drivers/net/ethernet/nxp/lpc_eth.c|  4 +-
 drivers/net/ethernet/qualcomm/qca_spi.c   | 10 +---
 drivers/net/ethernet/qualcomm/qca_uart.c  |  9 +--
 drivers/net/ethernet/renesas/ravb_main.c  | 12 ++--
 drivers/net/ethernet/renesas/sh_eth.c |  5 +-
 .../ethernet/samsung/sxgbe/sxgbe_platform.c   | 13 ++--
 drivers/net/ethernet/socionext/sni_ave.c  | 10 +---
 .../ethernet/stmicro/stmmac/dwmac-anarion.c   |  2 +-
 .../stmicro/stmmac/dwmac-dwc-qos-eth.c|  2 +-
 .../ethernet/stmicro/stmmac/dwmac-generic.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-imx.c   |  2 +-
 .../stmicro/stmmac/dwmac-intel-plat.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-ipq806x.c   |  2 +-

[PATCH net-next v3 2/2] of: net: fix of_get_mac_addr_nvmem() for PCI and DSA nodes

2021-04-06 Thread Michael Walle

of_get_mac_address() already supports fetching the MAC address by an
nvmem provider. But until now, it was just working for platform devices.
Esp. it was not working for DSA ports and PCI devices. It gets more
common that PCI devices have a device tree binding since SoCs contain
integrated root complexes.

Use the nvmem of_* binding to fetch the nvmem cells by a struct
device_node. We still have to try to read the cell by device first
because there might be a nvmem_cell_lookup associated with that device.

Signed-off-by: Michael Walle 
---
Please note, that I've kept the nvmem_get_mac_address() which operates
on a device. The new of_get_mac_addr_nvmem() is almost identical and
there are no users of the former function right now, but it seems to be
the "newer" version to get the MAC address for a "struct device". Thus
I've kept it. Please advise, if I should kill it though.

 drivers/of/of_net.c | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
index 2d5d5e59aea5..2323c6063eaf 100644
--- a/drivers/of/of_net.c
+++ b/drivers/of/of_net.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * of_get_phy_mode - Get phy mode for given device_node
@@ -59,15 +60,39 @@ static int of_get_mac_addr(struct device_node *np, const 
char *name, u8 *addr)
 static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
 {
struct platform_device *pdev = of_find_device_by_node(np);
+   struct nvmem_cell *cell;
+   const void *mac;
+   size_t len;
int ret;
 
-   if (!pdev)
-   return -ENODEV;
+   /* Try lookup by device first, there might be a nvmem_cell_lookup
+* associated with a given device.
+*/
+   if (pdev) {
+   ret = nvmem_get_mac_address(>dev, addr);
+   put_device(>dev);
+   return ret;
+   }
+
+   cell = of_nvmem_cell_get(np, "mac-address");
+   if (IS_ERR(cell))
+   return PTR_ERR(cell);
+
+   mac = nvmem_cell_read(cell, );
+   nvmem_cell_put(cell);
+
+   if (IS_ERR(mac))
+   return PTR_ERR(mac);
+
+   if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+   kfree(mac);
+   return -EINVAL;
+   }
 
-   ret = nvmem_get_mac_address(>dev, addr);
-   put_device(>dev);
+   ether_addr_copy(addr, mac);
+   kfree(mac);
 
-   return ret;
+   return 0;
 }
 
 /**
-- 
2.20.1

[PATCH net-next v3 0/2] of: net: support non-platform devices in of_get_mac_address()

2021-04-06 Thread Michael Walle

of_get_mac_address() is commonly used to fetch the MAC address
from the device tree. It also supports reading it from a NVMEM
provider. But the latter is only possible for platform devices,
because only platform devices are searched for a matching device
node.

Add a second method to fetch the NVMEM cell by a device tree node
instead of a "struct device".

Moreover, the NVMEM subsystem will return dynamically allocated
data which has to be freed after use. Currently, this is handled
by allocating a device resource manged buffer to store the MAC
address. of_get_mac_address() then returns a pointer to this
buffer. Without a device, this trick is not possible anymore.
Thus, change the of_get_mac_address() API to have the caller
supply a buffer.

It was considered to use the network device to attach the buffer
to, but then the order matters and netdev_register() has to be
called before of_get_mac_address(). No driver does it this way.

changes since v2:
 - fixed of_get_mac_addr_nvmem() signature, which was accidentially
   fixed in patch 2/2 again

changes since v1:
 - fixed stmmac_probe_config_dt() for !CONFIG_OF
 - added missing queue in patch subject

Michael Walle (2):
  of: net: pass the dst buffer to of_get_mac_address()
  of: net: fix of_get_mac_addr_nvmem() for PCI and DSA nodes

 arch/arm/mach-mvebu/kirkwood.c|  3 +-
 arch/powerpc/sysdev/tsi108_dev.c  |  5 +-
 drivers/net/ethernet/aeroflex/greth.c |  6 +-
 drivers/net/ethernet/allwinner/sun4i-emac.c   | 10 +--
 drivers/net/ethernet/altera/altera_tse_main.c |  7 +-
 drivers/net/ethernet/arc/emac_main.c  |  8 +-
 drivers/net/ethernet/atheros/ag71xx.c |  7 +-
 drivers/net/ethernet/broadcom/bcm4908_enet.c  |  7 +-
 drivers/net/ethernet/broadcom/bcmsysport.c|  7 +-
 drivers/net/ethernet/broadcom/bgmac-bcma.c| 10 +--
 .../net/ethernet/broadcom/bgmac-platform.c| 11 ++-
 drivers/net/ethernet/cadence/macb_main.c  | 11 +--
 .../net/ethernet/cavium/octeon/octeon_mgmt.c  |  8 +-
 .../net/ethernet/cavium/thunder/thunder_bgx.c |  5 +-
 drivers/net/ethernet/davicom/dm9000.c | 10 +--
 drivers/net/ethernet/ethoc.c  |  6 +-
 drivers/net/ethernet/ezchip/nps_enet.c|  7 +-
 drivers/net/ethernet/freescale/fec_main.c |  7 +-
 drivers/net/ethernet/freescale/fec_mpc52xx.c  |  7 +-
 drivers/net/ethernet/freescale/fman/mac.c |  9 +-
 .../ethernet/freescale/fs_enet/fs_enet-main.c |  5 +-
 drivers/net/ethernet/freescale/gianfar.c  |  8 +-
 drivers/net/ethernet/freescale/ucc_geth.c |  5 +-
 drivers/net/ethernet/hisilicon/hisi_femac.c   |  7 +-
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c |  7 +-
 drivers/net/ethernet/lantiq_xrx200.c  |  7 +-
 drivers/net/ethernet/marvell/mv643xx_eth.c|  5 +-
 drivers/net/ethernet/marvell/mvneta.c |  6 +-
 .../ethernet/marvell/prestera/prestera_main.c | 11 +--
 drivers/net/ethernet/marvell/pxa168_eth.c |  9 +-
 drivers/net/ethernet/marvell/sky2.c   |  8 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c   | 11 +--
 drivers/net/ethernet/micrel/ks8851_common.c   |  7 +-
 drivers/net/ethernet/microchip/lan743x_main.c |  5 +-
 drivers/net/ethernet/nxp/lpc_eth.c|  4 +-
 drivers/net/ethernet/qualcomm/qca_spi.c   | 10 +--
 drivers/net/ethernet/qualcomm/qca_uart.c  |  9 +-
 drivers/net/ethernet/renesas/ravb_main.c  | 12 +--
 drivers/net/ethernet/renesas/sh_eth.c |  5 +-
 .../ethernet/samsung/sxgbe/sxgbe_platform.c   | 13 +--
 drivers/net/ethernet/socionext/sni_ave.c  | 10 +--
 .../ethernet/stmicro/stmmac/dwmac-anarion.c   |  2 +-
 .../stmicro/stmmac/dwmac-dwc-qos-eth.c|  2 +-
 .../ethernet/stmicro/stmmac/dwmac-generic.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-imx.c   |  2 +-
 .../stmicro/stmmac/dwmac-intel-plat.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-ipq806x.c   |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-lpc18xx.c   |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-mediatek.c  |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-meson.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-meson8b.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-oxnas.c |  2 +-
 .../stmicro/stmmac/dwmac-qcom-ethqos.c|  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-rk.c|  2 +-
 .../ethernet/stmicro/stmmac/dwmac-socfpga.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-sti.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-stm32.c |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-sunxi.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-visconti.c  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |  4 +-
 .../ethernet/stmicro/stmmac/stmmac_platform.c | 14 +--
 .../ethernet/stmicro/stmmac/stmmac_platform.h |  2 +-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c  | 19 ++---
 drivers/net/ethernet/ti/cpsw.c|  7 +-
 drivers/net/ethernet/ti/cpsw_new.c|

[PATCH net-next v2 1/2] of: net: pass the dst buffer to of_get_mac_address()

2021-04-06 Thread Michael Walle

of_get_mac_address() returns a "const void*" pointer to a MAC address.
Lately, support to fetch the MAC address by an NVMEM provider was added.
But this will only work with platform devices. It will not work with
PCI devices (e.g. of an integrated root complex) and esp. not with DSA
ports.

There is an of_* variant of the nvmem binding which works without
devices. The returned data of a nvmem_cell_read() has to be freed after
use. On the other hand the return of_get_mac_address() points to some
static data without a lifetime. The trick for now, was to allocate a
device resource managed buffer which is then returned. This will only
work if we have an actual device.

Change it, so that the caller of of_get_mac_address() has to supply a
buffer where the MAC address is written to. Unfortunately, this will
touch all drivers which use the of_get_mac_address().

Usually the code looks like:

  const char *addr;
  addr = of_get_mac_address(np);
  if (!IS_ERR(addr))
ether_addr_copy(ndev->dev_addr, addr);

This can then be simply rewritten as:

  of_get_mac_address(np, ndev->dev_addr);

Sometimes is_valid_ether_addr() is used to test the MAC address.
of_get_mac_address() already makes sure, it just returns a valid MAC
address. Thus we can just test its return code. But we have to be
careful if there are still other sources for the MAC address before the
of_get_mac_address(). In this case we have to keep the
is_valid_ether_addr() call.

The following coccinelle patch was used to convert common cases to the
new style. Afterwards, I've manually gone over the drivers and fixed the
return code variable: either used a new one or if one was already
available use that. Mansour Moufid, thanks for that coccinelle patch!


@a@
identifier x;
expression y, z;
@@
- x = of_get_mac_address(y);
+ x = of_get_mac_address(y, z);
  <...
- ether_addr_copy(z, x);
  ...>

@@
identifier a.x;
@@
- if (<+... x ...+>) {}

@@
identifier a.x;
@@
  if (<+... x ...+>) {
  ...
  }
- else {}

@@
identifier a.x;
expression e;
@@
- if (<+... x ...+>@e)
- {}
- else
+ if (!(e))
  {...}

@@
expression x, y, z;
@@
- x = of_get_mac_address(y, z);
+ of_get_mac_address(y, z);
  ... when != x


All drivers, except drivers/net/ethernet/aeroflex/greth.c, were
compile-time tested.

Suggested-by: Andrew Lunn 
Signed-off-by: Michael Walle 
---
 arch/arm/mach-mvebu/kirkwood.c|  3 +-
 arch/powerpc/sysdev/tsi108_dev.c  |  5 +-
 drivers/net/ethernet/aeroflex/greth.c |  6 +-
 drivers/net/ethernet/allwinner/sun4i-emac.c   | 10 +---
 drivers/net/ethernet/altera/altera_tse_main.c |  7 +--
 drivers/net/ethernet/arc/emac_main.c  |  8 +--
 drivers/net/ethernet/atheros/ag71xx.c |  7 +--
 drivers/net/ethernet/broadcom/bcm4908_enet.c  |  7 +--
 drivers/net/ethernet/broadcom/bcmsysport.c|  7 +--
 drivers/net/ethernet/broadcom/bgmac-bcma.c| 10 ++--
 .../net/ethernet/broadcom/bgmac-platform.c| 11 ++--
 drivers/net/ethernet/cadence/macb_main.c  | 11 +---
 .../net/ethernet/cavium/octeon/octeon_mgmt.c  |  8 +--
 .../net/ethernet/cavium/thunder/thunder_bgx.c |  5 +-
 drivers/net/ethernet/davicom/dm9000.c | 10 ++--
 drivers/net/ethernet/ethoc.c  |  6 +-
 drivers/net/ethernet/ezchip/nps_enet.c|  7 +--
 drivers/net/ethernet/freescale/fec_main.c |  7 ++-
 drivers/net/ethernet/freescale/fec_mpc52xx.c  |  7 +--
 drivers/net/ethernet/freescale/fman/mac.c |  9 +--
 .../ethernet/freescale/fs_enet/fs_enet-main.c |  5 +-
 drivers/net/ethernet/freescale/gianfar.c  |  8 +--
 drivers/net/ethernet/freescale/ucc_geth.c |  5 +-
 drivers/net/ethernet/hisilicon/hisi_femac.c   |  7 +--
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c |  7 +--
 drivers/net/ethernet/lantiq_xrx200.c  |  7 +--
 drivers/net/ethernet/marvell/mv643xx_eth.c|  5 +-
 drivers/net/ethernet/marvell/mvneta.c |  6 +-
 .../ethernet/marvell/prestera/prestera_main.c | 11 ++--
 drivers/net/ethernet/marvell/pxa168_eth.c |  9 +--
 drivers/net/ethernet/marvell/sky2.c   |  8 +--
 drivers/net/ethernet/mediatek/mtk_eth_soc.c   | 11 ++--
 drivers/net/ethernet/micrel/ks8851_common.c   |  7 +--
 drivers/net/ethernet/microchip/lan743x_main.c |  5 +-
 drivers/net/ethernet/nxp/lpc_eth.c|  4 +-
 drivers/net/ethernet/qualcomm/qca_spi.c   | 10 +---
 drivers/net/ethernet/qualcomm/qca_uart.c  |  9 +--
 drivers/net/ethernet/renesas/ravb_main.c  | 12 ++--
 drivers/net/ethernet/renesas/sh_eth.c |  5 +-
 .../ethernet/samsung/sxgbe/sxgbe_platform.c   | 13 ++--
 drivers/net/ethernet/socionext/sni_ave.c  | 10 +---
 .../ethernet/stmicro/stmmac/dwmac-anarion.c   |  2 +-
 .../stmicro/stmmac/dwmac-dwc-qos-eth.c|  2 +-
 .../ethernet/stmicro/stmmac/dwmac-generic.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-imx.c   |  2 +-
 .../stmicro/stmmac/dwmac-intel-plat.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-ipq806x.c   |  2 +-

[PATCH net-next v2 2/2] of: net: fix of_get_mac_addr_nvmem() for PCI and DSA nodes

2021-04-06 Thread Michael Walle

of_get_mac_address() already supports fetching the MAC address by an
nvmem provider. But until now, it was just working for platform devices.
Esp. it was not working for DSA ports and PCI devices. It gets more
common that PCI devices have a device tree binding since SoCs contain
integrated root complexes.

Use the nvmem of_* binding to fetch the nvmem cells by a struct
device_node. We still have to try to read the cell by device first
because there might be a nvmem_cell_lookup associated with that device.

Signed-off-by: Michael Walle 
---
Please note, that I've kept the nvmem_get_mac_address() which operates
on a device. The new of_get_mac_addr_nvmem() is almost identical and
there are no users of the former function right now, but it seems to be
the "newer" version to get the MAC address for a "struct device". Thus
I've kept it. Please advise, if I should kill it though.

 drivers/of/of_net.c | 37 +++--
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c
index 2344ad7fff5e..2323c6063eaf 100644
--- a/drivers/of/of_net.c
+++ b/drivers/of/of_net.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * of_get_phy_mode - Get phy mode for given device_node
@@ -56,18 +57,42 @@ static int of_get_mac_addr(struct device_node *np, const 
char *name, u8 *addr)
return -ENODEV;
 }
 
-static int of_get_mac_addr_nvmem(struct device_node *np, u8 addr)
+static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr)
 {
struct platform_device *pdev = of_find_device_by_node(np);
+   struct nvmem_cell *cell;
+   const void *mac;
+   size_t len;
int ret;
 
-   if (!pdev)
-   return -ENODEV;
+   /* Try lookup by device first, there might be a nvmem_cell_lookup
+* associated with a given device.
+*/
+   if (pdev) {
+   ret = nvmem_get_mac_address(>dev, addr);
+   put_device(>dev);
+   return ret;
+   }
+
+   cell = of_nvmem_cell_get(np, "mac-address");
+   if (IS_ERR(cell))
+   return PTR_ERR(cell);
+
+   mac = nvmem_cell_read(cell, );
+   nvmem_cell_put(cell);
+
+   if (IS_ERR(mac))
+   return PTR_ERR(mac);
+
+   if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+   kfree(mac);
+   return -EINVAL;
+   }
 
-   ret = nvmem_get_mac_address(>dev, addr);
-   put_device(>dev);
+   ether_addr_copy(addr, mac);
+   kfree(mac);
 
-   return ret;
+   return 0;
 }
 
 /**
-- 
2.20.1

[PATCH net-next v2 0/2] of: net: support non-platform devices in of_get_mac_address()

2021-04-06 Thread Michael Walle

of_get_mac_address() is commonly used to fetch the MAC address
from the device tree. It also supports reading it from a NVMEM
provider. But the latter is only possible for platform devices,
because only platform devices are searched for a matching device
node.

Add a second method to fetch the NVMEM cell by a device tree node
instead of a "struct device".

Moreover, the NVMEM subsystem will return dynamically allocated
data which has to be freed after use. Currently, this is handled
by allocating a device resource manged buffer to store the MAC
address. of_get_mac_address() then returns a pointer to this
buffer. Without a device, this trick is not possible anymore.
Thus, change the of_get_mac_address() API to have the caller
supply a buffer.

It was considered to use the network device to attach the buffer
to, but then the order matters and netdev_register() has to be
called before of_get_mac_address(). No driver does it this way.

changes since v1:
 - fixed stmmac_probe_config_dt() for !CONFIG_OF
 - added missing queue in patch subject

Michael Walle (2):
  of: net: pass the dst buffer to of_get_mac_address()
  of: net: fix of_get_mac_addr_nvmem() for PCI and DSA nodes

 arch/arm/mach-mvebu/kirkwood.c|  3 +-
 arch/powerpc/sysdev/tsi108_dev.c  |  5 +-
 drivers/net/ethernet/aeroflex/greth.c |  6 +-
 drivers/net/ethernet/allwinner/sun4i-emac.c   | 10 +--
 drivers/net/ethernet/altera/altera_tse_main.c |  7 +-
 drivers/net/ethernet/arc/emac_main.c  |  8 +-
 drivers/net/ethernet/atheros/ag71xx.c |  7 +-
 drivers/net/ethernet/broadcom/bcm4908_enet.c  |  7 +-
 drivers/net/ethernet/broadcom/bcmsysport.c|  7 +-
 drivers/net/ethernet/broadcom/bgmac-bcma.c| 10 +--
 .../net/ethernet/broadcom/bgmac-platform.c| 11 ++-
 drivers/net/ethernet/cadence/macb_main.c  | 11 +--
 .../net/ethernet/cavium/octeon/octeon_mgmt.c  |  8 +-
 .../net/ethernet/cavium/thunder/thunder_bgx.c |  5 +-
 drivers/net/ethernet/davicom/dm9000.c | 10 +--
 drivers/net/ethernet/ethoc.c  |  6 +-
 drivers/net/ethernet/ezchip/nps_enet.c|  7 +-
 drivers/net/ethernet/freescale/fec_main.c |  7 +-
 drivers/net/ethernet/freescale/fec_mpc52xx.c  |  7 +-
 drivers/net/ethernet/freescale/fman/mac.c |  9 +-
 .../ethernet/freescale/fs_enet/fs_enet-main.c |  5 +-
 drivers/net/ethernet/freescale/gianfar.c  |  8 +-
 drivers/net/ethernet/freescale/ucc_geth.c |  5 +-
 drivers/net/ethernet/hisilicon/hisi_femac.c   |  7 +-
 drivers/net/ethernet/hisilicon/hix5hd2_gmac.c |  7 +-
 drivers/net/ethernet/lantiq_xrx200.c  |  7 +-
 drivers/net/ethernet/marvell/mv643xx_eth.c|  5 +-
 drivers/net/ethernet/marvell/mvneta.c |  6 +-
 .../ethernet/marvell/prestera/prestera_main.c | 11 +--
 drivers/net/ethernet/marvell/pxa168_eth.c |  9 +-
 drivers/net/ethernet/marvell/sky2.c   |  8 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c   | 11 +--
 drivers/net/ethernet/micrel/ks8851_common.c   |  7 +-
 drivers/net/ethernet/microchip/lan743x_main.c |  5 +-
 drivers/net/ethernet/nxp/lpc_eth.c|  4 +-
 drivers/net/ethernet/qualcomm/qca_spi.c   | 10 +--
 drivers/net/ethernet/qualcomm/qca_uart.c  |  9 +-
 drivers/net/ethernet/renesas/ravb_main.c  | 12 +--
 drivers/net/ethernet/renesas/sh_eth.c |  5 +-
 .../ethernet/samsung/sxgbe/sxgbe_platform.c   | 13 +--
 drivers/net/ethernet/socionext/sni_ave.c  | 10 +--
 .../ethernet/stmicro/stmmac/dwmac-anarion.c   |  2 +-
 .../stmicro/stmmac/dwmac-dwc-qos-eth.c|  2 +-
 .../ethernet/stmicro/stmmac/dwmac-generic.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-imx.c   |  2 +-
 .../stmicro/stmmac/dwmac-intel-plat.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-ipq806x.c   |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-lpc18xx.c   |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-mediatek.c  |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-meson.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-meson8b.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-oxnas.c |  2 +-
 .../stmicro/stmmac/dwmac-qcom-ethqos.c|  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-rk.c|  2 +-
 .../ethernet/stmicro/stmmac/dwmac-socfpga.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-sti.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-stm32.c |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-sun8i.c |  2 +-
 .../net/ethernet/stmicro/stmmac/dwmac-sunxi.c |  2 +-
 .../ethernet/stmicro/stmmac/dwmac-visconti.c  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h  |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |  4 +-
 .../ethernet/stmicro/stmmac/stmmac_platform.c | 14 +--
 .../ethernet/stmicro/stmmac/stmmac_platform.h |  2 +-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c  | 19 ++---
 drivers/net/ethernet/ti/cpsw.c|  7 +-
 drivers/net/ethernet/ti/cpsw_new.c|  7 +-
 drivers/net/ethernet/ti/davinci_emac.c|  8 +-
 drivers/net/ethernet/ti/netcp_core.c  |  7

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Corey Minyard

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.

For the IPMI portion:

Acked-by: Corey Minyard 

> 
> Signed-off-by: Andy Shevchenko 
> ---
>  arch/powerpc/kernel/setup-common.c   |  1 +
>  arch/x86/include/asm/desc.h  |  1 +
>  arch/x86/kernel/cpu/mshyperv.c   |  1 +
>  arch/x86/kernel/setup.c  |  1 +
>  drivers/char/ipmi/ipmi_msghandler.c  |  1 +
>  drivers/remoteproc/remoteproc_core.c |  1 +
>  include/asm-generic/bug.h|  3 +-
>  include/linux/kernel.h   | 84 +---
>  include/linux/panic.h| 98 
>  include/linux/panic_notifier.h   | 12 
>  kernel/hung_task.c   |  1 +
>  kernel/kexec_core.c  |  1 +
>  kernel/panic.c   |  1 +
>  kernel/rcu/tree.c|  2 +
>  kernel/sysctl.c  |  1 +
>  kernel/trace/trace.c |  1 +
>  16 files changed, 126 insertions(+), 84 deletions(-)
>  create mode 100644 include/linux/panic.h
>  create mode 100644 include/linux/panic_notifier.h
> 
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index 74a98fff2c2f..046fe21b5c3b 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -9,6 +9,7 @@
>  #undef DEBUG
>  
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index 476082a83d1c..ceb12683b6d1 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -9,6 +9,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 22f13343b5da..9e5c6f2b044d 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -17,6 +17,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 59e5e0903b0c..570699eecf90 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
> b/drivers/char/ipmi/ipmi_msghandler.c
> index 8a0e97b33cae..e96cb5c4f97a 100644
> --- a/drivers/char/ipmi/ipmi_msghandler.c
> +++ b/drivers/char/ipmi/ipmi_msghandler.c
> @@ -16,6 +16,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/remoteproc/remoteproc_core.c 
> b/drivers/remoteproc/remoteproc_core.c
> index 626a6b90fba2..76dd8e2b1e7e 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
> index 76a10e0dca9f..719410b93f99 100644
> --- a/include/asm-generic/bug.h
> +++ b/include/asm-generic/bug.h
> @@ -17,7 +17,8 @@
>  #endif
>  
>  #ifndef __ASSEMBLY__
> -#include 
> +#include 
> +#include 
>  
>  #ifdef CONFIG_BUG
>  
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 09035ac67d4b..6c5a05ac1ecb 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -70,7 +71,6 @@
>  #define lower_32_bits(n) ((u32)((n) & 0x))
>  
>  struct completion;
> -struct pt_regs;
>  struct user;
>  
>  #ifdef CONFIG_PREEMPT_VOLUNTARY
> @@ -175,14 +175,6 @@ void __might_fault(const char *file, int line);
>  static inline void might_fault(void) { }
>  #endif
>  
> -extern struct atomic_notifier_head panic_notifier_list;
> -extern long (*panic_blink)(int state);
> -__printf(1, 2)
> -void panic(const char *fmt, ...) __noreturn __cold;
> -void nmi_panic(struct pt_regs *regs, const char *msg);
> -extern void oops_enter(void);
> -extern void oops_exit(void);
> -extern bool oops_may_print(void);
>  void do_exit(long error_code) __noreturn;
>  void complete_and_exit(struct completion *, long) __noreturn;
>  
> @@ -368,52 +360,8 @@ extern int __kernel_text_address(unsigned long addr);
>  extern int kernel_text_address(unsigned long addr);
>  extern int func_ptr_is_kernel_text(void *ptr);
>  
> -#ifdef CONFIG_SMP
> -extern unsigned int sysctl_oops_all_cpu_backtrace;
> -#else
> -#define sysctl_oops_all_cpu_backtrace 0
> -#endif /*

Re: [PATCH v1 0/3] Remove qe_io{read,write}* IO accessors

2021-04-06 Thread Li Yang

On Sat, Mar 6, 2021 at 12:11 PM Christophe Leroy
 wrote:
>
> Commit 6ac9b61786cc ("soc: fsl: qe: introduce qe_io{read,write}*
> wrappers") added specific I/O accessors for qe because at that
> time ioread/iowrite functions were sub-optimal on powerpc/32
> compared to the architecture specific in_/out_ IO accessors.
>
> But as ioread/iowrite accessors are now equivalent since
> commit 894fa235eb4c ("powerpc: inline iomap accessors"),
> use them in order to allow removal of the qe specific ones.
>
> Christophe Leroy (3):
>   soc: fsl: qe: replace qe_io{read,write}* wrappers by generic
> io{read,write}*
>   tty: serial: ucc_uart: replace qe_io{read,write}* wrappers by generic
> io{read,write}*
>   Revert "soc: fsl: qe: introduce qe_io{read,write}* wrappers"

Series applied.  Thanks.

>
>  drivers/soc/fsl/qe/gpio.c |  20 +++---
>  drivers/soc/fsl/qe/qe.c   |  24 +++
>  drivers/soc/fsl/qe/qe_ic.c|   4 +-
>  drivers/soc/fsl/qe/qe_io.c|  36 +-
>  drivers/soc/fsl/qe/ucc_fast.c |  68 +--
>  drivers/soc/fsl/qe/ucc_slow.c |  42 ++--
>  drivers/tty/serial/ucc_uart.c | 124 +-
>  include/soc/fsl/qe/qe.h   |  34 +++---
>  8 files changed, 168 insertions(+), 184 deletions(-)
>
> --
> 2.25.0
>

[PATCH v2] KVM: PPC: Book3S HV: Sanitise vcpu registers in nested path

2021-04-06 Thread Fabiano Rosas

As one of the arguments of the H_ENTER_NESTED hypercall, the nested
hypervisor (L1) prepares a structure containing the values of various
hypervisor-privileged registers with which it wants the nested guest
(L2) to run. Since the nested HV runs in supervisor mode it needs the
host to write to these registers.

To stop a nested HV manipulating this mechanism and using a nested
guest as a proxy to access a facility that has been made unavailable
to it, we have a routine that sanitises the values of the HV registers
before copying them into the nested guest's vcpu struct.

However, when coming out of the guest the values are copied as they
were back into L1 memory, which means that any sanitisation we did
during guest entry will be exposed to L1 after H_ENTER_NESTED returns.

This patch alters this sanitisation to have effect on the vcpu->arch
registers directly before entering and after exiting the guest,
leaving the structure that is copied back into L1 unchanged (except
when we really want L1 to access the value, e.g the Cause bits of
HFSCR).

Signed-off-by: Fabiano Rosas 
---
I'm taking another shot at fixing this locally without resorting to
more complex things such as error handling and feature
advertisement/negotiation.

Changes since v1:

- made the change more generic, not only applies to hfscr anymore;
- sanitisation is now done directly on the vcpu struct, l2_hv is left unchanged;

v1:

https://lkml.kernel.org/r/20210305231055.2913892-1-faro...@linux.ibm.com
---
 arch/powerpc/kvm/book3s_hv_nested.c | 33 +++--
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 0cd0e7aad588..a60fccb2c4f2 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -132,21 +132,37 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, 
int trap,
}
 }
 
-static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+static void sanitise_vcpu_entry_state(struct kvm_vcpu *vcpu,
+ const struct hv_guest_state *l2_hv,
+ const struct hv_guest_state *l1_hv)
 {
/*
 * Don't let L1 enable features for L2 which we've disabled for L1,
 * but preserve the interrupt cause field.
 */
-   hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
+   vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | l1_hv->hfscr);
 
/* Don't let data address watchpoint match in hypervisor state */
-   hr->dawrx0 &= ~DAWRX_HYP;
-   hr->dawrx1 &= ~DAWRX_HYP;
+   vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP;
+   vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP;
 
/* Don't let completed instruction address breakpt match in HV state */
-   if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
-   hr->ciabr &= ~CIABR_PRIV;
+   if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+   vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV;
+}
+
+
+/*
+ * During sanitise_vcpu_entry_state() we might have used bits from L1
+ * state to restrict what the L2 state is allowed to be. Since L1 is
+ * not allowed to read the HV registers, do not include these
+ * modifications in the return state.
+ */
+static void sanitise_vcpu_return_state(struct kvm_vcpu *vcpu,
+  const struct hv_guest_state *l2_hv)
+{
+   vcpu->arch.hfscr = ((~HFSCR_INTR_CAUSE & l2_hv->hfscr) |
+   (HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
 }
 
 static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
@@ -324,9 +340,10 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
LPCR_LPES | LPCR_MER;
lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
-   sanitise_hv_regs(vcpu, _hv);
restore_hv_regs(vcpu, _hv);
 
+   sanitise_vcpu_entry_state(vcpu, _hv, _l1_hv);
+
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
do {
@@ -338,6 +355,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr);
} while (is_kvmppc_resume_guest(r));
 
+   sanitise_vcpu_return_state(vcpu, _hv);
+
/* save L2 state for return */
l2_regs = vcpu->arch.regs;
l2_regs.msr = vcpu->arch.shregs.msr;
-- 
2.29.2

Re: [PATCH v6] soc: fsl: enable acpi support in RCPM driver

2021-04-06 Thread Li Yang

On Fri, Mar 12, 2021 at 2:56 AM Ran Wang  wrote:
>
> From: Peng Ma 
>
> This patch enables ACPI support in RCPM driver.
>
> Signed-off-by: Peng Ma 
> Signed-off-by: Ran Wang 
> ---
> Change in v6:
>  - Remove copyright udpate to rebase on latest mainline
>
> Change in v5:
>  - Fix panic when dev->of_node is null
>
> Change in v4:
>  - Make commit subject more accurate
>  - Remove unrelated new blank line
>
> Change in v3:
>  - Add #ifdef CONFIG_ACPI for acpi_device_id
>  - Rename rcpm_acpi_imx_ids to rcpm_acpi_ids
>
> Change in v2:
>  - Update acpi_device_id to fix conflict with other driver
>
>  drivers/soc/fsl/rcpm.c | 18 --
>  1 file changed, 16 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c
> index 4ace28cab314..7aa997b932d1 100644
> --- a/drivers/soc/fsl/rcpm.c
> +++ b/drivers/soc/fsl/rcpm.c
> @@ -13,6 +13,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #define RCPM_WAKEUP_CELL_MAX_SIZE  7
>
> @@ -78,10 +79,14 @@ static int rcpm_pm_prepare(struct device *dev)
> "fsl,rcpm-wakeup", value,
> rcpm->wakeup_cells + 1);
>
> -   /*  Wakeup source should refer to current rcpm device */
> -   if (ret || (np->phandle != value[0]))
> +   if (ret)
> continue;
>
> +   if (is_of_node(dev->fwnode))
> +   /*  Should refer to current rcpm device */
> +   if (np->phandle != value[0])
> +   continue;

It looks like that we assume that in the ACPI scenario there will only
be one RCPM controller and all devices are controlled by this single
PM controller.  This probably is true for all existing SoCs with a
RCPM.  But since the driver tried to support multiple RCPMs, maybe we
should continue to support multiple RCPM controllers or at least
mention that in the comment.

> +
> /* Property "#fsl,rcpm-wakeup-cells" of rcpm node defines the
>  * number of IPPDEXPCR register cells, and "fsl,rcpm-wakeup"
>  * of wakeup source IP contains an integer array:  @@ -172,10 +177,19 @@ static const struct of_device_id rcpm_of_match[] = {
>  };
>  MODULE_DEVICE_TABLE(of, rcpm_of_match);
>
> +#ifdef CONFIG_ACPI
> +static const struct acpi_device_id rcpm_acpi_ids[] = {
> +   {"NXP0015",},
> +   { }
> +};
> +MODULE_DEVICE_TABLE(acpi, rcpm_acpi_ids);
> +#endif
> +
>  static struct platform_driver rcpm_driver = {
> .driver = {
> .name = "rcpm",
> .of_match_table = rcpm_of_match,
> +   .acpi_match_table = ACPI_PTR(rcpm_acpi_ids),
> .pm = _pm_ops,
> },
> .probe = rcpm_probe,
> --
> 2.25.1
>

Re: [PATCH 1/2] vfio/pci: remove vfio_pci_nvlink2

2021-04-06 Thread Alex Williamson

On Fri, 26 Mar 2021 07:13:10 +0100
Christoph Hellwig  wrote:

> This driver never had any open userspace (which for VFIO would include
> VM kernel drivers) that use it, and thus should never have been added
> by our normal userspace ABI rules.
> 
> Signed-off-by: Christoph Hellwig 
> Acked-by: Greg Kroah-Hartman 
> ---
>  drivers/vfio/pci/Kconfig|   6 -
>  drivers/vfio/pci/Makefile   |   1 -
>  drivers/vfio/pci/vfio_pci.c |  18 -
>  drivers/vfio/pci/vfio_pci_nvlink2.c | 490 
>  drivers/vfio/pci/vfio_pci_private.h |  14 -
>  include/uapi/linux/vfio.h   |  38 +--
>  6 files changed, 4 insertions(+), 563 deletions(-)
>  delete mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

Hearing no objections, applied to vfio next branch for v5.13.  Thanks,

Alex

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Kees Cook

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
> 
> Signed-off-by: Andy Shevchenko 

I like it! Do you have a multi-arch CI to do allmodconfig builds to
double-check this?

Acked-by: Kees Cook 

-Kees

-- 
Kees Cook

[PATCH v3] pseries: prevent free CPU ids to be reused on another node

2021-04-06 Thread Laurent Dufour

When a CPU is hot added, the CPU ids are taken from the available mask from
the lower possible set. If that set of values was previously used for CPU
attached to a different node, this seems to application like if these CPUs
have migrated from a node to another one which is not expected in real
life.

To prevent this, it is needed to record the CPU ids used for each node and
to not reuse them on another node. However, to prevent CPU hot plug to
fail, in the case the CPU ids is starved on a node, the capability to reuse
other nodes’ free CPU ids is kept. A warning is displayed in such a case
to warn the user.

A new CPU bit mask (node_recorded_ids_map) is introduced for each possible
node. It is populated with the CPU onlined at boot time, and then when a
CPU is hot plug to a node. The bits in that mask remain when the CPU is hot
unplugged, to remind this CPU ids have been used for this node.

The effect of this patch can be seen by removing and adding CPUs using the
Qemu monitor. In the following case, the first CPU from the node 2 is
removed, then the first one from the node 1 is removed too. Later, the
first CPU of the node 2 is added back. Without that patch, the kernel will
numbered these CPUs using the first CPU ids available which are the ones
freed when removing the second CPU of the node 0. This leads to the CPU ids
16-23 to move from the node 1 to the node 2. With the patch applied, the
CPU ids 32-39 are used since they are the lowest free ones which have not
been used on another node.

At boot time:
[root@vm40 ~]# numactl -H | grep cpus
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

Vanilla kernel, after the CPU hot unplug/plug operations:
[root@vm40 ~]# numactl -H | grep cpus
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
node 1 cpus: 24 25 26 27 28 29 30 31
node 2 cpus: 16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47

Patched kernel, after the CPU hot unplug/plug operations:
[root@vm40 ~]# numactl -H | grep cpus
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
node 1 cpus: 24 25 26 27 28 29 30 31
node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47

Changes since V2, addressing Nathan's comments:
 - Remove the retry feature
 - Reduce the number of local variables (removing 'i')
 - Add comment about the cpu_add_remove_lock protecting the added CPU mask.
Changes since V1 (no functional changes):
 - update the test's output in the commit's description
 - node_recorded_ids_map should be static

Signed-off-by: Laurent Dufour 
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 51 ++--
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index ec478f8a98ff..f3fd4807dc3e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -39,6 +39,12 @@
 /* This version can't take the spinlock, because it never returns */
 static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
 
+/*
+ * Record the CPU ids used on each nodes.
+ * Protected by cpu_add_remove_lock.
+ */
+static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES];
+
 static void rtas_stop_self(void)
 {
static struct rtas_args args;
@@ -151,9 +157,9 @@ static void pseries_cpu_die(unsigned int cpu)
  */
 static int pseries_add_processor(struct device_node *np)
 {
-   unsigned int cpu;
+   unsigned int cpu, node;
cpumask_var_t candidate_mask, tmp;
-   int err = -ENOSPC, len, nthreads, i;
+   int err = -ENOSPC, len, nthreads, nid;
const __be32 *intserv;
 
intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", );
@@ -163,9 +169,17 @@ static int pseries_add_processor(struct device_node *np)
zalloc_cpumask_var(_mask, GFP_KERNEL);
zalloc_cpumask_var(, GFP_KERNEL);
 
+   /*
+* Fetch from the DT nodes read by dlpar_configure_connector() the NUMA
+* node id the added CPU belongs to.
+*/
+   nid = of_node_to_nid(np);
+   if (nid < 0 || !node_possible(nid))
+   nid = first_online_node;
+
nthreads = len / sizeof(u32);
-   for (i = 0; i < nthreads; i++)
-   cpumask_set_cpu(i, tmp);
+   for (cpu = 0; cpu < nthreads; cpu++)
+   cpumask_set_cpu(cpu, tmp);
 
cpu_maps_update_begin();
 
@@ -173,6 +187,19 @@ static int pseries_add_processor(struct device_node *np)
 
/* Get a bitmap of unoccupied slots. */
cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+
+   /*
+* Remove free ids previously assigned on the other nodes. We can walk
+* only online nodes because once a node became online it is not turned
+* offlined back.
+*/
+   for_each_online_node(node) {
+   if (node == nid) /* Keep our node's recorded ids */
+

Re: [PATCH v4 19/20] mips: Convert to GENERIC_CMDLINE

2021-04-06 Thread Daniel Walker

On Fri, Apr 02, 2021 at 03:18:21PM +, Christophe Leroy wrote:
> -config CMDLINE_BOOL
> - bool "Built-in kernel command line"
> - help
> -   For most systems, it is firmware or second stage bootloader that
> -   by default specifies the kernel command line options.  However,
> -   it might be necessary or advantageous to either override the
> -   default kernel command line or add a few extra options to it.
> -   For such cases, this option allows you to hardcode your own
> -   command line options directly into the kernel.  For that, you
> -   should choose 'Y' here, and fill in the extra boot arguments
> -   in CONFIG_CMDLINE.
> -
> -   The built-in options will be concatenated to the default command
> -   line if CMDLINE_OVERRIDE is set to 'N'. Otherwise, the default
> -   command line will be ignored and replaced by the built-in string.
> -
> -   Most MIPS systems will normally expect 'N' here and rely upon
> -   the command line from the firmware or the second-stage bootloader.
> -


See how you complained that I have CMDLINE_BOOL in my changed, and you think it
shouldn't exist.

Yet here mips has it, and you just deleted it with no feature parity in your
changes for this.

In my changes I tried to maintain as much feature parity as I could with the
architectures. I did the same huge conversion a long time ago you've done here 
to be sure all
platforms have the features needed.

Daniel

Re: [PATCH v4 00/20] Implement GENERIC_CMDLINE

2021-04-06 Thread Daniel Walker

On Fri, Apr 02, 2021 at 03:18:01PM +, Christophe Leroy wrote:
> The purpose of this series is to improve and enhance the
> handling of kernel boot arguments.
> 
> Current situation is that most if not all architectures are using
> similar options to do some manupulation on command line arguments:
> - Prepend built-in arguments in front of bootloader provided arguments
> - Append built-in arguments after bootloader provided arguments
> - Replace bootloader provided arguments by built-in arguments
> - Use built-in arguments when none is provided by bootloader.
> 
> On some architectures, all the options are possible. On other ones,
> only a subset are available.
> 
> The purpose of this series is to refactor and enhance the
> handling of kernel boot arguments so that every architecture can
> benefit from all possibilities.
> 
> It is first focussed on powerpc but also extends the capability
> for other arches.
> 
> The work has been focussed on minimising the churn in architectures
> by keeping the most commonly used namings.
> 
> Main changes in V4:
> - Included patch from Daniel to replace powerpc's strcpy() by strlcpy()
> - Using strlcpy() instead of zeroing first char + strlcat() (idea taken frm 
> Daniel's series)
> - Reworked the convertion of EFI which was wrong in V3
> - Added "too long" command line handling
> - Changed cmdline macro into a function
> - Done a few fixes in arch (NIOS2, SH, ARM)
> - Taken comments into account (see individual responses for details)
> - Tested on powerpc, build tested on ARM64, X86_64.
> 

Why submit your changes ? My changes have been around for almost 10 years, and
are more widely used. Your changes are very new and unstable, but don't really
solve the needs of people using my series.

I've tried to work with you and I take comments from you, but yet you insist to
submit your own series.

I would suggest this isn't going to go anyplace unless we work together.

I can't really support your changes because, honestly, your changes are really
ugly and they just look more and more like my changes with every passing
iteration .. As the maturity of your changes continue they will just become my
change set.

I've been thru every iteration of these changes, and I see those attempts in
your changes. Everything different in your changes I've tried, and found not to
be useful, then it falls away in later iterations.

When you give me comments on something which I haven't tried I typically
incorporate it.

Daniel

Re: [PATCH v6 1/9] locking/qspinlock: Add ARCH_USE_QUEUED_SPINLOCKS_XCHG32

2021-04-06 Thread Boqun Feng

Hi,

On Wed, Mar 31, 2021 at 02:30:32PM +, guo...@kernel.org wrote:
> From: Guo Ren 
> 
> Some architectures don't have sub-word swap atomic instruction,
> they only have the full word's one.
> 
> The sub-word swap only improve the performance when:
> NR_CPUS < 16K
>  *  0- 7: locked byte
>  * 8: pending
>  *  9-15: not used
>  * 16-17: tail index
>  * 18-31: tail cpu (+1)
> 
> The 9-15 bits are wasted to use xchg16 in xchg_tail.
> 
> Please let architecture select xchg16/xchg32 to implement
> xchg_tail.
> 

If the architecture doesn't have sub-word swap atomic, won't it generate
the same/similar code no matter which version xchg_tail() is used? That
is even CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32=y, xchg_tail() acts
similar to an xchg16() implemented by cmpxchg(), which means we still
don't have forward progress guarantee. So this configuration doesn't
solve the problem.

I think it's OK to introduce this config and don't provide xchg16() for
risc-v. But I don't see the point of converting other architectures to
use it.

Regards,
Boqun

> Signed-off-by: Guo Ren 
> Cc: Peter Zijlstra 
> Cc: Will Deacon 
> Cc: Ingo Molnar 
> Cc: Waiman Long 
> Cc: Arnd Bergmann 
> Cc: Anup Patel 
> ---
>  kernel/Kconfig.locks   |  3 +++
>  kernel/locking/qspinlock.c | 46 +-
>  2 files changed, 28 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
> index 3de8fd11873b..d02f1261f73f 100644
> --- a/kernel/Kconfig.locks
> +++ b/kernel/Kconfig.locks
> @@ -239,6 +239,9 @@ config LOCK_SPIN_ON_OWNER
>  config ARCH_USE_QUEUED_SPINLOCKS
>   bool
>  
> +config ARCH_USE_QUEUED_SPINLOCKS_XCHG32
> + bool
> +
>  config QUEUED_SPINLOCKS
>   def_bool y if ARCH_USE_QUEUED_SPINLOCKS
>   depends on SMP
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index cbff6ba53d56..4bfaa969bd15 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -163,26 +163,6 @@ static __always_inline void 
> clear_pending_set_locked(struct qspinlock *lock)
>   WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
>  }
>  
> -/*
> - * xchg_tail - Put in the new queue tail code word & retrieve previous one
> - * @lock : Pointer to queued spinlock structure
> - * @tail : The new queue tail code word
> - * Return: The previous queue tail code word
> - *
> - * xchg(lock, tail), which heads an address dependency
> - *
> - * p,*,* -> n,*,* ; prev = xchg(lock, node)
> - */
> -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> -{
> - /*
> -  * We can use relaxed semantics since the caller ensures that the
> -  * MCS node is properly initialized before updating the tail.
> -  */
> - return (u32)xchg_relaxed(>tail,
> -  tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> -}
> -
>  #else /* _Q_PENDING_BITS == 8 */
>  
>  /**
> @@ -206,6 +186,30 @@ static __always_inline void 
> clear_pending_set_locked(struct qspinlock *lock)
>  {
>   atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, >val);
>  }
> +#endif /* _Q_PENDING_BITS == 8 */
> +
> +#if _Q_PENDING_BITS == 8 && !defined(CONFIG_ARCH_USE_QUEUED_SPINLOCKS_XCHG32)
> +/*
> + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> + * @lock : Pointer to queued spinlock structure
> + * @tail : The new queue tail code word
> + * Return: The previous queue tail code word
> + *
> + * xchg(lock, tail), which heads an address dependency
> + *
> + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> + */
> +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
> +{
> + /*
> +  * We can use relaxed semantics since the caller ensures that the
> +  * MCS node is properly initialized before updating the tail.
> +  */
> + return (u32)xchg_relaxed(>tail,
> +  tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
> +}
> +
> +#else
>  
>  /**
>   * xchg_tail - Put in the new queue tail code word & retrieve previous one
> @@ -236,7 +240,7 @@ static __always_inline u32 xchg_tail(struct qspinlock 
> *lock, u32 tail)
>   }
>   return old;
>  }
> -#endif /* _Q_PENDING_BITS == 8 */
> +#endif
>  
>  /**
>   * queued_fetch_set_pending_acquire - fetch the whole lock value and set 
> pending
> -- 
> 2.17.1
>

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Luis Chamberlain

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> diff --git a/include/linux/panic_notifier.h b/include/linux/panic_notifier.h
> new file mode 100644
> index ..41e32483d7a7
> --- /dev/null
> +++ b/include/linux/panic_notifier.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_PANIC_NOTIFIERS_H
> +#define _LINUX_PANIC_NOTIFIERS_H
> +
> +#include 
> +#include 
> +
> +extern struct atomic_notifier_head panic_notifier_list;
> +
> +extern bool crash_kexec_post_notifiers;
> +
> +#endif   /* _LINUX_PANIC_NOTIFIERS_H */

Why is it worth it to add another file just for this? Seems like a very
small file.

  Luis

Re: [PATCH 6/8] drivers: firmware: efi: libstub: enable generic commandline

2021-04-06 Thread Daniel Walker

On Fri, Apr 02, 2021 at 07:36:53PM +0200, Christophe Leroy wrote:
> 
> 
> Le 30/03/2021 à 19:57, Daniel Walker a écrit :
> > This adds code to handle the generic command line changes.
> > The efi code appears that it doesn't benefit as much from this design
> > as it could.
> > 
> > For example, if you had a prepend command line with "nokaslr" then
> > you might be helpful to re-enable it in the boot loader or dts,
> > but there appears to be no way to re-enable kaslr or some of the
> > other options.
> > 
> > Cc: xe-linux-exter...@cisco.com
> > Signed-off-by: Daniel Walker 
> > ---
> >   .../firmware/efi/libstub/efi-stub-helper.c| 35 +++
> >   drivers/firmware/efi/libstub/efi-stub.c   |  7 
> >   drivers/firmware/efi/libstub/efistub.h|  1 +
> >   drivers/firmware/efi/libstub/x86-stub.c   | 13 +--
> >   4 files changed, 54 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c 
> > b/drivers/firmware/efi/libstub/efi-stub-helper.c
> > index aa8da0a49829..c155837cedc9 100644
> > --- a/drivers/firmware/efi/libstub/efi-stub-helper.c
> > +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
> > @@ -13,6 +13,7 @@
> >   #include 
> >   #include 
> >   #include  /* For CONSOLE_LOGLEVEL_* */
> > +#include 
> >   #include 
> >   #include 
> > @@ -172,6 +173,40 @@ int efi_printk(const char *fmt, ...)
> > return printed;
> >   }
> > +/**
> > + * efi_handle_cmdline() - handle adding in building parts of the command 
> > line
> > + * @cmdline:   kernel command line
> > + *
> > + * Add in the generic parts of the commandline and start the parsing of the
> > + * command line.
> > + *
> > + * Return: status code
> > + */
> > +efi_status_t efi_handle_cmdline(char const *cmdline)
> > +{
> > +   efi_status_t status;
> > +
> > +   status = efi_parse_options(CMDLINE_PREPEND);
> > +   if (status != EFI_SUCCESS) {
> > +   efi_err("Failed to parse options\n");
> > +   return status;
> > +   }
> > +
> > +   status = efi_parse_options(IS_ENABLED(CONFIG_CMDLINE_OVERRIDE) ? "" : 
> > cmdline);
> > +   if (status != EFI_SUCCESS) {
> > +   efi_err("Failed to parse options\n");
> > +   return status;
> > +   }
> > +
> > +   status = efi_parse_options(CMDLINE_APPEND);
> > +   if (status != EFI_SUCCESS) {
> > +   efi_err("Failed to parse options\n");
> > +   return status;
> > +   }
> > +
> > +   return EFI_SUCCESS;
> > +}
> 
> I think we can refactor to first build the final command line, then call
> efi_parse_options() only once after that.
 
I tried this, like what you did in your v4 .. The issues are similar to the
prom_init.c problems. The environment is delicate and requires careful
programming to get it done correctly.

> The big advantage of GENERIC_CMDLINE should be to not address anymore
> CONFIG_CMDLINE_XXX options at all outside of linux/cmdline.h
 
I agree , but not I've found that it's not likely to get this all changed in a
single series.

Daniel

Re: [PATCH 4/8] CMDLINE: powerpc: convert to generic builtin command line

2021-04-06 Thread Daniel Walker

On Fri, Apr 02, 2021 at 07:34:19PM +0200, Christophe Leroy wrote:
> 
> 
> Le 30/03/2021 à 19:56, Daniel Walker a écrit :
> > This updates the powerpc code to use the CONFIG_GENERIC_CMDLINE
> > option.
> > 
> > This includes a scripted mass convert of the config files to use
> > the new generic cmdline. There is a bit of a trim effect here.
> > It would seems that some of the config haven't been trimmed in
> > a while.
> 
> Sorry, this patch is not acceptable as is, the default for powerpc is
> CMDLINE_FROM_BOOTLOADER, ie builtin-cmdline is taken if and only if none is
> provided by the bootloader.
> 
> As far as I understand, that disappear with this patch.

We've talked about it previously. Maybe your not understanding the precedent of
the command line options. I tried to explain that one before.

What problems do you think are caused if this patch is applied ?

Daniel

Re: [PATCH 2/8] CMDLINE: drivers: of: ifdef out cmdline section

2021-04-06 Thread Daniel Walker

On Fri, Apr 02, 2021 at 07:32:08PM +0200, Christophe Leroy wrote:
> 
> 
> Le 30/03/2021 à 19:56, Daniel Walker a écrit :
> > It looks like there's some seepage of cmdline stuff into
> > the generic device tree code. This conflicts with the
> > generic cmdline implementation so I remove it in the case
> > when that's enabled.
> > 
> > Cc: xe-linux-exter...@cisco.com
> > Signed-off-by: Ruslan Ruslichenko 
> > Signed-off-by: Daniel Walker 
> > ---
> >   drivers/of/fdt.c | 14 ++
> >   1 file changed, 14 insertions(+)
> > 
> > diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
> > index dcc1dd96911a..d8805cd9717a 100644
> > --- a/drivers/of/fdt.c
> > +++ b/drivers/of/fdt.c
> > @@ -25,6 +25,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   #include   /* for COMMAND_LINE_SIZE */
> >   #include 
> > @@ -1050,6 +1051,18 @@ int __init early_init_dt_scan_chosen(unsigned long 
> > node, const char *uname,
> > /* Retrieve command line */
> > p = of_get_flat_dt_prop(node, "bootargs", );
> > +
> > +#if defined(CONFIG_GENERIC_CMDLINE) && defined(CONFIG_GENERIC_CMDLINE_OF)
> > +   /*
> > +* The builtin command line will be added here, or it can override
> > +* with the DT bootargs.
> > +*/
> > +   cmdline_add_builtin(data,
> > +   (l > 0 ? p : NULL), /* This is sanity checking */
> > +   COMMAND_LINE_SIZE);
> > +#elif defined(CONFIG_GENERIC_CMDLINE)
> > +   strlcpy(data, p, COMMAND_LINE_SIZE);
> > +#else
> 
> Ugly.
> 
> Linux codying style recommend to limit the use of #ifdefs to headers as much 
> as possible.
> 
> Why do we need so many alternatives ? Allthough they are temporary, can we
> order the changes in another way to reduce that ?

I think this whole section can be removed down even all the CMDLINE ifdef's ..
The only architecture which needs this is powerpc because it calls this function
three times.

If powerpc were made to call this only once , and then call the generic handling
for the command line then this whole section would get removed.

Daniel

Re: [PATCH v2 3/7] powerpc: convert config files to generic cmdline

2021-04-06 Thread Daniel Walker

On Thu, Apr 01, 2021 at 03:08:04PM -0500, Rob Herring wrote:
> On Tue, Mar 30, 2021 at 6:31 PM Daniel Walker  wrote:
> >
> > On Tue, Mar 30, 2021 at 03:13:04PM -0500, Rob Herring wrote:
> > > On Tue, Mar 30, 2021 at 12:33 PM Daniel Walker  wrote:
> > > >
> > > > On Thu, Mar 25, 2021 at 05:29:44PM -0600, Rob Herring wrote:
> > > > > On Thu, Mar 25, 2021 at 2:00 PM Daniel Walker  
> > > > > wrote:
> > > > > >
> > > > > > On Thu, Mar 25, 2021 at 01:03:55PM +0100, Christophe Leroy wrote:
> > > > > > >
> > > > > > > Ok, so you agree we don't need to provide two CMDLINE, one to be 
> > > > > > > appended and one to be prepended.
> > > > > > >
> > > > > > > Let's only provide once CMDLINE as of today, and ask the user to 
> > > > > > > select
> > > > > > > whether he wants it appended or prepended or replacee. Then no 
> > > > > > > need to
> > > > > > > change all existing config to rename CONFIG_CMDLINE into either 
> > > > > > > of the new
> > > > > > > ones.
> > > > > > >
> > > > > > > That's the main difference between my series and Daniel's series. 
> > > > > > > So I'll
> > > > > > > finish taking Will's comment into account and we'll send out a v3 
> > > > > > > soon.
> > > > > >
> > > > > > It doesn't solve the needs of Cisco, I've stated many times your 
> > > > > > changes have
> > > > > > little value. Please stop submitting them.
> > > > >
> > > > > Can you please outline what those needs are which aren't met?
> > > >
> > > > append AND prepend at the same time on all architectures. Christophe 
> > > > doesn't
> > > > understand the need, and hence tries to minimize the feature set which 
> > > > is
> > > > incompatible with Cisco needs and all the other out of tree users.
> > >
> > > Okay, but that's never been a feature in upstream. For upstream, we
> > > refactor first and add features 2nd. In this case, the difference is
> > > largely the kconfig and it would be better to not change the options
> > > twice, but that's not a blocker for taking the refactoring. You won't
> > > find a maintainer that's going to take adding a feature over cleanups
> > > and unification.
> >
> > It kind of is a feature in upstream, it's a matter of opinion. Some platform
> > used append and some use prepend, and it's likely because the maintainers 
> > needed
> > one or the other for development.
> 
> Which arch/platform upstream does both prepend and append at the same time?
 
None do it at the same time, however x86 and mips have switched between the 
two. 

> > I'm not sure why you think I can't add the features in one go. It would be
> > horrid to take Christophe's changes, then have to do basically all the same 
> > work
> > a second time which is what Christophe's changes would force me to do.
> 
> I didn't say it couldn't be done. In fact, I said it would be better
> all at once: "it would be better to not change the options twice"
> 
> But both of you ignoring comments and continuing to post competing
> series is not going to get us there. TBC, I think Christophe's series
> is much closer to being in shape to merge upstream.
 
I'm not the one ignoring comments .. I've taken a number of comments from
Christophe, but he still submits his own series..

Christophe series doesn't look good to me.. I suspect you like it cause it
deletes lines from of.

> > Say for example I implement this change only on one architecture. In that 
> > case
> > the maintainer would be accepting a feature enhancement , but there would 
> > be no
> > stopping it. I shouldn't have to go two strokes on one architecture, but 
> > each
> > change I'm making is essentially a single architecture. They can go in all
> > together or one at a time.
> 
> Features do get implemented all the time on one arch. And then maybe a
> 2nd and 3rd. At some point we decide no more copying, it needs to be
> common and refactored. We're at that point for cmdline handling IMO.

I don't think it can be done with one series all at once ..

Daniel

Re: [PATCH] powerpc/perf: prevent mixed EBB and non-EBB events

2021-04-06 Thread Athira Rajeev




> On 05-Mar-2021, at 11:20 AM, Athira Rajeev  
> wrote:
> 
> 
> 
>> On 24-Feb-2021, at 5:51 PM, Thadeu Lima de Souza Cascardo 
>>  wrote:
>> 
>> EBB events must be under exclusive groups, so there is no mix of EBB and
>> non-EBB events on the same PMU. This requirement worked fine as perf core
>> would not allow other pinned events to be scheduled together with exclusive
>> events.
>> 
>> This assumption was broken by commit 1908dc911792 ("perf: Tweak
>> perf_event_attr::exclusive semantics").
>> 
>> After that, the test cpu_event_pinned_vs_ebb_test started succeeding after
>> read_events, but worse, the task would not have given access to PMC1, so
>> when it tried to write to it, it was killed with "illegal instruction".
>> 
>> Preventing mixed EBB and non-EBB events from being add to the same PMU will
>> just revert to the previous behavior and the test will succeed.
> 
> 
> Hi,
> 
> Thanks for checking this. I checked your patch which is fixing 
> “check_excludes” to make
> sure all events must agree on EBB. But in the PMU group constraints, we 
> already have check for
> EBB events. This is in arch/powerpc/perf/isa207-common.c ( 
> isa207_get_constraint function ).
> 
> <<>>
> mask  |= CNST_EBB_VAL(ebb);
> value |= CNST_EBB_MASK;
> <<>>
> 
> But the above setting for mask and value is interchanged. We actually need to 
> fix here.
> 

Hi,

I have sent a patch for fixing this EBB mask/value setting.
This is the link to patch:

powerpc/perf: Fix PMU constraint check for EBB events
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=237669

Thanks
Athira

> Below patch should fix this:
> 
> diff --git a/arch/powerpc/perf/isa207-common.c 
> b/arch/powerpc/perf/isa207-common.c
> index e4f577da33d8..8b5eeb6fb2fb 100644
> --- a/arch/powerpc/perf/isa207-common.c
> +++ b/arch/powerpc/perf/isa207-common.c
> @@ -447,8 +447,8 @@ int isa207_get_constraint(u64 event, unsigned long 
> *maskp, unsigned long *valp,
> * EBB events are pinned & exclusive, so this should never actually
> * hit, but we leave it as a fallback in case.
> */
> -   mask  |= CNST_EBB_VAL(ebb);
> -   value |= CNST_EBB_MASK;
> +   mask  |= CNST_EBB_MASK;
> +   value |= CNST_EBB_VAL(ebb);
> 
>*maskp = mask;
>*valp = value;
> 
> 
> Can you please try with this patch.
> 
> Thanks
> Athira
> 
> 
>> 
>> Fixes: 1908dc911792 (perf: Tweak perf_event_attr::exclusive semantics)
>> Signed-off-by: Thadeu Lima de Souza Cascardo 
>> ---
>> arch/powerpc/perf/core-book3s.c | 20 
>> 1 file changed, 16 insertions(+), 4 deletions(-)
>> 
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index 43599e671d38..d767f7944f85 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -1010,9 +1010,25 @@ static int check_excludes(struct perf_event **ctrs, 
>> unsigned int cflags[],
>>int n_prev, int n_new)
>> {
>>  int eu = 0, ek = 0, eh = 0;
>> +bool ebb = false;
>>  int i, n, first;
>>  struct perf_event *event;
>> 
>> +n = n_prev + n_new;
>> +if (n <= 1)
>> +return 0;
>> +
>> +first = 1;
>> +for (i = 0; i < n; ++i) {
>> +event = ctrs[i];
>> +if (first) {
>> +ebb = is_ebb_event(event);
>> +first = 0;
>> +} else if (is_ebb_event(event) != ebb) {
>> +return -EAGAIN;
>> +}
>> +}
>> +
>>  /*
>>   * If the PMU we're on supports per event exclude settings then we
>>   * don't need to do any of this logic. NB. This assumes no PMU has both
>> @@ -1021,10 +1037,6 @@ static int check_excludes(struct perf_event **ctrs, 
>> unsigned int cflags[],
>>  if (ppmu->flags & PPMU_ARCH_207S)
>>  return 0;
>> 
>> -n = n_prev + n_new;
>> -if (n <= 1)
>> -return 0;
>> -
>>  first = 1;
>>  for (i = 0; i < n; ++i) {
>>  if (cflags[i] & PPMU_LIMITED_PMC_OK) {
>> -- 
>> 2.27.0

[PATCH] powerpc/perf: Fix PMU constraint check for EBB events

2021-04-06 Thread Athira Rajeev

The power PMU group constraints includes check for EBB events
to make sure all events in a group must agree on EBB. This
will prevent scheduling EBB and non-EBB events together.
But in the existing check, settings for constraint mask and
value is interchanged. Patch fixes the same.

Before the patch, PMU selftest "cpu_event_pinned_vs_ebb_test"
fails with below in dmesg logs. This happens because EBB event
gets enabled along with a non-EBB cpu event.

<<>>
[35600.453346] cpu_event_pinne[41326]: illegal instruction (4)
at 10004a18 nip 10004a18 lr 100049f8 code 1 in
cpu_event_pinned_vs_ebb_test[1000+1]
<<>>

Test results after the patch:

 ./pmu/ebb/cpu_event_pinned_vs_ebb_test
test: cpu_event_pinned_vs_ebb
tags: git_version:v5.12-rc5-93-gf28c3125acd3-dirty
Binding to cpu 8
EBB Handler is at 0x100050c8
read error on event 0x7fffe6bd4040!
PM_RUN_INST_CMPL: result 9872 running/enabled 37930432
success: cpu_event_pinned_vs_ebb

Fixes: 4df489991182 ("powerpc/perf: Add power8 EBB support")
Reported-by: Thadeu Lima de Souza Cascardo 
Signed-off-by: Athira Rajeev 
---
 arch/powerpc/perf/isa207-common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/isa207-common.c 
b/arch/powerpc/perf/isa207-common.c
index e4f577da33d8..8b5eeb6fb2fb 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -447,8 +447,8 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, 
unsigned long *valp,
 * EBB events are pinned & exclusive, so this should never actually
 * hit, but we leave it as a fallback in case.
 */
-   mask  |= CNST_EBB_VAL(ebb);
-   value |= CNST_EBB_MASK;
+   mask  |= CNST_EBB_MASK;
+   value |= CNST_EBB_VAL(ebb);
 
*maskp = mask;
*valp = value;
-- 
1.8.3.1

Re: [PATCH] swim3: support highmem

2021-04-06 Thread Jens Axboe

On 4/6/21 12:18 AM, Christoph Hellwig wrote:
> swim3 only uses the virtual address of a bio to stash it into the data
> transfer using virt_to_bus.  But the ppc32 virt_to_bus just uses the
> physical address with an offset.  Replace virt_to_bus with a local hack
> that performs the equivalent transformation and stop asking for block
> layer bounce buffering.

Applied, thanks.

-- 
Jens Axboe

Re: [PATCH printk v2 2/5] printk: remove safe buffers

2021-04-06 Thread Petr Mladek

On Fri 2021-04-02 11:14:18, Sergey Senozhatsky wrote:
> On (21/04/01 16:17), Petr Mladek wrote:
> > > For the long term, we should introduce a printk-context API that allows
> > > callers to perfectly pack their multi-line output into a single
> > > entry. We discussed [0][1] this back in August 2020.
> > 
> > We need a "short" term solution. There are currently 3 solutions:
> > 
> > 1. Keep nmi_safe() and all the hacks around.
> > 
> > 2. Serialize nmi_cpu_backtrace() by a spin lock and later by
> >the special lock used also by atomic consoles.
> > 
> > 3. Tell complaining people how to sort the messed logs.
> 
> Are we talking about nmi_cpu_backtrace()->dump_stack() or some
> other path?

It is about serializing

if (regs)
show_regs(regs);
else
dump_stack();

in nmi_cpu_backtrace() when it is triggered on many(all) CPUs
at the same time.

> dump_stack() seems to be already serialized by `dump_lock`. Hmm,
> show_regs() is not serialized, seems like it should be under the
> same `dump_lock` as dump_stack().

Ah, I think that you already mentioned it in the past and I forget it.

Yes, we would need to synchronize all these dump/show functions using
the same lock. It is already the lock that might be taken recursively
on the same CPU.

In each case, we must not introduce another lock in
nmi_cpu_backtrace() because it might cause deadlock with
@dump_lock.

Anyway, I would really like to keep the dumps serialized. So, we
either need to use the same lock everywhere or we need to keep
nmi_safe buffers for now.

I would like to remove the nmi_safe buffers in the long term
but I am fine with doing it later after the consoles rework.
I'll leave the prioritization for John who is doing the work
and might have some preferences.

Best Regards,
Petr

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Arnd Bergmann

On Tue, Apr 6, 2021 at 3:31 PM Andy Shevchenko
 wrote:
>
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
>
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
>
> Signed-off-by: Andy Shevchenko 

Nice!

Acked-by: Arnd Bergmann

[PATCH] powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC

2021-04-06 Thread Athira Rajeev

Running perf fuzzer showed below in dmesg logs:
"Can't find PMC that caused IRQ"

This means a PMU exception happened, but none of the PMC's (Performance
Monitor Counter) were found to be overflown. There are some corner cases
that clears the PMCs after PMI gets masked. In such cases, the perf
interrupt handler will not find the active PMC values that had caused
the overflow and thus leads to this message while replaying.

Case 1: PMU Interrupt happens during replay of other interrupts and
counter values gets cleared by PMU callbacks before replay:

During replay of interrupts like timer, __do_irq and doorbell exception, we
conditionally enable interrupts via may_hard_irq_enable(). This could
potentially create a window to generate a PMI. Since irq soft mask is set
to ALL_DISABLED, the PMI will get masked here. We could get IPIs run before
perf interrupt is replayed and the PMU events could deleted or stopped.
This will change the PMU SPR values and resets the counters. Snippet of
ftrace log showing PMU callbacks invoked in "__do_irq":

-0 [051] dns. 132025441306354: __do_irq <-call_do_irq
-0 [051] dns. 132025441306430: irq_enter <-__do_irq
-0 [051] dns. 132025441306503: irq_enter_rcu <-__do_irq
-0 [051] dnH. 132025441306599: xive_get_irq <-__do_irq
<<>>
-0 [051] dnH. 132025441307770: generic_smp_call_function_single_interrupt 
<-smp_ipi_demux_relaxed
-0 [051] dnH. 132025441307839: flush_smp_call_function_queue 
<-smp_ipi_demux_relaxed
-0 [051] dnH. 132025441308057: _raw_spin_lock <-event_function
-0 [051] dnH. 132025441308206: power_pmu_disable <-perf_pmu_disable
-0 [051] dnH. 132025441308337: power_pmu_del <-event_sched_out
-0 [051] dnH. 132025441308407: power_pmu_read <-power_pmu_del
-0 [051] dnH. 132025441308477: read_pmc <-power_pmu_read
-0 [051] dnH. 132025441308590: isa207_disable_pmc <-power_pmu_del
-0 [051] dnH. 132025441308663: write_pmc <-power_pmu_del
-0 [051] dnH. 132025441308787: power_pmu_event_idx 
<-perf_event_update_userpage
-0 [051] dnH. 132025441308859: rcu_read_unlock_strict 
<-perf_event_update_userpage
-0 [051] dnH. 132025441308975: power_pmu_enable <-perf_pmu_enable
<<>>
-0 [051] dnH. 132025441311108: irq_exit <-__do_irq
-0 [051] dns. 132025441311319: performance_monitor_exception 
<-replay_soft_interrupts

Case 2: PMI's masked during local_* operations, example local_add.
If the local_add operation happens within a local_irq_save, replay of
PMI will be during local_irq_restore. Similar to case 1, this could
also create a window before replay where PMU events gets deleted or
stopped.

Patch adds a fix to update the PMU callback functions (del,stop,enable) to
check for pending perf interrupt. If there is an overflown PMC and pending
perf interrupt indicated in Paca, clear the PMI bit in paca to drop that
sample. In case of power_pmu_del, also clear the MMCR0 PMAO bit which
otherwise could lead to spurious interrupts in some corner cases. Example,
a timer after power_pmu_del which will re-enable interrupts since PMI is
cleared and triggers a PMI again since PMAO bit is still set.

We can't just replay PMI any time. Hence this approach is preferred rather
than replaying PMI before resetting overflown PMC. Patch also documents
core-book3s on a race condition which can trigger these PMC messages during
idle path in PowerNV.

Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and 
replay them")
Reported-by: Nageswara R Sastry 
Suggested-by: Nicholas Piggin 
Suggested-by: Madhavan Srinivasan 
Signed-off-by: Athira Rajeev 
---
 arch/powerpc/include/asm/pmc.h  | 11 +
 arch/powerpc/perf/core-book3s.c | 55 +
 2 files changed, 66 insertions(+)

diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h
index c6bbe9778d3c..97b4bd8de25b 100644
--- a/arch/powerpc/include/asm/pmc.h
+++ b/arch/powerpc/include/asm/pmc.h
@@ -34,11 +34,22 @@ static inline void ppc_set_pmu_inuse(int inuse)
 #endif
 }
 
+static inline int clear_paca_irq_pmi(void)
+{
+   if (get_paca()->irq_happened & PACA_IRQ_PMI) {
+   WARN_ON_ONCE(mfmsr() & MSR_EE);
+   get_paca()->irq_happened &= ~PACA_IRQ_PMI;
+   return 1;
+   }
+   return 0;
+}
+
 extern void power4_enable_pmcs(void);
 
 #else /* CONFIG_PPC64 */
 
 static inline void ppc_set_pmu_inuse(int inuse) { }
+static inline int clear_paca_irq_pmi(void) { return 0; }
 
 #endif
 
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 766f064f00fb..18ca3c90f866 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -847,6 +847,20 @@ static void write_pmc(int idx, unsigned long val)
}
 }
 
+static int pmc_overflown(int idx)
+{
+   unsigned long val[8];
+   int i;
+
+   for (i = 0; i < ppmu->n_counter; i++)
+   val[i] = read_pmc(i + 1);
+
+   if ((int)val[idx-1] < 0)
+   return 1;
+
+   return 0;
+}
+
 /* Called from

[PATCH] powerpc/perf: Clear pending PMI in ppmu callbacks

2021-04-06 Thread Athira Rajeev

Running perf fuzzer testsuite popped up below messages
in the dmesg logs:

"Can't find PMC that caused IRQ"

This means a PMU exception happened, but none of the PMC's (Performance
Monitor Counter) were found to be overflown. Perf interrupt handler checks
the PMC's to see which PMC has overflown and if none of the PMCs are
overflown ( counter value not >= 0x8000 ), it throws warning:
"Can't find PMC that caused IRQ".

Powerpc has capability to mask and replay a performance monitoring
interrupt (PMI). In case of replayed PMI, there are some corner cases
that clears the PMCs after masking. In such cases, the perf interrupt
handler will not find the active PMC values that had caused the overflow
and thus leading to this message. This patchset attempts to fix those
corner cases.

However there is one more case in PowerNV where these messages are
emitted during system wide profiling or when a specific CPU is monitored
for an event. That is, when a counter overflow just before entering idle
and a PMI gets triggered after wakeup from idle. Since PMCs
are not saved in the idle path, perf interrupt handler will not
find overflown counter value and emits the "Can't find PMC" messages.
This patch documents this race condition in powerpc core-book3s.

Patch fixes the ppmu callbacks to disable pending interrupt before clearing
the overflown PMC and documents the race condition in idle path.

Athira Rajeev (1):
  powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting
an overflown PMC

 arch/powerpc/include/asm/pmc.h  | 11 +
 arch/powerpc/perf/core-book3s.c | 55 +
 2 files changed, 66 insertions(+)

-- 
1.8.3.1

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Christian Brauner

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
> 
> Signed-off-by: Andy Shevchenko 
> ---

(I think David has tried something like this a few years ago too?)
Good idea in any case. (Be good to see kbuild do an allmodconfig build
of this though.)
Acked-by: Christian Brauner 

>  arch/powerpc/kernel/setup-common.c   |  1 +
>  arch/x86/include/asm/desc.h  |  1 +
>  arch/x86/kernel/cpu/mshyperv.c   |  1 +
>  arch/x86/kernel/setup.c  |  1 +
>  drivers/char/ipmi/ipmi_msghandler.c  |  1 +
>  drivers/remoteproc/remoteproc_core.c |  1 +
>  include/asm-generic/bug.h|  3 +-
>  include/linux/kernel.h   | 84 +---
>  include/linux/panic.h| 98 
>  include/linux/panic_notifier.h   | 12 
>  kernel/hung_task.c   |  1 +
>  kernel/kexec_core.c  |  1 +
>  kernel/panic.c   |  1 +
>  kernel/rcu/tree.c|  2 +
>  kernel/sysctl.c  |  1 +
>  kernel/trace/trace.c |  1 +
>  16 files changed, 126 insertions(+), 84 deletions(-)
>  create mode 100644 include/linux/panic.h
>  create mode 100644 include/linux/panic_notifier.h
> 
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index 74a98fff2c2f..046fe21b5c3b 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -9,6 +9,7 @@
>  #undef DEBUG
>  
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index 476082a83d1c..ceb12683b6d1 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -9,6 +9,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  #include 
>  
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 22f13343b5da..9e5c6f2b044d 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -17,6 +17,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 59e5e0903b0c..570699eecf90 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
> b/drivers/char/ipmi/ipmi_msghandler.c
> index 8a0e97b33cae..e96cb5c4f97a 100644
> --- a/drivers/char/ipmi/ipmi_msghandler.c
> +++ b/drivers/char/ipmi/ipmi_msghandler.c
> @@ -16,6 +16,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/drivers/remoteproc/remoteproc_core.c 
> b/drivers/remoteproc/remoteproc_core.c
> index 626a6b90fba2..76dd8e2b1e7e 100644
> --- a/drivers/remoteproc/remoteproc_core.c
> +++ b/drivers/remoteproc/remoteproc_core.c
> @@ -20,6 +20,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
> index 76a10e0dca9f..719410b93f99 100644
> --- a/include/asm-generic/bug.h
> +++ b/include/asm-generic/bug.h
> @@ -17,7 +17,8 @@
>  #endif
>  
>  #ifndef __ASSEMBLY__
> -#include 
> +#include 
> +#include 
>  
>  #ifdef CONFIG_BUG
>  
> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 09035ac67d4b..6c5a05ac1ecb 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -70,7 +71,6 @@
>  #define lower_32_bits(n) ((u32)((n) & 0x))
>  
>  struct completion;
> -struct pt_regs;
>  struct user;
>  
>  #ifdef CONFIG_PREEMPT_VOLUNTARY
> @@ -175,14 +175,6 @@ void __might_fault(const char *file, int line);
>  static inline void might_fault(void) { }
>  #endif
>  
> -extern struct atomic_notifier_head panic_notifier_list;
> -extern long (*panic_blink)(int state);
> -__printf(1, 2)
> -void panic(const char *fmt, ...) __noreturn __cold;
> -void nmi_panic(struct pt_regs *regs, const char *msg);
> -extern void oops_enter(void);
> -extern void oops_exit(void);
> -extern bool oops_may_print(void);
>  void do_exit(long error_code) __noreturn;
>  void complete_and_exit(struct completion *, long) __noreturn;
>  
> @@ -368,52 +360,8 @@ extern int __kernel_text_address(unsigned long addr);
>  extern int kernel_text_address(unsigned long addr);
>  extern int func_ptr_is_kernel_text(void *ptr);
>  
>

Re: [PATCH] powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC

2021-04-06 Thread Athira Rajeev

Hi,

Cover letter is missing in this patch. I will resent the patch along with cover 
letter. 
Sorry for the noise.

Thanks,
Athira
> On 06-Apr-2021, at 7:44 PM, Athira Rajeev  wrote:
> 
> Running perf fuzzer showed below in dmesg logs:
> "Can't find PMC that caused IRQ"
> 
> This means a PMU exception happened, but none of the PMC's (Performance
> Monitor Counter) were found to be overflown. There are some corner cases
> that clears the PMCs after PMI gets masked. In such cases, the perf
> interrupt handler will not find the active PMC values that had caused
> the overflow and thus leads to this message while replaying.
> 
> Case 1: PMU Interrupt happens during replay of other interrupts and
> counter values gets cleared by PMU callbacks before replay:
> 
> During replay of interrupts like timer, __do_irq and doorbell exception, we
> conditionally enable interrupts via may_hard_irq_enable(). This could
> potentially create a window to generate a PMI. Since irq soft mask is set
> to ALL_DISABLED, the PMI will get masked here. We could get IPIs run before
> perf interrupt is replayed and the PMU events could deleted or stopped.
> This will change the PMU SPR values and resets the counters. Snippet of
> ftrace log showing PMU callbacks invoked in "__do_irq":
> 
> -0 [051] dns. 132025441306354: __do_irq <-call_do_irq
> -0 [051] dns. 132025441306430: irq_enter <-__do_irq
> -0 [051] dns. 132025441306503: irq_enter_rcu <-__do_irq
> -0 [051] dnH. 132025441306599: xive_get_irq <-__do_irq
> <<>>
> -0 [051] dnH. 132025441307770: 
> generic_smp_call_function_single_interrupt <-smp_ipi_demux_relaxed
> -0 [051] dnH. 132025441307839: flush_smp_call_function_queue 
> <-smp_ipi_demux_relaxed
> -0 [051] dnH. 132025441308057: _raw_spin_lock <-event_function
> -0 [051] dnH. 132025441308206: power_pmu_disable <-perf_pmu_disable
> -0 [051] dnH. 132025441308337: power_pmu_del <-event_sched_out
> -0 [051] dnH. 132025441308407: power_pmu_read <-power_pmu_del
> -0 [051] dnH. 132025441308477: read_pmc <-power_pmu_read
> -0 [051] dnH. 132025441308590: isa207_disable_pmc <-power_pmu_del
> -0 [051] dnH. 132025441308663: write_pmc <-power_pmu_del
> -0 [051] dnH. 132025441308787: power_pmu_event_idx 
> <-perf_event_update_userpage
> -0 [051] dnH. 132025441308859: rcu_read_unlock_strict 
> <-perf_event_update_userpage
> -0 [051] dnH. 132025441308975: power_pmu_enable <-perf_pmu_enable
> <<>>
> -0 [051] dnH. 132025441311108: irq_exit <-__do_irq
> -0 [051] dns. 132025441311319: performance_monitor_exception 
> <-replay_soft_interrupts
> 
> Case 2: PMI's masked during local_* operations, example local_add.
> If the local_add operation happens within a local_irq_save, replay of
> PMI will be during local_irq_restore. Similar to case 1, this could
> also create a window before replay where PMU events gets deleted or
> stopped.
> 
> Patch adds a fix to update the PMU callback functions (del,stop,enable) to
> check for pending perf interrupt. If there is an overflown PMC and pending
> perf interrupt indicated in Paca, clear the PMI bit in paca to drop that
> sample. In case of power_pmu_del, also clear the MMCR0 PMAO bit which
> otherwise could lead to spurious interrupts in some corner cases. Example,
> a timer after power_pmu_del which will re-enable interrupts since PMI is
> cleared and triggers a PMI again since PMAO bit is still set.
> 
> We can't just replay PMI any time. Hence this approach is preferred rather
> than replaying PMI before resetting overflown PMC. Patch also documents
> core-book3s on a race condition which can trigger these PMC messages during
> idle path in PowerNV.
> 
> Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and 
> replay them")
> Reported-by: Nageswara R Sastry 
> Suggested-by: Nicholas Piggin 
> Suggested-by: Madhavan Srinivasan 
> Signed-off-by: Athira Rajeev 
> ---
> arch/powerpc/include/asm/pmc.h  | 11 +
> arch/powerpc/perf/core-book3s.c | 55 +
> 2 files changed, 66 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h
> index c6bbe9778d3c..97b4bd8de25b 100644
> --- a/arch/powerpc/include/asm/pmc.h
> +++ b/arch/powerpc/include/asm/pmc.h
> @@ -34,11 +34,22 @@ static inline void ppc_set_pmu_inuse(int inuse)
> #endif
> }
> 
> +static inline int clear_paca_irq_pmi(void)
> +{
> + if (get_paca()->irq_happened & PACA_IRQ_PMI) {
> + WARN_ON_ONCE(mfmsr() & MSR_EE);
> + get_paca()->irq_happened &= ~PACA_IRQ_PMI;
> + return 1;
> + }
> + return 0;
> +}
> +
> extern void power4_enable_pmcs(void);
> 
> #else /* CONFIG_PPC64 */
> 
> static inline void ppc_set_pmu_inuse(int inuse) { }
> +static inline int clear_paca_irq_pmi(void) { return 0; }
> 
> #endif
> 
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 766f064f00fb..18ca3c90f866 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Mike Rapoport

On Tue, Apr 06, 2021 at 04:31:58PM +0300, Andy Shevchenko wrote:
> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
> 
> Signed-off-by: Andy Shevchenko 

Acked-by: Mike Rapoport 

> ---
>  arch/powerpc/kernel/setup-common.c   |  1 +
>  arch/x86/include/asm/desc.h  |  1 +
>  arch/x86/kernel/cpu/mshyperv.c   |  1 +
>  arch/x86/kernel/setup.c  |  1 +
>  drivers/char/ipmi/ipmi_msghandler.c  |  1 +
>  drivers/remoteproc/remoteproc_core.c |  1 +
>  include/asm-generic/bug.h|  3 +-
>  include/linux/kernel.h   | 84 +---
>  include/linux/panic.h| 98 
>  include/linux/panic_notifier.h   | 12 
>  kernel/hung_task.c   |  1 +
>  kernel/kexec_core.c  |  1 +
>  kernel/panic.c   |  1 +
>  kernel/rcu/tree.c|  2 +
>  kernel/sysctl.c  |  1 +
>  kernel/trace/trace.c |  1 +
>  16 files changed, 126 insertions(+), 84 deletions(-)
>  create mode 100644 include/linux/panic.h
>  create mode 100644 include/linux/panic_notifier.h
> 
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index 476082a83d1c..ceb12683b6d1 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -9,6 +9,7 @@
>  #include 
>  #include 
>  
> +#include 

This seems unrelated, but I might be missing something.

>  #include 
>  #include 
>  

-- 
Sincerely yours,
Mike.

Re: [PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Bjorn Andersson

On Tue 06 Apr 08:31 CDT 2021, Andy Shevchenko wrote:

> kernel.h is being used as a dump for all kinds of stuff for a long time.
> Here is the attempt to start cleaning it up by splitting out panic and
> oops helpers.
> 
> At the same time convert users in header and lib folder to use new header.
> Though for time being include new header back to kernel.h to avoid twisted
> indirected includes for existing users.
> 

Reviewed-by: Bjorn Andersson 

Regards,
Bjorn

[PATCH] powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting an overflown PMC

2021-04-06 Thread Athira Rajeev

Running perf fuzzer showed below in dmesg logs:
"Can't find PMC that caused IRQ"

This means a PMU exception happened, but none of the PMC's (Performance
Monitor Counter) were found to be overflown. There are some corner cases
that clears the PMCs after PMI gets masked. In such cases, the perf
interrupt handler will not find the active PMC values that had caused
the overflow and thus leads to this message while replaying.

Case 1: PMU Interrupt happens during replay of other interrupts and
counter values gets cleared by PMU callbacks before replay:

During replay of interrupts like timer, __do_irq and doorbell exception, we
conditionally enable interrupts via may_hard_irq_enable(). This could
potentially create a window to generate a PMI. Since irq soft mask is set
to ALL_DISABLED, the PMI will get masked here. We could get IPIs run before
perf interrupt is replayed and the PMU events could deleted or stopped.
This will change the PMU SPR values and resets the counters. Snippet of
ftrace log showing PMU callbacks invoked in "__do_irq":

-0 [051] dns. 132025441306354: __do_irq <-call_do_irq
-0 [051] dns. 132025441306430: irq_enter <-__do_irq
-0 [051] dns. 132025441306503: irq_enter_rcu <-__do_irq
-0 [051] dnH. 132025441306599: xive_get_irq <-__do_irq
<<>>
-0 [051] dnH. 132025441307770: generic_smp_call_function_single_interrupt 
<-smp_ipi_demux_relaxed
-0 [051] dnH. 132025441307839: flush_smp_call_function_queue 
<-smp_ipi_demux_relaxed
-0 [051] dnH. 132025441308057: _raw_spin_lock <-event_function
-0 [051] dnH. 132025441308206: power_pmu_disable <-perf_pmu_disable
-0 [051] dnH. 132025441308337: power_pmu_del <-event_sched_out
-0 [051] dnH. 132025441308407: power_pmu_read <-power_pmu_del
-0 [051] dnH. 132025441308477: read_pmc <-power_pmu_read
-0 [051] dnH. 132025441308590: isa207_disable_pmc <-power_pmu_del
-0 [051] dnH. 132025441308663: write_pmc <-power_pmu_del
-0 [051] dnH. 132025441308787: power_pmu_event_idx 
<-perf_event_update_userpage
-0 [051] dnH. 132025441308859: rcu_read_unlock_strict 
<-perf_event_update_userpage
-0 [051] dnH. 132025441308975: power_pmu_enable <-perf_pmu_enable
<<>>
-0 [051] dnH. 132025441311108: irq_exit <-__do_irq
-0 [051] dns. 132025441311319: performance_monitor_exception 
<-replay_soft_interrupts

Case 2: PMI's masked during local_* operations, example local_add.
If the local_add operation happens within a local_irq_save, replay of
PMI will be during local_irq_restore. Similar to case 1, this could
also create a window before replay where PMU events gets deleted or
stopped.

Patch adds a fix to update the PMU callback functions (del,stop,enable) to
check for pending perf interrupt. If there is an overflown PMC and pending
perf interrupt indicated in Paca, clear the PMI bit in paca to drop that
sample. In case of power_pmu_del, also clear the MMCR0 PMAO bit which
otherwise could lead to spurious interrupts in some corner cases. Example,
a timer after power_pmu_del which will re-enable interrupts since PMI is
cleared and triggers a PMI again since PMAO bit is still set.

We can't just replay PMI any time. Hence this approach is preferred rather
than replaying PMI before resetting overflown PMC. Patch also documents
core-book3s on a race condition which can trigger these PMC messages during
idle path in PowerNV.

Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and 
replay them")
Reported-by: Nageswara R Sastry 
Suggested-by: Nicholas Piggin 
Suggested-by: Madhavan Srinivasan 
Signed-off-by: Athira Rajeev 
---
 arch/powerpc/include/asm/pmc.h  | 11 +
 arch/powerpc/perf/core-book3s.c | 55 +
 2 files changed, 66 insertions(+)

diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h
index c6bbe9778d3c..97b4bd8de25b 100644
--- a/arch/powerpc/include/asm/pmc.h
+++ b/arch/powerpc/include/asm/pmc.h
@@ -34,11 +34,22 @@ static inline void ppc_set_pmu_inuse(int inuse)
 #endif
 }
 
+static inline int clear_paca_irq_pmi(void)
+{
+   if (get_paca()->irq_happened & PACA_IRQ_PMI) {
+   WARN_ON_ONCE(mfmsr() & MSR_EE);
+   get_paca()->irq_happened &= ~PACA_IRQ_PMI;
+   return 1;
+   }
+   return 0;
+}
+
 extern void power4_enable_pmcs(void);
 
 #else /* CONFIG_PPC64 */
 
 static inline void ppc_set_pmu_inuse(int inuse) { }
+static inline int clear_paca_irq_pmi(void) { return 0; }
 
 #endif
 
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 766f064f00fb..18ca3c90f866 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -847,6 +847,20 @@ static void write_pmc(int idx, unsigned long val)
}
 }
 
+static int pmc_overflown(int idx)
+{
+   unsigned long val[8];
+   int i;
+
+   for (i = 0; i < ppmu->n_counter; i++)
+   val[i] = read_pmc(i + 1);
+
+   if ((int)val[idx-1] < 0)
+   return 1;
+
+   return 0;
+}
+
 /* Called from

[PATCH v1 1/1] kernel.h: Split out panic and oops helpers

2021-04-06 Thread Andy Shevchenko

kernel.h is being used as a dump for all kinds of stuff for a long time.
Here is the attempt to start cleaning it up by splitting out panic and
oops helpers.

At the same time convert users in header and lib folder to use new header.
Though for time being include new header back to kernel.h to avoid twisted
indirected includes for existing users.

Signed-off-by: Andy Shevchenko 
---
 arch/powerpc/kernel/setup-common.c   |  1 +
 arch/x86/include/asm/desc.h  |  1 +
 arch/x86/kernel/cpu/mshyperv.c   |  1 +
 arch/x86/kernel/setup.c  |  1 +
 drivers/char/ipmi/ipmi_msghandler.c  |  1 +
 drivers/remoteproc/remoteproc_core.c |  1 +
 include/asm-generic/bug.h|  3 +-
 include/linux/kernel.h   | 84 +---
 include/linux/panic.h| 98 
 include/linux/panic_notifier.h   | 12 
 kernel/hung_task.c   |  1 +
 kernel/kexec_core.c  |  1 +
 kernel/panic.c   |  1 +
 kernel/rcu/tree.c|  2 +
 kernel/sysctl.c  |  1 +
 kernel/trace/trace.c |  1 +
 16 files changed, 126 insertions(+), 84 deletions(-)
 create mode 100644 include/linux/panic.h
 create mode 100644 include/linux/panic_notifier.h

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 74a98fff2c2f..046fe21b5c3b 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -9,6 +9,7 @@
 #undef DEBUG
 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 476082a83d1c..ceb12683b6d1 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 22f13343b5da..9e5c6f2b044d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 59e5e0903b0c..570699eecf90 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/char/ipmi/ipmi_msghandler.c 
b/drivers/char/ipmi/ipmi_msghandler.c
index 8a0e97b33cae..e96cb5c4f97a 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -16,6 +16,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/remoteproc/remoteproc_core.c 
b/drivers/remoteproc/remoteproc_core.c
index 626a6b90fba2..76dd8e2b1e7e 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 76a10e0dca9f..719410b93f99 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -17,7 +17,8 @@
 #endif
 
 #ifndef __ASSEMBLY__
-#include 
+#include 
+#include 
 
 #ifdef CONFIG_BUG
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 09035ac67d4b..6c5a05ac1ecb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -70,7 +71,6 @@
 #define lower_32_bits(n) ((u32)((n) & 0x))
 
 struct completion;
-struct pt_regs;
 struct user;
 
 #ifdef CONFIG_PREEMPT_VOLUNTARY
@@ -175,14 +175,6 @@ void __might_fault(const char *file, int line);
 static inline void might_fault(void) { }
 #endif
 
-extern struct atomic_notifier_head panic_notifier_list;
-extern long (*panic_blink)(int state);
-__printf(1, 2)
-void panic(const char *fmt, ...) __noreturn __cold;
-void nmi_panic(struct pt_regs *regs, const char *msg);
-extern void oops_enter(void);
-extern void oops_exit(void);
-extern bool oops_may_print(void);
 void do_exit(long error_code) __noreturn;
 void complete_and_exit(struct completion *, long) __noreturn;
 
@@ -368,52 +360,8 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
 
-#ifdef CONFIG_SMP
-extern unsigned int sysctl_oops_all_cpu_backtrace;
-#else
-#define sysctl_oops_all_cpu_backtrace 0
-#endif /* CONFIG_SMP */
-
 extern void bust_spinlocks(int yes);
-extern int panic_timeout;
-extern unsigned long panic_print;
-extern int panic_on_oops;
-extern int panic_on_unrecovered_nmi;
-extern int panic_on_io_nmi;
-extern int panic_on_warn;
-extern unsigned long panic_on_taint;
-extern bool panic_on_taint_nousertaint;
-extern int sysctl_panic_on_rcu_stall;
-extern int sysctl_max_rcu_stall_to_panic;
-extern int sysctl_panic_on_stackoverflow;
-

Re: [PATCH v6 30/48] KVM: PPC: Book3S HV P9: Implement the rest of the P9 path in C

2021-04-06 Thread Nicholas Piggin

Excerpts from Nicholas Piggin's message of April 6, 2021 7:12 pm:
> Excerpts from Paul Mackerras's message of April 6, 2021 5:27 pm:
>> On Mon, Apr 05, 2021 at 11:19:30AM +1000, Nicholas Piggin wrote:
>>> Almost all logic is moved to C, by introducing a new in_guest mode for
>>> the P9 path that branches very early in the KVM interrupt handler to
>>> P9 exit code.
>>> 
>>> The main P9 entry and exit assembly is now only about 160 lines of low
>>> level stack setup and register save/restore, plus a bad-interrupt
>>> handler.
>>> 
>>> There are two motivations for this, the first is just make the code more
>>> maintainable being in C. The second is to reduce the amount of code
>>> running in a special KVM mode, "realmode". In quotes because with radix
>>> it is no longer necessarily real-mode in the MMU, but it still has to be
>>> treated specially because it may be in real-mode, and has various
>>> important registers like PID, DEC, TB, etc set to guest. This is hostile
>>> to the rest of Linux and can't use arbitrary kernel functionality or be
>>> instrumented well.
>>> 
>>> This initial patch is a reasonably faithful conversion of the asm code,
>>> but it does lack any loop to return quickly back into the guest without
>>> switching out of realmode in the case of unimportant or easily handled
>>> interrupts. As explained in previous changes, handling HV interrupts
>>> in real mode is not so important for P9.
>>> 
>>> Use of Linux 64s interrupt entry code register conventions including
>>> paca EX_ save areas are brought into the KVM code. There is no point
>>> shuffling things into different paca save areas and making up a
>>> different calling convention for KVM.
>>> 
>>> Signed-off-by: Nicholas Piggin 
>> 
>> [snip]
>> 
>>> +/*
>>> + * Took an interrupt somewhere right before HRFID to guest, so registers 
>>> are
>>> + * in a bad way. Return things hopefully enough to run host virtual code 
>>> and
>>> + * run the Linux interrupt handler (SRESET or MCE) to print something 
>>> useful.
>>> + *
>>> + * We could be really clever and save all host registers in known locations
>>> + * before setting HSTATE_IN_GUEST, then restoring them all here, and 
>>> setting
>>> + * return address to a fixup that sets them up again. But that's a lot of
>>> + * effort for a small bit of code. Lots of other things to do first.
>>> + */
>>> +kvmppc_p9_bad_interrupt:
>>> +   /*
>>> +* Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
>>> +* MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
>>> +*/
>>> +   li  r10,KVM_GUEST_MODE_NONE
>>> +   stb r10,HSTATE_IN_GUEST(r13)
>>> +   li  r10,MSR_RI
>>> +   andcr12,r12,r10
>>> +
>>> +   /*
>>> +* Clean up guest registers to give host a chance to run.
>>> +*/
>>> +   li  r10,0
>>> +   mtspr   SPRN_AMR,r10
>>> +   mtspr   SPRN_IAMR,r10
>>> +   mtspr   SPRN_CIABR,r10
>>> +   mtspr   SPRN_DAWRX0,r10
>>> +BEGIN_FTR_SECTION
>>> +   mtspr   SPRN_DAWRX1,r10
>>> +END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
>>> +   mtspr   SPRN_PID,r10
>>> +
>>> +   /*
>>> +* Switch to host MMU mode
>>> +*/
>>> +   ld  r10, HSTATE_KVM_VCPU(r13)
>>> +   ld  r10, VCPU_KVM(r10)
>>> +   lwz r10, KVM_HOST_LPID(r10)
>>> +   mtspr   SPRN_LPID,r10
>>> +
>>> +   ld  r10, HSTATE_KVM_VCPU(r13)
>>> +   ld  r10, VCPU_KVM(r10)
>>> +   ld  r10, KVM_HOST_LPCR(r10)
>>> +   mtspr   SPRN_LPCR,r10
>>> +
>>> +   /*
>>> +* Go back to interrupt handler
>>> +*/
>>> +   ld  r10,HSTATE_SCRATCH0(r13)
>>> +   cmpwi   r10,BOOK3S_INTERRUPT_MACHINE_CHECK
>>> +   beq machine_check_common
>>> +
>>> +   ld  r10,HSTATE_SCRATCH0(r13)
>>> +   cmpwi   r10,BOOK3S_INTERRUPT_SYSTEM_RESET
>>> +   beq system_reset_common
>>> +
>>> +   b   .
>> 
>> So you only handle machine check and system reset here?  I would think
>> that program check would also be useful, for the cases where people
>> put BUG_ON in sensitive places (see below).  DSI and ISI could also be
>> useful for the null pointer dereference cases, I would think.
> 
> Those ones have their own stack, so a bit simpler to run them (and
> they obviously have to be handled as they are NMIs). I'll see if we
> can do something to improve the others a bit. Maybe just call program
> check for any other exception might work, making sure that it'll use
> the emergency stack rather than something that looks like a kernel
> stack but is a guest value, I'll see what we can get to work.

So program check isn't tested in virt mode, and neither are ISI or
DSI unless PR is possible, so the bad host interrupt check in
rmhandlers doesn't really catch those cases either.

We may want to improve on that but I might wait until after this
series.

I did get rid of the BUG_ON though.

Thanks,
Nick

>>> +static inline void mtslb(unsigned int idx, u64 slbee, u64 slbev)
>>> +{
>>> +   BUG_ON((slbee & 0xfff) != idx);
>>> +
>>> +   asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
>>> +}
>> 
>>

Re: [PATCH 4/9] sh: intc: Drop the use of irq_create_identity_mapping()

2021-04-06 Thread Marc Zyngier

On Tue, 06 Apr 2021 11:32:13 +0100,
Geert Uytterhoeven  wrote:
> 
> Hi Marc,
> 
> On Tue, Apr 6, 2021 at 11:44 AM Marc Zyngier  wrote:
> > Instead of playing games with using irq_create_identity_mapping()
> > and irq_domain_associate(), drop the use of the former and only
> > use the latter, together with the allocation of the irq_desc
> > as needed.
> >
> > It doesn't make the code less awful, but at least the intent
> > is clearer.
> >
> > Signed-off-by: Marc Zyngier 
> 
> Thanks for your patch!
> 
> > --- a/drivers/sh/intc/core.c
> > +++ b/drivers/sh/intc/core.c
> > @@ -179,6 +179,23 @@ static unsigned int __init save_reg(struct 
> > intc_desc_int *d,
> > return 0;
> >  }
> >
> > +static bool __init intc_map(struct irq_domain *domain, int irq)
> > +{
> > +   int res;
> 
> warning: unused variable ‘res’ [-Wunused-variable]
> 
> > +
> > +   if (!irq_to_desc(irq) && irq_alloc_desc_at(irq, NUMA_NO_NODE) != 
> > irq) {
> > +   pr_err("uname to allocate IRQ %d\n", irq);
> > +   return false;
> > +   }
> > +
> > +   if (irq_domain_associate(domain, irq, irq)) {
> > +   pr_err("domain association failure\n");
> > +   return false;
> > +   }
> > +
> > +   return true;
> > +}
> > +
> >  int __init register_intc_controller(struct intc_desc *desc)
> >  {
> > unsigned int i, k, smp;
> > @@ -316,19 +333,8 @@ int __init register_intc_controller(struct intc_desc 
> > *desc)
> 
> warning: unused variable ‘res’ [-Wunused-variable]

Ah, thanks for spotting these.

> 
> > if (!vect->enum_id)
> > continue;
> >
> > -   res = irq_create_identity_mapping(d->domain, irq);
> 
> 
> > -   if (unlikely(res)) {
> > -   if (res == -EEXIST) {
> > -   res = irq_domain_associate(d->domain, irq, 
> > irq);
> > -   if (unlikely(res)) {
> > -   pr_err("domain association 
> > failure\n");
> > -   continue;
> > -   }
> > -   } else {
> > -   pr_err("can't identity map IRQ %d\n", irq);
> > -   continue;
> > -   }
> > -   }
> > +   if (!intc_map(d->domain, irq))
> > +   continue;
> >
> > intc_irq_xlate_set(irq, vect->enum_id, d);
> > intc_register_irq(desc, d, vect->enum_id, irq);
> 
> Otherwise this seems to work fine on real hardware (landisk) and qemu
> (rts7751r2d).  I did verify that the new function intc_map() is called.
> 
> Tested-by: Geert Uytterhoeven 

Awesome, thanks Geert.

M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH 1/9] irqdomain: Reimplement irq_linear_revmap() with irq_find_mapping()

2021-04-06 Thread Marc Zyngier

Christophe,

On Tue, 06 Apr 2021 12:21:33 +0100,
Christophe Leroy  wrote:
> 
> 
> 
> Le 06/04/2021 à 11:35, Marc Zyngier a écrit :
> > irq_linear_revmap() is supposed to be a fast path for domain
> > lookups, but it only exposes low-level details of the irqdomain
> > implementation, details which are better kept private.
> 
> Can you elaborate with more details ?

Things like directly picking into the revmap are positively awful, and
doesn't work if the domain has been constructed using the radix
tree. Which on its own is totally broken, because things like
irq_domain_create_hierarchy() will pick an implementation or the
other.

> 
> > 
> > The *overhead* between the two is only a function call and
> > a couple of tests, so it is likely that noone can show any
> > meaningful difference compared to the cost of taking an
> > interrupt.
> 
> Do you have any measurement ?

I did measure things on arm64, and couldn't come up with any
difference other than noise.

> Can you make the "likely" a certitude ?

Of course not. You can always come up with an artificial CPU
implementation that has a very small exception entry overhead, and a
ridiculously slow memory subsystem. Do I care about these? No.

If you can come up with realistic platforms that show a regression
with this patch, I'm all ears.

> 
> > 
> > Reimplement irq_linear_revmap() with irq_find_mapping()
> > in order to preserve source code compatibility, and
> > rename the internal field for a measure.
> 
> This is in complete contradiction with commit 
> https://github.com/torvalds/linux/commit/d3dcb436
> 
> At that time, irq_linear_revmap() was less complex than what
> irq_find_mapping() is today, and nevertheless it was considered worth
> restoring in as a fast path. What has changed since then ?

Over 8 years? Plenty. The use of irqdomains has been generalised, we
have domain hierarchies, and if anything, this commit introduces the
buggy behaviour I was mentioning above. I also don't see any mention
of actual performance in that commit.

And if we're worried about a fast path, being able to directly cache
the irq_data in the revmap, hence skipping the irq_desc lookup that
inevitable follows, is a much more interesting prospect than the "get
useless data quick" that irq_linear_revmap() implements.

This latter optimisation is what I am after.

> Can you also explain the reason for the renaming of "linear_revmap"
> into "revmap" ? What is that "measure" ?

To catch the potential direct use of the reverse map field.

Thanks,

M.

-- 
Without deviation from the norm, progress is not possible.

Re: [PATCH 6/9] mips: netlogic: Use irq_domain_simple_ops for XLP PIC

2021-04-06 Thread Thomas Bogendoerfer

On Tue, Apr 06, 2021 at 10:35:54AM +0100, Marc Zyngier wrote:
> Use the generic irq_domain_simple_ops structure instead of
> a home-grown one.
> 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/mips/netlogic/common/irq.c | 6 +-
>  1 file changed, 1 insertion(+), 5 deletions(-)

Acked-by: Thomas Bogendoerfer 

-- 
Crap can work. Given enough thrust pigs will fly, but it's not necessarily a
good idea.[ RFC1925, 2.3 ]

Re: [PATCH 1/9] irqdomain: Reimplement irq_linear_revmap() with irq_find_mapping()

2021-04-06 Thread Christophe Leroy





Le 06/04/2021 à 11:35, Marc Zyngier a écrit :

irq_linear_revmap() is supposed to be a fast path for domain
lookups, but it only exposes low-level details of the irqdomain
implementation, details which are better kept private.


Can you elaborate with more details ?



The *overhead* between the two is only a function call and
a couple of tests, so it is likely that noone can show any
meaningful difference compared to the cost of taking an
interrupt.


Do you have any measurement ?

Can you make the "likely" a certitude ?



Reimplement irq_linear_revmap() with irq_find_mapping()
in order to preserve source code compatibility, and
rename the internal field for a measure.


This is in complete contradiction with commit 
https://github.com/torvalds/linux/commit/d3dcb436

At that time, irq_linear_revmap() was less complex than what irq_find_mapping() is today, and 
nevertheless it was considered worth restoring in as a fast path. What has changed since then ?


Can you also explain the reason for the renaming of "linear_revmap" into "revmap" ? What is that 
"measure" ?




Signed-off-by: Marc Zyngier 
---
  include/linux/irqdomain.h | 22 +-
  kernel/irq/irqdomain.c|  6 +++---
  2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 33cacc8af26d..b9600f24878a 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -154,9 +154,9 @@ struct irq_domain_chip_generic;
   * Revmap data, used internally by irq_domain
   * @revmap_direct_max_irq: The largest hwirq that can be set for controllers 
that
   * support direct mapping
- * @revmap_size: Size of the linear map table @linear_revmap[]
+ * @revmap_size: Size of the linear map table @revmap[]
   * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
- * @linear_revmap: Linear table of hwirq->virq reverse mappings
+ * @revmap: Linear table of hwirq->virq reverse mappings
   */
  struct irq_domain {
struct list_head link;
@@ -180,7 +180,7 @@ struct irq_domain {
unsigned int revmap_size;
struct radix_tree_root revmap_tree;
struct mutex revmap_tree_mutex;
-   unsigned int linear_revmap[];
+   unsigned int revmap[];
  };
  
  /* Irq domain flags */

@@ -396,24 +396,20 @@ static inline unsigned int irq_create_mapping(struct 
irq_domain *host,
return irq_create_mapping_affinity(host, hwirq, NULL);
  }
  
-

  /**
- * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * irq_find_mapping() - Find a linux irq from a hw irq number.
   * @domain: domain owning this hardware interrupt
   * @hwirq: hardware irq number in that domain space
- *
- * This is a fast path alternative to irq_find_mapping() that can be
- * called directly by irq controller code to save a handful of
- * instructions. It is always safe to call, but won't find irqs mapped
- * using the radix tree.
   */
+extern unsigned int irq_find_mapping(struct irq_domain *host,
+irq_hw_number_t hwirq);
+
  static inline unsigned int irq_linear_revmap(struct irq_domain *domain,
 irq_hw_number_t hwirq)
  {
-   return hwirq < domain->revmap_size ? domain->linear_revmap[hwirq] : 0;
+   return irq_find_mapping(domain, hwirq);
  }
-extern unsigned int irq_find_mapping(struct irq_domain *host,
-irq_hw_number_t hwirq);
+
  extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
  extern int irq_create_strict_mappings(struct irq_domain *domain,
  unsigned int irq_base,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d10ab1d689d5..dfa716305ea9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -486,7 +486,7 @@ static void irq_domain_clear_mapping(struct irq_domain 
*domain,
 irq_hw_number_t hwirq)
  {
if (hwirq < domain->revmap_size) {
-   domain->linear_revmap[hwirq] = 0;
+   domain->revmap[hwirq] = 0;
} else {
mutex_lock(>revmap_tree_mutex);
radix_tree_delete(>revmap_tree, hwirq);
@@ -499,7 +499,7 @@ static void irq_domain_set_mapping(struct irq_domain 
*domain,
   struct irq_data *irq_data)
  {
if (hwirq < domain->revmap_size) {
-   domain->linear_revmap[hwirq] = irq_data->irq;
+   domain->revmap[hwirq] = irq_data->irq;
} else {
mutex_lock(>revmap_tree_mutex);
radix_tree_insert(>revmap_tree, hwirq, irq_data);
@@ -920,7 +920,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
  
  	/* Check if the hwirq is in the linear revmap. */

if (hwirq < domain->revmap_size)
-   return domain->linear_revmap[hwirq];
+   return domain->revmap[hwirq];

Re: [PATCH printk v2 2/5] printk: remove safe buffers

2021-04-06 Thread John Ogness

On 2021-04-01, Petr Mladek  wrote:
>> Caller-id solves this problem and is easy to sort for anyone with
>> `grep'. Yes, it is a shame that `dmesg' does not show it, but
>> directly using any of the printk interfaces does show it (kmsg_dump,
>> /dev/kmsg, syslog, console).
>
> True but frankly, the current situation is _far_ from convenient:
>
>+ consoles do not show it by default
>+ none userspace tool (dmesg, journalctl, crash) is able to show it
>+ grep is a nightmare, especially if you have more than handful of CPUs
>
> Yes, everything is solvable but not easily.
>
>> > I get this with "echo l >/proc/sysrq-trigger" and this patchset:
>> 
>> Of course. Without caller-id, it is a mess. But this has nothing to do
>> with NMI. The same problem exists for WARN_ON() on multiple CPUs
>> simultaneously. If the user is not using caller-id, they are
>> lost. Caller-id is the current solution to the interlaced logs.
>
> Sure. But in reality, the risk of mixed WARN_ONs is small. While
> this patch makes backtraces from all CPUs always unusable without
> caller_id and non-trivial effort.

I would prefer we solve the situation for non-NMI as well, not just for
the sysrq "l" case.

>> For the long term, we should introduce a printk-context API that allows
>> callers to perfectly pack their multi-line output into a single
>> entry. We discussed [0][1] this back in August 2020.
>
> We need a "short" term solution. There are currently 3 solutions:
>
> 1. Keep nmi_safe() and all the hacks around.
>
> 2. Serialize nmi_cpu_backtrace() by a spin lock and later by
>the special lock used also by atomic consoles.
>
> 3. Tell complaining people how to sort the messed logs.

Or we look into the long term solution now. If caller-id's cannot not be
used as the solution (because nobody turns it on, nobody knows about it,
and/or distros do not enable it), then we should look at how to make at
least the backtraces contiguous. I have a few ideas here.

John Ogness

Re: [PATCH 4/9] sh: intc: Drop the use of irq_create_identity_mapping()

2021-04-06 Thread Geert Uytterhoeven

Hi Marc,

On Tue, Apr 6, 2021 at 11:44 AM Marc Zyngier  wrote:
> Instead of playing games with using irq_create_identity_mapping()
> and irq_domain_associate(), drop the use of the former and only
> use the latter, together with the allocation of the irq_desc
> as needed.
>
> It doesn't make the code less awful, but at least the intent
> is clearer.
>
> Signed-off-by: Marc Zyngier 

Thanks for your patch!

> --- a/drivers/sh/intc/core.c
> +++ b/drivers/sh/intc/core.c
> @@ -179,6 +179,23 @@ static unsigned int __init save_reg(struct intc_desc_int 
> *d,
> return 0;
>  }
>
> +static bool __init intc_map(struct irq_domain *domain, int irq)
> +{
> +   int res;

warning: unused variable ‘res’ [-Wunused-variable]

> +
> +   if (!irq_to_desc(irq) && irq_alloc_desc_at(irq, NUMA_NO_NODE) != irq) 
> {
> +   pr_err("uname to allocate IRQ %d\n", irq);
> +   return false;
> +   }
> +
> +   if (irq_domain_associate(domain, irq, irq)) {
> +   pr_err("domain association failure\n");
> +   return false;
> +   }
> +
> +   return true;
> +}
> +
>  int __init register_intc_controller(struct intc_desc *desc)
>  {
> unsigned int i, k, smp;
> @@ -316,19 +333,8 @@ int __init register_intc_controller(struct intc_desc 
> *desc)

warning: unused variable ‘res’ [-Wunused-variable]

> if (!vect->enum_id)
> continue;
>
> -   res = irq_create_identity_mapping(d->domain, irq);


> -   if (unlikely(res)) {
> -   if (res == -EEXIST) {
> -   res = irq_domain_associate(d->domain, irq, 
> irq);
> -   if (unlikely(res)) {
> -   pr_err("domain association 
> failure\n");
> -   continue;
> -   }
> -   } else {
> -   pr_err("can't identity map IRQ %d\n", irq);
> -   continue;
> -   }
> -   }
> +   if (!intc_map(d->domain, irq))
> +   continue;
>
> intc_irq_xlate_set(irq, vect->enum_id, d);
> intc_register_irq(desc, d, vect->enum_id, irq);

Otherwise this seems to work fine on real hardware (landisk) and qemu
(rts7751r2d).  I did verify that the new function intc_map() is called.

Tested-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

[PATCH v1 1/2] powerpc/inst: ppc_inst_as_u64() becomes ppc_inst_as_ulong()

2021-04-06 Thread Christophe Leroy

In order to simplify use on PPC32, change ppc_inst_as_u64()
into ppc_inst_as_ulong() that returns the 32 bits instruction
on PPC32.

Will be used when porting OPTPROBES to PPC32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/inst.h  | 13 +++--
 arch/powerpc/kernel/optprobes.c  |  2 +-
 arch/powerpc/lib/code-patching.c |  2 +-
 arch/powerpc/xmon/xmon.c |  2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index cc73c1267572..8ea0b503f32f 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -113,13 +113,14 @@ static inline struct ppc_inst *ppc_inst_next(void 
*location, struct ppc_inst *va
return location + ppc_inst_len(tmp);
 }
 
-static inline u64 ppc_inst_as_u64(struct ppc_inst x)
+static inline unsigned long ppc_inst_as_ulong(struct ppc_inst x)
 {
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-   return (u64)ppc_inst_suffix(x) << 32 | ppc_inst_val(x);
-#else
-   return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x);
-#endif
+   if (IS_ENABLED(CONFIG_PPC32))
+   return ppc_inst_val(x);
+   else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
+   return (u64)ppc_inst_suffix(x) << 32 | ppc_inst_val(x);
+   else
+   return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x);
 }
 
 #define PPC_INST_STR_LEN sizeof(" ")
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 7f7cdbeacd1a..58fdb9f66e0f 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -264,7 +264,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
 * 3. load instruction to be emulated into relevant register, and
 */
temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn);
-   patch_imm64_load_insns(ppc_inst_as_u64(temp), 4, buff + TMPL_INSN_IDX);
+   patch_imm64_load_insns(ppc_inst_as_ulong(temp), 4, buff + 
TMPL_INSN_IDX);
 
/*
 * 4. branch back from trampoline
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 65aec4d6d9ba..870b30d9be2f 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -26,7 +26,7 @@ static int __patch_instruction(struct ppc_inst *exec_addr, 
struct ppc_inst instr
 
__put_kernel_nofault(patch_addr, , u32, failed);
} else {
-   u64 val = ppc_inst_as_u64(instr);
+   u64 val = ppc_inst_as_ulong(instr);
 
__put_kernel_nofault(patch_addr, , u64, failed);
}
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 3fe37495f63d..ff10a357d41d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2980,7 +2980,7 @@ generic_inst_dump(unsigned long adr, long count, int 
praddr,
if (!ppc_inst_prefixed(inst))
dump_func(ppc_inst_val(inst), adr);
else
-   dump_func(ppc_inst_as_u64(inst), adr);
+   dump_func(ppc_inst_as_ulong(inst), adr);
printf("\n");
}
return adr - first_adr;
-- 
2.25.0

[PATCH v1 2/2] powerpc: Enable OPTPROBES on PPC32

2021-04-06 Thread Christophe Leroy

For that, create a 32 bits version of patch_imm64_load_insns()
and create a patch_imm_load_insns() which calls
patch_imm32_load_insns() on PPC32 and patch_imm64_load_insns()
on PPC64.

Adapt optprobes_head.S for PPC32. Use PPC_LL/PPC_STL macros instead
of raw ld/std, opt out things linked to paca and use stmw/lmw to
save/restore registers.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |  2 +-
 arch/powerpc/kernel/optprobes.c  | 24 +--
 arch/powerpc/kernel/optprobes_head.S | 46 +++-
 3 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c1344c05226c..49b538e54efb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -227,7 +227,7 @@ config PPC
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI if PERF_EVENTS || (PPC64 && 
PPC_BOOK3S)
select HAVE_HARDLOCKUP_DETECTOR_ARCHif PPC64 && PPC_BOOK3S && SMP
-   select HAVE_OPTPROBES   if PPC64
+   select HAVE_OPTPROBES
select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && 
HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 58fdb9f66e0f..cdf87086fa33 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -141,11 +141,21 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe 
*op)
}
 }
 
+static void patch_imm32_load_insns(unsigned long val, int reg, kprobe_opcode_t 
*addr)
+{
+   patch_instruction((struct ppc_inst *)addr,
+ ppc_inst(PPC_RAW_LIS(reg, IMM_H(val;
+   addr++;
+
+   patch_instruction((struct ppc_inst *)addr,
+ ppc_inst(PPC_RAW_ORI(reg, reg, IMM_L(val;
+}
+
 /*
  * Generate instructions to load provided immediate 64-bit value
  * to register 'reg' and patch these instructions at 'addr'.
  */
-static void patch_imm64_load_insns(unsigned long val, int reg, kprobe_opcode_t 
*addr)
+static void patch_imm64_load_insns(unsigned long long val, int reg, 
kprobe_opcode_t *addr)
 {
/* lis reg,(op)@highest */
patch_instruction((struct ppc_inst *)addr,
@@ -177,6 +187,14 @@ static void patch_imm64_load_insns(unsigned long val, int 
reg, kprobe_opcode_t *
   ___PPC_RS(reg) | (val & 0x)));
 }
 
+static void patch_imm_load_insns(unsigned long val, int reg, kprobe_opcode_t 
*addr)
+{
+   if (IS_ENABLED(CONFIG_PPC64))
+   patch_imm64_load_insns(val, reg, addr);
+   else
+   patch_imm32_load_insns(val, reg, addr);
+}
+
 int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe 
*p)
 {
struct ppc_inst branch_op_callback, branch_emulate_step, temp;
@@ -230,7 +248,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
 * Fixup the template with instructions to:
 * 1. load the address of the actual probepoint
 */
-   patch_imm64_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX);
+   patch_imm_load_insns((unsigned long)op, 3, buff + TMPL_OP_IDX);
 
/*
 * 2. branch to optimized_callback() and emulate_step()
@@ -264,7 +282,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
 * 3. load instruction to be emulated into relevant register, and
 */
temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn);
-   patch_imm64_load_insns(ppc_inst_as_ulong(temp), 4, buff + 
TMPL_INSN_IDX);
+   patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX);
 
/*
 * 4. branch back from trampoline
diff --git a/arch/powerpc/kernel/optprobes_head.S 
b/arch/powerpc/kernel/optprobes_head.S
index ff8ba4d3824d..49f31e554573 100644
--- a/arch/powerpc/kernel/optprobes_head.S
+++ b/arch/powerpc/kernel/optprobes_head.S
@@ -30,39 +30,47 @@ optinsn_slot:
.global optprobe_template_entry
 optprobe_template_entry:
/* Create an in-memory pt_regs */
-   stdur1,-INT_FRAME_SIZE(r1)
+   PPC_STLUr1,-INT_FRAME_SIZE(r1)
SAVE_GPR(0,r1)
/* Save the previous SP into stack */
addir0,r1,INT_FRAME_SIZE
-   std r0,GPR1(r1)
+   PPC_STL r0,GPR1(r1)
+#ifdef CONFIG_PPC64
SAVE_10GPRS(2,r1)
SAVE_10GPRS(12,r1)
SAVE_10GPRS(22,r1)
+#else
+   stmwr2, GPR2(r1)
+#endif
/* Save SPRS */
mfmsr   r5
-   std r5,_MSR(r1)
+   PPC_STL r5,_MSR(r1)
li  r5,0x700
-   std r5,_TRAP(r1)
+   PPC_STL r5,_TRAP(r1)
li  r5,0
-   std r5,ORIG_GPR3(r1)
-   std r5,RESULT(r1)
+   PPC_STL r5,ORIG_GPR3(r1)
+   PPC_STL r5,RESULT(r1)
mfctr   r5
-   std r5,_CTR(r1)
+

[PATCH 9/9] irqdomain: Kill irq_domain_add_legacy_isa

2021-04-06 Thread Marc Zyngier

This helper doesn't have a user anymore, let's remove it.

Signed-off-by: Marc Zyngier 
---
 Documentation/core-api/irq/irq-domain.rst |  1 -
 include/linux/irqdomain.h | 11 ---
 2 files changed, 12 deletions(-)

diff --git a/Documentation/core-api/irq/irq-domain.rst 
b/Documentation/core-api/irq/irq-domain.rst
index a77c24c27f7b..84e561db468f 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -146,7 +146,6 @@ Legacy
 
irq_domain_add_simple()
irq_domain_add_legacy()
-   irq_domain_add_legacy_isa()
irq_domain_create_legacy()
 
 The Legacy mapping is a special case for drivers that already have a
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 3997ed9e4d7d..2a7ecf08d56e 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -45,9 +45,6 @@ struct cpumask;
 struct seq_file;
 struct irq_affinity_desc;
 
-/* Number of irqs reserved for a legacy isa controller */
-#define NUM_ISA_INTERRUPTS 16
-
 #define IRQ_DOMAIN_IRQ_SPEC_PARAMS 16
 
 /**
@@ -346,14 +343,6 @@ static inline struct irq_domain 
*irq_domain_add_nomap(struct device_node *of_nod
 {
return __irq_domain_add(of_node_to_fwnode(of_node), 0, max_irq, 
max_irq, ops, host_data);
 }
-static inline struct irq_domain *irq_domain_add_legacy_isa(
-   struct device_node *of_node,
-   const struct irq_domain_ops *ops,
-   void *host_data)
-{
-   return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops,
-host_data);
-}
 static inline struct irq_domain *irq_domain_add_tree(struct device_node 
*of_node,
 const struct irq_domain_ops *ops,
 void *host_data)
-- 
2.29.2

[PATCH 8/9] powerpc: Convert irq_domain_add_legacy_isa use to irq_domain_add_legacy

2021-04-06 Thread Marc Zyngier

irq_domain_add_legacy_isa is a pain. It only exists for the benefit of
two PPC-specific drivers, and creates an ugly dependency between asm/irq.h
and linux/irqdomain.h

Instead, let's convert these two drivers to irq_domain_add_legacy(),
stop using NUM_ISA_INTERRUPTS by directly setting NR_IRQS_LEGACY.

The dependency cannot be broken yet as there is a lot of PPC-related
code that depends on it, but that's the first step towards it.

A followup patch will remove irq_domain_add_legacy_isa.

Signed-off-by: Marc Zyngier 
---
 arch/powerpc/include/asm/irq.h | 4 ++--
 arch/powerpc/platforms/ps3/interrupt.c | 4 ++--
 arch/powerpc/sysdev/i8259.c| 3 ++-
 arch/powerpc/sysdev/mpic.c | 2 +-
 arch/powerpc/sysdev/tsi108_pci.c   | 3 ++-
 arch/powerpc/sysdev/xics/xics-common.c | 2 +-
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index f3f264e441a7..aeb209144c68 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -23,8 +23,8 @@ extern atomic_t ppc_n_lost_interrupts;
 /* Total number of virq in the platform */
 #define NR_IRQSCONFIG_NR_IRQS
 
-/* Same thing, used by the generic IRQ code */
-#define NR_IRQS_LEGACY NUM_ISA_INTERRUPTS
+/* Number of irqs reserved for a legacy isa controller */
+#define NR_IRQS_LEGACY 16
 
 extern irq_hw_number_t virq_to_hw(unsigned int virq);
 
diff --git a/arch/powerpc/platforms/ps3/interrupt.c 
b/arch/powerpc/platforms/ps3/interrupt.c
index 78f2339ed5cb..93e367a00452 100644
--- a/arch/powerpc/platforms/ps3/interrupt.c
+++ b/arch/powerpc/platforms/ps3/interrupt.c
@@ -45,7 +45,7 @@
  * implementation equates HV plug value to Linux virq value, constrains each
  * interrupt to have a system wide unique plug number, and limits the range
  * of the plug values to map into the first dword of the bitmaps.  This
- * gives a usable range of plug values of  {NUM_ISA_INTERRUPTS..63}.  Note
+ * gives a usable range of plug values of  {NR_IRQS_LEGACY..63}.  Note
  * that there is no constraint on how many in this set an individual thread
  * can acquire.
  *
@@ -721,7 +721,7 @@ static unsigned int ps3_get_irq(void)
}
 
 #if defined(DEBUG)
-   if (unlikely(plug < NUM_ISA_INTERRUPTS || plug > PS3_PLUG_MAX)) {
+   if (unlikely(plug < NR_IRQS_LEGACY || plug > PS3_PLUG_MAX)) {
dump_bmp(_cpu(ps3_private, 0));
dump_bmp(_cpu(ps3_private, 1));
BUG();
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index c1d76c344351..dc1a151c63d7 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -260,7 +260,8 @@ void i8259_init(struct device_node *node, unsigned long 
intack_addr)
raw_spin_unlock_irqrestore(_lock, flags);
 
/* create a legacy host */
-   i8259_host = irq_domain_add_legacy_isa(node, _host_ops, NULL);
+   i8259_host = irq_domain_add_legacy(node, NR_IRQS_LEGACY, 0, 0,
+  _host_ops, NULL);
if (i8259_host == NULL) {
printk(KERN_ERR "i8259: failed to allocate irq host !\n");
return;
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index b0426f28946a..995fb2ada507 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -602,7 +602,7 @@ static void __init mpic_scan_ht_pics(struct mpic *mpic)
 /* Find an mpic associated with a given linux interrupt */
 static struct mpic *mpic_find(unsigned int irq)
 {
-   if (irq < NUM_ISA_INTERRUPTS)
+   if (irq < NR_IRQS_LEGACY)
return NULL;
 
return irq_get_chip_data(irq);
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index 49f9541954f8..042bb38fa5c2 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -404,7 +404,8 @@ void __init tsi108_pci_int_init(struct device_node *node)
 {
DBG("Tsi108_pci_int_init: initializing PCI interrupts\n");
 
-   pci_irq_host = irq_domain_add_legacy_isa(node, _irq_domain_ops, 
NULL);
+   pci_irq_host = irq_domain_add_legacy(node, NR_IRQS_LEGACY, 0, 0,
+_irq_domain_ops, NULL);
if (pci_irq_host == NULL) {
printk(KERN_ERR "pci_irq_host: failed to allocate irq 
domain!\n");
return;
diff --git a/arch/powerpc/sysdev/xics/xics-common.c 
b/arch/powerpc/sysdev/xics/xics-common.c
index 7e4305c01bac..fdf8dbb6 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -201,7 +201,7 @@ void xics_migrate_irqs_away(void)
struct ics *ics;
 
/* We can't set affinity on ISA interrupts */
-   if (virq < NUM_ISA_INTERRUPTS)
+   if (virq < NR_IRQS_LEGACY)
continue;
/* We only need to migrate enabled IRQS

[PATCH 5/9] irqdomain: Kill irq_create_strict_mappings()/irq_create_identity_mapping()

2021-04-06 Thread Marc Zyngier

No user of these APIs are left, remove them.

Signed-off-by: Marc Zyngier 
---
 include/linux/irqdomain.h |  9 -
 kernel/irq/irqdomain.c| 35 ---
 2 files changed, 44 deletions(-)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index b9600f24878a..3997ed9e4d7d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -411,15 +411,6 @@ static inline unsigned int irq_linear_revmap(struct 
irq_domain *domain,
 }
 
 extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
-extern int irq_create_strict_mappings(struct irq_domain *domain,
- unsigned int irq_base,
- irq_hw_number_t hwirq_base, int count);
-
-static inline int irq_create_identity_mapping(struct irq_domain *host,
- irq_hw_number_t hwirq)
-{
-   return irq_create_strict_mappings(host, hwirq, hwirq, 1);
-}
 
 extern const struct irq_domain_ops irq_domain_simple_ops;
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dfa716305ea9..0ba761e6b0a8 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -703,41 +703,6 @@ unsigned int irq_create_mapping_affinity(struct irq_domain 
*domain,
 }
 EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
 
-/**
- * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
- * @domain: domain owning the interrupt range
- * @irq_base: beginning of linux IRQ range
- * @hwirq_base: beginning of hardware IRQ range
- * @count: Number of interrupts to map
- *
- * This routine is used for allocating and mapping a range of hardware
- * irqs to linux irqs where the linux irq numbers are at pre-defined
- * locations. For use by controllers that already have static mappings
- * to insert in to the domain.
- *
- * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
- * domain insertion.
- *
- * 0 is returned upon success, while any failure to establish a static
- * mapping is treated as an error.
- */
-int irq_create_strict_mappings(struct irq_domain *domain, unsigned int 
irq_base,
-  irq_hw_number_t hwirq_base, int count)
-{
-   struct device_node *of_node;
-   int ret;
-
-   of_node = irq_domain_get_of_node(domain);
-   ret = irq_alloc_descs(irq_base, irq_base, count,
- of_node_to_nid(of_node));
-   if (unlikely(ret < 0))
-   return ret;
-
-   irq_domain_associate_many(domain, irq_base, hwirq_base, count);
-   return 0;
-}
-EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-
 static int irq_domain_translate(struct irq_domain *d,
struct irq_fwspec *fwspec,
irq_hw_number_t *hwirq, unsigned int *type)
-- 
2.29.2

[PATCH 7/9] irqdomain: Drop references to recusive irqdomain setup

2021-04-06 Thread Marc Zyngier

It was never completely implemented, and was removed a long time
ago. Adjust the documentation to reflect this.

Signed-off-by: Marc Zyngier 
---
 kernel/irq/irqdomain.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 0ba761e6b0a8..89474aa88220 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1659,12 +1659,10 @@ void irq_domain_free_irqs(unsigned int virq, unsigned 
int nr_irqs)
 
 /**
  * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain
+ * @domain:Domain below which interrupts must be allocated
  * @irq_base:  Base IRQ number
  * @nr_irqs:   Number of IRQs to allocate
  * @arg:   Allocation data (arch/domain specific)
- *
- * Check whether the domain has been setup recursive. If not allocate
- * through the parent domain.
  */
 int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
 unsigned int irq_base, unsigned int nr_irqs,
@@ -1680,11 +1678,9 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
 
 /**
  * irq_domain_free_irqs_parent - Free interrupts from parent domain
+ * @domain:Domain below which interrupts must be freed
  * @irq_base:  Base IRQ number
  * @nr_irqs:   Number of IRQs to free
- *
- * Check whether the domain has been setup recursive. If not free
- * through the parent domain.
  */
 void irq_domain_free_irqs_parent(struct irq_domain *domain,
 unsigned int irq_base, unsigned int nr_irqs)
-- 
2.29.2

[PATCH 6/9] mips: netlogic: Use irq_domain_simple_ops for XLP PIC

2021-04-06 Thread Marc Zyngier

Use the generic irq_domain_simple_ops structure instead of
a home-grown one.

Signed-off-by: Marc Zyngier 
---
 arch/mips/netlogic/common/irq.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/mips/netlogic/common/irq.c b/arch/mips/netlogic/common/irq.c
index cf33dd8a487e..c25a2ce5e29f 100644
--- a/arch/mips/netlogic/common/irq.c
+++ b/arch/mips/netlogic/common/irq.c
@@ -276,10 +276,6 @@ asmlinkage void plat_irq_dispatch(void)
 }
 
 #ifdef CONFIG_CPU_XLP
-static const struct irq_domain_ops xlp_pic_irq_domain_ops = {
-   .xlate = irq_domain_xlate_onetwocell,
-};
-
 static int __init xlp_of_pic_init(struct device_node *node,
struct device_node *parent)
 {
@@ -324,7 +320,7 @@ static int __init xlp_of_pic_init(struct device_node *node,
 
xlp_pic_domain = irq_domain_add_legacy(node, n_picirqs,
nlm_irq_to_xirq(socid, PIC_IRQ_BASE), PIC_IRQ_BASE,
-   _pic_irq_domain_ops, NULL);
+   _domain_simple_ops, NULL);
if (xlp_pic_domain == NULL) {
pr_err("PIC %pOFn: Creating legacy domain failed!\n", node);
return -EINVAL;
-- 
2.29.2

[PATCH 4/9] sh: intc: Drop the use of irq_create_identity_mapping()

2021-04-06 Thread Marc Zyngier

Instead of playing games with using irq_create_identity_mapping()
and irq_domain_associate(), drop the use of the former and only
use the latter, together with the allocation of the irq_desc
as needed.

It doesn't make the code less awful, but at least the intent
is clearer.

Signed-off-by: Marc Zyngier 
---
 drivers/sh/intc/core.c | 50 ++
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/drivers/sh/intc/core.c b/drivers/sh/intc/core.c
index a14684ffe4c1..6c57ee1ce6c4 100644
--- a/drivers/sh/intc/core.c
+++ b/drivers/sh/intc/core.c
@@ -179,6 +179,23 @@ static unsigned int __init save_reg(struct intc_desc_int 
*d,
return 0;
 }
 
+static bool __init intc_map(struct irq_domain *domain, int irq)
+{
+   int res;
+
+   if (!irq_to_desc(irq) && irq_alloc_desc_at(irq, NUMA_NO_NODE) != irq) {
+   pr_err("uname to allocate IRQ %d\n", irq);
+   return false;
+   }
+
+   if (irq_domain_associate(domain, irq, irq)) {
+   pr_err("domain association failure\n");
+   return false;
+   }
+
+   return true;
+}
+
 int __init register_intc_controller(struct intc_desc *desc)
 {
unsigned int i, k, smp;
@@ -316,19 +333,8 @@ int __init register_intc_controller(struct intc_desc *desc)
if (!vect->enum_id)
continue;
 
-   res = irq_create_identity_mapping(d->domain, irq);
-   if (unlikely(res)) {
-   if (res == -EEXIST) {
-   res = irq_domain_associate(d->domain, irq, irq);
-   if (unlikely(res)) {
-   pr_err("domain association failure\n");
-   continue;
-   }
-   } else {
-   pr_err("can't identity map IRQ %d\n", irq);
-   continue;
-   }
-   }
+   if (!intc_map(d->domain, irq))
+   continue;
 
intc_irq_xlate_set(irq, vect->enum_id, d);
intc_register_irq(desc, d, vect->enum_id, irq);
@@ -345,22 +351,8 @@ int __init register_intc_controller(struct intc_desc *desc)
 * IRQ support, each vector still needs to have
 * its own backing irq_desc.
 */
-   res = irq_create_identity_mapping(d->domain, irq2);
-   if (unlikely(res)) {
-   if (res == -EEXIST) {
-   res = irq_domain_associate(d->domain,
-  irq2, irq2);
-   if (unlikely(res)) {
-   pr_err("domain association "
-  "failure\n");
-   continue;
-   }
-   } else {
-   pr_err("can't identity map IRQ %d\n",
-  irq);
-   continue;
-   }
-   }
+   if (!intc_map(d->domain, irq2))
+   continue;
 
vect2->enum_id = 0;
 
-- 
2.29.2

[PATCH 0/9] Cleaning up some of the irqdomain features

2021-04-06 Thread Marc Zyngier

The irqdomain subsystem has grown quite a lot over the years, and some
of its features are either oddly used or just pretty useless. Some
other helpers expose internals that are likely to change soon.

Here are the bits that I'm trying to get rid of:

- irq_linear_revmap exposes the internals of the domains, and only
  works for linear domains. The supposed speed improvement really
  isn't an argument, as it gets in the way of more significant
  optimisations. Reimplemented in terms of irq_find_mapping, which
  always works, and will eventually go at some point.

- irq_create_strict_mappings is just a way to constraint the
  allocation of irqdescs into a given range, which is better served by
  creating a legacy irqdomain, and shows that the platform really
  needs to catch up with the 21st century.

- irq_create_identity mapping is just a variation on the above, with
  irq==hwirq, although the way it is used is a gross hack in the SH
  code that needs to go.

- irq_domain_add_legacy_isa is, as the names shows, a variation on
  irq_domain_add_legacy with a reservation of 16 interrupts. This is
  only used in the PPC code.

The patches address all of the above, touching some of the ARM, PPC,
and SH code that is affected. Another couple of patches address a MIPS
platform that could use the generic code, and clean some of the
comments in the irqdomain code.

Unless anyone shouts, I'd like to take this into 5.13, as it is the
basis of some further (and deeper) changes in the way irqdomains work.

M.

Marc Zyngier (9):
  irqdomain: Reimplement irq_linear_revmap() with irq_find_mapping()
  ARM: PXA: Kill use of irq_create_strict_mappings()
  irqchip/jcore-aic: Kill use of irq_create_strict_mappings()
  sh: intc: Drop the use of irq_create_identity_mapping()
  irqdomain: Kill
irq_create_strict_mappings()/irq_create_identity_mapping()
  mips: netlogic: Use irq_domain_simple_ops for XLP PIC
  irqdomain: Drop references to recusive irqdomain setup
  powerpc: Convert irq_domain_add_legacy_isa use to
irq_domain_add_legacy
  irqdomain: Kill irq_domain_add_legacy_isa

 Documentation/core-api/irq/irq-domain.rst |  1 -
 arch/arm/mach-pxa/pxa_cplds_irqs.c| 24 +--
 arch/mips/netlogic/common/irq.c   |  6 +--
 arch/powerpc/include/asm/irq.h|  4 +-
 arch/powerpc/platforms/ps3/interrupt.c|  4 +-
 arch/powerpc/sysdev/i8259.c   |  3 +-
 arch/powerpc/sysdev/mpic.c|  2 +-
 arch/powerpc/sysdev/tsi108_pci.c  |  3 +-
 arch/powerpc/sysdev/xics/xics-common.c|  2 +-
 drivers/irqchip/irq-jcore-aic.c   |  4 +-
 drivers/sh/intc/core.c| 50 ++-
 include/linux/irqdomain.h | 42 ---
 kernel/irq/irqdomain.c| 49 +++---
 13 files changed, 59 insertions(+), 135 deletions(-)

-- 
2.29.2

[PATCH 1/9] irqdomain: Reimplement irq_linear_revmap() with irq_find_mapping()

2021-04-06 Thread Marc Zyngier

irq_linear_revmap() is supposed to be a fast path for domain
lookups, but it only exposes low-level details of the irqdomain
implementation, details which are better kept private.

The *overhead* between the two is only a function call and
a couple of tests, so it is likely that noone can show any
meaningful difference compared to the cost of taking an
interrupt.

Reimplement irq_linear_revmap() with irq_find_mapping()
in order to preserve source code compatibility, and
rename the internal field for a measure.

Signed-off-by: Marc Zyngier 
---
 include/linux/irqdomain.h | 22 +-
 kernel/irq/irqdomain.c|  6 +++---
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 33cacc8af26d..b9600f24878a 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -154,9 +154,9 @@ struct irq_domain_chip_generic;
  * Revmap data, used internally by irq_domain
  * @revmap_direct_max_irq: The largest hwirq that can be set for controllers 
that
  * support direct mapping
- * @revmap_size: Size of the linear map table @linear_revmap[]
+ * @revmap_size: Size of the linear map table @revmap[]
  * @revmap_tree: Radix map tree for hwirqs that don't fit in the linear map
- * @linear_revmap: Linear table of hwirq->virq reverse mappings
+ * @revmap: Linear table of hwirq->virq reverse mappings
  */
 struct irq_domain {
struct list_head link;
@@ -180,7 +180,7 @@ struct irq_domain {
unsigned int revmap_size;
struct radix_tree_root revmap_tree;
struct mutex revmap_tree_mutex;
-   unsigned int linear_revmap[];
+   unsigned int revmap[];
 };
 
 /* Irq domain flags */
@@ -396,24 +396,20 @@ static inline unsigned int irq_create_mapping(struct 
irq_domain *host,
return irq_create_mapping_affinity(host, hwirq, NULL);
 }
 
-
 /**
- * irq_linear_revmap() - Find a linux irq from a hw irq number.
+ * irq_find_mapping() - Find a linux irq from a hw irq number.
  * @domain: domain owning this hardware interrupt
  * @hwirq: hardware irq number in that domain space
- *
- * This is a fast path alternative to irq_find_mapping() that can be
- * called directly by irq controller code to save a handful of
- * instructions. It is always safe to call, but won't find irqs mapped
- * using the radix tree.
  */
+extern unsigned int irq_find_mapping(struct irq_domain *host,
+irq_hw_number_t hwirq);
+
 static inline unsigned int irq_linear_revmap(struct irq_domain *domain,
 irq_hw_number_t hwirq)
 {
-   return hwirq < domain->revmap_size ? domain->linear_revmap[hwirq] : 0;
+   return irq_find_mapping(domain, hwirq);
 }
-extern unsigned int irq_find_mapping(struct irq_domain *host,
-irq_hw_number_t hwirq);
+
 extern unsigned int irq_create_direct_mapping(struct irq_domain *host);
 extern int irq_create_strict_mappings(struct irq_domain *domain,
  unsigned int irq_base,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d10ab1d689d5..dfa716305ea9 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -486,7 +486,7 @@ static void irq_domain_clear_mapping(struct irq_domain 
*domain,
 irq_hw_number_t hwirq)
 {
if (hwirq < domain->revmap_size) {
-   domain->linear_revmap[hwirq] = 0;
+   domain->revmap[hwirq] = 0;
} else {
mutex_lock(>revmap_tree_mutex);
radix_tree_delete(>revmap_tree, hwirq);
@@ -499,7 +499,7 @@ static void irq_domain_set_mapping(struct irq_domain 
*domain,
   struct irq_data *irq_data)
 {
if (hwirq < domain->revmap_size) {
-   domain->linear_revmap[hwirq] = irq_data->irq;
+   domain->revmap[hwirq] = irq_data->irq;
} else {
mutex_lock(>revmap_tree_mutex);
radix_tree_insert(>revmap_tree, hwirq, irq_data);
@@ -920,7 +920,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
 
/* Check if the hwirq is in the linear revmap. */
if (hwirq < domain->revmap_size)
-   return domain->linear_revmap[hwirq];
+   return domain->revmap[hwirq];
 
rcu_read_lock();
data = radix_tree_lookup(>revmap_tree, hwirq);
-- 
2.29.2

[PATCH 3/9] irqchip/jcore-aic: Kill use of irq_create_strict_mappings()

2021-04-06 Thread Marc Zyngier

irq_create_strict_mappings() is a poor way to allow the use of
a linear IRQ domain as a legacy one. Let's be upfront about it.

Signed-off-by: Marc Zyngier 
---
 drivers/irqchip/irq-jcore-aic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/irqchip/irq-jcore-aic.c b/drivers/irqchip/irq-jcore-aic.c
index 033bccb41455..5f47d8ee4ae3 100644
--- a/drivers/irqchip/irq-jcore-aic.c
+++ b/drivers/irqchip/irq-jcore-aic.c
@@ -100,11 +100,11 @@ static int __init aic_irq_of_init(struct device_node 
*node,
jcore_aic.irq_unmask = noop;
jcore_aic.name = "AIC";
 
-   domain = irq_domain_add_linear(node, dom_sz, _aic_irqdomain_ops,
+   domain = irq_domain_add_legacy(node, dom_sz - min_irq, min_irq, min_irq,
+  _aic_irqdomain_ops,
   _aic);
if (!domain)
return -ENOMEM;
-   irq_create_strict_mappings(domain, min_irq, min_irq, dom_sz - min_irq);
 
return 0;
 }
-- 
2.29.2

[PATCH 2/9] ARM: PXA: Kill use of irq_create_strict_mappings()

2021-04-06 Thread Marc Zyngier

irq_create_strict_mappings() is a poor way to allow the use of
a linear IRQ domain as a legacy one. Let's be upfront about
it and use a legacy domain when appropriate.

Signed-off-by: Marc Zyngier 
---
 arch/arm/mach-pxa/pxa_cplds_irqs.c | 24 +++-
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/arm/mach-pxa/pxa_cplds_irqs.c 
b/arch/arm/mach-pxa/pxa_cplds_irqs.c
index 45c19ca96f7a..ec0d9b094744 100644
--- a/arch/arm/mach-pxa/pxa_cplds_irqs.c
+++ b/arch/arm/mach-pxa/pxa_cplds_irqs.c
@@ -147,22 +147,20 @@ static int cplds_probe(struct platform_device *pdev)
}
 
irq_set_irq_wake(fpga->irq, 1);
-   fpga->irqdomain = irq_domain_add_linear(pdev->dev.of_node,
-  CPLDS_NB_IRQ,
-  _irq_domain_ops, fpga);
+   if (base_irq)
+   fpga->irqdomain = irq_domain_add_legacy(pdev->dev.of_node,
+   CPLDS_NB_IRQ,
+   base_irq, 0,
+   _irq_domain_ops,
+   fpga);
+   else
+   fpga->irqdomain = irq_domain_add_linear(pdev->dev.of_node,
+   CPLDS_NB_IRQ,
+   _irq_domain_ops,
+   fpga);
if (!fpga->irqdomain)
return -ENODEV;
 
-   if (base_irq) {
-   ret = irq_create_strict_mappings(fpga->irqdomain, base_irq, 0,
-CPLDS_NB_IRQ);
-   if (ret) {
-   dev_err(>dev, "couldn't create the irq mapping 
%d..%d\n",
-   base_irq, base_irq + CPLDS_NB_IRQ);
-   return ret;
-   }
-   }
-
return 0;
 }
 
-- 
2.29.2

Re: [PATCH v6 30/48] KVM: PPC: Book3S HV P9: Implement the rest of the P9 path in C

2021-04-06 Thread Nicholas Piggin

Excerpts from Paul Mackerras's message of April 6, 2021 5:27 pm:
> On Mon, Apr 05, 2021 at 11:19:30AM +1000, Nicholas Piggin wrote:
>> Almost all logic is moved to C, by introducing a new in_guest mode for
>> the P9 path that branches very early in the KVM interrupt handler to
>> P9 exit code.
>> 
>> The main P9 entry and exit assembly is now only about 160 lines of low
>> level stack setup and register save/restore, plus a bad-interrupt
>> handler.
>> 
>> There are two motivations for this, the first is just make the code more
>> maintainable being in C. The second is to reduce the amount of code
>> running in a special KVM mode, "realmode". In quotes because with radix
>> it is no longer necessarily real-mode in the MMU, but it still has to be
>> treated specially because it may be in real-mode, and has various
>> important registers like PID, DEC, TB, etc set to guest. This is hostile
>> to the rest of Linux and can't use arbitrary kernel functionality or be
>> instrumented well.
>> 
>> This initial patch is a reasonably faithful conversion of the asm code,
>> but it does lack any loop to return quickly back into the guest without
>> switching out of realmode in the case of unimportant or easily handled
>> interrupts. As explained in previous changes, handling HV interrupts
>> in real mode is not so important for P9.
>> 
>> Use of Linux 64s interrupt entry code register conventions including
>> paca EX_ save areas are brought into the KVM code. There is no point
>> shuffling things into different paca save areas and making up a
>> different calling convention for KVM.
>> 
>> Signed-off-by: Nicholas Piggin 
> 
> [snip]
> 
>> +/*
>> + * Took an interrupt somewhere right before HRFID to guest, so registers are
>> + * in a bad way. Return things hopefully enough to run host virtual code and
>> + * run the Linux interrupt handler (SRESET or MCE) to print something 
>> useful.
>> + *
>> + * We could be really clever and save all host registers in known locations
>> + * before setting HSTATE_IN_GUEST, then restoring them all here, and setting
>> + * return address to a fixup that sets them up again. But that's a lot of
>> + * effort for a small bit of code. Lots of other things to do first.
>> + */
>> +kvmppc_p9_bad_interrupt:
>> +/*
>> + * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
>> + * MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
>> + */
>> +li  r10,KVM_GUEST_MODE_NONE
>> +stb r10,HSTATE_IN_GUEST(r13)
>> +li  r10,MSR_RI
>> +andcr12,r12,r10
>> +
>> +/*
>> + * Clean up guest registers to give host a chance to run.
>> + */
>> +li  r10,0
>> +mtspr   SPRN_AMR,r10
>> +mtspr   SPRN_IAMR,r10
>> +mtspr   SPRN_CIABR,r10
>> +mtspr   SPRN_DAWRX0,r10
>> +BEGIN_FTR_SECTION
>> +mtspr   SPRN_DAWRX1,r10
>> +END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
>> +mtspr   SPRN_PID,r10
>> +
>> +/*
>> + * Switch to host MMU mode
>> + */
>> +ld  r10, HSTATE_KVM_VCPU(r13)
>> +ld  r10, VCPU_KVM(r10)
>> +lwz r10, KVM_HOST_LPID(r10)
>> +mtspr   SPRN_LPID,r10
>> +
>> +ld  r10, HSTATE_KVM_VCPU(r13)
>> +ld  r10, VCPU_KVM(r10)
>> +ld  r10, KVM_HOST_LPCR(r10)
>> +mtspr   SPRN_LPCR,r10
>> +
>> +/*
>> + * Go back to interrupt handler
>> + */
>> +ld  r10,HSTATE_SCRATCH0(r13)
>> +cmpwi   r10,BOOK3S_INTERRUPT_MACHINE_CHECK
>> +beq machine_check_common
>> +
>> +ld  r10,HSTATE_SCRATCH0(r13)
>> +cmpwi   r10,BOOK3S_INTERRUPT_SYSTEM_RESET
>> +beq system_reset_common
>> +
>> +b   .
> 
> So you only handle machine check and system reset here?  I would think
> that program check would also be useful, for the cases where people
> put BUG_ON in sensitive places (see below).  DSI and ISI could also be
> useful for the null pointer dereference cases, I would think.

Those ones have their own stack, so a bit simpler to run them (and
they obviously have to be handled as they are NMIs). I'll see if we
can do something to improve the others a bit. Maybe just call program
check for any other exception might work, making sure that it'll use
the emergency stack rather than something that looks like a kernel
stack but is a guest value, I'll see what we can get to work.

>> +static inline void mtslb(unsigned int idx, u64 slbee, u64 slbev)
>> +{
>> +BUG_ON((slbee & 0xfff) != idx);
>> +
>> +asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
>> +}
> 
> Using BUG_ON here feels dangerous, and the condition it is testing is
> certainly not one where the host kernel is in such trouble that it
> can't continue to run.  If the index was wrong then at worst the guest
> kernel would be in trouble.  So I don't believe BUG_ON is appropriate.

Yeah good point, some of it was a bit of development paranoia but I 
do have to go through and tighten these up.

>> +
>> +/*
>> + * Malicious or buggy radix guests may have

Re: [PATCH v6 6/9] openrisc: qspinlock: Add ARCH_USE_QUEUED_SPINLOCKS_XCHG32

2021-04-06 Thread Stafford Horne

On Wed, Mar 31, 2021 at 02:30:37PM +, guo...@kernel.org wrote:
> From: Guo Ren 
> 
> We don't have native hw xchg16 instruction, so let qspinlock
> generic code to deal with it.
> 
> Using the full-word atomic xchg instructions implement xchg16 has
> the semantic risk for atomic operations.
> 
> This patch cancels the dependency of on qspinlock generic code on
> architecture's xchg16.
> 
> Signed-off-by: Guo Ren 
> Cc: Arnd Bergmann 
> Cc: Jonas Bonn 
> Cc: Stefan Kristiansson 
> Cc: Stafford Horne 
> Cc: openr...@lists.librecores.org

Acked-by: Stafford Horne 

> ---
>  arch/openrisc/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
> index 591acc5990dc..b299e409429f 100644
> --- a/arch/openrisc/Kconfig
> +++ b/arch/openrisc/Kconfig
> @@ -33,6 +33,7 @@ config OPENRISC
>   select OR1K_PIC
>   select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
>   select ARCH_USE_QUEUED_SPINLOCKS
> + select ARCH_USE_QUEUED_SPINLOCKS_XCHG32
>   select ARCH_USE_QUEUED_RWLOCKS
>   select OMPIC if SMP
>   select ARCH_WANT_FRAME_POINTERS
> -- 
> 2.17.1
>

Re: [PATCH v6 30/48] KVM: PPC: Book3S HV P9: Implement the rest of the P9 path in C

2021-04-06 Thread Paul Mackerras

On Mon, Apr 05, 2021 at 11:19:30AM +1000, Nicholas Piggin wrote:
> Almost all logic is moved to C, by introducing a new in_guest mode for
> the P9 path that branches very early in the KVM interrupt handler to
> P9 exit code.
> 
> The main P9 entry and exit assembly is now only about 160 lines of low
> level stack setup and register save/restore, plus a bad-interrupt
> handler.
> 
> There are two motivations for this, the first is just make the code more
> maintainable being in C. The second is to reduce the amount of code
> running in a special KVM mode, "realmode". In quotes because with radix
> it is no longer necessarily real-mode in the MMU, but it still has to be
> treated specially because it may be in real-mode, and has various
> important registers like PID, DEC, TB, etc set to guest. This is hostile
> to the rest of Linux and can't use arbitrary kernel functionality or be
> instrumented well.
> 
> This initial patch is a reasonably faithful conversion of the asm code,
> but it does lack any loop to return quickly back into the guest without
> switching out of realmode in the case of unimportant or easily handled
> interrupts. As explained in previous changes, handling HV interrupts
> in real mode is not so important for P9.
> 
> Use of Linux 64s interrupt entry code register conventions including
> paca EX_ save areas are brought into the KVM code. There is no point
> shuffling things into different paca save areas and making up a
> different calling convention for KVM.
> 
> Signed-off-by: Nicholas Piggin 

[snip]

> +/*
> + * Took an interrupt somewhere right before HRFID to guest, so registers are
> + * in a bad way. Return things hopefully enough to run host virtual code and
> + * run the Linux interrupt handler (SRESET or MCE) to print something useful.
> + *
> + * We could be really clever and save all host registers in known locations
> + * before setting HSTATE_IN_GUEST, then restoring them all here, and setting
> + * return address to a fixup that sets them up again. But that's a lot of
> + * effort for a small bit of code. Lots of other things to do first.
> + */
> +kvmppc_p9_bad_interrupt:
> + /*
> +  * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
> +  * MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
> +  */
> + li  r10,KVM_GUEST_MODE_NONE
> + stb r10,HSTATE_IN_GUEST(r13)
> + li  r10,MSR_RI
> + andcr12,r12,r10
> +
> + /*
> +  * Clean up guest registers to give host a chance to run.
> +  */
> + li  r10,0
> + mtspr   SPRN_AMR,r10
> + mtspr   SPRN_IAMR,r10
> + mtspr   SPRN_CIABR,r10
> + mtspr   SPRN_DAWRX0,r10
> +BEGIN_FTR_SECTION
> + mtspr   SPRN_DAWRX1,r10
> +END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
> + mtspr   SPRN_PID,r10
> +
> + /*
> +  * Switch to host MMU mode
> +  */
> + ld  r10, HSTATE_KVM_VCPU(r13)
> + ld  r10, VCPU_KVM(r10)
> + lwz r10, KVM_HOST_LPID(r10)
> + mtspr   SPRN_LPID,r10
> +
> + ld  r10, HSTATE_KVM_VCPU(r13)
> + ld  r10, VCPU_KVM(r10)
> + ld  r10, KVM_HOST_LPCR(r10)
> + mtspr   SPRN_LPCR,r10
> +
> + /*
> +  * Go back to interrupt handler
> +  */
> + ld  r10,HSTATE_SCRATCH0(r13)
> + cmpwi   r10,BOOK3S_INTERRUPT_MACHINE_CHECK
> + beq machine_check_common
> +
> + ld  r10,HSTATE_SCRATCH0(r13)
> + cmpwi   r10,BOOK3S_INTERRUPT_SYSTEM_RESET
> + beq system_reset_common
> +
> + b   .

So you only handle machine check and system reset here?  I would think
that program check would also be useful, for the cases where people
put BUG_ON in sensitive places (see below).  DSI and ISI could also be
useful for the null pointer dereference cases, I would think.

> +static inline void mtslb(unsigned int idx, u64 slbee, u64 slbev)
> +{
> + BUG_ON((slbee & 0xfff) != idx);
> +
> + asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
> +}

Using BUG_ON here feels dangerous, and the condition it is testing is
certainly not one where the host kernel is in such trouble that it
can't continue to run.  If the index was wrong then at worst the guest
kernel would be in trouble.  So I don't believe BUG_ON is appropriate.

> +
> +/*
> + * Malicious or buggy radix guests may have inserted SLB entries
> + * (only 0..3 because radix always runs with UPRT=1), so these must
> + * be cleared here to avoid side-channels. slbmte is used rather
> + * than slbia, as it won't clear cached translations.
> + */
> +static void radix_clear_slb(void)
> +{
> + u64 slbee, slbev;
> + int i;
> +
> + for (i = 0; i < 4; i++) {
> + mfslb(i, , );
> + if (unlikely(slbee || slbev)) {
> + slbee = i;
> + slbev = 0;
> + mtslb(i, slbee, slbev);
> + }
> + }

Are four slbmfee + slbmfev really faster than four slbmte?

Paul.

Re: [PATCH v6 16/48] KVM: PPC: Book3S 64: Move interrupt early register setup to KVM

2021-04-06 Thread Nicholas Piggin

Excerpts from Paul Mackerras's message of April 6, 2021 2:37 pm:
> On Mon, Apr 05, 2021 at 11:19:16AM +1000, Nicholas Piggin wrote:
>> Like the earlier patch for hcalls, KVM interrupt entry requires a
>> different calling convention than the Linux interrupt handlers
>> set up. Move the code that converts from one to the other into KVM.
> 
> I don't see where you do anything to enable the new KVM entry code to
> access the PACA_EXSLB area when handling DSegI and ISegI interrupts.
> Have I missed something, or are you not testing PR KVM at all?

We just got rid of PACA_EXSLB, commit ac7c5e9b08ac ("powerpc/64s:
Remove EXSLB interrupt save area").

Thanks,
Nick

[PATCH] swim3: support highmem

2021-04-06 Thread Christoph Hellwig

swim3 only uses the virtual address of a bio to stash it into the data
transfer using virt_to_bus.  But the ppc32 virt_to_bus just uses the
physical address with an offset.  Replace virt_to_bus with a local hack
that performs the equivalent transformation and stop asking for block
layer bounce buffering.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/swim3.c | 34 +-
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c2d922d125e281..a515d0c1d2cb8e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -234,7 +234,6 @@ static unsigned short write_postamble[] = {
 };
 
 static void seek_track(struct floppy_state *fs, int n);
-static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count);
 static void act(struct floppy_state *fs);
 static void scan_timeout(struct timer_list *t);
 static void seek_timeout(struct timer_list *t);
@@ -404,12 +403,28 @@ static inline void seek_track(struct floppy_state *fs, 
int n)
fs->settle_time = 0;
 }
 
+/*
+ * XXX: this is a horrible hack, but at least allows ppc32 to get
+ * out of defining virt_to_bus, and this driver out of using the
+ * deprecated block layer bounce buffering for highmem addresses
+ * for no good reason.
+ */
+static unsigned long swim3_phys_to_bus(phys_addr_t paddr)
+{
+   return paddr + PCI_DRAM_OFFSET;
+}
+
+static phys_addr_t swim3_bio_phys(struct bio *bio)
+{
+   return page_to_phys(bio_page(bio)) + bio_offset(bio);
+}
+
 static inline void init_dma(struct dbdma_cmd *cp, int cmd,
-   void *buf, int count)
+   phys_addr_t paddr, int count)
 {
cp->req_count = cpu_to_le16(count);
cp->command = cpu_to_le16(cmd);
-   cp->phy_addr = cpu_to_le32(virt_to_bus(buf));
+   cp->phy_addr = cpu_to_le32(swim3_phys_to_bus(paddr));
cp->xfer_status = 0;
 }
 
@@ -441,16 +456,18 @@ static inline void setup_transfer(struct floppy_state *fs)
out_8(>sector, fs->req_sector);
out_8(>nsect, n);
out_8(>gap3, 0);
-   out_le32(>cmdptr, virt_to_bus(cp));
+   out_le32(>cmdptr, swim3_phys_to_bus(virt_to_phys(cp)));
if (rq_data_dir(req) == WRITE) {
/* Set up 3 dma commands: write preamble, data, postamble */
-   init_dma(cp, OUTPUT_MORE, write_preamble, 
sizeof(write_preamble));
+   init_dma(cp, OUTPUT_MORE, virt_to_phys(write_preamble),
+sizeof(write_preamble));
++cp;
-   init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
+   init_dma(cp, OUTPUT_MORE, swim3_bio_phys(req->bio), 512);
++cp;
-   init_dma(cp, OUTPUT_LAST, write_postamble, 
sizeof(write_postamble));
+   init_dma(cp, OUTPUT_LAST, virt_to_phys(write_postamble),
+   sizeof(write_postamble));
} else {
-   init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
+   init_dma(cp, INPUT_LAST, swim3_bio_phys(req->bio), n * 512);
}
++cp;
out_le16(>command, DBDMA_STOP);
@@ -1201,7 +1218,6 @@ static int swim3_attach(struct macio_dev *mdev,
disk->queue = NULL;
goto out_put_disk;
}
-   blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
disk->queue->queuedata = fs;
 
rc = swim3_add_device(mdev, floppy_count);
-- 
2.30.1

71 matches

Mail list logo