[PATCH 12/19] kvm/book3s_64_vio: Convert account_locked_vm() to vm_account_pinned()

2023-02-05 Thread Alistair Popple
book3s_64_vio currently accounts for pinned pages with
account_locked_vm() which charges the pages to mm->locked_vm. To make
this consistent with other drivers switch to using
vm_account_pinned().

Signed-off-by: Alistair Popple 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Paolo Bonzini 
Cc: Alexey Kardashevskiy 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-ker...@vger.kernel.org
Cc: k...@vger.kernel.org
---
 arch/powerpc/kvm/book3s_64_vio.c | 10 +-
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c  |  3 +++
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 95e738e..ecd1deb 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -273,8 +273,8 @@ static int kvm_spapr_tce_release(struct inode *inode, 
struct file *filp)
}
}
 
-   account_locked_vm(kvm->mm,
-   kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+   vm_unaccount_pinned(>vm_account,
+   kvmppc_stt_pages(kvmppc_tce_pages(stt->size)));
 
kvm_put_kvm(stt->kvm);
 
@@ -301,8 +301,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
return -EINVAL;
 
-   npages = kvmppc_tce_pages(args->size);
-   ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true);
+   npages = kvmppc_tce_pages(size);
+   ret = vm_account_pinned(>vm_account, kvmppc_stt_pages(npages));
if (ret)
return ret;
 
@@ -347,7 +347,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
kfree(stt);
  fail_acct:
-   account_locked_vm(mm, kvmppc_stt_pages(npages), false);
+   vm_unaccount_pinned(>vm_account, kvmppc_stt_pages(npages));
return ret;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4f26b24..bd7a7be 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -717,6 +718,7 @@ struct kvm {
 */
struct mutex slots_arch_lock;
struct mm_struct *mm; /* userspace tied to this vm */
+   struct vm_account vm_account;
unsigned long nr_memslot_pages;
/* The two memslot sets - active and inactive (per address space) */
struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2];
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9c60384..770d037 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1142,6 +1142,7 @@ static struct kvm *kvm_create_vm(unsigned long type, 
const char *fdname)
KVM_MMU_LOCK_INIT(kvm);
mmgrab(current->mm);
kvm->mm = current->mm;
+   vm_account_init_current(>vm_account);
kvm_eventfd_init(kvm);
mutex_init(>lock);
mutex_init(>irq_lock);
@@ -1258,6 +1259,7 @@ static struct kvm *kvm_create_vm(unsigned long type, 
const char *fdname)
 out_err_no_srcu:
kvm_arch_free_vm(kvm);
mmdrop(current->mm);
+   vm_account_release(>vm_account);
module_put(kvm_chardev_ops.owner);
return ERR_PTR(r);
 }
@@ -1327,6 +1329,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
preempt_notifier_dec();
hardware_disable_all();
mmdrop(mm);
+   vm_account_release(>vm_account);
module_put(kvm_chardev_ops.owner);
 }
 
-- 
git-series 0.9.1


[PATCH 08/19] vfio/spapr_tce: Convert accounting to pinned_vm

2023-02-05 Thread Alistair Popple
Convert from accounting pages against locked_vm to accounting them to
pinned_vm. This allows struct vm_account to be used to track the
mm_struct used to charge the pages. A future change also uses this to
track a cgroup for controlling pinned pages.

Signed-off-by: Alistair Popple 
Cc: Michael Ellerman 
Cc: Nicholas Piggin 
Cc: Christophe Leroy 
Cc: Alex Williamson 
Cc: Cornelia Huck 
Cc: Alexey Kardashevskiy 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-ker...@vger.kernel.org
Cc: k...@vger.kernel.org
---
 arch/powerpc/mm/book3s64/iommu_api.c | 30 ++---
 drivers/vfio/vfio_iommu_spapr_tce.c  | 16 ++-
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
b/arch/powerpc/mm/book3s64/iommu_api.c
index 7fcfba1..338b111 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_MUTEX(mem_list_mutex);
 
@@ -30,6 +31,7 @@ struct mm_iommu_table_group_mem_t {
unsigned long used;
atomic64_t mapped;
unsigned int pageshift;
+   struct vm_account vm_account;
u64 ua; /* userspace address */
u64 entries;/* number of entries in hpas/hpages[] */
/*
@@ -62,20 +64,24 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
unsigned int pageshift;
unsigned long entry, chunk;
 
-   if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-   ret = account_locked_vm(mm, entries, true);
-   if (ret)
-   return ret;
-
-   locked_entries = entries;
-   }
-
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
if (!mem) {
ret = -ENOMEM;
goto unlock_exit;
}
 
+   vm_account_init_current(>vm_account);
+   if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
+   ret = vm_account_pinned(>vm_account, entries);
+   if (ret) {
+   vm_account_release(>vm_account);
+   kfree(mem);
+   return ret;
+   }
+
+   locked_entries = entries;
+   }
+
if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) {
mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT));
mem->dev_hpa = dev_hpa;
@@ -175,10 +181,11 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
unpin_user_pages(mem->hpages, pinned);
 
vfree(mem->hpas);
-   kfree(mem);
 
 unlock_exit:
-   account_locked_vm(mm, locked_entries, false);
+   vm_unaccount_pinned(>vm_account, locked_entries);
+   vm_account_release(>vm_account);
+   kfree(mem);
 
return ret;
 }
@@ -229,6 +236,7 @@ static void mm_iommu_do_free(struct 
mm_iommu_table_group_mem_t *mem)
 
mm_iommu_unpin(mem);
vfree(mem->hpas);
+   vm_account_release(>vm_account);
kfree(mem);
 }
 
@@ -279,7 +287,7 @@ long mm_iommu_put(struct mm_struct *mm, struct 
mm_iommu_table_group_mem_t *mem)
 unlock_exit:
mutex_unlock(_list_mutex);
 
-   account_locked_vm(mm, unlock_entries, false);
+   vm_unaccount_pinned(>vm_account, unlock_entries);
 
return ret;
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 60a50ce..454ccc4 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "vfio.h"
 
 #include 
@@ -67,6 +68,7 @@ struct tce_container {
bool def_window_pending;
unsigned long locked_pages;
struct mm_struct *mm;
+   struct vm_account vm_account;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
struct list_head prereg_list;
@@ -82,6 +84,7 @@ static long tce_iommu_mm_set(struct tce_container *container)
BUG_ON(!current->mm);
container->mm = current->mm;
mmgrab(container->mm);
+   vm_account_init_current(>vm_account);
 
return 0;
 }
@@ -291,7 +294,7 @@ static int tce_iommu_enable(struct tce_container *container)
return ret;
 
locked = table_group->tce32_size >> PAGE_SHIFT;
-   ret = account_locked_vm(container->mm, locked, true);
+   ret = vm_account_pinned(>vm_accounnt, locked);
if (ret)
return ret;
 
@@ -310,7 +313,7 @@ static void tce_iommu_disable(struct tce_container 
*container)
container->enabled = false;
 
BUG_ON(!container->mm);
-   account_locked_vm(container->mm, container->locked_pages, false);
+   vm_account_pinned(>vm_account, container->locked_pages);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -372,8 +375,10 @@ static void tce_iommu_release(void *iommu_data)
WARN_ON(tce_iommu_prereg_free(container, 

[PATCH 01/19] mm: Introduce vm_account

2023-02-05 Thread Alistair Popple
Kernel drivers that pin pages should account these pages against
either user->locked_vm and/or mm->pinned_vm and fail the pinning if
RLIMIT_MEMLOCK is exceeded and CAP_IPC_LOCK isn't held.

Currently drivers open-code this accounting and use various methods to
update the atomic variables and check against the limits leading to
various bugs and inconsistencies. To fix this introduce a standard
interface for charging pinned and locked memory. As this involves
taking references on kernel objects such as mm_struct or user_struct
we introduce a new vm_account struct to hold these references. Several
helper functions are then introduced to grab references and check
limits.

As the way these limits are charged and enforced is visible to
userspace we need to be careful not to break existing applications by
charging to different counters. As a result the vm_account functions
support accounting to different counters as required.

A future change will extend this to also account against a cgroup for
pinned pages.

Signed-off-by: Alistair Popple 
Cc: linux-ker...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-f...@vger.kernel.org
Cc: linux-r...@vger.kernel.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: k...@vger.kernel.org
Cc: net...@vger.kernel.org
Cc: cgro...@vger.kernel.org
Cc: io-ur...@vger.kernel.org
Cc: linux...@kvack.org
Cc: b...@vger.kernel.org
Cc: rds-de...@oss.oracle.com
Cc: linux-kselft...@vger.kernel.org
---
 include/linux/vm_account.h |  56 +-
 mm/util.c  | 127 ++-
 2 files changed, 183 insertions(+)
 create mode 100644 include/linux/vm_account.h

diff --git a/include/linux/vm_account.h b/include/linux/vm_account.h
new file mode 100644
index 000..b4b2e90
--- /dev/null
+++ b/include/linux/vm_account.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_VM_ACCOUNT_H
+#define _LINUX_VM_ACCOUNT_H
+
+/**
+ * enum vm_account_flags - Determine how pinned/locked memory is accounted.
+ * @VM_ACCOUNT_TASK: Account pinned memory to mm->pinned_vm.
+ * @VM_ACCOUNT_BYPASS: Don't enforce rlimit on any charges.
+ * @VM_ACCOUNT_USER: Account locked memory to user->locked_vm.
+ *
+ * Determines which statistic pinned/locked memory is accounted
+ * against. All limits will be enforced against RLIMIT_MEMLOCK and the
+ * pins cgroup if CONFIG_CGROUP_PINS is enabled.
+ *
+ * New drivers should use VM_ACCOUNT_USER. VM_ACCOUNT_TASK is used by
+ * pre-existing drivers to maintain existing accounting against
+ * mm->pinned_mm rather than user->locked_mm.
+ *
+ * VM_ACCOUNT_BYPASS may also be specified to bypass rlimit
+ * checks. Typically this is used to cache CAP_IPC_LOCK from when a
+ * driver is first initialised. Note that this does not bypass cgroup
+ * limit checks.
+ */
+enum vm_account_flags {
+   VM_ACCOUNT_USER = 0,
+   VM_ACCOUNT_BYPASS = 1,
+   VM_ACCOUNT_TASK = 1,
+};
+
+struct vm_account {
+   struct task_struct *task;
+   struct mm_struct *mm;
+   struct user_struct *user;
+   enum vm_account_flags flags;
+};
+
+void vm_account_init(struct vm_account *vm_account, struct task_struct *task,
+   struct user_struct *user, enum vm_account_flags flags);
+
+/**
+ * vm_account_init_current - Initialise a new struct vm_account.
+ * @vm_account: pointer to uninitialised vm_account.
+ *
+ * Helper to initialise a vm_account for the common case of charging
+ * with VM_ACCOUNT_TASK against current.
+ */
+static inline void vm_account_init_current(struct vm_account *vm_account)
+{
+   vm_account_init(vm_account, current, NULL, VM_ACCOUNT_TASK);
+}
+
+void vm_account_release(struct vm_account *vm_account);
+int vm_account_pinned(struct vm_account *vm_account, unsigned long npages);
+void vm_unaccount_pinned(struct vm_account *vm_account, unsigned long npages);
+
+#endif /* _LINUX_VM_ACCOUNT_H */
diff --git a/mm/util.c b/mm/util.c
index b56c92f..d8c19f8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -431,6 +432,132 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct 
rlimit *rlim_stack)
 #endif
 
 /**
+ * vm_account_init - Initialise a new struct vm_account.
+ * @vm_account: pointer to uninitialised vm_account.
+ * @task: task to charge against.
+ * @user: user to charge against. Must be non-NULL for VM_ACCOUNT_USER.
+ * @flags: flags to use when charging to vm_account.
+ *
+ * Initialise a new uninitialised struct vm_account. Takes references
+ * on the task/mm/user/cgroup as required although callers must ensure
+ * any references passed in remain valid for the duration of this
+ * call.
+ */
+void vm_account_init(struct vm_account *vm_account, struct task_struct *task,
+   struct user_struct *user, enum vm_account_flags flags)
+{
+   vm_account->task = get_task_struct(task);
+
+   if (flags & VM_ACCOUNT_USER)
+   vm_account->user = get_uid(user);
+
+   

Re: [PATCH v8 4/8] crash: add phdr for possible CPUs in elfcorehdr

2023-02-05 Thread Sourabh Jain



On 03/02/23 02:31, Eric DeVolder wrote:



On 2/2/23 09:37, Eric DeVolder wrote:



On 2/1/23 00:38, Sourabh Jain wrote:

On architectures like PowerPC the crash notes are available for all
possible CPUs. So let's populate the elfcorehdr for all possible
CPUs having crash notes to avoid updating elfcorehdr during in-kernel
crash update on CPU hotplug events.

The similar technique is used in kexec-tool for kexec_load case.

Signed-off-by: Sourabh Jain 
---
  kernel/crash_core.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 37c594858fd51..898d8d2fe2e2e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -364,8 +364,8 @@ int crash_prepare_elf64_headers(struct kimage 
*image, struct crash_mem *mem,

  ehdr->e_ehsize = sizeof(Elf64_Ehdr);
  ehdr->e_phentsize = sizeof(Elf64_Phdr);
-    /* Prepare one phdr of type PT_NOTE for each present CPU */
-    for_each_present_cpu(cpu) {
+    /* Prepare one phdr of type PT_NOTE for possible CPU with crash 
note. */

+    for_each_possible_cpu(cpu) {


Sourabh,
Thomas Gleixner is suggesting moving away from for_each_present_cpu() 
to for_each_online_cpu(). Using for_each_online_cpu() is going to the 
minimum number of needed, whereas your approach of 
for_each_possible_cpu() would be to the maximum number needed.


What would be the ramifications to ppc for moving towards 
for_each_online_cpu()?


In my next patch series, I have finally figured out how to use cpuhp 
framework to where it is possible to use for_each_online_cpu() here, 
but that is at odds with your changes here.


Thanks,
eric


Without knowing the ramifications of changing to 
for_each_online_cpu(), I currently am

using the following:

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index e1a3430f06f4..a019b691d974 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -366,6 +366,9 @@ int crash_prepare_elf64_headers(struct crash_mem 
*mem, int n


    /* Prepare one phdr of type PT_NOTE for each present CPU */
    for_each_present_cpu(cpu) {
+   if (IS_ENABLED(CONFIG_CRASH_HOTPLUG)) {
+   if (!cpu_online(cpu)) continue;
+   }


How about let the arch decide the list of CPUs they want to pack in? 
Because on
PowerPC the crash notes are created for possible CPUs and we can utilize 
this

to avoid re-generating elfcorehdr for every hotplug operation.



phdr->p_type = PT_NOTE;
    notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
    phdr->p_offset = phdr->p_paddr = notes_addr;

Thomas points out that the above can be simply the 
for_each_online_cpu(), but again

I'm not sure how that impacts ppc,

which appears to layout all possible cpus rather
than just online ones. How are present but not online cpus handled by 
crash analysis

utility?


As per my testing all worked fine if we replace for_each_present_cpu 
with for_each_online_cpu
but again I don't know the reason why it worked. I will investigate it 
and let you know.


How packing PT_LOAD for present CPU is impacting x86? Because on PowerPC
when system is on crash path it only populates the crash notes for 
online CPUs, and skip

all other CPUs.

- Sourabh Jain


Re: [PATCH v8 4/8] crash: add phdr for possible CPUs in elfcorehdr

2023-02-05 Thread Sourabh Jain



On 02/02/23 21:07, Eric DeVolder wrote:



On 2/1/23 00:38, Sourabh Jain wrote:

On architectures like PowerPC the crash notes are available for all
possible CPUs. So let's populate the elfcorehdr for all possible
CPUs having crash notes to avoid updating elfcorehdr during in-kernel
crash update on CPU hotplug events.

The similar technique is used in kexec-tool for kexec_load case.

Signed-off-by: Sourabh Jain 
---
  kernel/crash_core.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 37c594858fd51..898d8d2fe2e2e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -364,8 +364,8 @@ int crash_prepare_elf64_headers(struct kimage 
*image, struct crash_mem *mem,

  ehdr->e_ehsize = sizeof(Elf64_Ehdr);
  ehdr->e_phentsize = sizeof(Elf64_Phdr);
  -    /* Prepare one phdr of type PT_NOTE for each present CPU */
-    for_each_present_cpu(cpu) {
+    /* Prepare one phdr of type PT_NOTE for possible CPU with crash 
note. */

+    for_each_possible_cpu(cpu) {


Sourabh,
Thomas Gleixner is suggesting moving away from for_each_present_cpu() 
to for_each_online_cpu(). Using for_each_online_cpu() is going to the 
minimum number of needed, whereas your approach of 
for_each_possible_cpu() would be to the maximum number needed.


What would be the ramifications to ppc for moving towards 
for_each_online_cpu()?


In my next patch series, I have finally figured out how to use cpuhp 
framework to where it is possible to use for_each_online_cpu() here, 
but that is at odds with your changes here.
I was in the impression that if CPU notes are missing for offline CPUs 
in the elfcorehdr then makedumpfile will mess up the

CPU IDs.

But somehow replacing for_each_present_cpu with for_each_online_cpu 
worked on PowerPC, even after disabling a couple of CPUs.


So things are fine if we pack PT_LOAD for online CPUs instead of present 
CPUs but,
I need to investigate how makedumpfile is able to map PT_LOAD to online 
CPUs.


- Sourabh Jain


Re: [PATCH v8 4/8] crash: add phdr for possible CPUs in elfcorehdr

2023-02-05 Thread Sourabh Jain



On 02/02/23 21:07, Eric DeVolder wrote:



On 2/1/23 00:38, Sourabh Jain wrote:

On architectures like PowerPC the crash notes are available for all
possible CPUs. So let's populate the elfcorehdr for all possible
CPUs having crash notes to avoid updating elfcorehdr during in-kernel
crash update on CPU hotplug events.

The similar technique is used in kexec-tool for kexec_load case.

Signed-off-by: Sourabh Jain 
---
  kernel/crash_core.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 37c594858fd51..898d8d2fe2e2e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -364,8 +364,8 @@ int crash_prepare_elf64_headers(struct kimage 
*image, struct crash_mem *mem,

  ehdr->e_ehsize = sizeof(Elf64_Ehdr);
  ehdr->e_phentsize = sizeof(Elf64_Phdr);
  -    /* Prepare one phdr of type PT_NOTE for each present CPU */
-    for_each_present_cpu(cpu) {
+    /* Prepare one phdr of type PT_NOTE for possible CPU with crash 
note. */

+    for_each_possible_cpu(cpu) {


Sourabh,
Thomas Gleixner is suggesting moving away from for_each_present_cpu() 
to for_each_online_cpu(). Using for_each_online_cpu() is going to the 
minimum number of needed, whereas your approach of 
for_each_possible_cpu() would be to the maximum number needed.


What would be the ramifications to ppc for moving towards 
for_each_online_cpu()?


I was in the impression that if CPU notes are missing for offline CPUs 
in the elfcorehdr then makedumpfile will mess up the

CPU IDs.

But somehow replacing for_each_present_cpu with for_each_online_cpu 
worked on PowerPC, even after disabling a couple of CPUs.


So things are fine if we pack PT_LOAD for online CPUs instead of present 
CPUs but,
I need to investigate how makedumpfile is able to map PT_LOAD to online 
CPUs.


- Sourabh Jain



Re: [PATCH] tools/perf/tests: Add system wide check for perf bench workload in all metric test

2023-02-05 Thread Athira Rajeev



> On 02-Feb-2023, at 10:14 PM, Kajol Jain  wrote:
> 
> Testcase stat_all_metrics.sh fails in powerpc:
> 
> 92: perf all metrics test : FAILED!
> 
> Logs with verbose:
> 
> [command]# ./perf test 92 -vv
> 92: perf all metrics test   :
> --- start ---
> test child forked, pid 13262
> Testing BRU_STALL_CPI
> Testing COMPLETION_STALL_CPI
> 
> Testing TOTAL_LOCAL_NODE_PUMPS_P23
> Metric 'TOTAL_LOCAL_NODE_PUMPS_P23' not printed in:
> Error:
> Invalid event (hv_24x7/PM_PB_LNS_PUMP23,chip=3/) in per-thread mode, enable 
> system wide with '-a'.
> Testing TOTAL_LOCAL_NODE_PUMPS_RETRIES_P01
> Metric 'TOTAL_LOCAL_NODE_PUMPS_RETRIES_P01' not printed in:
> Error:
> Invalid event (hv_24x7/PM_PB_RTY_LNS_PUMP01,chip=3/) in per-thread mode, 
> enable system wide with '-a'.
> 
> 
> Based on above logs, we could see some of the hv-24x7 metric events fails,
> and logs suggest to run the metric event with -a option.
> This change happened after the commit a4b8cfcabb1d ("perf stat: Delay metric
> parsing"), which delayed the metric parsing phase and now before metric 
> parsing
> phase perf tool identifies, whether target is system-wide or not. With this
> change, perf_event_open will fails with workload monitoring for uncore events
> as expected.
> 
> The perf all metric test case fails as some of the hv-24x7 metric events
> may need bigger workload to get the data. And the added perf bench
> workload in 'perf all metric test case' will not run for hv-24x7 without 
> -a option.
> 
> Fix this issue by adding system wide check for perf bench workload.
> 
> Result with the patch changes in powerpc:
> 
> 92: perf all metrics test : Ok
> 
> Signed-off-by: Kajol Jain 

Looks good to me

Reviewed-by: Athira Rajeev 

> ---
> tools/perf/tests/shell/stat_all_metrics.sh | 7 +++
> 1 file changed, 7 insertions(+)
> 
> diff --git a/tools/perf/tests/shell/stat_all_metrics.sh 
> b/tools/perf/tests/shell/stat_all_metrics.sh
> index 6e79349e42be..d49832a316d9 100755
> --- a/tools/perf/tests/shell/stat_all_metrics.sh
> +++ b/tools/perf/tests/shell/stat_all_metrics.sh
> @@ -23,6 +23,13 @@ for m in $(perf list --raw-dump metrics); do
>   then
> continue
>   fi
> +  # Failed again, possibly the event is uncore pmu event which will need
> +  # system wide monitoring with workload, so retry with -a option
> +  result=$(perf stat -M "$m" -a perf bench internals synthesize 2>&1)
> +  if [[ "$result" =~ "${m:0:50}" ]]
> +  then
> +continue
> +  fi
>   echo "Metric '$m' not printed in:"
>   echo "$result"
>   if [[ "$err" != "1" ]]
> -- 
> 2.39.0
> 



[PATCH] powerpc/64s/interrupt: Fix interrupt exit race with security mitigation switch

2023-02-05 Thread Nicholas Piggin
The RFI and STF security mitigation options can flip the
interrupt_exit_not_reentrant static branch condition concurrently with
the interrupt exit code which tests that branch.

Interrupt exit tests this condition to set MSR[EE|RI] for exit, then
again in the case a soft-masked interrupt is found pending, to recover
the MSR so the interrupt can be replayed before attempting to exit
again. If the condition changes between these two tests, the MSR and irq
soft-mask state will become corrupted, leading to warnings and possible
crashes. For example, if the branch is initially true then false,
MSR[EE] will be 0 but PACA_IRQ_HARD_DIS clear and EE may not get
enabled, leading to warnings in irq_64.c.

Reported-by: Sachin Sant 
Tested-by: Sachin Sant 
Signed-off-by: Nicholas Piggin 
---
The static_branch condition should not be evaluated multiple times by
the caller when coded like this AFAIKS because if code patching is
disabled then it becomes an atomic_read of the key, and if it is enabled
then it should be done with asm volatile so the compiler should be
unable to expand it again.

Thanks,
Nick

 arch/powerpc/kernel/interrupt.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index fc6631a80527..0ec1581619db 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -50,16 +50,18 @@ static inline bool exit_must_hard_disable(void)
  */
 static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable)
 {
+   bool must_hard_disable = (exit_must_hard_disable() || !restartable);
+
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
 
-   if (exit_must_hard_disable() || !restartable)
+   if (must_hard_disable)
__hard_EE_RI_disable();
 
 #ifdef CONFIG_PPC64
/* This pattern matches prep_irq_for_idle */
if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable() || !restartable) {
+   if (must_hard_disable) {
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
__hard_RI_enable();
}
-- 
2.37.2



Re: [PATCH] tests/bpf: Fix the bpf test to check for libtraceevent support

2023-02-05 Thread Athira Rajeev



> On 02-Feb-2023, at 6:27 AM, Arnaldo Carvalho de Melo  wrote:
> 
> Em Tue, Jan 31, 2023 at 07:20:01PM +0530, Athira Rajeev escreveu:
>> "bpf" tests fails in environment with missing libtraceevent
>> support as below:
>> 
>> # ./perf test 36
>> 36: BPF filter  :
>> 36.1: Basic BPF filtering   : FAILED!
>> 36.2: BPF pinning   : FAILED!
>> 36.3: BPF prologue generation   : FAILED!
>> 
>> The environment has clang but missing the libtraceevent
>> devel. Hence perf is compiled without libtraceevent support.
> 
> Thanks, applied.

Hi,

Thanks for checking

Arnaldo, this is applied to tmp.perf/core branch ?

Athira 
> 
> - Arnaldo
> 
> 
>> Detailed logs:
>>  ./perf test -v "Basic BPF filtering"
>> 
>>  Failed to add BPF event syscalls:sys_enter_epoll_pwait
>>  bpf: tracepoint call back failed, stop iterate
>>  Failed to add events selected by BPF
>> 
>> The bpf tests tris to add probe event which fails
>> at "parse_events_add_tracepoint" function due to missing
>> libtraceevent. Add check for "HAVE_LIBTRACEEVENT" in the
>> "tests/bpf.c" before proceeding with the test.
>> 
>> With the change,
>> 
>>  # ./perf test 36
>>  36: BPF filter  :
>>  36.1: Basic BPF filtering   : 
>> Skip (not compiled in or missing libtraceevent support)
>>  36.2: BPF pinning   : 
>> Skip (not compiled in or missing libtraceevent support)
>>  36.3: BPF prologue generation   : 
>> Skip (not compiled in or missing libtraceevent support)
>> 
>> Signed-off-by: Athira Rajeev 
>> ---
>> tools/perf/tests/bpf.c | 22 +++---
>> 1 file changed, 11 insertions(+), 11 deletions(-)
>> 
>> diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
>> index 17c023823713..4af39528f611 100644
>> --- a/tools/perf/tests/bpf.c
>> +++ b/tools/perf/tests/bpf.c
>> @@ -23,7 +23,7 @@
>> #define NR_ITERS   111
>> #define PERF_TEST_BPF_PATH "/sys/fs/bpf/perf_test"
>> 
>> -#ifdef HAVE_LIBBPF_SUPPORT
>> +#if defined(HAVE_LIBBPF_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
>> #include 
>> #include 
>> 
>> @@ -330,10 +330,10 @@ static int test__bpf(int i)
>> static int test__basic_bpf_test(struct test_suite *test __maybe_unused,
>>  int subtest __maybe_unused)
>> {
>> -#ifdef HAVE_LIBBPF_SUPPORT
>> +#if defined(HAVE_LIBBPF_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
>>  return test__bpf(0);
>> #else
>> -pr_debug("Skip BPF test because BPF support is not compiled\n");
>> +pr_debug("Skip BPF test because BPF or libtraceevent support is not 
>> compiled\n");
>>  return TEST_SKIP;
>> #endif
>> }
>> @@ -341,10 +341,10 @@ static int test__basic_bpf_test(struct test_suite 
>> *test __maybe_unused,
>> static int test__bpf_pinning(struct test_suite *test __maybe_unused,
>>   int subtest __maybe_unused)
>> {
>> -#ifdef HAVE_LIBBPF_SUPPORT
>> +#if defined(HAVE_LIBBPF_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
>>  return test__bpf(1);
>> #else
>> -pr_debug("Skip BPF test because BPF support is not compiled\n");
>> +pr_debug("Skip BPF test because BPF or libtraceevent support is not 
>> compiled\n");
>>  return TEST_SKIP;
>> #endif
>> }
>> @@ -352,17 +352,17 @@ static int test__bpf_pinning(struct test_suite *test 
>> __maybe_unused,
>> static int test__bpf_prologue_test(struct test_suite *test __maybe_unused,
>> int subtest __maybe_unused)
>> {
>> -#if defined(HAVE_LIBBPF_SUPPORT) && defined(HAVE_BPF_PROLOGUE)
>> +#if defined(HAVE_LIBBPF_SUPPORT) && defined(HAVE_BPF_PROLOGUE) && 
>> defined(HAVE_LIBTRACEEVENT)
>>  return test__bpf(2);
>> #else
>> -pr_debug("Skip BPF test because BPF support is not compiled\n");
>> +pr_debug("Skip BPF test because BPF or libtraceevent support is not 
>> compiled\n");
>>  return TEST_SKIP;
>> #endif
>> }
>> 
>> 
>> static struct test_case bpf_tests[] = {
>> -#ifdef HAVE_LIBBPF_SUPPORT
>> +#if defined(HAVE_LIBBPF_SUPPORT) && defined(HAVE_LIBTRACEEVENT)
>>  TEST_CASE("Basic BPF filtering", basic_bpf_test),
>>  TEST_CASE_REASON("BPF pinning", bpf_pinning,
>>  "clang isn't installed or environment missing BPF 
>> support"),
>> @@ -373,9 +373,9 @@ static struct test_case bpf_tests[] = {
>>  TEST_CASE_REASON("BPF prologue generation", bpf_prologue_test, "not 
>> compiled in"),
>> #endif
>> #else
>> -TEST_CASE_REASON("Basic BPF filtering", basic_bpf_test, "not compiled 
>> in"),
>> -TEST_CASE_REASON("BPF pinning", bpf_pinning, "not compiled in"),
>> -TEST_CASE_REASON("BPF prologue generation", bpf_prologue_test, "not 
>> compiled in"),
>> +TEST_CASE_REASON("Basic BPF filtering", 

Re: [PATCH 05/22] csky/cpu: Make sure arch_cpu_idle_dead() doesn't return

2023-02-05 Thread Guo Ren
On Sat, Feb 4, 2023 at 10:29 AM Josh Poimboeuf  wrote:
>
> On Sat, Feb 04, 2023 at 09:12:31AM +0800, Guo Ren wrote:
> > On Sat, Feb 4, 2023 at 6:05 AM Josh Poimboeuf  wrote:
> > >
> > > arch_cpu_idle_dead() doesn't return.  Make that more explicit with a
> > > BUG().
> > >
> > > BUG() is preferable to unreachable() because BUG() is a more explicit
> > > failure mode and avoids undefined behavior like falling off the edge of
> > > the function into whatever code happens to be next.
> > >
> > > Signed-off-by: Josh Poimboeuf 
> > > ---
> > >  arch/csky/kernel/smp.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > >
> > > diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
> > > index b45d1073307f..0ec20efaf5fd 100644
> > > --- a/arch/csky/kernel/smp.c
> > > +++ b/arch/csky/kernel/smp.c
> > > @@ -317,5 +317,7 @@ void arch_cpu_idle_dead(void)
> > > "jmpi   csky_start_secondary"
> > > :
> > > : "r" (secondary_stack));
> > > +
> > > +   BUG();
> > Why not:
> > diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> > index f26ab2675f7d..1d3bf903add2 100644
> > --- a/kernel/sched/idle.c
> > +++ b/kernel/sched/idle.c
> > @@ -285,6 +285,7 @@ static void do_idle(void)
> > tick_nohz_idle_stop_tick();
> > cpuhp_report_idle_dead();
> > arch_cpu_idle_dead();
> > +   BUG();
>
> Without the BUG() in csky arch_cpu_idle_dead(), the compiler will warn
> about arch_cpu_idle_dead() returning, because it's marked __noreturn but
> doesn't clearly return (as far as the compiler knows).
>
> And we want it marked __noreturn so we'll be more likely to catch such
> bugs at build time.
>
> And as a bonus we get better code generation and clearer code semantics
> which helps both humans and tooling understand the intent of the code.
Thx for the clarification.

Acked-by: Guo Ren 

>
> --
> Josh



-- 
Best Regards
 Guo Ren


[PATCH v3 5/5] powerpc: kcsan: Add KCSAN Support

2023-02-05 Thread Rohan McLure
Enable HAVE_ARCH_KCSAN on all powerpc platforms, permitting use of the
kernel concurrency sanitiser through the CONFIG_KCSAN_* kconfig options.
KCSAN requires compiler builtins __atomic_* 64-bit values, and so only
report support on PPC64.

See documentation in Documentation/dev-tools/kcsan.rst for more
information.

Signed-off-by: Rohan McLure 
---
v3: Restrict support to 64-bit, as TSAN expects 64-bit __atomic_* compiler
built-ins.
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b8c4ac56bddc..55bc2d724c73 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -198,6 +198,7 @@ config PPC
select HAVE_ARCH_KASAN  if PPC_RADIX_MMU
select HAVE_ARCH_KASAN  if PPC_BOOK3E_64
select HAVE_ARCH_KASAN_VMALLOC  if HAVE_ARCH_KASAN
+   select HAVE_ARCH_KCSANif PPC64
select HAVE_ARCH_KFENCE if ARCH_SUPPORTS_DEBUG_PAGEALLOC
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_KGDB
-- 
2.37.2



[PATCH v3 2/5] powerpc: kcsan: Exclude udelay to prevent recursive instrumentation

2023-02-05 Thread Rohan McLure
In order for KCSAN to increase its likelihood of observing a data race,
it sets a watchpoint on memory accesses and stalls, allowing for
detection of conflicting accesses by other kernel threads or interrupts.

Stalls are implemented by injecting a call to udelay in instrumented code.
To prevent recursive instrumentation, exclude udelay from being instrumented.

Signed-off-by: Rohan McLure 
---
 arch/powerpc/kernel/time.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index d68de3618741..b894029f53db 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -356,7 +356,7 @@ void vtime_flush(struct task_struct *tsk)
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
-void __delay(unsigned long loops)
+void __no_kcsan __delay(unsigned long loops)
 {
unsigned long start;
 
@@ -377,7 +377,7 @@ void __delay(unsigned long loops)
 }
 EXPORT_SYMBOL(__delay);
 
-void udelay(unsigned long usecs)
+void __no_kcsan udelay(unsigned long usecs)
 {
__delay(tb_ticks_per_usec * usecs);
 }
-- 
2.37.2



[PATCH v3 4/5] powerpc: kcsan: Prevent recursive instrumentation with IRQ save/restores

2023-02-05 Thread Rohan McLure
Instrumented memory accesses provided by KCSAN will access core-local
memories (which will save and restore IRQs) as well as restoring IRQs
directly. Avoid recursive instrumentation by applying __no_kcsan
annotation to IRQ restore routines.

Signed-off-by: Rohan McLure 
---
 arch/powerpc/kernel/irq_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c
index eb2b380e52a0..3a1e0bffe9e0 100644
--- a/arch/powerpc/kernel/irq_64.c
+++ b/arch/powerpc/kernel/irq_64.c
@@ -97,7 +97,7 @@ static inline bool irq_happened_test_and_clear(u8 irq)
return false;
 }
 
-void replay_soft_interrupts(void)
+__no_kcsan void replay_soft_interrupts(void)
 {
struct pt_regs regs;
 
@@ -185,7 +185,7 @@ void replay_soft_interrupts(void)
 }
 
 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_KUAP)
-static inline void replay_soft_interrupts_irqrestore(void)
+__no_kcsan static inline void replay_soft_interrupts_irqrestore(void)
 {
unsigned long kuap_state = get_kuap();
 
@@ -209,7 +209,7 @@ static inline void replay_soft_interrupts_irqrestore(void)
 #define replay_soft_interrupts_irqrestore() replay_soft_interrupts()
 #endif
 
-notrace void arch_local_irq_restore(unsigned long mask)
+notrace __no_kcsan void arch_local_irq_restore(unsigned long mask)
 {
unsigned char irq_happened;
 
-- 
2.37.2



[PATCH v3 0/5] powerpc: Add KCSAN support

2023-02-05 Thread Rohan McLure
Add Kernel Concurrency Sanitiser support for PPC64. Doing so involves
exclusion of a number of compilation units from instrumentation, as was
done with KASAN.

KCSAN uses watchpoints on memory accesses to enforce the semantics of
the Linux kernel memory model, notifying the user of observed data races
which have not been declared to be intended in source through the
data_race() macro, in order to remove false positives.

A number of such race conditions are identified. This patch series
provides support for the instrumentation, with bug fixes as well as
removal of false positives to be issued in future patches.

v3: Restrict support to PPC64 as kcsan code expects support for
__atomic* builtins for 64-bit atomic types.

v2: Implement __smp_mb() in terms of __mb() to avoid multiple calls to
kcsan_mb().
Link: 
https://lore.kernel.org/linuxppc-dev/20230201043438.1301212-4-rmcl...@linux.ibm.com/

v1: 
https://lore.kernel.org/linuxppc-dev/20230131234859.1275125-1-rmcl...@linux.ibm.com/

Rohan McLure (5):
  powerpc: kcsan: Add exclusions from instrumentation
  powerpc: kcsan: Exclude udelay to prevent recursive instrumentation
  powerpc: kcsan: Memory barriers semantics
  powerpc: kcsan: Prevent recursive instrumentation with IRQ
save/restores
  powerpc: kcsan: Add KCSAN Support

 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/barrier.h | 12 ++--
 arch/powerpc/kernel/Makefile   | 10 ++
 arch/powerpc/kernel/irq_64.c   |  6 +++---
 arch/powerpc/kernel/time.c |  4 ++--
 arch/powerpc/kernel/trace/Makefile |  1 +
 arch/powerpc/kernel/vdso/Makefile  |  1 +
 arch/powerpc/lib/Makefile  |  2 ++
 arch/powerpc/purgatory/Makefile|  1 +
 arch/powerpc/xmon/Makefile |  1 +
 10 files changed, 28 insertions(+), 11 deletions(-)

-- 
2.37.2



[PATCH v3 3/5] powerpc: kcsan: Memory barriers semantics

2023-02-05 Thread Rohan McLure
Annotate memory barriers *mb() with calls to kcsan_mb(), signaling to
compilers supporting KCSAN that the respective memory barrier has been
issued. Rename memory barrier *mb() to __*mb() to opt in for
asm-generic/barrier.h to generate the respective *mb() macro.

Signed-off-by: Rohan McLure 
---
v2: Implement __smp_mb() in terms of __mb() to avoid duplicate calls to
kcsan_mb()
---
 arch/powerpc/include/asm/barrier.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index e80b2c0e9315..b95b666f0374 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -35,9 +35,9 @@
  * However, on CPUs that don't support lwsync, lwsync actually maps to a
  * heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
  */
-#define mb()   __asm__ __volatile__ ("sync" : : : "memory")
-#define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
-#define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
+#define __mb()   __asm__ __volatile__ ("sync" : : : "memory")
+#define __rmb()  __asm__ __volatile__ ("sync" : : : "memory")
+#define __wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
 /* The sub-arch has lwsync */
 #if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
@@ -51,12 +51,12 @@
 /* clang defines this macro for a builtin, which will not work with runtime 
patching */
 #undef __lwsync
 #define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : 
:"memory")
-#define dma_rmb()  __lwsync()
-#define dma_wmb()  __asm__ __volatile__ (stringify_in_c(SMPWMB) : : 
:"memory")
+#define __dma_rmb()__lwsync()
+#define __dma_wmb()__asm__ __volatile__ (stringify_in_c(SMPWMB) : : 
:"memory")
 
 #define __smp_lwsync() __lwsync()
 
-#define __smp_mb() mb()
+#define __smp_mb() __mb()
 #define __smp_rmb()__lwsync()
 #define __smp_wmb()__asm__ __volatile__ (stringify_in_c(SMPWMB) : : 
:"memory")
 
-- 
2.37.2



[PATCH v3 1/5] powerpc: kcsan: Add exclusions from instrumentation

2023-02-05 Thread Rohan McLure
Exclude various incompatible compilation units from KCSAN
instrumentation.

Signed-off-by: Rohan McLure 
---
 arch/powerpc/kernel/Makefile   | 10 ++
 arch/powerpc/kernel/trace/Makefile |  1 +
 arch/powerpc/kernel/vdso/Makefile  |  1 +
 arch/powerpc/lib/Makefile  |  2 ++
 arch/powerpc/purgatory/Makefile|  1 +
 arch/powerpc/xmon/Makefile |  1 +
 6 files changed, 16 insertions(+)

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 9b6146056e48..9bf2be123093 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -54,6 +54,13 @@ CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING
 CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING
 endif
 
+KCSAN_SANITIZE_early_32.o := n
+KCSAN_SANITIZE_early_64.o := n
+KCSAN_SANITIZE_cputable.o := n
+KCSAN_SANITIZE_btext.o := n
+KCSAN_SANITIZE_paca.o := n
+KCSAN_SANITIZE_setup_64.o := n
+
 #ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
 # Remove stack protector to avoid triggering unneeded stack canary
 # checks due to randomize_kstack_offset.
@@ -177,12 +184,15 @@ obj-$(CONFIG_PPC_SECVAR_SYSFS)+= secvar-sysfs.o
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 KCOV_INSTRUMENT_prom_init.o := n
+KCSAN_SANITIZE_prom_init.o := n
 UBSAN_SANITIZE_prom_init.o := n
 GCOV_PROFILE_kprobes.o := n
 KCOV_INSTRUMENT_kprobes.o := n
+KCSAN_SANITIZE_kprobes.o := n
 UBSAN_SANITIZE_kprobes.o := n
 GCOV_PROFILE_kprobes-ftrace.o := n
 KCOV_INSTRUMENT_kprobes-ftrace.o := n
+KCSAN_SANITIZE_kprobes-ftrace.o := n
 UBSAN_SANITIZE_kprobes-ftrace.o := n
 GCOV_PROFILE_syscall_64.o := n
 KCOV_INSTRUMENT_syscall_64.o := n
diff --git a/arch/powerpc/kernel/trace/Makefile 
b/arch/powerpc/kernel/trace/Makefile
index af8527538fe4..b16a9f9c0b35 100644
--- a/arch/powerpc/kernel/trace/Makefile
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -23,4 +23,5 @@ obj-$(CONFIG_PPC32)   += $(obj32-y)
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_ftrace.o := n
 KCOV_INSTRUMENT_ftrace.o := n
+KCSAN_SANITIZE_ftrace.o := n
 UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/vdso/Makefile 
b/arch/powerpc/kernel/vdso/Makefile
index 6a977b0d8ffc..3a2f32929fcf 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -46,6 +46,7 @@ GCOV_PROFILE := n
 KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
 
 ccflags-y := -shared -fno-common -fno-builtin -nostdlib -Wl,--hash-style=both
 ccflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld)
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 4de71cbf6e8e..c4db459d304a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,6 +16,8 @@ KASAN_SANITIZE_feature-fixups.o := n
 # restart_table.o contains functions called in the NMI interrupt path
 # which can be in real mode. Disable KASAN.
 KASAN_SANITIZE_restart_table.o := n
+KCSAN_SANITIZE_code-patching.o := n
+KCSAN_SANITIZE_feature-fixups.o := n
 
 ifdef CONFIG_KASAN
 CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
index a81d155b89ae..6f5e2727963c 100644
--- a/arch/powerpc/purgatory/Makefile
+++ b/arch/powerpc/purgatory/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
 
 targets += trampoline_$(BITS).o purgatory.ro
 
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index eb25d7554ffd..d334de392e6c 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -5,6 +5,7 @@ GCOV_PROFILE := n
 KCOV_INSTRUMENT := n
 UBSAN_SANITIZE := n
 KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
 
 # Disable ftrace for the entire directory
 ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
-- 
2.37.2



Re: [PATCH 0/2] powerpc: Fix livepatch module re-patching issue

2023-02-05 Thread Josh Poimboeuf
On Sun, Feb 05, 2023 at 11:46:12AM +1100, Michael Ellerman wrote:
> Josh Poimboeuf  writes:
> > On Tue, Jan 24, 2023 at 07:38:03PM -0800, Josh Poimboeuf wrote:
> >> Fix a livepatch bug seen when reloading a patched module.
> >> 
> >> This is the powerpc counterpart to Song Liu's fix for a similar issue on
> >> x86:
> >> 
> >>   https://lkml.kernel.org/lkml/20230121004945.697003-2-s...@kernel.org
> >> 
> >> Josh Poimboeuf (2):
> >>   powerpc/module_64: Improve restore_r2() return semantics
> >>   powerpc/module_64: Fix "expected nop" error on module re-patching
> >> 
> >>  arch/powerpc/kernel/module_64.c | 29 ++---
> >>  1 file changed, 18 insertions(+), 11 deletions(-)
> >
> > Hi Michael,
> >
> > Ping?  Any objections to this?
> >
> > The x86 counterpart to this is queued for 6.3, it would be nice if this
> > also landed.  We could take it through the livepatch tree if needed.
> 
> It's in my next since about a week. Sorry I forgot to send the
> "accepted" emails (which I still don't have automated :/ ).
> 
> 337251c7114e1 ("powerpc/module_64: Fix "expected nop" error on module 
> re-patching")

Ah, I didn't think to look in -next.  Thanks!

-- 
Josh


[PATCH v6 7/7] PCI: Work around PCIe link training failures

2023-02-05 Thread Maciej W. Rozycki
Attempt to handle cases such as with a downstream port of the ASMedia 
ASM2824 PCIe switch where link training never completes and the link 
continues switching between speeds indefinitely with the data link layer 
never reaching the active state.

It has been observed with a downstream port of the ASMedia ASM2824 Gen 3 
switch wired to the upstream port of the Pericom PI7C9X2G304 Gen 2 
switch, using a Delock Riser Card PCI Express x1 > 2 x PCIe x1 device, 
P/N 41433, wired to a SiFive HiFive Unmatched board.  In this setup the 
switches are supposed to negotiate the link speed of preferably 5.0GT/s, 
falling back to 2.5GT/s.

Instead the link continues oscillating between the two speeds, at the 
rate of 34-35 times per second, with link training reported repeatedly 
active ~84% of the time.  Forcibly limiting the target link speed to 
2.5GT/s with the upstream ASM2824 device however makes the two switches 
communicate correctly.  Removing the speed restriction afterwards makes 
the two devices switch to 5.0GT/s then.

Make use of these observations then and detect the inability to train 
the link, by checking for the Data Link Layer Link Active status bit 
being off while the Link Bandwidth Management Status indicating that 
hardware has changed the link speed or width in an attempt to correct 
unreliable link operation.

Restrict the speed to 2.5GT/s then with the Target Link Speed field, 
request a retrain and wait 200ms for the data link to go up.  If this 
turns out successful, then lift the restriction, letting the devices 
negotiate a higher speed.

Also check for a 2.5GT/s speed restriction the firmware may have already 
arranged and lift it too with ports of devices known to continue working 
afterwards, currently the ASM2824 only, that already report their data 
link being up.

Signed-off-by: Maciej W. Rozycki 
Link: 
https://lore.kernel.org/lkml/alpine.deb.2.21.2203022037020.56...@angie.orcam.me.uk/
Link: https://source.denx.de/u-boot/u-boot/-/commit/a398a51ccc68
---
Changes from v5:

- Move from a quirk into PCI core and call at device probing, hot-plug,
  reset and resume.  Keep the ASMedia part under CONFIG_PCI_QUIRKS.

- Rely on `dev->link_active_reporting' rather than re-retrieving the 
  capability.

Changes from v4:

- Remove  inclusion no longer needed.

- Make the quirk generic based on probing device features rather than 
  specific to the ASM2824 part only; take the Retrain Link bit erratum 
  into account.

- Still lift the 2.5GT/s speed restriction with the ASM2824 only.

- Increase retrain timeout from 200ms to 1s (PCIE_LINK_RETRAIN_TIMEOUT).

- Remove retrain success notification.

- Use PCIe helpers rather than generic PCI functions throughout.

- Trim down and update the wording of the change description for the 
  switch from an ASM2824-specific to a generic fixup.

Changes from v3:

- Remove the  entry for the ASM2824.

Changes from v2:

- Regenerate for 5.17-rc2 for a merge conflict.

- Replace BUG_ON for a missing PCI Express capability with WARN_ON and an
  early return.

Changes from v1:

- Regenerate for a merge conflict.
---
 drivers/pci/pci.c   |  154 ++--
 drivers/pci/pci.h   |1 
 drivers/pci/probe.c |2 
 3 files changed, 152 insertions(+), 5 deletions(-)

linux-pcie-asm2824-manual-retrain.diff
Index: linux-macro/drivers/pci/pci.c
===
--- linux-macro.orig/drivers/pci/pci.c
+++ linux-macro/drivers/pci/pci.c
@@ -859,6 +859,132 @@ int pci_wait_for_pending(struct pci_dev
return 0;
 }
 
+/*
+ * Retrain the link of a downstream PCIe port by hand if necessary.
+ *
+ * This is needed at least where a downstream port of the ASMedia ASM2824
+ * Gen 3 switch is wired to the upstream port of the Pericom PI7C9X2G304
+ * Gen 2 switch, and observed with the Delock Riser Card PCI Express x1 >
+ * 2 x PCIe x1 device, P/N 41433, plugged into the SiFive HiFive Unmatched
+ * board.
+ *
+ * In such a configuration the switches are supposed to negotiate the link
+ * speed of preferably 5.0GT/s, falling back to 2.5GT/s.  However the link
+ * continues switching between the two speeds indefinitely and the data
+ * link layer never reaches the active state, with link training reported
+ * repeatedly active ~84% of the time.  Forcing the target link speed to
+ * 2.5GT/s with the upstream ASM2824 device makes the two switches talk to
+ * each other correctly however.  And more interestingly retraining with a
+ * higher target link speed afterwards lets the two successfully negotiate
+ * 5.0GT/s.
+ *
+ * With the ASM2824 we can rely on the otherwise optional Data Link Layer
+ * Link Active status bit and in the failed link training scenario it will
+ * be off along with the Link Bandwidth Management Status indicating that
+ * hardware has changed the link speed or width in an attempt to correct
+ * unreliable link operation.  For a port that has been left unconnected
+ 

[PATCH v6 3/7] PCI: Initialize `link_active_reporting' earlier

2023-02-05 Thread Maciej W. Rozycki
Determine whether Data Link Layer Link Active Reporting is available 
ahead of calling any fixups so that the cached value can be used there 
and later on.

Signed-off-by: Maciej W. Rozycki 
---
New change in v6.
---
 drivers/pci/probe.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

linux-pcie-link-active-reporting-early.diff
Index: linux-macro/drivers/pci/probe.c
===
--- linux-macro.orig/drivers/pci/probe.c
+++ linux-macro/drivers/pci/probe.c
@@ -819,7 +819,6 @@ static void pci_set_bus_speed(struct pci
 
pcie_capability_read_dword(bridge, PCI_EXP_LNKCAP, );
bus->max_bus_speed = pcie_link_speed[linkcap & 
PCI_EXP_LNKCAP_SLS];
-   bridge->link_active_reporting = !!(linkcap & 
PCI_EXP_LNKCAP_DLLLARC);
 
pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, );
pcie_update_link_speed(bus, linksta);
@@ -1828,6 +1827,7 @@ int pci_setup_device(struct pci_dev *dev
int pos = 0;
struct pci_bus_region region;
struct resource *res;
+   u32 linkcap;
 
hdr_type = pci_hdr_type(dev);
 
@@ -1873,6 +1873,10 @@ int pci_setup_device(struct pci_dev *dev
/* "Unknown power state" */
dev->current_state = PCI_UNKNOWN;
 
+   /* Set it early to make it available to fixups, etc.  */
+   pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, );
+   dev->link_active_reporting = !!(linkcap & PCI_EXP_LNKCAP_DLLLARC);
+
/* Early fixups, before probing the BARs */
pci_fixup_device(pci_fixup_early, dev);
 


[PATCH v6 0/7] pci: Work around ASMedia ASM2824 PCIe link training failures

2023-02-05 Thread Maciej W. Rozycki
Hi,

 This is v6 of the change to work around a PCIe link training phenomenon 
where a pair of devices both capable of operating at a link speed above 
2.5GT/s seems unable to negotiate the link speed and continues training 
indefinitely with the Link Training bit switching on and off repeatedly 
and the data link layer never reaching the active state.

 Following Bjorn's suggestion from the previous iteration:
 I have 
moved the workaround into the PCI core.  I have kept the part specific to 
ASMedia (to lift the speed restriction after a successful retrain) within, 
although I find it a good candidate for a standalone quirk.  It seems to 
me we'd have to add additional classes of fixups however to move this part 
to drivers/pci/quirks.c, which I think would be an overkill.  So I've only 
made it explicitly guarded by CONFIG_PCI_QUIRKS; I can see there's prior 
art with this approach.

 In the course of the update I have realised that commit 6b2f1351af56 
("PCI: Wait for device to become ready after secondary bus reset") makes 
no sense and was about to figure out what to do here about it, but then 
found Lukas's recent patch series addressing this issue (thanks, Lukas, 
you made my life easier!), so I have rebased my patch set on top of 
Lukas's:
.

 This has resulted in mild ugliness in that `pcie_downstream_link_retrain' 
may be called from `pci_bridge_wait_for_secondary_bus' twice, first time 
via `pcie_wait_for_link_delay' and second time via `pci_dev_wait'.  This 
second call to `pcie_downstream_link_retrain' will do nothing, because for 
`link_active_reporting' devices `pcie_wait_for_link_delay' will have 
ensured the link has gone up or the second call won't have been reached.

 I have also decided to move the initialisation of `link_active_reporting' 
earlier on, so as to have a single way to check for the feature.  This has 
brought an extra patch and its 3 clean-up dependencies into the series.

 This was originally observed in a configuration featuring a downstream 
port of the ASMedia ASM2824 Gen 3 switch wired to the upstream port of the 
Pericom PI7C9X2G304 Gen 2 switch.  However in the course of review I have 
come to the conclusion that similarly to the earlier similar change to 
U-Boot it is indeed expected to be safe to apply this workaround to any 
downstream port that has failed link negotiation provided that:

1. the port is capable of reporting the data link layer link active 
   status (because unlike U-Boot we cannot busy-loop continuously polling 
   the link training bit),

and:

2. we don't attempt to lift the 2.5GT/s speed restriction, imposed as the
   basis of the workaround, for devices not explicitly known to continue 
   working in that case.

It is expected to be safe because the workaround is applied to a failed 
link, that is one that does not (at the time this code is executed) work 
anyway, so trying to bring it up cannot make the situation worse.

 This has been verified with a SiFive HiFive unmatched board, with and 
without CONFIG_PCI_QUIRKS enabled, booting with or without the workaround 
activated in U-Boot, which covered both the link retraining part of the 
quirk and the lifting of speed restriction already imposed by U-Boot.

 I have also issued resets via sysfs to see how this change behaves.  For 
the problematic link this required a hack to remove a `dev->subordinate' 
check from `pci_parent_bus_reset', which in turn triggered the workaround 
as expected and brought the link up (but otherwise clobbered downstream 
devices as one would inevitably expect).

 I have no way to verify these patches with power management or hot-plug 
events, but owing to Lukas's effort they get into the same infrastructure, 
so I expect the workaround to do its job as expected.  I note that there 
is an extra call to `pcie_wait_for_link' from `pciehp_check_link_status', 
but I expect it to work too.  For `link_active_reporting' devices it will 
call `pcie_downstream_link_retrain' and for`!link_active_reporting' ones 
we have no means to do anything anyway.

 The 3 extra clean-ups were only compile-tested (with PowerPC and x86-64 
configurations, as appropriate), because I have no suitable hardware 
available.

 Please see individual change descriptions for further details.

 Let me know if this is going in the right direction.

  Maciej


[PATCH v6 6/7] PCI: pciehp: Rely on `link_active_reporting'

2023-02-05 Thread Maciej W. Rozycki
Use `link_active_reporting' to determine whether Data Link Layer Link 
Active Reporting is available rather than re-retrieving the capability.

Signed-off-by: Maciej W. Rozycki 
---
NB this has been compile-tested only with PPC64LE and x86-64
configurations.

New change in v6.
---
 drivers/pci/hotplug/pciehp_hpc.c |7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

linux-pcie-link-active-reporting-hpc.diff
Index: linux-macro/drivers/pci/hotplug/pciehp_hpc.c
===
--- linux-macro.orig/drivers/pci/hotplug/pciehp_hpc.c
+++ linux-macro/drivers/pci/hotplug/pciehp_hpc.c
@@ -984,7 +984,7 @@ static inline int pcie_hotplug_depth(str
 struct controller *pcie_init(struct pcie_device *dev)
 {
struct controller *ctrl;
-   u32 slot_cap, slot_cap2, link_cap;
+   u32 slot_cap, slot_cap2;
u8 poweron;
struct pci_dev *pdev = dev->port;
struct pci_bus *subordinate = pdev->subordinate;
@@ -1030,9 +1030,6 @@ struct controller *pcie_init(struct pcie
if (dmi_first_match(inband_presence_disabled_dmi_table))
ctrl->inband_presence_disabled = 1;
 
-   /* Check if Data Link Layer Link Active Reporting is implemented */
-   pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP, _cap);
-
/* Clear all remaining event bits in Slot Status register. */
pcie_capability_write_word(pdev, PCI_EXP_SLTSTA,
PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD |
@@ -1051,7 +1048,7 @@ struct controller *pcie_init(struct pcie
FLAG(slot_cap, PCI_EXP_SLTCAP_EIP),
FLAG(slot_cap, PCI_EXP_SLTCAP_NCCS),
FLAG(slot_cap2, PCI_EXP_SLTCAP2_IBPD),
-   FLAG(link_cap, PCI_EXP_LNKCAP_DLLLARC),
+   FLAG(pdev->link_active_reporting, true),
pdev->broken_cmd_compl ? " (with Cmd Compl erratum)" : "");
 
/*


[PATCH v6 5/7] net/mlx5: Rely on `link_active_reporting'

2023-02-05 Thread Maciej W. Rozycki
Use `link_active_reporting' to determine whether Data Link Layer Link 
Active Reporting is available rather than re-retrieving the capability.

Signed-off-by: Maciej W. Rozycki 
---
NB this has been compile-tested only with PPC64LE and x86-64 
configurations.

New change in v6.
---
 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c |8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

linux-pcie-link-active-reporting-mlx5.diff
Index: linux-macro/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
===
--- linux-macro.orig/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ linux-macro/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -294,7 +294,6 @@ static int mlx5_pci_link_toggle(struct m
unsigned long timeout;
struct pci_dev *sdev;
int cap, err;
-   u32 reg32;
 
/* Check that all functions under the pci bridge are PFs of
 * this device otherwise fail this function.
@@ -333,11 +332,8 @@ static int mlx5_pci_link_toggle(struct m
return err;
 
/* Check link */
-   err = pci_read_config_dword(bridge, cap + PCI_EXP_LNKCAP, );
-   if (err)
-   return err;
-   if (!(reg32 & PCI_EXP_LNKCAP_DLLLARC)) {
-   mlx5_core_warn(dev, "No PCI link reporting capability 
(0x%08x)\n", reg32);
+   if (!bridge->link_active_reporting) {
+   mlx5_core_warn(dev, "No PCI link reporting capability\n");
msleep(1000);
goto restore;
}


[PATCH v6 4/7] powerpc/eeh: Rely on `link_active_reporting'

2023-02-05 Thread Maciej W. Rozycki
Use `link_active_reporting' to determine whether Data Link Layer Link 
Active Reporting is available rather than re-retrieving the capability.

Signed-off-by: Maciej W. Rozycki 
---
NB this has been compile-tested only with a PPC64LE configuration.

New change in v6.
---
 arch/powerpc/kernel/eeh_pe.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

linux-pcie-link-active-reporting-eeh.diff
Index: linux-macro/arch/powerpc/kernel/eeh_pe.c
===
--- linux-macro.orig/arch/powerpc/kernel/eeh_pe.c
+++ linux-macro/arch/powerpc/kernel/eeh_pe.c
@@ -671,9 +671,8 @@ static void eeh_bridge_check_link(struct
eeh_ops->write_config(edev, cap + PCI_EXP_LNKCTL, 2, val);
 
/* Check link */
-   eeh_ops->read_config(edev, cap + PCI_EXP_LNKCAP, 4, );
-   if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
-   eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", 
val);
+   if (!edev->pdev->link_active_reporting) {
+   eeh_edev_dbg(edev, "No link reporting capability\n");
msleep(1000);
return;
}


[PATCH v6 2/7] PCI: Execute `quirk_enable_clear_retrain_link' earlier

2023-02-05 Thread Maciej W. Rozycki
Make `quirk_enable_clear_retrain_link' `pci_fixup_early' so that any later 
fixups can rely on `clear_retrain_link' to have been already initialised.

Signed-off-by: Maciej W. Rozycki 
---
No change from v5.

New change in v5.
---
 drivers/pci/quirks.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

linux-pcie-clear-retrain-link-early.diff
Index: linux-macro/drivers/pci/quirks.c
===
--- linux-macro.orig/drivers/pci/quirks.c
+++ linux-macro/drivers/pci/quirks.c
@@ -2407,9 +2407,9 @@ static void quirk_enable_clear_retrain_l
dev->clear_retrain_link = 1;
pci_info(dev, "Enable PCIe Retrain Link quirk\n");
 }
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PERICOM, 0xe110, 
quirk_enable_clear_retrain_link);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PERICOM, 0xe111, 
quirk_enable_clear_retrain_link);
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_PERICOM, 0xe130, 
quirk_enable_clear_retrain_link);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_PERICOM, 0xe110, 
quirk_enable_clear_retrain_link);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_PERICOM, 0xe111, 
quirk_enable_clear_retrain_link);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_PERICOM, 0xe130, 
quirk_enable_clear_retrain_link);
 
 static void fixup_rev1_53c810(struct pci_dev *dev)
 {


[PATCH v6 1/7] PCI: Export PCI link retrain timeout

2023-02-05 Thread Maciej W. Rozycki
Rename LINK_RETRAIN_TIMEOUT to PCIE_LINK_RETRAIN_TIMEOUT and make it
available via "pci.h" for PCI drivers to use.

Signed-off-by: Maciej W. Rozycki 
---
No change from v5.

New change in v5.
---
 drivers/pci/pci.h   |2 ++
 drivers/pci/pcie/aspm.c |4 +---
 2 files changed, 3 insertions(+), 3 deletions(-)

linux-pcie-link-retrain-timeout.diff
Index: linux-macro/drivers/pci/pci.h
===
--- linux-macro.orig/drivers/pci/pci.h
+++ linux-macro/drivers/pci/pci.h
@@ -11,6 +11,8 @@
 
 #define PCI_VSEC_ID_INTEL_TBT  0x1234  /* Thunderbolt */
 
+#define PCIE_LINK_RETRAIN_TIMEOUT HZ
+
 extern const unsigned char pcie_link_speed[];
 extern bool pci_early_dump;
 
Index: linux-macro/drivers/pci/pcie/aspm.c
===
--- linux-macro.orig/drivers/pci/pcie/aspm.c
+++ linux-macro/drivers/pci/pcie/aspm.c
@@ -90,8 +90,6 @@ static const char *policy_str[] = {
[POLICY_POWER_SUPERSAVE] = "powersupersave"
 };
 
-#define LINK_RETRAIN_TIMEOUT HZ
-
 /*
  * The L1 PM substate capability is only implemented in function 0 in a
  * multi function device.
@@ -213,7 +211,7 @@ static bool pcie_retrain_link(struct pci
}
 
/* Wait for link training end. Break out after waiting for timeout */
-   end_jiffies = jiffies + LINK_RETRAIN_TIMEOUT;
+   end_jiffies = jiffies + PCIE_LINK_RETRAIN_TIMEOUT;
do {
pcie_capability_read_word(parent, PCI_EXP_LNKSTA, );
if (!(reg16 & PCI_EXP_LNKSTA_LT))


Re: [merge] WARN arch/powerpc/kernel/irq_64.c:278

2023-02-05 Thread Sachin Sant



> On 03-Feb-2023, at 3:55 PM, Nicholas Piggin  wrote:
> 
> On Fri Feb 3, 2023 at 4:26 PM AEST, Sachin Sant wrote:
>> I am observing an intermittent crash while running powerpc/security
>> selftests on a Power10 LPAR booted with powerpc/merge branch code.
>> 
>> [ cut here ]
>> WARNING: CPU: 1 PID: 5644 at arch/powerpc/kernel/irq_64.c:278 
>> arch_local_irq_restore+0x254/0x260
> 
> Okay, I guess the static branch test changes from true to false both
> times it is tested and so it doesn't recover properly. It's a real bug.
> I don't know why I didn't change the static branch under stop machine,
> maybe it gets into some recursive issue, that would be ideal if we could
> though. But this might be a safer minimal fix?
> 
> Thanks,
> Nick
> —
> 

Thanks Nick. Since this failure was intermittent I ran the test 10 times
with and without this patch.

Without the patch failure rate was 3 out of 10
With the patch no failures were seen (0/10).

I also ran the remaining selftests and observed no additional failures.

Based on these tests this fixes the problem for me.

- Sachin



Re: [PATCH] powerpc/kexec_file: fix implicit decl error

2023-02-05 Thread Sourabh Jain



On 04/02/23 22:52, Randy Dunlap wrote:

kexec (PPC64) code calls memory_hotplug_max(). Add the header declaration
for it from . Using  does not work since
the #include for  depends on CONFIG_NUMA=y, which is not
set in this kernel config file.


I didn't realize that linux/mmzone.h includes asm/mmzone.h under 
CONFIG_NUMA.


from linux/mmzone.h

#else /* CONFIG_NUMA */

#include 

#endif /* !CONFIG_NUMA */



Fixes this build error/warning:

../arch/powerpc/kexec/file_load_64.c: In function 'kexec_extra_fdt_size_ppc64':
../arch/powerpc/kexec/file_load_64.c:993:33: error: implicit declaration of 
function 'memory_hotplug_max' [-Werror=implicit-function-declaration]
   993 | usm_entries = ((memory_hotplug_max() / 
drmem_lmb_size()) +
   | ^~

Fixes: fc546faa5595 ("powerpc/kexec_file: Count hot-pluggable memory in FDT 
estimate")
Signed-off-by: Randy Dunlap 
Cc: Sourabh Jain 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
  arch/powerpc/kexec/file_load_64.c |1 +
  1 file changed, 1 insertion(+)

diff -- a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -26,6 +26,7 @@
  #include 
  #include 
  #include 
+#include 


Yes including the asm/mmzone.h will fix the build issue.

- Sourabh


  #include 
  
  struct umem_info {


[powerpc:fixes-test] BUILD SUCCESS e33416fca8a2313b8650bd5807aaf34354d39a4c

2023-02-05 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
fixes-test
branch HEAD: e33416fca8a2313b8650bd5807aaf34354d39a4c  powerpc: Don't select 
ARCH_WANTS_NO_INSTR

elapsed time: 723m

configs tested: 20
configs skipped: 108

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
powerpc   allnoconfig
powerpc  allmodconfig
x86_64  defconfig
x86_64   allyesconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-func
x86_64rhel-8.3-kselftests

clang tested configs:
riscvrandconfig-r042-20230205
s390 randconfig-r044-20230205
hexagon  randconfig-r045-20230205
hexagon  randconfig-r041-20230205
i386  randconfig-a002
i386  randconfig-a006
i386  randconfig-a004
x86_64randconfig-a005
x86_64randconfig-a003
x86_64randconfig-a001
x86_64randconfig-a012
x86_64randconfig-a014
x86_64randconfig-a016

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests


[powerpc:fixes] BUILD SUCCESS 1665c027afb225882a5a0b014c45e84290b826c2

2023-02-05 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
fixes
branch HEAD: 1665c027afb225882a5a0b014c45e84290b826c2  powerpc/64s: Reconnect 
tlb_flush() to hash__tlb_flush()

elapsed time: 723m

configs tested: 14
configs skipped: 108

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
powerpc   allnoconfig
powerpc  allmodconfig
x86_64  rhel-8.3-func
x86_64rhel-8.3-kselftests
x86_64  defconfig
x86_64   allyesconfig
x86_64   rhel-8.3

clang tested configs:
riscvrandconfig-r042-20230205
s390 randconfig-r044-20230205
hexagon  randconfig-r045-20230205
hexagon  randconfig-r041-20230205
i386  randconfig-a002
i386  randconfig-a006
i386  randconfig-a004

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests


Re: [PATCH v2] powerpc/kexec_file: account hot-pluggable memory while estimating FDT size

2023-02-05 Thread Michael Ellerman
On Tue, 31 Jan 2023 08:36:15 +0530, Sourabh Jain wrote:
> On Systems where online memory is lesser compared to max memory, the
> kexec_file_load system call may fail to load the kdump kernel with the
> below errors:
> 
> "Failed to update fdt with linux,drconf-usable-memory property"
> "Error setting up usable-memory property for kdump kernel"
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/kexec_file: account hot-pluggable memory while estimating FDT size
  https://git.kernel.org/powerpc/c/fc546faa559538fb312c77e055243ece18ab3288

cheers


Re: [PATCH] powerpc/kvm: Fix objtool warning for unannotated intra-function call in booke.o

2023-02-05 Thread Michael Ellerman
On Sat, 28 Jan 2023 18:11:58 +0530, Sathvika Vasireddy wrote:
> Objtool throws the following warning:
> arch/powerpc/kvm/booke.o: warning: objtool: kvmppc_fill_pt_regs+0x30: 
> unannotated intra-function call
> 
> Fix this warning by allowing the function to set the value of 'nip' field
> using _THIS_IP_ macro, without having to use an additional assembly
> instruction to save the instruction pointer.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/kvm: Fix objtool warning for unannotated intra-function call in 
booke.o
  https://git.kernel.org/powerpc/c/fe6de81b610e5d0b9d2231acff2de74a35482e7d

cheers


Re: [PATCH v2] powerpc/64: Fix perf profiling asynchronous interrupt handlers

2023-02-05 Thread Michael Ellerman
On Sat, 21 Jan 2023 20:01:56 +1000, Nicholas Piggin wrote:
> Interrupt entry sets the soft mask to IRQS_ALL_DISABLED to match the
> hard irq disabled state. So when should_hard_irq_enable() returns true
> because we want PMI interrupts in irq handlers, MSR[EE] is enabled but
> PMIs just get soft-masked. Fix this by clearing IRQS_PMI_DISABLED before
> enabling MSR[EE].
> 
> This also tidies some of the warnings, no need to duplicate them in
> both should_hard_irq_enable() and do_hard_irq_enable().
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/64: Fix perf profiling asynchronous interrupt handlers
  https://git.kernel.org/powerpc/c/c28548012ee2bac55772ef7685138bd1124b80c3

cheers


Re: [PATCH] powerpc/64s: Fix local irq disable when PMIs are disabled

2023-02-05 Thread Michael Ellerman
On Sat, 21 Jan 2023 19:53:52 +1000, Nicholas Piggin wrote:
> When PMI interrupts are soft-masked, local_irq_save() will clear the PMI
> mask bit, allowing PMIs in and causing a race condition. This causes a
> deadlock in native_hpte_insert via hash_preload, which depends on PMIs
> being disabled since commit 8b91cee5eadd ("powerpc/64s/hash: Make hash
> faults work in NMI context"). native_hpte_insert calls local_irq_save().
> It's possible the lpar hash code is also affected when tracing is
> enabled because __trace_hcall_entry() calls local_irq_save().
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/64s: Fix local irq disable when PMIs are disabled
  https://git.kernel.org/powerpc/c/bc88ef663265676419555df2dc469a471c0add31

cheers


Re: [PATCH] powerpc: Fix objtool warning for unannotated intra-function call in head_85xx.o

2023-02-05 Thread Michael Ellerman
On Sat, 28 Jan 2023 18:11:38 +0530, Sathvika Vasireddy wrote:
> Objtool throws the following warning:
> arch/powerpc/kernel/head_85xx.o: warning: objtool: .head.text+0x1a6c:
> unannotated intra-function call
> 
> Fix this warning by annotating KernelSPE symbol with SYM_FUNC_START_LOCAL
> and SYM_FUNC_END macros.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc: Fix objtool warning for unannotated intra-function call in 
head_85xx.o
  https://git.kernel.org/powerpc/c/8afffce6aa3bddc940ac1909627ff1e772b6cbf1

cheers


Re: [PATCH] powerpc/kexec_file: Fix division by zero in extra size estimation

2023-02-05 Thread Michael Ellerman
On Mon, 30 Jan 2023 12:47:07 +1100, Michael Ellerman wrote:
> In kexec_extra_fdt_size_ppc64() there's logic to estimate how much
> extra space will be needed in the device tree for some memory related
> properties.
> 
> That logic uses the size of RAM divided by drmem_lmb_size() to do the
> estimation. However drmem_lmb_size() can be zero if the machine has no
> hotpluggable memory configured, which is the case when booting with qemu
> and no maxmem=x parameter is passed (the default).
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/kexec_file: Fix division by zero in extra size estimation
  https://git.kernel.org/powerpc/c/7294194b47e994753a86eee8cf1c61f3f36458a3

cheers


Re: [PATCH] powerpc/imc-pmu: Revert nest_init_lock to being a mutex

2023-02-05 Thread Michael Ellerman
On Mon, 30 Jan 2023 12:44:01 +1100, Michael Ellerman wrote:
> The recent commit 76d588dddc45 ("powerpc/imc-pmu: Fix use of mutex in
> IRQs disabled section") fixed warnings (and possible deadlocks) in the
> IMC PMU driver by converting the locking to use spinlocks.
> 
> It also converted the init-time nest_init_lock to a spinlock, even
> though it's not used at runtime in IRQ disabled sections or while
> holding other spinlocks.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/imc-pmu: Revert nest_init_lock to being a mutex
  https://git.kernel.org/powerpc/c/ad53db4acb415976761d7302f5b02e97f2bd097e

cheers


Re: [PATCH] powerpc/64s: Reconnect tlb_flush() to hash__tlb_flush()

2023-02-05 Thread Michael Ellerman
On Tue, 31 Jan 2023 22:14:07 +1100, Michael Ellerman wrote:
> Commit baf1ed24b27d ("powerpc/mm: Remove empty hash__ functions")
> removed some empty hash MMU flushing routines, but got a bit overeager
> and also removed the call to hash__tlb_flush() from tlb_flush().
> 
> In regular use this doesn't lead to any noticable breakage, which is a
> little concerning. Presumably there are flushes happening via other
> paths such as arch_leave_lazy_mmu_mode(), and/or a bit of luck.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/64s: Reconnect tlb_flush() to hash__tlb_flush()
  https://git.kernel.org/powerpc/c/1665c027afb225882a5a0b014c45e84290b826c2

cheers


Re: [PATCH 1/2] powerpc/64s/radix: Fix crash with unaligned relocated kernel

2023-02-05 Thread Michael Ellerman
On Tue, 10 Jan 2023 23:47:52 +1100, Michael Ellerman wrote:
> If a relocatable kernel is loaded at an address that is not 2MB aligned
> and told not to relocate to zero, the kernel can crash due to
> mark_rodata_ro() incorrectly changing some read-write data to read-only.
> 
> Scenarios where the misalignment can occur are when the kernel is
> loaded by kdump or using the RELOCATABLE_TEST config option.
> 
> [...]

Applied to powerpc/fixes.

[1/2] powerpc/64s/radix: Fix crash with unaligned relocated kernel
  https://git.kernel.org/powerpc/c/98d0219e043e09013e883eacde3b93e0b2bf944d
[2/2] powerpc/64s/radix: Fix RWX mapping with relocated kernel
  https://git.kernel.org/powerpc/c/111bcb37385353f0510e5847d5abcd1c613dba23

cheers