[PATCH] KVM: PPC: Book3S HV: Fix TLB management on SMT8 POWER9 and POWER10 processors

2021-06-01 Thread Nicholas Piggin
From: Suraj Jitindar Singh 

The POWER9 vCPU TLB management code assumes all threads in a core share
a TLB, and that TLBIEL execued by one thread will invalidate TLBs for
all threads. This is not the case for SMT8 capable POWER9 and POWER10
(big core) processors, where the TLB is split between groups of threads.
This results in TLB multi-hits, random data corruption, etc.

Fix this by introducing cpu_first_tlb_thread_sibling etc., to determine
which siblings share TLBs, and use that in the guest TLB flushing code.

[npig...@gmail.com: add changelog and comment]

Signed-off-by: Paul Mackerras 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/cputhreads.h | 30 +++
 arch/powerpc/kvm/book3s_hv.c  | 13 ++--
 arch/powerpc/kvm/book3s_hv_builtin.c  |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   |  2 +-
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/cputhreads.h 
b/arch/powerpc/include/asm/cputhreads.h
index 98c8bd155bf9..b167186aaee4 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -98,6 +98,36 @@ static inline int cpu_last_thread_sibling(int cpu)
return cpu | (threads_per_core - 1);
 }
 
+/*
+ * tlb_thread_siblings are siblings which share a TLB. This is not
+ * architected, is not something a hypervisor could emulate and a future
+ * CPU may change behaviour even in compat mode, so this should only be
+ * used on PowerNV, and only with care.
+ */
+static inline int cpu_first_tlb_thread_sibling(int cpu)
+{
+   if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
+   return cpu & ~0x6;  /* Big Core */
+   else
+   return cpu_first_thread_sibling(cpu);
+}
+
+static inline int cpu_last_tlb_thread_sibling(int cpu)
+{
+   if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
+   return cpu | 0x6;   /* Big Core */
+   else
+   return cpu_last_thread_sibling(cpu);
+}
+
+static inline int cpu_tlb_thread_sibling_step(void)
+{
+   if (cpu_has_feature(CPU_FTR_ARCH_300) && (threads_per_core == 8))
+   return 2;   /* Big Core */
+   else
+   return 1;
+}
+
 static inline u32 get_tensr(void)
 {
 #ifdef CONFIG_BOOKE
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 28a80d240b76..0a8398a63f80 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2657,7 +2657,7 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, 
struct kvm_vcpu *vcpu)
cpumask_t *cpu_in_guest;
int i;
 
-   cpu = cpu_first_thread_sibling(cpu);
+   cpu = cpu_first_tlb_thread_sibling(cpu);
if (nested) {
cpumask_set_cpu(cpu, >need_tlb_flush);
cpu_in_guest = >cpu_in_guest;
@@ -2671,9 +2671,10 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, 
struct kvm_vcpu *vcpu)
 * the other side is the first smp_mb() in kvmppc_run_core().
 */
smp_mb();
-   for (i = 0; i < threads_per_core; ++i)
-   if (cpumask_test_cpu(cpu + i, cpu_in_guest))
-   smp_call_function_single(cpu + i, do_nothing, NULL, 1);
+   for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
+   i += cpu_tlb_thread_sibling_step())
+   if (cpumask_test_cpu(i, cpu_in_guest))
+   smp_call_function_single(i, do_nothing, NULL, 1);
 }
 
 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
@@ -2704,8 +2705,8 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu 
*vcpu, int pcpu)
 */
if (prev_cpu != pcpu) {
if (prev_cpu >= 0 &&
-   cpu_first_thread_sibling(prev_cpu) !=
-   cpu_first_thread_sibling(pcpu))
+   cpu_first_tlb_thread_sibling(prev_cpu) !=
+   cpu_first_tlb_thread_sibling(pcpu))
radix_flush_cpu(kvm, prev_cpu, vcpu);
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7a0e33a9c980..3edc25c89092 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -800,7 +800,7 @@ void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
 * Thus we make all 4 threads use the same bit.
 */
if (cpu_has_feature(CPU_FTR_ARCH_300))
-   pcpu = cpu_first_thread_sibling(pcpu);
+   pcpu = cpu_first_tlb_thread_sibling(pcpu);
 
if (nested)
need_tlb_flush = >need_tlb_flush;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 7af7c70f1468..407dbf21bcbc 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -67,7 +67,7 

[PATCH] powerpc: 52xx: add fallthrough in mpc52xx_wdt_ioctl()

2021-06-01 Thread trix
From: Tom Rix 

With gcc 10.3, there is this compiler error
compiler.h:56:26: error: this statement may
  fall through [-Werror=implicit-fallthrough=]

mpc52xx_gpt.c:586:2: note: here
  586 |  case WDIOC_GETTIMEOUT:
  |  ^~~~

So add the fallthrough pseudo keyword.

Signed-off-by: Tom Rix 
---
 arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c 
b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index 8c0d324f657e..3823df235f25 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -582,6 +582,7 @@ static long mpc52xx_wdt_ioctl(struct file *file, unsigned 
int cmd,
if (ret)
break;
/* fall through and return the timeout */
+   fallthrough;
 
case WDIOC_GETTIMEOUT:
/* we need to round here as to avoid e.g. the following
-- 
2.26.3



[PATCH] powerpc/8xx: select CPM1 for MPC8XXFADS

2021-06-01 Thread trix
From: Tom Rix 

With MPC8XXFADS, there is this linker error
ppc64-linux-ld: m8xx_setup.o: in function `cpm_cascade':
m8xx_setup.c: undefined reference to `cpm_get_irq'

cpm_get_irq() is conditionally complied by CPM1
So add a select, like the other plaforms

Signed-off-by: Tom Rix 
---
 arch/powerpc/platforms/8xx/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/8xx/Kconfig 
b/arch/powerpc/platforms/8xx/Kconfig
index 60cc5b537a98..919082cdb2f1 100644
--- a/arch/powerpc/platforms/8xx/Kconfig
+++ b/arch/powerpc/platforms/8xx/Kconfig
@@ -10,6 +10,7 @@ choice
 
 config MPC8XXFADS
bool "FADS"
+   select CPM1
 
 config MPC86XADS
bool "MPC86XADS"
-- 
2.26.3



[PATCH 2/5] powerpc/ps3: Warn on PS3 device errors

2021-06-01 Thread Geoff Levand
To aid debugging PS3 boot problems change the log level
of several PS3 device errors from pr_debug to pr_warn.

Signed-off-by: Geoff Levand 
---
 arch/powerpc/platforms/ps3/system-bus.c |  9 +
 drivers/ps3/ps3-vuart.c |  2 +-
 drivers/ps3/ps3av.c | 22 +++---
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/ps3/system-bus.c 
b/arch/powerpc/platforms/ps3/system-bus.c
index b431f41c6cb5..1a5665875165 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -64,9 +64,10 @@ static int ps3_open_hv_device_sb(struct 
ps3_system_bus_device *dev)
result = lv1_open_device(dev->bus_id, dev->dev_id, 0);
 
if (result) {
-   pr_debug("%s:%d: lv1_open_device failed: %s\n", __func__,
-   __LINE__, ps3_result(result));
-   result = -EPERM;
+   pr_warn("%s:%d: lv1_open_device dev=%u.%u(%s) failed: %s\n",
+   __func__, __LINE__, dev->match_id, dev->match_sub_id,
+   dev_name(>core), ps3_result(result));
+   result = -EPERM;
}
 
 done:
@@ -120,7 +121,7 @@ static int ps3_open_hv_device_gpu(struct 
ps3_system_bus_device *dev)
result = lv1_gpu_open(0);
 
if (result) {
-   pr_debug("%s:%d: lv1_gpu_open failed: %s\n", __func__,
+   pr_warn("%s:%d: lv1_gpu_open failed: %s\n", __func__,
__LINE__, ps3_result(result));
result = -EPERM;
}
diff --git a/drivers/ps3/ps3-vuart.c b/drivers/ps3/ps3-vuart.c
index e34ae6a442c7..6328abd51ffa 100644
--- a/drivers/ps3/ps3-vuart.c
+++ b/drivers/ps3/ps3-vuart.c
@@ -358,7 +358,7 @@ static int ps3_vuart_raw_write(struct ps3_system_bus_device 
*dev,
ps3_mm_phys_to_lpar(__pa(buf)), bytes, bytes_written);
 
if (result) {
-   dev_dbg(>core, "%s:%d: lv1_write_virtual_uart failed: "
+   dev_warn(>core, "%s:%d: lv1_write_virtual_uart failed: "
"%s\n", __func__, __LINE__, ps3_result(result));
return result;
}
diff --git a/drivers/ps3/ps3av.c b/drivers/ps3/ps3av.c
index 9d66257e1da5..516e6d14d32e 100644
--- a/drivers/ps3/ps3av.c
+++ b/drivers/ps3/ps3av.c
@@ -217,9 +217,9 @@ static int ps3av_send_cmd_pkt(const struct ps3av_send_hdr 
*send_buf,
/* send pkt */
res = ps3av_vuart_write(ps3av->dev, send_buf, write_len);
if (res < 0) {
-   dev_dbg(>dev->core,
-   "%s: ps3av_vuart_write() failed (result=%d)\n",
-   __func__, res);
+   dev_warn(>dev->core,
+   "%s:%d: ps3av_vuart_write() failed: %s\n", __func__,
+   __LINE__, ps3_result(res));
return res;
}
 
@@ -230,9 +230,9 @@ static int ps3av_send_cmd_pkt(const struct ps3av_send_hdr 
*send_buf,
res = ps3av_vuart_read(ps3av->dev, recv_buf, PS3AV_HDR_SIZE,
   timeout);
if (res != PS3AV_HDR_SIZE) {
-   dev_dbg(>dev->core,
-   "%s: ps3av_vuart_read() failed (result=%d)\n",
-   __func__, res);
+   dev_warn(>dev->core,
+   "%s:%d: ps3av_vuart_read() failed: %s\n", 
__func__,
+   __LINE__, ps3_result(res));
return res;
}
 
@@ -240,9 +240,9 @@ static int ps3av_send_cmd_pkt(const struct ps3av_send_hdr 
*send_buf,
res = ps3av_vuart_read(ps3av->dev, _buf->cid,
   recv_buf->size, timeout);
if (res < 0) {
-   dev_dbg(>dev->core,
-   "%s: ps3av_vuart_read() failed (result=%d)\n",
-   __func__, res);
+   dev_warn(>dev->core,
+   "%s:%d: ps3av_vuart_read() failed: %s\n", 
__func__,
+   __LINE__, ps3_result(res));
return res;
}
res += PS3AV_HDR_SIZE;  /* total len */
@@ -251,8 +251,8 @@ static int ps3av_send_cmd_pkt(const struct ps3av_send_hdr 
*send_buf,
} while (event);
 
if ((cmd | PS3AV_REPLY_BIT) != recv_buf->cid) {
-   dev_dbg(>dev->core, "%s: reply err (result=%x)\n",
-   __func__, recv_buf->cid);
+   dev_warn(>dev->core, "%s:%d: reply err: %x\n", __func__,
+   __LINE__, recv_buf->cid);
return -EINVAL;
}
 
-- 
2.25.1




[PATCH 1/5] powerpc/ps3: Add CONFIG_PS3_VERBOSE_RESULT option

2021-06-01 Thread Geoff Levand
To aid debugging, add a new PS3 kernel config option
PS3_VERBOSE_RESULT that, when enabled, will print more
verbose messages for the result of LV1 hypercalls.

Signed-off-by: Geoff Levand 
---
 arch/powerpc/include/asm/ps3.h | 2 +-
 arch/powerpc/platforms/ps3/Kconfig | 9 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/ps3.h b/arch/powerpc/include/asm/ps3.h
index e646c7f218bc..7df145901def 100644
--- a/arch/powerpc/include/asm/ps3.h
+++ b/arch/powerpc/include/asm/ps3.h
@@ -232,7 +232,7 @@ enum lv1_result {
 
 static inline const char* ps3_result(int result)
 {
-#if defined(DEBUG) || defined(PS3_VERBOSE_RESULT)
+#if defined(DEBUG) || defined(PS3_VERBOSE_RESULT) || 
defined(CONFIG_PS3_VERBOSE_RESULT)
switch (result) {
case LV1_SUCCESS:
return "LV1_SUCCESS (0)";
diff --git a/arch/powerpc/platforms/ps3/Kconfig 
b/arch/powerpc/platforms/ps3/Kconfig
index e32406e918d0..ebed94942d39 100644
--- a/arch/powerpc/platforms/ps3/Kconfig
+++ b/arch/powerpc/platforms/ps3/Kconfig
@@ -85,6 +85,15 @@ config PS3_SYS_MANAGER
  This support is required for PS3 system control.  In
  general, all users will say Y or M.
 
+config PS3_VERBOSE_RESULT
+   bool "PS3 Verbose LV1 hypercall results" if PS3_ADVANCED
+   depends on PPC_PS3
+   help
+ Enables more verbose log mesages for LV1 hypercall results.
+
+ If in doubt, say N here and reduce the size of the kernel by a
+ small amount.
+
 config PS3_REPOSITORY_WRITE
bool "PS3 Repository write support" if PS3_ADVANCED
depends on PPC_PS3
-- 
2.25.1




[PATCH 4/5] net/ps3_gelic: Add gelic_descr structures

2021-06-01 Thread Geoff Levand
Create two new structures, struct gelic_hw_regs and struct gelic_chain_link,
and replace the corresponding members of struct gelic_descr with the new
structures.  struct gelic_hw_regs holds the register variables used by the
gelic hardware device.  struct gelic_chain_link holds variables used to
manage the driver's linked list of gelic descr structures.

Fixes several DMA mapping problems with the PS3's gelic network driver:

 * Change from checking the return value of dma_map_single to using the
   dma_mapping_error routine.
 * Use the correct buffer length when mapping the RX skb.
 * Improved error checking and debug logging.

Fixes runtime errors like these, and also other randomly occurring errors:

  IP-Config: Complete:
  DMA-API: ps3_gelic_driver sb_05: device driver failed to check map error
  WARNING: CPU: 0 PID: 0 at kernel/dma/debug.c:1027 .check_unmap+0x888/0x8dc

Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 573 +++
 drivers/net/ethernet/toshiba/ps3_gelic_net.h |  24 +-
 2 files changed, 341 insertions(+), 256 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c 
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
index 55e652624bd7..e01938128882 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -96,9 +96,11 @@ static void gelic_card_get_ether_port_status(struct 
gelic_card *card,
  * returns the status as in the dmac_cmd_status field of the descriptor
  */
 static enum gelic_descr_dma_status
+
 gelic_descr_get_status(struct gelic_descr *descr)
 {
-   return be32_to_cpu(descr->dmac_cmd_status) & GELIC_DESCR_DMA_STAT_MASK;
+   return be32_to_cpu(descr->hw_regs.dmac_cmd_status)
+   & GELIC_DESCR_DMA_STAT_MASK;
 }
 
 static int gelic_card_set_link_mode(struct gelic_card *card, int mode)
@@ -146,24 +148,34 @@ static void gelic_card_disable_txdmac(struct gelic_card 
*card)
  */
 static void gelic_card_enable_rxdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;
+#if defined(DEBUG)
+   static const int debug_build = 1;
+#else
+   static const int debug_build = 0;
+#endif
 
-#ifdef DEBUG
-   if (gelic_descr_get_status(card->rx_chain.head) !=
-   GELIC_DESCR_DMA_CARDOWNED) {
-   printk(KERN_ERR "%s: status=%x\n", __func__,
-  be32_to_cpu(card->rx_chain.head->dmac_cmd_status));
-   printk(KERN_ERR "%s: nextphy=%x\n", __func__,
-  be32_to_cpu(card->rx_chain.head->next_descr_addr));
-   printk(KERN_ERR "%s: head=%p\n", __func__,
-  card->rx_chain.head);
+   if (debug_build
+   && (gelic_descr_get_status(card->rx_chain.head)
+   != GELIC_DESCR_DMA_CARDOWNED)) {
+   dev_err(dev, "%s:%d: status=%x\n", __func__, __LINE__,
+   be32_to_cpu(
+   card->rx_chain.head->hw_regs.dmac_cmd_status));
+   dev_err(dev, "%s:%d: nextphy=%x\n", __func__, __LINE__,
+   be32_to_cpu(
+   card->rx_chain.head->hw_regs.next_descr_addr));
+   dev_err(dev, "%s:%d: head=%px\n", __func__, __LINE__,
+   card->rx_chain.head);
}
-#endif
+
status = lv1_net_start_rx_dma(bus_id(card), dev_id(card),
-   card->rx_chain.head->bus_addr, 0);
-   if (status)
-   dev_info(ctodev(card),
-"lv1_net_start_rx_dma failed, status=%d\n", status);
+   card->rx_chain.head->link.cpu_addr, 0);
+
+   if (status) {
+   dev_err(dev, "%s:%d: lv1_net_start_rx_dma failed: %d\n",
+   __func__, __LINE__, status);
+   }
 }
 
 /**
@@ -193,11 +205,11 @@ static void gelic_card_disable_rxdmac(struct gelic_card 
*card)
  * in the status
  */
 static void gelic_descr_set_status(struct gelic_descr *descr,
-  enum gelic_descr_dma_status status)
+   enum gelic_descr_dma_status status)
 {
-   descr->dmac_cmd_status = cpu_to_be32(status |
-   (be32_to_cpu(descr->dmac_cmd_status) &
-~GELIC_DESCR_DMA_STAT_MASK));
+   descr->hw_regs.dmac_cmd_status = cpu_to_be32(status
+   | (be32_to_cpu(descr->hw_regs.dmac_cmd_status)
+   & ~GELIC_DESCR_DMA_STAT_MASK));
/*
 * dma_cmd_status field is used to indicate whether the descriptor
 * is valid or not.
@@ -224,13 +236,14 @@ static void gelic_card_reset_chain(struct gelic_card 
*card,
 
for (descr = start_descr; start_descr != descr->next; descr++) {
gelic_descr_set_status(descr, GELIC_DESCR_DMA_CARDOWNED);
-   descr->next_descr_addr = cpu_to_be32(descr->next->bus_addr);
+   descr->hw_regs.next_descr_addr
+   = 

[PATCH 3/5] powerpc/ps3: Add dma_mask to ps3_dma_region

2021-06-01 Thread Geoff Levand
Commit f959dcd6ddfd29235030e8026471ac1b022ad2b0 (dma-direct: Fix
potential NULL pointer dereference) added a null check on the
dma_mask pointer of the kernel's device structure.

Add a dma_mask variable to the ps3_dma_region structure and set
the device structure's dma_mask pointer to point to this new variable.

Fixes runtime errors like these:

  ps3_system_bus_match:349: dev=8.0(sb_01), drv=8.0(ps3flash): match
  WARNING: CPU: 0 PID: 1 at kernel/dma/mapping.c:151
  ps3flash sb_01: ps3stor_setup:193: map DMA region failed

Signed-off-by: Geoff Levand 
---
 arch/powerpc/include/asm/ps3.h  |  2 ++
 arch/powerpc/platforms/ps3/mm.c | 12 
 2 files changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/ps3.h b/arch/powerpc/include/asm/ps3.h
index 7df145901def..8a0d8fb35328 100644
--- a/arch/powerpc/include/asm/ps3.h
+++ b/arch/powerpc/include/asm/ps3.h
@@ -71,6 +71,7 @@ struct ps3_dma_region_ops;
  * @bus_addr: The 'translated' bus address of the region.
  * @len: The length in bytes of the region.
  * @offset: The offset from the start of memory of the region.
+ * @dma_mask: Device dma_mask.
  * @ioid: The IOID of the device who owns this region
  * @chunk_list: Opaque variable used by the ioc page manager.
  * @region_ops: struct ps3_dma_region_ops - dma region operations
@@ -85,6 +86,7 @@ struct ps3_dma_region {
enum ps3_dma_region_type region_type;
unsigned long len;
unsigned long offset;
+   u64 dma_mask;
 
/* driver variables  (set by ps3_dma_region_create) */
unsigned long bus_addr;
diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c
index d094321964fb..a81eac35d900 100644
--- a/arch/powerpc/platforms/ps3/mm.c
+++ b/arch/powerpc/platforms/ps3/mm.c
@@ -6,6 +6,7 @@
  *  Copyright 2006 Sony Corp.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -1118,6 +1119,7 @@ int ps3_dma_region_init(struct ps3_system_bus_device *dev,
enum ps3_dma_region_type region_type, void *addr, unsigned long len)
 {
unsigned long lpar_addr;
+   int result;
 
lpar_addr = addr ? ps3_mm_phys_to_lpar(__pa(addr)) : 0;
 
@@ -1129,6 +1131,16 @@ int ps3_dma_region_init(struct ps3_system_bus_device 
*dev,
r->offset -= map.r1.offset;
r->len = len ? len : ALIGN(map.total, 1 << r->page_size);
 
+   dev->core.dma_mask = >dma_mask;
+
+   result = dma_set_mask_and_coherent(>core, DMA_BIT_MASK(32));
+
+   if (result < 0) {
+   dev_err(>core, "%s:%d: dma_set_mask_and_coherent failed: 
%d\n",
+   __func__, __LINE__, result);
+   return result;
+   }
+
switch (dev->dev_type) {
case PS3_DEVICE_TYPE_SB:
r->region_ops =  (USE_DYNAMIC_DMA)
-- 
2.25.1




[PATCH 0/5] DMA fixes for PS3 device drivers

2021-06-01 Thread Geoff Levand
Hi,

This is a set of patches that fix various DMA related problems in the PS3
device drivers, and add better error checking and improved message logging.

The gelic network driver had a number of problems and most of the changes are
in it's sources.

Please consider.

-Geoff

The following changes since commit 8124c8a6b35386f73523d27eacb71b5364a68c4c:

  Linux 5.13-rc4 (2021-05-30 11:58:25 -1000)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/geoff/ps3-linux.git 
for-merge-gelic

for you to fetch changes up to 4adcfc9735bf8d1987d2bc82e914be154f2ffad8:

  net/ps3_gelic: Cleanups, improve logging (2021-06-01 12:27:43 -0700)


Geoff Levand (5):
  powerpc/ps3: Add CONFIG_PS3_VERBOSE_RESULT option
  powerpc/ps3: Warn on PS3 device errors
  powerpc/ps3: Add dma_mask to ps3_dma_region
  net/ps3_gelic: Add gelic_descr structures
  net/ps3_gelic: Cleanups, improve logging

 arch/powerpc/include/asm/ps3.h   |   4 +-
 arch/powerpc/platforms/ps3/Kconfig   |   9 +
 arch/powerpc/platforms/ps3/mm.c  |  12 +
 arch/powerpc/platforms/ps3/system-bus.c  |   9 +-
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 968 +++
 drivers/net/ethernet/toshiba/ps3_gelic_net.h |  24 +-
 drivers/ps3/ps3-vuart.c  |   2 +-
 drivers/ps3/ps3av.c  |  22 +-
 8 files changed, 598 insertions(+), 452 deletions(-)

-- 
2.25.1



[PATCH 5/5] net/ps3_gelic: Cleanups, improve logging

2021-06-01 Thread Geoff Levand
General source cleanups and improved logging messages.

Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 395 ++-
 1 file changed, 216 insertions(+), 179 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c 
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
index e01938128882..9dbcb7c4ec80 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -44,17 +44,17 @@ MODULE_AUTHOR("SCE Inc.");
 MODULE_DESCRIPTION("Gelic Network driver");
 MODULE_LICENSE("GPL");
 
-
-/* set irq_mask */
 int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask)
 {
+   struct device *dev = ctodev(card);
int status;
 
status = lv1_net_set_interrupt_mask(bus_id(card), dev_id(card),
mask, 0);
-   if (status)
-   dev_info(ctodev(card),
-"%s failed %d\n", __func__, status);
+   if (status) {
+   dev_err(dev, "%s:%d failed: %d\n", __func__, __LINE__, status);
+   }
+
return status;
 }
 
@@ -63,6 +63,7 @@ static void gelic_card_rx_irq_on(struct gelic_card *card)
card->irq_mask |= GELIC_CARD_RXINT;
gelic_card_set_irq_mask(card, card->irq_mask);
 }
+
 static void gelic_card_rx_irq_off(struct gelic_card *card)
 {
card->irq_mask &= ~GELIC_CARD_RXINT;
@@ -70,15 +71,14 @@ static void gelic_card_rx_irq_off(struct gelic_card *card)
 }
 
 static void gelic_card_get_ether_port_status(struct gelic_card *card,
-int inform)
+   int inform)
 {
u64 v2;
struct net_device *ether_netdev;
 
lv1_net_control(bus_id(card), dev_id(card),
-   GELIC_LV1_GET_ETH_PORT_STATUS,
-   GELIC_LV1_VLAN_TX_ETHERNET_0, 0, 0,
-   >ether_port_status, );
+   GELIC_LV1_GET_ETH_PORT_STATUS, GELIC_LV1_VLAN_TX_ETHERNET_0, 0,
+   0, >ether_port_status, );
 
if (inform) {
ether_netdev = card->netdev[GELIC_PORT_ETHERNET_0];
@@ -105,15 +105,17 @@ gelic_descr_get_status(struct gelic_descr *descr)
 
 static int gelic_card_set_link_mode(struct gelic_card *card, int mode)
 {
+   struct device *dev = ctodev(card);
int status;
u64 v1, v2;
 
status = lv1_net_control(bus_id(card), dev_id(card),
-GELIC_LV1_SET_NEGOTIATION_MODE,
-GELIC_LV1_PHY_ETHERNET_0, mode, 0, , );
+   GELIC_LV1_SET_NEGOTIATION_MODE, GELIC_LV1_PHY_ETHERNET_0, mode,
+   0, , );
+
if (status) {
-   pr_info("%s: failed setting negotiation mode %d\n", __func__,
-   status);
+   dev_err(dev, "%s:%d: Failed setting negotiation mode: %d\n",
+   __func__, __LINE__, status);
return -EBUSY;
}
 
@@ -130,13 +132,16 @@ static int gelic_card_set_link_mode(struct gelic_card 
*card, int mode)
  */
 static void gelic_card_disable_txdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;
 
/* this hvc blocks until the DMA in progress really stopped */
status = lv1_net_stop_tx_dma(bus_id(card), dev_id(card));
-   if (status)
-   dev_err(ctodev(card),
-   "lv1_net_stop_tx_dma failed, status=%d\n", status);
+
+   if (status) {
+   dev_err(dev, "%s:%d: lv1_net_stop_tx_dma failed: %d\n",
+   __func__, __LINE__, status);
+   }
 }
 
 /**
@@ -187,13 +192,16 @@ static void gelic_card_enable_rxdmac(struct gelic_card 
*card)
  */
 static void gelic_card_disable_rxdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;
 
/* this hvc blocks until the DMA in progress really stopped */
status = lv1_net_stop_rx_dma(bus_id(card), dev_id(card));
-   if (status)
-   dev_err(ctodev(card),
-   "lv1_net_stop_rx_dma failed, %d\n", status);
+
+   if (status) {
+   dev_err(dev, "%s:%d: lv1_net_stop_rx_dma failed: %d\n",
+   __func__, __LINE__, status);
+   }
 }
 
 /**
@@ -216,6 +224,7 @@ static void gelic_descr_set_status(struct gelic_descr 
*descr,
 * Usually caller of this function wants to inform that to the
 * hardware, so we assure here the hardware sees the change.
 */
+
wmb();
 }
 
@@ -229,8 +238,7 @@ static void gelic_descr_set_status(struct gelic_descr 
*descr,
  * and re-initialize the hardware chain for later use
  */
 static void gelic_card_reset_chain(struct gelic_card *card,
-  struct gelic_descr_chain *chain,
-  struct gelic_descr *start_descr)
+   struct gelic_descr_chain *chain, struct gelic_descr *start_descr)
 {
struct 

Re: [PATCH] powerpc/barrier: Avoid collision with clang's __lwsync macro

2021-06-01 Thread Nick Desaulniers
On Fri, May 28, 2021 at 11:29 AM Nathan Chancellor  wrote:
>
> A change in clang 13 results in the __lwsync macro being defined as
> __builtin_ppc_lwsync, which emits 'lwsync' or 'msync' depending on what
> the target supports.

Indeed,
$ clang --target=powerpc64le-linux-gnu -mcpu=e500 -m32
for example.

Thanks for the patch!
Reviewed-by: Nick Desaulniers 

> This breaks the build because of -Werror in
> arch/powerpc, along with thousands of warnings:
>
>  In file included from arch/powerpc/kernel/pmc.c:12:
>  In file included from include/linux/bug.h:5:
>  In file included from arch/powerpc/include/asm/bug.h:109:
>  In file included from include/asm-generic/bug.h:20:
>  In file included from include/linux/kernel.h:12:
>  In file included from include/linux/bitops.h:32:
>  In file included from arch/powerpc/include/asm/bitops.h:62:
>  arch/powerpc/include/asm/barrier.h:49:9: error: '__lwsync' macro redefined 
> [-Werror,-Wmacro-redefined]
>  #define __lwsync()  __asm__ __volatile__ (stringify_in_c(LWSYNC) : : 
> :"memory")
> ^
>  :308:9: note: previous definition is here
>  #define __lwsync __builtin_ppc_lwsync
> ^
>  1 error generated.
>
> Undefine this macro so that the runtime patching introduced by
> commit 2d1b2027626d ("powerpc: Fixup lwsync at runtime") continues to
> work properly with clang and the build no longer breaks.
>
> Cc: sta...@vger.kernel.org
> Link: https://github.com/ClangBuiltLinux/linux/issues/1386
> Link: 
> https://github.com/llvm/llvm-project/commit/62b5df7fe2b3fda1772befeda15598fbef96a614
> Signed-off-by: Nathan Chancellor 
> ---
>  arch/powerpc/include/asm/barrier.h | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/barrier.h 
> b/arch/powerpc/include/asm/barrier.h
> index 7ae29cfb06c0..f0e687236484 100644
> --- a/arch/powerpc/include/asm/barrier.h
> +++ b/arch/powerpc/include/asm/barrier.h
> @@ -46,6 +46,8 @@
>  #define SMPWMB  eieio
>  #endif
>
> +/* clang defines this macro for a builtin, which will not work with runtime 
> patching */
> +#undef __lwsync
>  #define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : 
> :"memory")
>  #define dma_rmb()  __lwsync()
>  #define dma_wmb()  __asm__ __volatile__ (stringify_in_c(SMPWMB) : : 
> :"memory")
>
> base-commit: 97e5bf604b7a0d6e1b3e00fe31d5fd4b9bffeaae
> --
> 2.32.0.rc0
>


-- 
Thanks,
~Nick Desaulniers


Re: [PATCH v2] lockdown,selinux: avoid bogus SELinux lockdown permission checks

2021-06-01 Thread Paul Moore
On Mon, May 31, 2021 at 4:24 AM Daniel Borkmann  wrote:
> On 5/29/21 8:48 PM, Paul Moore wrote:
> [...]
> > Daniel's patch side steps that worry by just doing the lockdown
> > permission check when the BPF program is loaded, but that isn't a
> > great solution if the policy changes afterward.  I was hoping there
> > might be some way to perform the permission check as needed, but the
> > more I look the more that appears to be difficult, if not impossible
> > (once again, corrections are welcome).
>
> Your observation is correct, will try to clarify below a bit.
>
> > I'm now wondering if the right solution here is to make use of the LSM
> > notifier mechanism.  I'm not yet entirely sure if this would work from
> > a BPF perspective, but I could envision the BPF subsystem registering
> > a LSM notification callback via register_blocking_lsm_notifier(), see
> > if Infiniband code as an example, and then when the LSM(s) policy
> > changes the BPF subsystem would get a notification and it could
> > revalidate the existing BPF programs and take block/remove/whatever
> > the offending BPF programs.  This obviously requires a few things
> > which I'm not sure are easily done, or even possible:
> >
> > 1. Somehow the BPF programs would need to be "marked" at
> > load/verification time with respect to their lockdown requirements so
> > that decisions can be made later.  Perhaps a flag in bpf_prog_aux?
> >
> > 2. While it looks like it should be possible to iterate over all of
> > the loaded BPF programs in the LSM notifier callback via
> > idr_for_each(prog_idr, ...), it is not clear to me if it is possible
> > to safely remove, or somehow disable, BPF programs once they have been
> > loaded.  Hopefully the BPF folks can help answer that question.
> >
> > 3. Disabling of BPF programs might be preferable to removing them
> > entirely on LSM policy changes as it would be possible to make the
> > lockdown state less restrictive at a future point in time, allowing
> > for the BPF program to be executed again.  Once again, not sure if
> > this is even possible.
>
> Part of why this gets really complex/impossible is that BPF programs in
> the kernel are reference counted from various sides, be it that there
> are references from user space to them (fd from application, BPF fs, or
> BPF links), hooks where they are attached to as well as tail call maps
> where one BPF prog calls into another. There is currently also no global
> infra of some sort where you could piggy back to atomically keep track of
> all the references in a list or such. And the other thing is that BPF progs
> have no ownership that is tied to a specific task after they have been
> loaded. Meaning, once they are loaded into the kernel by an application
> and attached to a specific hook, they can remain there potentially until
> reboot of the node, so lifecycle of the user space application != lifecycle
> of the BPF program.

I don't think the disjoint lifecycle or lack of task ownership is a
deal breaker from a LSM perspective as the LSMs can stash whatever
info they need in the security pointer during the program allocation
hook, e.g. selinux_bpf_prog_alloc() saves the security domain which
allocates/loads the BPF program.

The thing I'm worried about would be the case where a LSM policy
change requires that an existing BPF program be removed or disabled.
I'm guessing based on the refcounting that there is not presently a
clean way to remove a BPF program from the system, but is this
something we could resolve?  If we can't safely remove a BPF program
from the system, can we replace/swap it with an empty/NULL BPF
program?

> It's maybe best to compare this aspect to kernel modules in the sense that
> you have an application that loads it into the kernel (insmod, etc, where
> you could also enforce lockdown signature check), but after that, they can
> be managed by other entities as well (implicitly refcounted from kernel,
> removed by other applications, etc).

Well, I guess we could consider BPF programs as out-of-tree kernel
modules that potentially do very odd and dangerous things, e.g.
performing access control checks *inside* access control checks ...
but yeah, I get your point at a basic level, I just think that
comparing BPF programs to kernel modules is a not-so-great comparison
in general.

> My understanding of the lockdown settings are that users have options
> to select/enforce a lockdown level of 
> CONFIG_LOCK_DOWN_KERNEL_FORCE_{INTEGRITY,
> CONFIDENTIALITY} at compilation time, they have a lockdown={integrity|
> confidentiality} boot-time parameter, /sys/kernel/security/lockdown,
> and then more fine-grained policy via 59438b46471a 
> ("security,lockdown,selinux:
> implement SELinux lockdown"). Once you have set a global policy level,
> you cannot revert back to a less strict mode.

I don't recall there being anything in the SELinux lockdown support
that prevents a newly loaded policy from allowing a change in the
lockdown level, 

Re: [PATCH] ASoC: fsl-asoc-card: Set .owner attribute when registering card.

2021-06-01 Thread Mark Brown
On Thu, 27 May 2021 18:34:09 +0200, Nicolas Cavallari wrote:
> Otherwise, when compiled as module, a WARN_ON is triggered:
> 
> WARNING: CPU: 0 PID: 5 at sound/core/init.c:208 snd_card_new+0x310/0x39c [snd]
> [...]
> CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.10.39 #1
> Hardware name: Freescale i.MX6 Quad/DualLite (Device Tree)
> Workqueue: events deferred_probe_work_func
> [] (unwind_backtrace) from [] (show_stack+0x10/0x14)
> [] (show_stack) from [] (dump_stack+0xdc/0x104)
> [] (dump_stack) from [] (__warn+0xd8/0x114)
> [] (__warn) from [] (warn_slowpath_fmt+0x5c/0xc4)
> [] (warn_slowpath_fmt) from [] (snd_card_new+0x310/0x39c 
> [snd])
> [] (snd_card_new [snd]) from [] 
> (snd_soc_bind_card+0x334/0x9c4 [snd_soc_core])
> [] (snd_soc_bind_card [snd_soc_core]) from [] 
> (devm_snd_soc_register_card+0x30/0x6c [snd_soc_core])
> [] (devm_snd_soc_register_card [snd_soc_core]) from [] 
> (fsl_asoc_card_probe+0x550/0xcc8 [snd_soc_fsl_asoc_card])
> [] (fsl_asoc_card_probe [snd_soc_fsl_asoc_card]) from [] 
> (platform_drv_probe+0x48/0x98)
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next

Thanks!

[1/1] ASoC: fsl-asoc-card: Set .owner attribute when registering card.
  commit: a8437f05384cb472518ec21bf4fffbe8f0a47378

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark


Re: [PATCH] Documentation PCI: Fix typo in pci-error-recovery.rst

2021-06-01 Thread Bjorn Helgaas
On Mon, May 31, 2021 at 04:12:15PM +0800, Wesley Sheng wrote:
> Replace "It" with "If", since it is a conditional statement.
> 
> Signed-off-by: Wesley Sheng 

Applied to pci/error for v5.14 with Krzysztof's reviewed-by and
subject "Documentation: PCI: Fix typo in pci-error-recovery.rst" to
match previous convention, thanks!

> ---
>  Documentation/PCI/pci-error-recovery.rst | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/Documentation/PCI/pci-error-recovery.rst 
> b/Documentation/PCI/pci-error-recovery.rst
> index 84ceebb08cac..187f43a03200 100644
> --- a/Documentation/PCI/pci-error-recovery.rst
> +++ b/Documentation/PCI/pci-error-recovery.rst
> @@ -295,7 +295,7 @@ and let the driver restart normal I/O processing.
>  A driver can still return a critical failure for this function if
>  it can't get the device operational after reset.  If the platform
>  previously tried a soft reset, it might now try a hard reset (power
> -cycle) and then call slot_reset() again.  It the device still can't
> +cycle) and then call slot_reset() again.  If the device still can't
>  be recovered, there is nothing more that can be done;  the platform
>  will typically report a "permanent failure" in such a case.  The
>  device will be considered "dead" in this case.
> -- 
> 2.25.1
> 


Re: [RFC PATCH 1/6] trace/stack: Move code to save the stack trace into a separate function

2021-06-01 Thread Steven Rostedt
On Fri, 21 May 2021 12:18:36 +0530
"Naveen N. Rao"  wrote:

> In preparation to add support for stack tracer to powerpc, move code to
> save stack trace and to calculate the frame sizes into a separate weak
> function. Also provide access to some of the data structures used by the
> stack trace code so that architectures can update those.
> 
> Signed-off-by: Naveen N. Rao 
> ---
>  include/linux/ftrace.h |  8 
>  kernel/trace/trace_stack.c | 98 --
>  2 files changed, 60 insertions(+), 46 deletions(-)
> 
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index a69f363b61bf73..8263427379f05c 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -368,10 +368,18 @@ static inline void arch_ftrace_set_direct_caller(struct 
> pt_regs *regs,
>  
>  #ifdef CONFIG_STACK_TRACER
>  
> +#define STACK_TRACE_ENTRIES 500
> +
> +extern unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
> +extern unsigned stack_trace_index[STACK_TRACE_ENTRIES];
> +extern unsigned int stack_trace_nr_entries;
> +extern unsigned long stack_trace_max_size;
>  extern int stack_tracer_enabled;
>  
>  int stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
>  size_t *lenp, loff_t *ppos);
> +void stack_get_trace(unsigned long traced_ip, unsigned long *stack_ref,
> + unsigned long stack_size, int 
> *tracer_frame);
>  
>  /* DO NOT MODIFY THIS VARIABLE DIRECTLY! */
>  DECLARE_PER_CPU(int, disable_stack_tracer);
> diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
> index 63c28504205162..5b63dbd37c8c25 100644
> --- a/kernel/trace/trace_stack.c
> +++ b/kernel/trace/trace_stack.c
> @@ -19,13 +19,11 @@
>  
>  #include "trace.h"
>  
> -#define STACK_TRACE_ENTRIES 500
> +unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
> +unsigned stack_trace_index[STACK_TRACE_ENTRIES];
>  
> -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
> -static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
> -
> -static unsigned int stack_trace_nr_entries;
> -static unsigned long stack_trace_max_size;
> +unsigned int stack_trace_nr_entries;
> +unsigned long stack_trace_max_size;
>  static arch_spinlock_t stack_trace_max_lock =
>   (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
>  
> @@ -152,49 +150,19 @@ static void print_max_stack(void)
>   * Although the entry function is not displayed, the first function (sys_foo)
>   * will still include the stack size of it.
>   */
> -static void check_stack(unsigned long ip, unsigned long *stack)

I just got back from PTO and have a ton of other obligations to attend
to before I can dig deeper into this. I'm not opposed to this change,
but the stack_tracer has not been getting the love that it deserves and
I think you hit one of the issues that needs to be addressed. I'm not
sure this is a PPC only issue, and would like to see if I can get more
time (or someone else can) to reevaluate the way stack tracer works,
and see if it can be made a bit more robust.

-- Steve


Re: [PATCH] powerpc: make show_stack's stack walking KASAN-safe

2021-06-01 Thread Daniel Axtens
"Naveen N. Rao"  writes:

> Daniel Axtens wrote:
>> Make our stack-walking code KASAN-safe by using READ_ONCE_NOCHECK -
>> generic code, arm64, s390 and x86 all do this for similar sorts of
>> reasons: when unwinding a stack, we might touch memory that KASAN has
>> marked as being out-of-bounds. In ppc64 KASAN development, I hit this
>> sometimes when checking for an exception frame - because we're checking
>> an arbitrary offset into the stack frame.
>> 
>> See commit 20955746320e ("s390/kasan: avoid false positives during stack
>> unwind"), commit bcaf669b4bdb ("arm64: disable kasan when accessing
>> frame->fp in unwind_frame"), commit 91e08ab0c851 ("x86/dumpstack:
>> Prevent KASAN false positive warnings") and commit 6e22c8366416
>> ("tracing, kasan: Silence Kasan warning in check_stack of stack_tracer").
>> 
>> Signed-off-by: Daniel Axtens 
>> ---
>>  arch/powerpc/kernel/process.c | 16 +---
>>  1 file changed, 9 insertions(+), 7 deletions(-)
>> 
>> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
>> index 89e34aa273e2..430cf06f9406 100644
>> --- a/arch/powerpc/kernel/process.c
>> +++ b/arch/powerpc/kernel/process.c
>> @@ -2151,8 +2151,8 @@ void show_stack(struct task_struct *tsk, unsigned long 
>> *stack,
>>  break;
>>  
>>  stack = (unsigned long *) sp;
>> -newsp = stack[0];
>> -ip = stack[STACK_FRAME_LR_SAVE];
>> +newsp = READ_ONCE_NOCHECK(stack[0]);
>> +ip = READ_ONCE_NOCHECK(stack[STACK_FRAME_LR_SAVE]);
>
> Just curious:
> Given that we validate the stack pointer before these accesses, can we 
> annotate show_stack() with __no_sanitize_address instead?
>
> I ask because we have other places where we walk the stack: 
> arch_stack_walk(), as well as in perf callchain. Similar changes will be 
> needed there as well.

Oh good points. Yes, it probably makes most sense to mark all the
functions with __no_sanitize_address, that resolves Christophe's issue
as well. I'll send a v2.

Kind regards,
Daniel

>
>
> - Naveen


Re: [RFC PATCH 2/6] powerpc/trace: Add support for stack tracer

2021-06-01 Thread Naveen N. Rao

Naveen N. Rao wrote:

+
+unsigned long ftrace_get_traced_func_if_no_stackframe(unsigned long ip, 
unsigned long *stack)
+{
+   if (!is_ftrace_entry(ip))
+   return 0;
+
+   if (IS_ENABLED(CONFIG_PPC32))
+   return stack[11]; /* see MCOUNT_SAVE_FRAME */
+
+   if (!IS_ENABLED(CONFIG_MPROFILE_KERNEL))
+   return 0;
+
+   return stack[(STACK_FRAME_OVERHEAD + offsetof(struct pt_regs, nip)) / 
sizeof(unsigned long)];


Looking at Daniel's patch to address KASAN errors with our stack walk 
code in show_stack() [*], I realized that I am not validating the stack 
pointer here for the above accesses...


[*] http://lkml.kernel.org/r/20210528074806.1311297-1-...@axtens.net


+}
+
+#ifdef CONFIG_STACK_TRACER
+void stack_get_trace(unsigned long traced_ip,
+unsigned long *stack_ref __maybe_unused,
+unsigned long stack_size __maybe_unused,
+int *tracer_frame)
+{
+   unsigned long sp, newsp, top, ip;
+   int ftrace_call_found = 0;
+   unsigned long *stack;
+   int i = 0;
+
+   sp = current_stack_frame();
+   top = (unsigned long)task_stack_page(current) + THREAD_SIZE;
+
+   while (validate_sp(sp, current, STACK_FRAME_OVERHEAD) && i < 
STACK_TRACE_ENTRIES) {
+   stack = (unsigned long *) sp;
+   newsp = stack[0];
+   ip = stack[STACK_FRAME_LR_SAVE];
+
+   if (ftrace_call_found) {
+   stack_dump_trace[i] = ip;
+   stack_trace_index[i++] = top - sp;
+   }


And I need to make the above accesses bypass KASAN as well.


- Naveen



Re: simplify gendisk and request_queue allocation for bio based drivers

2021-06-01 Thread Jens Axboe
On 5/20/21 11:50 PM, Christoph Hellwig wrote:
> Hi all,
> 
> this series is the first part of cleaning up lifetimes and allocation of
> the gendisk and request_queue structure.  It adds a new interface to
> allocate the disk and queue together for bio based drivers, and a helper
> for cleanup/free them when a driver is unloaded or a device is removed.
> 
> Together this removes the need to treat the gendisk and request_queue
> as separate entities for bio based drivers.

Applied, thanks.

-- 
Jens Axboe



Re: [PATCH] powerpc: make show_stack's stack walking KASAN-safe

2021-06-01 Thread Naveen N. Rao

Daniel Axtens wrote:

Make our stack-walking code KASAN-safe by using READ_ONCE_NOCHECK -
generic code, arm64, s390 and x86 all do this for similar sorts of
reasons: when unwinding a stack, we might touch memory that KASAN has
marked as being out-of-bounds. In ppc64 KASAN development, I hit this
sometimes when checking for an exception frame - because we're checking
an arbitrary offset into the stack frame.

See commit 20955746320e ("s390/kasan: avoid false positives during stack
unwind"), commit bcaf669b4bdb ("arm64: disable kasan when accessing
frame->fp in unwind_frame"), commit 91e08ab0c851 ("x86/dumpstack:
Prevent KASAN false positive warnings") and commit 6e22c8366416
("tracing, kasan: Silence Kasan warning in check_stack of stack_tracer").

Signed-off-by: Daniel Axtens 
---
 arch/powerpc/kernel/process.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 89e34aa273e2..430cf06f9406 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2151,8 +2151,8 @@ void show_stack(struct task_struct *tsk, unsigned long 
*stack,
break;
 
 		stack = (unsigned long *) sp;

-   newsp = stack[0];
-   ip = stack[STACK_FRAME_LR_SAVE];
+   newsp = READ_ONCE_NOCHECK(stack[0]);
+   ip = READ_ONCE_NOCHECK(stack[STACK_FRAME_LR_SAVE]);


Just curious:
Given that we validate the stack pointer before these accesses, can we 
annotate show_stack() with __no_sanitize_address instead?


I ask because we have other places where we walk the stack: 
arch_stack_walk(), as well as in perf callchain. Similar changes will be 
needed there as well.



- Naveen



[PATCH -next] powerpc/spufs: disp: Remove set but not used variable 'dummy'

2021-06-01 Thread Baokun Li
Fixes gcc '-Wunused-but-set-variable' warning:

arch/powerpc/platforms/cell/spufs/switch.c: In function 'check_ppu_mb_stat':
arch/powerpc/platforms/cell/spufs/switch.c:1660:6: warning:
variable ‘dummy’ set but not used [-Wunused-but-set-variable]

arch/powerpc/platforms/cell/spufs/switch.c: In function 'check_ppuint_mb_stat':
arch/powerpc/platforms/cell/spufs/switch.c:1675:6: warning:
variable ‘dummy’ set but not used [-Wunused-but-set-variable]

It never used since introduction.

Signed-off-by: Baokun Li 
---
 arch/powerpc/platforms/cell/spufs/switch.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/switch.c 
b/arch/powerpc/platforms/cell/spufs/switch.c
index d56b4e3241cd..b41e81b22fdc 100644
--- a/arch/powerpc/platforms/cell/spufs/switch.c
+++ b/arch/powerpc/platforms/cell/spufs/switch.c
@@ -1657,14 +1657,13 @@ static inline void restore_spu_mb(struct spu_state 
*csa, struct spu *spu)
 static inline void check_ppu_mb_stat(struct spu_state *csa, struct spu *spu)
 {
struct spu_problem __iomem *prob = spu->problem;
-   u32 dummy = 0;
 
/* Restore, Step 66:
 * If CSA.MB_Stat[P]=0 (mailbox empty) then
 * read from the PPU_MB register.
 */
if ((csa->prob.mb_stat_R & 0xFF) == 0) {
-   dummy = in_be32(>pu_mb_R);
+   in_be32(>pu_mb_R);
eieio();
}
 }
@@ -1672,14 +1671,13 @@ static inline void check_ppu_mb_stat(struct spu_state 
*csa, struct spu *spu)
 static inline void check_ppuint_mb_stat(struct spu_state *csa, struct spu *spu)
 {
struct spu_priv2 __iomem *priv2 = spu->priv2;
-   u64 dummy = 0UL;
 
/* Restore, Step 66:
 * If CSA.MB_Stat[I]=0 (mailbox empty) then
 * read from the PPUINT_MB register.
 */
if ((csa->prob.mb_stat_R & 0xFF) == 0) {
-   dummy = in_be64(>puint_mb_R);
+   in_be64(>puint_mb_R);
eieio();
spu_int_stat_clear(spu, 2, CLASS2_ENABLE_MAILBOX_INTR);
eieio();
-- 
2.31.1



[PATCH -next] powerpc/spider-pci: Remove set but not used variable 'val'

2021-06-01 Thread Baokun Li
Fixes gcc '-Wunused-but-set-variable' warning:

arch/powerpc/platforms/cell/spider-pci.c: In function 'spiderpci_io_flush':
arch/powerpc/platforms/cell/spider-pci.c:28:6: warning:
variable ‘val’ set but not used [-Wunused-but-set-variable]

It never used since introduction.

Signed-off-by: Baokun Li 
---
 arch/powerpc/platforms/cell/spider-pci.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spider-pci.c 
b/arch/powerpc/platforms/cell/spider-pci.c
index 93ea41680f54..a1c293f42a1f 100644
--- a/arch/powerpc/platforms/cell/spider-pci.c
+++ b/arch/powerpc/platforms/cell/spider-pci.c
@@ -25,10 +25,9 @@ struct spiderpci_iowa_private {
 static void spiderpci_io_flush(struct iowa_bus *bus)
 {
struct spiderpci_iowa_private *priv;
-   u32 val;
 
priv = bus->private;
-   val = in_be32(priv->regs + SPIDER_PCI_DUMMY_READ);
+   in_be32(priv->regs + SPIDER_PCI_DUMMY_READ);
iosync();
 }
 
-- 
2.31.1



Re: [PATCH -next] powerpc/spufs: disp: Remove set but not used variable 'dummy'

2021-06-01 Thread Arnd Bergmann
On Tue, Jun 1, 2021 at 10:43 AM Baokun Li  wrote:
>
> Fixes gcc '-Wunused-but-set-variable' warning:
>
> arch/powerpc/platforms/cell/spufs/switch.c: In function 'check_ppu_mb_stat':
> arch/powerpc/platforms/cell/spufs/switch.c:1660:6: warning:
> variable ‘dummy’ set but not used [-Wunused-but-set-variable]
>
> arch/powerpc/platforms/cell/spufs/switch.c: In function 
> 'check_ppuint_mb_stat':
> arch/powerpc/platforms/cell/spufs/switch.c:1675:6: warning:
> variable ‘dummy’ set but not used [-Wunused-but-set-variable]
>
> It never used since introduction.
>
> Signed-off-by: Baokun Li 

I think in_be64() used to cause a different warning if the result was
ignored, but this is no longer the case.

Acked-by: Arnd Bergmann 


[PATCH] powerpc/xmon: Add support for running a command on all cpus in xmon

2021-06-01 Thread Naveen N. Rao
It is sometimes desirable to run a command on all cpus in xmon. A
typical scenario is to obtain the backtrace from all cpus in xmon if
there is a soft lockup. Add rudimentary support for the same. The
command to be run on all cpus should be prefixed with 'c#'. As an
example, 'c#t' will run 't' command and produce a backtrace on all cpus
in xmon.

Since many xmon commands are not sensible for running in this manner, we
only allow a predefined list of commands -- 'r', 'S' and 't' for now.

Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/xmon/xmon.c | 148 +--
 1 file changed, 126 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index c8173e92f19d7b..a178b6654e3aec 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -70,6 +70,9 @@ static cpumask_t cpus_in_xmon = CPU_MASK_NONE;
 static unsigned long xmon_taken = 1;
 static int xmon_owner;
 static int xmon_gate;
+static int xmon_batch;
+static unsigned long xmon_batch_start_cpu;
+static cpumask_t xmon_batch_cpus = CPU_MASK_NONE;
 #else
 #define xmon_owner 0
 #endif /* CONFIG_SMP */
@@ -133,6 +136,12 @@ static void prdump(unsigned long, long);
 static int ppc_inst_dump(unsigned long, long, int);
 static void dump_log_buf(void);
 
+#ifdef CONFIG_SMP
+static int xmon_switch_cpu(unsigned long);
+static int xmon_batch_next_cpu(void);
+static int batch_cmds(struct pt_regs *);
+#endif
+
 #ifdef CONFIG_PPC_POWERNV
 static void dump_opal_msglog(void);
 #else
@@ -216,7 +225,8 @@ Commands:\n\
 #ifdef CONFIG_SMP
   "\
   cprint cpus stopped in xmon\n\
-  c#   try to switch to cpu number h (in hex)\n"
+  c#   try to switch to cpu number h (in hex)\n\
+  c# $ run command '$' (one of 'r','S' or 't') on all cpus in xmon\n"
 #endif
   "\
   Cchecksum\n\
@@ -644,7 +654,12 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
spin_cpu_relax();
touch_nmi_watchdog();
} else {
-   if (!locked_down)
+   cmd = 1;
+#ifdef CONFIG_SMP
+   if (xmon_batch)
+   cmd = batch_cmds(regs);
+#endif
+   if (!locked_down && cmd)
cmd = cmds(regs);
if (locked_down || cmd != 0) {
/* exiting xmon */
@@ -1243,11 +1258,112 @@ static void bootcmds(void)
}
 }
 
+#ifdef CONFIG_SMP
+static int xmon_switch_cpu(unsigned long cpu)
+{
+   int timeout;
+
+   xmon_taken = 0;
+   mb();
+   xmon_owner = cpu;
+   timeout = 1000;
+   while (!xmon_taken) {
+   if (--timeout == 0) {
+   if (test_and_set_bit(0, _taken))
+   break;
+   /* take control back */
+   mb();
+   xmon_owner = smp_processor_id();
+   printf("cpu 0x%lx didn't take control\n", cpu);
+   return 0;
+   }
+   barrier();
+   }
+   return 1;
+}
+
+static int xmon_batch_next_cpu(void)
+{
+   unsigned long cpu;
+
+   while (!cpumask_empty(_batch_cpus)) {
+   cpu = cpumask_next_wrap(smp_processor_id(), _batch_cpus,
+   xmon_batch_start_cpu, true);
+   if (cpu == nr_cpumask_bits)
+   break;
+   if (xmon_batch_start_cpu == -1)
+   xmon_batch_start_cpu = cpu;
+   if (xmon_switch_cpu(cpu))
+   return 0;
+   cpumask_clear_cpu(cpu, _batch_cpus);
+   }
+
+   xmon_batch = 0;
+   printf("%x:mon> \n", smp_processor_id());
+   return 1;
+}
+
+static int batch_cmds(struct pt_regs *excp)
+{
+   int cmd;
+
+   /* simulate command entry */
+   cmd = xmon_batch;
+   termch = '\n';
+
+   last_cmd = NULL;
+   xmon_regs = excp;
+
+   printf("%x:", smp_processor_id());
+   printf("mon> ");
+   printf("%c\n", (char)cmd);
+
+   switch (cmd) {
+   case 'r':
+   prregs(excp);   /* print regs */
+   break;
+   case 'S':
+   super_regs();
+   break;
+   case 't':
+   backtrace(excp);
+   break;
+   }
+
+   cpumask_clear_cpu(smp_processor_id(), _batch_cpus);
+
+   return xmon_batch_next_cpu();
+}
+
 static int cpu_cmd(void)
 {
-#ifdef CONFIG_SMP
unsigned long cpu, first_cpu, last_cpu;
-   int timeout;
+
+   cpu = skipbl();
+   if (cpu == '#') {
+   xmon_batch = skipbl();
+   if (xmon_batch) {
+   switch (xmon_batch) {
+   case 'r':
+   case 'S':
+   case 't':
+   cpumask_copy(_batch_cpus, _in_xmon);
+   

Re: [PATCH] Revert "powerpc: Switch to relative jump labels"

2021-06-01 Thread Michael Ellerman
Roman Bolshakov  writes:
> On Sat, May 29, 2021 at 09:39:49AM +1000, Michael Ellerman wrote:
>> Roman Bolshakov  writes:
>> > This reverts commit b0b3b2c78ec075cec4721986a95abbbac8c3da4f.
>> >
>> > Otherwise, direct kernel boot with initramfs no longer works in QEMU.
>> > It's broken in some bizarre way because a valid initramfs is not
>> > recognized anymore:
>> >
>> >   Found initrd at 0xc1f7:0xc3d61d64
>> >   rootfs image is not initramfs (XZ-compressed data is corrupt); looks 
>> > like an initrd
>> >
>> > The issue is observed on v5.13-rc3 if the kernel is built with
>> > defconfig, GCC 7.5.0 and GNU ld 2.32.0.
>> 
>> Are you able to try a different compiler?
>
> Hi Michael,
>
> I've just tried GCC 9.3.1 and the result is the same.
>
> The offending patch has assembly inlines, they typically go through
> binutils/GAS and it might also be a case when older binutils doesn't
> implement something properly (i've seen this on x86 and arm).

Jump labels use asm goto, which is a compiler feature, but you're right
that the binutils version could also be important.

What ld versions have you tried?

And are those the toolchains from kernel.org or somewhere else?

>> I test booting qemu constantly, but I don't use GCC 7.5.
>>
>> And what qemu version are you using?
>> 
>
> QEMU 3.1.1, but I've also tried 6.0.50 (QEMU master, 62c0ac5041e913) and
> it fails the same way.

OK.

>> I assume your initramfs is compressed with XZ? How large is it
>> compressed?
>> 
>
> Yes, XZ. initramfs size is 30 MB (around 100 MB cpio size).
>
> It's interesting that the issue doesn't happen if I pass initramfs from
> host (11MB), then the initramfs can be recognized. It might be related
> to initramfs size then and bigger initramfs that used to work no longer
> work with v5.13-rc3.

Are you using qemu's -initrd option to pass the initramfs, or are you
building the initramfs into the kernel?

> So, I've created a small initramfs using only static busybox (2.7M
> uncompressed, 960K compressed with xz). No error is produced and it
> boots fine.
>
> If I add a dummy file (11M off /dev/urandom) to the small busybox
> initramfs, it boots and the init is started but I'm seeing the error:
>
>   rootfs image is not initramfs (XZ-compressed data is corrupt); looks like 
> an initrd
>
> sha1sum of the file inside initramfs doesn't match sha1sum on the host.
>
>   guest # sha1sum dummy
>   407c347e671ddd00f69df12b3368048bad0ebf0c  dummy
>   # QEMU: Terminated
>   host $ sha1sum dummy
>   ed8494b3eecab804960ceba2c497270eed0b0cd1  dummy
>
> sha1sum is the same in the guest and on the host for 10M dummy file:
>
>   guest # sha1sum dummy
>   43855f7a772a28cce91da9eb8f86f53bc807631f  dummy
>   # QEMU: Terminated
>   host $ sha1sum dummy
>   43855f7a772a28cce91da9eb8f86f53bc807631f  dummy
>
> That might explain why bigger initramfs (or initramfs with bigger files)
> doesn't boot - because some files might appear corrupted inside the guest.
>
> Here're the sources of the initrd along with 11M dummy file:
>   https://drive.yadro.com/s/W8HdbPnaKmPPwK4
>
> I've compressed it with:
>   $ find . 2>/dev/null | cpio -ocR 0:0 | xz  --check=crc32 > 
> ../initrd-dummy.xz
>
> Hope this helps,

I haven't been able to reproduce any corruption, with various initramfs
sizes.

Can you send us your kernel .config & qemu command line.

And can you try the patch below?

cheers


diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c
index ce87dc5ea23c..3d9878124cde 100644
--- a/arch/powerpc/kernel/jump_label.c
+++ b/arch/powerpc/kernel/jump_label.c
@@ -13,6 +13,9 @@ void arch_jump_label_transform(struct jump_entry *entry,
 {
struct ppc_inst *addr = (struct ppc_inst *)jump_entry_code(entry);
 
+   if (!is_kernel_text((unsigned long)addr) && 
!is_kernel_inittext((unsigned long)addr))
+   printk("%s: addr %px %pS is not kernel text?\n", __func__, 
addr, addr);
+
if (type == JUMP_LABEL_JMP)
patch_branch(addr, jump_entry_target(entry), 0);
else


Re: [PATCH] powerpc: make show_stack's stack walking KASAN-safe

2021-06-01 Thread Christophe Leroy




Le 28/05/2021 à 09:48, Daniel Axtens a écrit :

Make our stack-walking code KASAN-safe by using READ_ONCE_NOCHECK -
generic code, arm64, s390 and x86 all do this for similar sorts of
reasons: when unwinding a stack, we might touch memory that KASAN has
marked as being out-of-bounds. In ppc64 KASAN development, I hit this
sometimes when checking for an exception frame - because we're checking
an arbitrary offset into the stack frame.

See commit 20955746320e ("s390/kasan: avoid false positives during stack
unwind"), commit bcaf669b4bdb ("arm64: disable kasan when accessing
frame->fp in unwind_frame"), commit 91e08ab0c851 ("x86/dumpstack:
Prevent KASAN false positive warnings") and commit 6e22c8366416
("tracing, kasan: Silence Kasan warning in check_stack of stack_tracer").

Signed-off-by: Daniel Axtens 
---
  arch/powerpc/kernel/process.c | 16 +---
  1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 89e34aa273e2..430cf06f9406 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2151,8 +2151,8 @@ void show_stack(struct task_struct *tsk, unsigned long 
*stack,
break;
  
  		stack = (unsigned long *) sp;

-   newsp = stack[0];
-   ip = stack[STACK_FRAME_LR_SAVE];
+   newsp = READ_ONCE_NOCHECK(stack[0]);
+   ip = READ_ONCE_NOCHECK(stack[STACK_FRAME_LR_SAVE]);
if (!firstframe || ip != lr) {
printk("%s["REG"] ["REG"] %pS",
loglvl, sp, ip, (void *)ip);
@@ -2170,17 +2170,19 @@ void show_stack(struct task_struct *tsk, unsigned long 
*stack,
 * See if this is an exception frame.
 * We look for the "regshere" marker in the current frame.
 */
-   if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS)
-   && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
+   if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS) &&
+   (READ_ONCE_NOCHECK(stack[STACK_FRAME_MARKER]) ==
+STACK_FRAME_REGS_MARKER)) {
struct pt_regs *regs = (struct pt_regs *)
(sp + STACK_FRAME_OVERHEAD);
  
-			lr = regs->link;

+   lr = READ_ONCE_NOCHECK(regs->link);
printk("%s--- interrupt: %lx at %pS\n",
-  loglvl, regs->trap, (void *)regs->nip);
+  loglvl, READ_ONCE_NOCHECK(regs->trap),
+  (void *)READ_ONCE_NOCHECK(regs->nip));
__show_regs(regs);
printk("%s--- interrupt: %lx\n",
-  loglvl, regs->trap);
+  loglvl, READ_ONCE_NOCHECK(regs->trap));


Actually you read regs->trap twice now. Can you use a local var and really read 
it only once ?

  
  			firstframe = 1;

}



[PATCH] selftests/powerpc: Remove the repeated declaration

2021-06-01 Thread Shaokun Zhang
Function 'event_ebb_init' and 'event_leader_ebb_init' are declared
twice in the header file, so remove the repeated declaration.

Cc: Michael Ellerman 
Signed-off-by: Shaokun Zhang 
---
 tools/testing/selftests/powerpc/pmu/ebb/ebb.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.h 
b/tools/testing/selftests/powerpc/pmu/ebb/ebb.h
index b5bc2b616075..2c803b5b48d6 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.h
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.h
@@ -55,8 +55,6 @@ void ebb_global_disable(void);
 bool ebb_is_supported(void);
 void ebb_freeze_pmcs(void);
 void ebb_unfreeze_pmcs(void);
-void event_ebb_init(struct event *e);
-void event_leader_ebb_init(struct event *e);
 int count_pmc(int pmc, uint32_t sample_period);
 void dump_ebb_state(void);
 void dump_summary_ebb_state(void);
-- 
2.7.4



[PATCH v3 4/4] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

2021-06-01 Thread Nicholas Piggin
On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 088dd2afcfe4..8a092eedc692 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -252,6 +252,7 @@ config PPC
select IRQ_FORCED_THREADING
select MMU_GATHER_PAGE_SIZE
select MMU_GATHER_RCU_TABLE_FREE
+   select MMU_LAZY_TLB_SHOOTDOWN   if PPC_BOOK3S_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE   if PPC64 || NOT_COHERENT_CACHE
select NEED_SG_DMA_LENGTH
-- 
2.23.0



[PATCH v3 3/4] lazy tlb: shoot lazies, a non-refcounting lazy tlb option

2021-06-01 Thread Nicholas Piggin
On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin 
---
 arch/Kconfig  | 14 +-
 kernel/fork.c | 52 +++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 276e1c1c0219..91e1882e3284 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -439,11 +439,23 @@ config NO_MMU_LAZY_TLB
def_bool n
 
 # Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
-# For now, this must be enabled if MMU_LAZY_TLB is enabled.
 config MMU_LAZY_TLB_REFCOUNT
def_bool y
depends on MMU_LAZY_TLB
 
+# Instead of refcounting the lazy mm struct for kernel thread references
+# (which can cause contention with multi-threaded apps on large multiprocessor
+# systems), this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+# switch to init_mm if they were using the to-be-freed mm as the lazy tlb. To
+# implement this, architectures must use _lazy_tlb variants of mm refcounting
+# when releasing kernel thread mm references, and mm_cpumask must include at
+# least all possible CPUs in which the mm might be lazy, at the time of the
+# final mmdrop. mmgrab/mmdrop in arch/ code must be switched to _lazy_tlb
+# postfix as necessary.
+config MMU_LAZY_TLB_SHOOTDOWN
+   bool
+   depends on MMU_LAZY_TLB
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
 
diff --git a/kernel/fork.c b/kernel/fork.c
index dc06afd725cb..d485c24426a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -674,6 +674,53 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+   struct mm_struct *mm = arg;
+
+   if (current->active_mm == mm) {
+   WARN_ON_ONCE(current->mm);
+   current->active_mm = _mm;
+   switch_mm(mm, _mm, current);
+   }
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+   struct mm_struct *mm = arg;
+
+   WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+   if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+   /*
+* IPI overheads have not found to be expensive, but they could
+* be reduced in a number of possible ways, for example (in
+* roughly increasing order of complexity):
+* - A batch of mms requiring IPIs could be gathered and freed
+*   at once.
+* - CPUs could store their active mm somewhere that can be
+*   remotely checked without a lock, to filter out
+*   false-positives in the cpumask.
+* - After mm_users or mm_count reaches zero, switching away
+*   from the mm could clear mm_cpumask to reduce some IPIs
+*   (some batching or delaying would help).
+* - A delayed freeing and RCU-like quiescing sequence based on
+*   mm switching to avoid IPIs completely.
+*/
+   on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 
1);
+   if (IS_ENABLED(CONFIG_DEBUG_VM))
+   on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+   } else {
+   /*
+* In this case, lazy tlb mms are refounted and would not reach
+* __mmdrop until all CPUs have switched away and mmdrop()ed.
+*/
+   }
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -683,7 +730,12 @@ void __mmdrop(struct mm_struct *mm)
 {
BUG_ON(mm == _mm);
WARN_ON_ONCE(mm == current->mm);
+
+   /* Ensure no CPUs are using this as their lazy tlb mm */
+   shoot_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
+
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0



[PATCH v3 2/4] lazy tlb: allow lazy tlb mm switching to be configurable

2021-06-01 Thread Nicholas Piggin
Add CONFIG_MMU_LAZY_TLB which can be configured out to disable the lazy
tlb mechanism entirely, and switches to init_mm when switching to a
kernel thread.

NOMMU systems could easily go without this and save a bit of code and
the refcount atomics, because their mm switch is a no-op. They have not
been switched over by default because the arch code needs to be audited
and tested for lazy tlb mm refcounting and converted to _lazy_tlb
refcounting if necessary.

CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always be
enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch which
provides an alternate scheme.

Signed-off-by: Nicholas Piggin 
---
 arch/Kconfig | 26 ++
 include/linux/sched/mm.h | 13 +--
 kernel/sched/core.c  | 75 ++--
 kernel/sched/sched.h |  4 ++-
 4 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index c45b770d3579..276e1c1c0219 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -418,6 +418,32 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
  irqs disabled over activate_mm. Architectures that do IPI based TLB
  shootdowns should enable this.
 
+# Enable "lazy TLB", which means a user->kernel thread context switch does not
+# switch the mm to init_mm and the kernel thread takes a reference to the user
+# mm to provide its kernel mapping. This is how Linux has traditionally worked
+# (see Documentation/vm/active_mm.rst), for performance. Switching to and from
+# idle thread is a performance-critical case.
+#
+# If mm context switches are inexpensive or free (in the case of NOMMU) then
+# this could be disabled.
+#
+# It would make sense to have this depend on MMU, but need to audit and test
+# the NOMMU architectures for lazy mm refcounting first.
+config MMU_LAZY_TLB
+   def_bool y
+   depends on !NO_MMU_LAZY_TLB
+
+# This allows archs to disable MMU_LAZY_TLB. mmgrab/mmdrop in arch/ code has
+# to be audited and switched to _lazy_tlb postfix as necessary.
+config NO_MMU_LAZY_TLB
+   def_bool n
+
+# Use normal mm refcounting for MMU_LAZY_TLB kernel thread references.
+# For now, this must be enabled if MMU_LAZY_TLB is enabled.
+config MMU_LAZY_TLB_REFCOUNT
+   def_bool y
+   depends on MMU_LAZY_TLB
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index bfd1baca5266..29e4638ad124 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-   mmgrab(mm);
+   if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+   mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-   mmdrop(mm);
+   if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+   mmdrop(mm);
+   } else {
+   /*
+* mmdrop_lazy_tlb must provide a full memory barrier, see the
+* membarrier comment finish_task_switch which relies on this.
+*/
+   smp_mb();
+   }
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e359c76ea2e2..299c3eb12b2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4171,7 +4171,7 @@ static struct rq *finish_task_switch(struct task_struct 
*prev)
__releases(rq->lock)
 {
struct rq *rq = this_rq();
-   struct mm_struct *mm = rq->prev_mm;
+   struct mm_struct *mm = NULL;
long prev_state;
 
/*
@@ -4190,7 +4190,10 @@ static struct rq *finish_task_switch(struct task_struct 
*prev)
  current->comm, current->pid, preempt_count()))
preempt_count_set(FORK_PREEMPT_COUNT);
 
-   rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+   mm = rq->prev_lazy_mm;
+   rq->prev_lazy_mm = NULL;
+#endif
 
/*
 * A task struct has one reference for the use as "current".
@@ -4282,22 +4285,10 @@ asmlinkage __visible void schedule_tail(struct 
task_struct *prev)
calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-  struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+  struct task_struct *next)
 {
-   prepare_task_switch(rq, prev, next);
-
-   /*
-* For paravirt, this is coupled with an exit in switch_to to
-* combine the page table reload and the switch backend into
-* one hypercall.
-*/
-   arch_start_context_switch(prev);
-
/*
 * kernel -> kernel   lazy + transfer active
 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -4326,11 

[PATCH v3 1/4] lazy tlb: introduce lazy mm refcount helper functions

2021-06-01 Thread Nicholas Piggin
Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes lazy mm references more obvious, and allows explicit
refcounting to be removed if it is not used.

Signed-off-by: Nicholas Piggin 
---
 arch/arm/mach-rpc/ecard.c|  2 +-
 arch/powerpc/kernel/smp.c|  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c|  4 ++--
 include/linux/sched/mm.h | 11 +++
 kernel/cpu.c |  2 +-
 kernel/exit.c|  2 +-
 kernel/kthread.c | 11 +++
 kernel/sched/core.c  | 15 ---
 9 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
current->mm = mm;
current->active_mm = mm;
activate_mm(active_mm, mm);
-   mmdrop(active_mm);
+   mmdrop_lazy_tlb(active_mm);
ecard_init_pgtables(mm);
return 0;
 }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2e05c783440a..fb0bdfc67366 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1541,7 +1541,7 @@ void start_secondary(void *unused)
 {
unsigned int cpu = raw_smp_processor_id();
 
-   mmgrab(_mm);
+   mmgrab_lazy_tlb(_mm);
current->active_mm = _mm;
 
smp_store_cpu_info(cpu);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 409e61210789..2962082787c0 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -663,10 +663,10 @@ void exit_lazy_flush_tlb(struct mm_struct *mm, bool 
always_flush)
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
-   mmgrab(_mm);
+   mmgrab_lazy_tlb(_mm);
current->active_mm = _mm;
switch_mm_irqs_off(mm, _mm, current);
-   mmdrop(mm);
+   mmdrop_lazy_tlb(mm);
}
 
/*
diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..ca0f8b1af23a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1029,9 +1029,9 @@ static int exec_mmap(struct mm_struct *mm)
setmax_mm_hiwater_rss(>signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
-   return 0;
+   } else {
+   mmdrop_lazy_tlb(active_mm);
}
-   mmdrop(active_mm);
return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index e24b1fe348e3..bfd1baca5266 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+   mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+   mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a  mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e538518556f4..e87a89824e6c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -602,7 +602,7 @@ static int finish_cpu(unsigned int cpu)
 */
if (mm != _mm)
idle->active_mm = _mm;
-   mmdrop(mm);
+   mmdrop_lazy_tlb(mm);
return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index fd1c04193e18..8e87ec5f6be2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -476,7 +476,7 @@ static void exit_mm(void)
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
-   mmgrab(mm);
+   mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fe3f2a40d61e..b70e28431a01 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1314,14 +1314,14 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
 
+   mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
-   if (active_mm != mm) {
-   mmgrab(mm);
+   if (active_mm != mm)
tsk->active_mm = mm;
-   }
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1341,7 +1341,7 @@ void kthread_use_mm(struct mm_struct *mm)
 * mmdrop(), or explicitly with smp_mb().
 */
if (active_mm != mm)
-   mmdrop(active_mm);
+   

[PATCH v3 0/4] shoot lazy tlbs

2021-06-01 Thread Nicholas Piggin
There haven't been objections to the series since last posting, this
is just a rebase and tidies up a few comments minor patch rearranging.

Thanks,
Nick

Nicholas Piggin (4):
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 arch/Kconfig | 38 
 arch/arm/mach-rpc/ecard.c|  2 +-
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/kernel/smp.c|  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
 fs/exec.c|  4 +-
 include/linux/sched/mm.h | 20 +++
 kernel/cpu.c |  2 +-
 kernel/exit.c|  2 +-
 kernel/fork.c| 52 
 kernel/kthread.c | 11 ++--
 kernel/sched/core.c  | 88 
 kernel/sched/sched.h |  4 +-
 13 files changed, 192 insertions(+), 38 deletions(-)

-- 
2.23.0