Re: [PATCH v4 10/10] net/ps3_gelic: Fix DMA mapping problems

2021-07-25 Thread Christophe Leroy

Geoff Levand  a écrit :


Fixes several DMA mapping problems with the PS3's gelic network driver:

 * Change from checking the return value of dma_map_single to using the
   dma_mapping_error routine.
 * Use the correct buffer length when mapping the RX skb.
 * Improved error checking and debug logging.


The patch is quite big and probably deserves more explanation. For  
instance, explain why the buffer length is not correct today.


Also as it is a bug fixing patch, it should include a 'fixes' tag, and  
a Cc: to sta...@vger.kernel.org. Also, when possible, bug fixes should  
be one of the first patches in a series like that so that they can be  
applied to stable without applying the whole series.


Christophe



Fixes runtime errors like these, and also other randomly occurring errors:

  IP-Config: Complete:
  DMA-API: ps3_gelic_driver sb_05: device driver failed to check map error
  WARNING: CPU: 0 PID: 0 at kernel/dma/debug.c:1027 .check_unmap+0x888/0x8dc

Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 183 +++
 1 file changed, 108 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index 42f4de9ad5fe..11ddeacb1159 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -336,22 +336,31 @@ static int gelic_card_init_chain(struct  
gelic_card *card,

struct gelic_descr_chain *chain, struct gelic_descr *start_descr,
int descr_count)
 {
-   int i;
-   struct gelic_descr *descr;
+   struct gelic_descr *descr = start_descr;
struct device *dev = ctodev(card);
+   unsigned int index;

-   descr = start_descr;
-   memset(descr, 0, sizeof(*descr) *descr_count);
+   memset(start_descr, 0, descr_count * sizeof(*start_descr));

-   for (i = 0; i < descr_count; i++, descr++) {
-   descr->link.size = sizeof(struct gelic_hw_regs);
+   for (index = 0, descr = start_descr; index < descr_count;
+   index++, descr++) {
gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE);
-   descr->link.cpu_addr =
-   dma_map_single(dev, descr, descr->link.size,
-   DMA_BIDIRECTIONAL);

-   if (!descr->link.cpu_addr)
-   goto iommu_error;
+   descr->link.size = sizeof(struct gelic_hw_regs);
+   descr->link.cpu_addr = dma_map_single(dev, descr,
+   descr->link.size, DMA_BIDIRECTIONAL);
+
+   if (unlikely(dma_mapping_error(dev, descr->link.cpu_addr))) {
+   dev_err(dev, "%s:%d: dma_mapping_error\n", __func__,
+   __LINE__);
+
+   for (index--, descr--; index > 0; index--, descr--) {
+   if (descr->link.cpu_addr) {
+   gelic_unmap_link(dev, descr);
+   }
+   }
+   return -ENOMEM;
+   }

descr->next = descr + 1;
descr->prev = descr - 1;
@@ -360,8 +369,9 @@ static int gelic_card_init_chain(struct gelic_card *card,
(descr - 1)->next = start_descr;
start_descr->prev = (descr - 1);

-   descr = start_descr;
-   for (i = 0; i < descr_count; i++, descr++) {
+   /* chain bus addr of hw descriptor */
+   for (index = 0, descr = start_descr; index < descr_count;
+   index++, descr++) {
descr->hw_regs.next_descr_addr =
cpu_to_be32(descr->next->link.cpu_addr);
}
@@ -373,12 +383,6 @@ static int gelic_card_init_chain(struct  
gelic_card *card,

(descr - 1)->hw_regs.next_descr_addr = 0;

return 0;
-
-iommu_error:
-   for (i--, descr--; 0 <= i; i--, descr--)
-   if (descr->link.cpu_addr)
-   gelic_unmap_link(dev, descr);
-   return -ENOMEM;
 }

 /**
@@ -395,49 +399,63 @@ static int gelic_descr_prepare_rx(struct  
gelic_card *card,

struct gelic_descr *descr)
 {
struct device *dev = ctodev(card);
-   int offset;
-   unsigned int bufsize;
+   struct aligned_buff {
+   unsigned int total_bytes;
+   unsigned int offset;
+   };
+   struct aligned_buff a_buf;
+   dma_addr_t cpu_addr;

if (gelic_descr_get_status(descr) !=  GELIC_DESCR_DMA_NOT_IN_USE) {
dev_err(dev, "%s:%d: ERROR status\n", __func__, __LINE__);
}

-   /* we need to round up the buffer size to a multiple of 128 */
-   bufsize = ALIGN(GELIC_NET_MAX_MTU, GELIC_NET_RXBUF_ALIGN);
+   a_buf.total_bytes = ALIGN(GELIC_NET_MAX_MTU, GELIC_NET_RXBUF_ALIGN)
+   + GELIC_NET_RXBUF_ALIGN;
+
+   descr->skb = dev_alloc_skb(a_buf.total_bytes);

-   /* and we 

Re: [PATCH v4 09/10] net/ps3_gelic: Add new routine gelic_work_to_card

2021-07-25 Thread Christophe Leroy

Geoff Levand  a écrit :


Add new helper routine gelic_work_to_card that converts a work_struct
to a gelic_card.


Adding a function is it really needed as it is used only once ?

Christophe



Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index 60fcca5d20dd..42f4de9ad5fe 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -1420,6 +1420,11 @@ static const struct ethtool_ops  
gelic_ether_ethtool_ops = {

.set_link_ksettings = gelic_ether_set_link_ksettings,
 };

+static struct gelic_card *gelic_work_to_card(struct work_struct *work)
+{
+   return container_of(work, struct gelic_card, tx_timeout_task);
+}
+
 /**
  * gelic_net_tx_timeout_task - task scheduled by the watchdog timeout
  * function (to be called not under interrupt status)
@@ -1429,8 +1434,7 @@ static const struct ethtool_ops  
gelic_ether_ethtool_ops = {

  */
 static void gelic_net_tx_timeout_task(struct work_struct *work)
 {
-   struct gelic_card *card =
-   container_of(work, struct gelic_card, tx_timeout_task);
+   struct gelic_card *card = gelic_work_to_card(work);
struct net_device *netdev = card->netdev[GELIC_PORT_ETHERNET_0];
struct device *dev = ctodev(card);

--
2.25.1





Re: [PATCH v4 08/10] net/ps3_gelic: Rename no to descr_count

2021-07-25 Thread Christophe Leroy

Geoff Levand  a écrit :


In an effort to make the PS3 gelic driver easier to maintain, rename
the gelic_card_init_chain parameter 'no' to 'descr_count'.


Not sure you really need a so long name. 'count' should be good enough.

Read https://www.kernel.org/doc/html/latest/process/coding-style.html#naming



Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index e55aa9fecfeb..60fcca5d20dd 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -325,7 +325,7 @@ static void gelic_card_free_chain(struct  
gelic_card *card,

  * @card: card structure
  * @chain: address of chain
  * @start_descr: address of descriptor array
- * @no: number of descriptors
+ * @descr_count: number of descriptors
  *
  * we manage a circular list that mirrors the hardware structure,
  * except that the hardware uses bus addresses.
@@ -334,16 +334,16 @@ static void gelic_card_free_chain(struct  
gelic_card *card,

  */
 static int gelic_card_init_chain(struct gelic_card *card,
struct gelic_descr_chain *chain, struct gelic_descr *start_descr,
-   int no)
+   int descr_count)
 {
int i;
struct gelic_descr *descr;
struct device *dev = ctodev(card);

descr = start_descr;
-   memset(descr, 0, sizeof(*descr) * no);
+   memset(descr, 0, sizeof(*descr) *descr_count);


You forgot the space after the *

Christophe



-   for (i = 0; i < no; i++, descr++) {
+   for (i = 0; i < descr_count; i++, descr++) {
descr->link.size = sizeof(struct gelic_hw_regs);
gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE);
descr->link.cpu_addr =
@@ -361,7 +361,7 @@ static int gelic_card_init_chain(struct gelic_card *card,
start_descr->prev = (descr - 1);

descr = start_descr;
-   for (i = 0; i < no; i++, descr++) {
+   for (i = 0; i < descr_count; i++, descr++) {
descr->hw_regs.next_descr_addr =
cpu_to_be32(descr->next->link.cpu_addr);
}
--
2.25.1





Re: [PATCH v4 04/10] net/ps3_gelic: Add new macro BUG_ON_DEBUG

2021-07-25 Thread Christophe Leroy

Geoff Levand  a écrit :


Add a new preprocessor macro BUG_ON_DEBUG, that expands to BUG_ON when
the preprocessor macro DEBUG is defined, or to WARN_ON when DEBUG is not
defined.  Also, replace all occurrences of BUG_ON with BUG_ON_DEBUG.


Why is BUG_ON() needed at all if WARN_ON() is enough ?

You just have to set panic_on_warn  to get the system to stop at first  
warning.


BUG_ON() should be avoided unless vital.

Please read  
https://www.kernel.org/doc/html/latest/process/deprecated.html#bug-and-bug-on


Christophe




Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index ded467d81f36..946e9bfa071b 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -44,6 +44,13 @@ MODULE_AUTHOR("SCE Inc.");
 MODULE_DESCRIPTION("Gelic Network driver");
 MODULE_LICENSE("GPL");

+#define BUG_ON_DEBUG(_cond) do { \
+   if (__is_defined(DEBUG)) \
+   BUG_ON(_cond); \
+   else \
+   WARN_ON(_cond); \
+} while (0)
+
 int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask)
 {
struct device *dev = ctodev(card);
@@ -505,7 +512,7 @@ static void gelic_descr_release_tx(struct  
gelic_card *card,

struct sk_buff *skb = descr->skb;
struct device *dev = ctodev(card);

-   BUG_ON(!(be32_to_cpu(descr->hw_regs.data_status) &
+   BUG_ON_DEBUG(!(be32_to_cpu(descr->hw_regs.data_status) &
GELIC_DESCR_TX_TAIL));

dma_unmap_single(dev, be32_to_cpu(descr->hw_regs.payload.dev_addr),
@@ -1667,7 +1674,7 @@ static void gelic_card_get_vlan_info(struct  
gelic_card *card)

}

if (card->vlan[GELIC_PORT_ETHERNET_0].tx) {
-   BUG_ON(!card->vlan[GELIC_PORT_WIRELESS].tx);
+   BUG_ON_DEBUG(!card->vlan[GELIC_PORT_WIRELESS].tx);
card->vlan_required = 1;
} else
card->vlan_required = 0;
@@ -1709,7 +1716,7 @@ static int ps3_gelic_driver_probe(struct  
ps3_system_bus_device *sb_dev)

if (result) {
dev_err(dev, "%s:%d: ps3_dma_region_create failed: %d\n",
__func__, __LINE__, result);
-   BUG_ON("check region type");
+   BUG_ON_DEBUG("check region type");
goto fail_dma_region;
}

--
2.25.1





Re: [PATCH v4 02/10] net/ps3_gelic: Use local dev variable

2021-07-24 Thread Christophe Leroy

Geoff Levand  a écrit :


In an effort to make the PS3 gelic driver easier to maintain, add a
local variable dev to those routines that use the device structure that
makes the use the device structure more consistent.

Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 340 +++
 1 file changed, 191 insertions(+), 149 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index cb45571573d7..ba008a98928a 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -48,13 +48,15 @@ MODULE_LICENSE("GPL");
 /* set irq_mask */
 int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask)
 {
+   struct device *dev = ctodev(card);
int status;

status = lv1_net_set_interrupt_mask(bus_id(card), dev_id(card),
mask, 0);
-   if (status)
-   dev_info(ctodev(card),
-"%s failed %d\n", __func__, status);
+   if (status) {


There shall be no { } for single line statements. And anyway this  
change is not part of this patch as it is unrelated to the use of  
local dev variable.



+   dev_err(dev, "%s:%d failed: %d\n", __func__, __LINE__, status);
+   }
+
return status;
 }

@@ -103,6 +105,7 @@ gelic_descr_get_status(struct gelic_descr *descr)

 static int gelic_card_set_link_mode(struct gelic_card *card, int mode)
 {
+   struct device *dev = ctodev(card);
int status;
u64 v1, v2;

@@ -110,8 +113,8 @@ static int gelic_card_set_link_mode(struct  
gelic_card *card, int mode)

 GELIC_LV1_SET_NEGOTIATION_MODE,
 GELIC_LV1_PHY_ETHERNET_0, mode, 0, , );
if (status) {
-   pr_info("%s: failed setting negotiation mode %d\n", __func__,
-   status);
+   dev_err(dev, "%s:%d: Failed setting negotiation mode: %d\n",
+   __func__, __LINE__, status);


Changing from pr_info to dev_err is unrelated to the use of a dev local var


return -EBUSY;
}

@@ -128,13 +131,15 @@ static int gelic_card_set_link_mode(struct  
gelic_card *card, int mode)

  */
 static void gelic_card_disable_txdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;

/* this hvc blocks until the DMA in progress really stopped */
status = lv1_net_stop_tx_dma(bus_id(card), dev_id(card));
-   if (status)
-   dev_err(ctodev(card),
-   "lv1_net_stop_tx_dma failed, status=%d\n", status);
+
+   if (status) {
+   dev_err(dev, "lv1_net_stop_tx_dma failed, status=%d\n", status);
+   }
 }

 /**
@@ -146,6 +151,7 @@ static void gelic_card_disable_txdmac(struct  
gelic_card *card)

  */
 static void gelic_card_enable_rxdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;

 #ifdef DEBUG
@@ -161,9 +167,10 @@ static void gelic_card_enable_rxdmac(struct  
gelic_card *card)

 #endif
status = lv1_net_start_rx_dma(bus_id(card), dev_id(card),
card->rx_chain.head->link.cpu_addr, 0);
-   if (status)
-   dev_info(ctodev(card),
-"lv1_net_start_rx_dma failed, status=%d\n", status);
+   if (status) {
+   dev_err(dev, "lv1_net_start_rx_dma failed, status=%d\n",
+   status);
+   }
 }

 /**
@@ -175,13 +182,15 @@ static void gelic_card_enable_rxdmac(struct  
gelic_card *card)

  */
 static void gelic_card_disable_rxdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;

/* this hvc blocks until the DMA in progress really stopped */
status = lv1_net_stop_rx_dma(bus_id(card), dev_id(card));
-   if (status)
-   dev_err(ctodev(card),
-   "lv1_net_stop_rx_dma failed, %d\n", status);
+
+   if (status) {
+   dev_err(dev, "lv1_net_stop_rx_dma failed, %d\n", status);
+   }
 }

 /**
@@ -235,10 +244,11 @@ static void gelic_card_reset_chain(struct  
gelic_card *card,


 void gelic_card_up(struct gelic_card *card)
 {
-   pr_debug("%s: called\n", __func__);
+   struct device *dev = ctodev(card);
+
mutex_lock(>updown_lock);
if (atomic_inc_return(>users) == 1) {
-   pr_debug("%s: real do\n", __func__);
+   dev_dbg(dev, "%s:%d: Starting...\n", __func__, __LINE__);
/* enable irq */
gelic_card_set_irq_mask(card, card->irq_mask);
/* start rx */
@@ -247,16 +257,16 @@ void gelic_card_up(struct gelic_card *card)
napi_enable(>napi);
}
mutex_unlock(>updown_lock);
-   pr_debug("%s: done\n", __func__);
 }

 void gelic_card_down(struct gelic_card *card)

Re: [PATCH v4 03/10] net/ps3_gelic: Format cleanups

2021-07-24 Thread Christophe Leroy

Geoff Levand  a écrit :


In an effort to make the PS3 gelic driver easier to maintain, cleanup the
the driver source file formatting to be more consistent.


Many of your changes in this patch go in the wrong direction.

For instance, you shall not use { } in an if/else sequence with single  
lines in both the if and the else. See  
https://www.kernel.org/doc/html/latest/process/coding-style.html#placing-braces-and-spaces


In a multiline operation, the argument of the second line must be  
aligned to the matching parenthesis.


Christophe




Re: [PATCH 0/2] Fix arm64 boot regression in 5.14

2021-07-20 Thread Christophe Leroy

Will Deacon  a écrit :


Hi folks,

Jonathan reports [1] that commit c742199a014d ("mm/pgtable: add stubs
for {pmd/pub}_{set/clear}_huge") breaks the boot on arm64 when huge
mappings are used to map the kernel linear map but the VA size is
configured such that PUDs are folded. This is because the non-functional
pud_set_huge() stub is used to create the linear map, which results in
1GB holes and a fatal data abort when the kernel attemps to access them.

Digging further into the issue, it also transpired that huge-vmap is
silently disabled in these configurations as well [2], despite working
correctly in 5.13. The latter issue causes the pgtable selftests to
scream due to a failing consistency check [3].

Rather than leave mainline in a terminally broken state for arm64 while
we figure this out, revert the offending commit to get things working
again. Unfortunately, reverting the change in isolation causes a build
breakage for 32-bit PowerPC 8xx machines which recently started relying
on the problematic stubs to support pte-level huge-vmap entries [4].
Since Christophe is away at the moment, this series first reverts the
PowerPC 8xx change in order to avoid breaking the build.

I would really like this to land for -rc3 and I can take these via the
arm64 fixes queue if the PowerPC folks are alright with them.



If you can drop patch 1,

Change patch 2 to add the two following functions in  
arch/powerpc/mm/nohash/8xx.c :


int pud_clear_huge(pud_t *pud)
{
return 0;
}

int pmd_clear_huge(pmd_t *pmd)
{
return 0;
}

Then feel free to take it via ARM fixes with my acked-by as maintainer  
of PPC8XX.


Christophe



Cheers,

Will

[1] https://lore.kernel.org/r/20210717160118.9855-1-jonat...@marek.ca
[2] https://lore.kernel.org/r/20210719104918.GA6440@willie-the-truck
[3]  
https://lore.kernel.org/r/camuhmdxshordox-xxaeufdw3wx2peggfsqhvshvznkcgk-y...@mail.gmail.com/
[4]  
https://lore.kernel.org/r/8b972f1c03fb6bd59953035f0a3e4d26659de4f8.1620795204.git.christophe.le...@csgroup.eu/


Cc: Ard Biesheuvel 
Cc: Michael Ellerman 
Cc: Thomas Gleixner 
Cc: Benjamin Herrenschmidt 
Cc: Christophe Leroy 
Cc: Paul Mackerras 
Cc: Jonathan Marek 
Cc: Catalin Marinas 
Cc: Andrew Morton 
Cc: Nicholas Piggin 
Cc: Mark Rutland 
Cc: Geert Uytterhoeven 
Cc: Marc Zyngier 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-arm-ker...@lists.infradead.org

--->8

Jonathan Marek (1):
  Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge"

Will Deacon (1):
  Revert "powerpc/8xx: add support for huge pages on VMAP and VMALLOC"

 arch/arm64/mm/mmu.c  | 20 -
 arch/powerpc/Kconfig |  2 +-
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 43 
 arch/x86/mm/pgtable.c| 34 +++-
 include/linux/pgtable.h  | 26 +---
 5 files changed, 25 insertions(+), 100 deletions(-)

--
2.32.0.402.g57bb445576-goog





Re: [PATCH 1/2] Revert "powerpc/8xx: add support for huge pages on VMAP and VMALLOC"

2021-07-20 Thread Christophe Leroy

Will Deacon  a écrit :


This reverts commit a6a8f7c4aa7eb50304b5c4e68eccd24313f3a785.

Commit c742199a014d ("mm/pgtable: add stubs for
{pmd/pub}_{set/clear}_huge") breaks the boot for arm64 when block
mappings are used to create the linear map, as this relies on a working
implementation of pXd_set_huge() even if the corresponding page-table
levels have been folded.

Although the problematic patch reverts cleanly, doing so breaks the
build for 32-bit PowerPC 8xx machines, which rely on the default
function definitions when the corresponding page-table levels are
folded:

 | powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range':
 | linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge'
 | powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pmd_range':
 | linux/mm/vmalloc.c:337: undefined reference to `pmd_clear_huge'
 | powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range':
 | linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge'
 | powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pmd_range':
 | linux/mm/vmalloc.c:337: undefined reference to `pmd_clear_huge'
 | make: *** [Makefile:1177: vmlinux] Error 1

Although Christophe has kindly offered to look into the arm64 breakage,
he's on holiday for another 10 days and there isn't an obvious fix on
the arm64 side which allows us to continue using huge-vmap for affected
configurations.

In the interest of quickly getting things back to a working state as
they were in 5.13, revert the huge-vmap changes for PowerPC 8xx prior to
reverting the change which breaks arm64. We can then work on this
together for 5.15 once Christophe is back.


Instead of reverting this awaited functionnality,  could you please  
just add the two following functions in arch/powerpc/mm/nohash/8xx.c :


int pud_clear_huge(pud_t *pud)
{
return 0;
}

int pmd_clear_huge(pmd_t *pmd)
{
return 0;
}

Thank you
Christophe



Cc: Ard Biesheuvel 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Christophe Leroy 
Cc: Paul Mackerras 
Cc: Catalin Marinas 
Cc: Andrew Morton 
Cc: Nicholas Piggin 
Cc: Mark Rutland 
Cc: Geert Uytterhoeven 
Cc: Marc Zyngier 
Link:  
https://lore.kernel.org/r/20210719170615.horde.qio1wp3k5eblo-d9xxhd...@messagerie.c-s.fr

Signed-off-by: Will Deacon 
---
 arch/powerpc/Kconfig |  2 +-
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 43 
 2 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d01e3401581d..5fc19ac62cb9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -189,7 +189,7 @@ config PPC
select GENERIC_VDSO_TIME_NS
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMALLOC   if HAVE_ARCH_HUGE_VMAP
-   select HAVE_ARCH_HUGE_VMAP  if PPC_RADIX_MMU || PPC_8xx
+   select HAVE_ARCH_HUGE_VMAP  if PPC_BOOK3S_64 && 
PPC_RADIX_MMU
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN  if PPC32 && PPC_PAGE_SHIFT <= 14
diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h  
b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h

index 997cec973406..6e4faa0a9b35 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -178,7 +178,6 @@
 #ifndef __ASSEMBLY__

 #include 
-#include 

 void mmu_pin_tlb(unsigned long top, bool readonly);

@@ -226,48 +225,6 @@ static inline unsigned int  
mmu_psize_to_shift(unsigned int mmu_psize)

BUG();
 }

-static inline bool arch_vmap_try_size(unsigned long addr, unsigned  
long end, u64 pfn,

- unsigned int max_page_shift, unsigned 
long size)
-{
-   if (end - addr < size)
-   return false;
-
-   if ((1UL << max_page_shift) < size)
-   return false;
-
-   if (!IS_ALIGNED(addr, size))
-   return false;
-
-   if (!IS_ALIGNED(PFN_PHYS(pfn), size))
-   return false;
-
-   return true;
-}
-
-static inline unsigned long arch_vmap_pte_range_map_size(unsigned  
long addr, unsigned long end,

-u64 pfn, unsigned int 
max_page_shift)
-{
-   if (arch_vmap_try_size(addr, end, pfn, max_page_shift, SZ_512K))
-   return SZ_512K;
-   if (PAGE_SIZE == SZ_16K)
-   return SZ_16K;
-   if (arch_vmap_try_size(addr, end, pfn, max_page_shift, SZ_16K))
-   return SZ_16K;
-   return PAGE_SIZE;
-}
-#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
-
-static inline int arch_vmap_pte_supported_shift(unsigned long size)
-{
-   if (size >= SZ_512K)
-   return 19;
-   else if (size >= SZ_16K)
-   return 14;
-   else
-   return PAGE_SHIFT;
-}
-#define arch_vmap_pte_supported_shift arch_vmap_p

Re: [PATCH] replace if with min

2021-07-19 Thread Christophe Leroy

Salah Triki  a écrit :


Replace if with min in order to make code more clean.

Signed-off-by: Salah Triki 
---
 drivers/crypto/nx/nx-842.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/crypto/nx/nx-842.c b/drivers/crypto/nx/nx-842.c
index 2ab90ec10e61..0d1d5a463899 100644
--- a/drivers/crypto/nx/nx-842.c
+++ b/drivers/crypto/nx/nx-842.c
@@ -134,8 +134,7 @@ EXPORT_SYMBOL_GPL(nx842_crypto_exit);
 static void check_constraints(struct nx842_constraints *c)
 {
/* limit maximum, to always have enough bounce buffer to decompress */
-   if (c->maximum > BOUNCE_BUFFER_SIZE)
-   c->maximum = BOUNCE_BUFFER_SIZE;
+   c->maximum = min(c->maximum, BOUNCE_BUFFER_SIZE);


For me the code is less clear with this change, and in addition it  
slightly changes the behaviour. Before, the write was done only when  
the value was changing. Now you rewrite the value always, even when it  
doesn't change.



 }

 static int nx842_crypto_add_header(struct nx842_crypto_header *hdr, u8 *buf)
--
2.25.1





Re: [PATCH] powerpc/chrp: Revert "Move PHB discovery" and "Make hydra_init() static"

2021-07-17 Thread Christophe Leroy

Christophe Leroy  a écrit :


Guenter Roeck  a écrit :


This patch reverts commit 407d418f2fd4 ("powerpc/chrp: Move PHB
discovery") and commit 9634afa67bfd ("powerpc/chrp: Make hydra_init()
static").

Running the upstream kernel on Qemu's brand new "pegasos2" emulation
results in a variety of backtraces such as

Kernel attempted to write user page (a1) - exploit attempt? (uid: 0)
[ cut here ]
Bug: Write fault blocked by KUAP!
WARNING: CPU: 0 PID: 0 at arch/powerpc/mm/fault.c:230  
do_page_fault+0x4f4/0x920

CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.2 #40
NIP:  c0021824 LR: c0021824 CTR: 
REGS: c1085d50 TRAP: 0700   Not tainted  (5.13.2)
MSR:  00021032   CR: 24042254  XER: 

GPR00: c0021824 c1085e10 c0f8c520 0021 3fffefff c1085c60  
c1085c58 
GPR08: 1032   c0ffb3ec 44042254   
 0004
GPR16:   00c4 00d0 0188c6e0 01006000  
0001 40b14000
GPR24: c0ec000c 0300 0200  4200 00a1  
 c1085e60

NIP [c0021824] do_page_fault+0x4f4/0x920
LR [c0021824] do_page_fault+0x4f4/0x920
Call Trace:
[c1085e10] [c0021824] do_page_fault+0x4f4/0x920 (unreliable)
[c1085e50] [c0004254] DataAccess_virt+0xd4/0xe4

and the system fails to boot. Bisect points to commit 407d418f2fd4
("powerpc/chrp: Move PHB discovery"). Reverting this patch together with
commit 9634afa67bfd ("powerpc/chrp: Make hydra_init() static") fixes
the problem.


Isn't there more than that in the backtrace ? If there is a fault  
blocked by Kuap, it means there is a fault. It should be visible in  
the traces.


Should we fix the problem instead of reverting the commit that made  
the problem visible ?




Also, as it is a KUAP fault, did you test without CONFIG_PPC_KUAP ?  
Does it boot ?






Cc: Oliver O'Halloran 
Cc: Geert Uytterhoeven 
Fixes: 407d418f2fd4 ("powerpc/chrp: Move PHB discovery")
Signed-off-by: Guenter Roeck 
---
arch/powerpc/include/asm/hydra.h|  2 ++
arch/powerpc/platforms/chrp/pci.c   | 11 ++-
arch/powerpc/platforms/chrp/setup.c | 12 +++-
3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/hydra.h  
b/arch/powerpc/include/asm/hydra.h

index d024447283a0..ae02eb53d6ef 100644
--- a/arch/powerpc/include/asm/hydra.h
+++ b/arch/powerpc/include/asm/hydra.h
@@ -94,6 +94,8 @@ extern volatile struct Hydra __iomem *Hydra;
#define HYDRA_INT_EXT7  18  /* Power Off Request */
#define HYDRA_INT_SPARE 19

+extern int hydra_init(void);
+
#endif /* __KERNEL__ */

#endif /* _ASMPPC_HYDRA_H */
diff --git a/arch/powerpc/platforms/chrp/pci.c  
b/arch/powerpc/platforms/chrp/pci.c

index 76e6256cb0a7..b2c2bf35b76c 100644
--- a/arch/powerpc/platforms/chrp/pci.c
+++ b/arch/powerpc/platforms/chrp/pci.c
@@ -131,7 +131,8 @@ static struct pci_ops rtas_pci_ops =

volatile struct Hydra __iomem *Hydra = NULL;

-static int __init hydra_init(void)
+int __init
+hydra_init(void)
{
struct device_node *np;
struct resource r;
@@ -313,14 +314,6 @@ chrp_find_bridges(void)
}
}
of_node_put(root);
-
-   /*
-*  "Temporary" fixes for PCI devices.
-*  -- Geert
-*/
-   hydra_init();   /* Mac I/O */
-
-   pci_create_OF_bus_map();
}

/* SL82C105 IDE Control/Status Register */
diff --git a/arch/powerpc/platforms/chrp/setup.c  
b/arch/powerpc/platforms/chrp/setup.c

index 3cfc382841e5..c45435aa5e36 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -334,11 +334,22 @@ static void __init chrp_setup_arch(void)
/* On pegasos, enable the L2 cache if not already done by OF */
pegasos_set_l2cr();

+   /* Lookup PCI host bridges */
+   chrp_find_bridges();
+
+   /*
+*  Temporary fixes for PCI devices.
+*  -- Geert
+*/
+   hydra_init();   /* Mac I/O */
+
/*
 *  Fix the Super I/O configuration
 */
sio_init();

+   pci_create_OF_bus_map();
+
/*
 * Print the banner, then scroll down so boot progress
 * can be printed.  -- Cort
@@ -571,7 +582,6 @@ define_machine(chrp) {
.name   = "CHRP",
.probe  = chrp_probe,
.setup_arch = chrp_setup_arch,
-   .discover_phbs  = chrp_find_bridges,
.init   = chrp_init2,
.show_cpuinfo   = chrp_show_cpuinfo,
.init_IRQ   = chrp_init_IRQ,
--
2.25.1





Re: [PATCH] powerpc/chrp: Revert "Move PHB discovery" and "Make hydra_init() static"

2021-07-17 Thread Christophe Leroy

Guenter Roeck  a écrit :


This patch reverts commit 407d418f2fd4 ("powerpc/chrp: Move PHB
discovery") and commit 9634afa67bfd ("powerpc/chrp: Make hydra_init()
static").

Running the upstream kernel on Qemu's brand new "pegasos2" emulation
results in a variety of backtraces such as

Kernel attempted to write user page (a1) - exploit attempt? (uid: 0)
[ cut here ]
Bug: Write fault blocked by KUAP!
WARNING: CPU: 0 PID: 0 at arch/powerpc/mm/fault.c:230  
do_page_fault+0x4f4/0x920

CPU: 0 PID: 0 Comm: swapper Not tainted 5.13.2 #40
NIP:  c0021824 LR: c0021824 CTR: 
REGS: c1085d50 TRAP: 0700   Not tainted  (5.13.2)
MSR:  00021032   CR: 24042254  XER: 

GPR00: c0021824 c1085e10 c0f8c520 0021 3fffefff c1085c60  
c1085c58 
GPR08: 1032   c0ffb3ec 44042254   
 0004
GPR16:   00c4 00d0 0188c6e0 01006000  
0001 40b14000
GPR24: c0ec000c 0300 0200  4200 00a1  
 c1085e60

NIP [c0021824] do_page_fault+0x4f4/0x920
LR [c0021824] do_page_fault+0x4f4/0x920
Call Trace:
[c1085e10] [c0021824] do_page_fault+0x4f4/0x920 (unreliable)
[c1085e50] [c0004254] DataAccess_virt+0xd4/0xe4

and the system fails to boot. Bisect points to commit 407d418f2fd4
("powerpc/chrp: Move PHB discovery"). Reverting this patch together with
commit 9634afa67bfd ("powerpc/chrp: Make hydra_init() static") fixes
the problem.


Isn't there more than that in the backtrace ? If there is a fault  
blocked by Kuap, it means there is a fault. It should be visible in  
the traces.


Should we fix the problem instead of reverting the commit that made  
the problem visible ?





Cc: Oliver O'Halloran 
Cc: Geert Uytterhoeven 
Fixes: 407d418f2fd4 ("powerpc/chrp: Move PHB discovery")
Signed-off-by: Guenter Roeck 
---
 arch/powerpc/include/asm/hydra.h|  2 ++
 arch/powerpc/platforms/chrp/pci.c   | 11 ++-
 arch/powerpc/platforms/chrp/setup.c | 12 +++-
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/hydra.h  
b/arch/powerpc/include/asm/hydra.h

index d024447283a0..ae02eb53d6ef 100644
--- a/arch/powerpc/include/asm/hydra.h
+++ b/arch/powerpc/include/asm/hydra.h
@@ -94,6 +94,8 @@ extern volatile struct Hydra __iomem *Hydra;
 #define HYDRA_INT_EXT7 18  /* Power Off Request */
 #define HYDRA_INT_SPARE19

+extern int hydra_init(void);
+
 #endif /* __KERNEL__ */

 #endif /* _ASMPPC_HYDRA_H */
diff --git a/arch/powerpc/platforms/chrp/pci.c  
b/arch/powerpc/platforms/chrp/pci.c

index 76e6256cb0a7..b2c2bf35b76c 100644
--- a/arch/powerpc/platforms/chrp/pci.c
+++ b/arch/powerpc/platforms/chrp/pci.c
@@ -131,7 +131,8 @@ static struct pci_ops rtas_pci_ops =

 volatile struct Hydra __iomem *Hydra = NULL;

-static int __init hydra_init(void)
+int __init
+hydra_init(void)
 {
struct device_node *np;
struct resource r;
@@ -313,14 +314,6 @@ chrp_find_bridges(void)
}
}
of_node_put(root);
-
-   /*
-*  "Temporary" fixes for PCI devices.
-*  -- Geert
-*/
-   hydra_init();   /* Mac I/O */
-
-   pci_create_OF_bus_map();
 }

 /* SL82C105 IDE Control/Status Register */
diff --git a/arch/powerpc/platforms/chrp/setup.c  
b/arch/powerpc/platforms/chrp/setup.c

index 3cfc382841e5..c45435aa5e36 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -334,11 +334,22 @@ static void __init chrp_setup_arch(void)
/* On pegasos, enable the L2 cache if not already done by OF */
pegasos_set_l2cr();

+   /* Lookup PCI host bridges */
+   chrp_find_bridges();
+
+   /*
+*  Temporary fixes for PCI devices.
+*  -- Geert
+*/
+   hydra_init();   /* Mac I/O */
+
/*
 *  Fix the Super I/O configuration
 */
sio_init();

+   pci_create_OF_bus_map();
+
/*
 * Print the banner, then scroll down so boot progress
 * can be printed.  -- Cort
@@ -571,7 +582,6 @@ define_machine(chrp) {
.name   = "CHRP",
.probe  = chrp_probe,
.setup_arch = chrp_setup_arch,
-   .discover_phbs  = chrp_find_bridges,
.init   = chrp_init2,
.show_cpuinfo   = chrp_show_cpuinfo,
.init_IRQ   = chrp_init_IRQ,
--
2.25.1





Re: [PATCH] ibmvfc: fix command state accounting and stale response detection

2021-07-17 Thread Christophe Leroy

Tyrel Datwyler  a écrit :


Prior to commit 1f4a4a19508d ("scsi: ibmvfc: Complete commands outside
the host/queue lock") responses to commands were completed sequentially
with the host lock held such that a command had a basic binary state of
active or free. It was therefore a simple affair of ensuring the
assocaiated ibmvfc_event to a VIOS response was valid by testing that it
was not already free. The lock relexation work to complete commands
outside the lock inadverdently made it a trinary command state such that
a command is either in flight, received and being completed, or
completed and now free. This breaks the stale command detection logic as
a command may be still marked active and been placed on the delayed
completion list when a second stale response for the same command
arrives. This can lead to double completions and list corruption. This
issue was exposed by a recent VIOS regression were a missing memory
barrier could occasionally result in the ibmvfc client receiveing a
duplicate response for the same command.

Fix the issue by introducing the atomic ibmvfc_event.active to track the
trinary state of a command. The state is explicitly set to 1 when a
command is successfully sent. The CRQ response handlers use
atomic_dec_if_positive() to test for stale responses and correctly
transition to the completion state when a active command is received.
Finally, atomic_dec_and_test() is used to sanity check transistions
when commands are freed as a result of a completion, or moved to the
purge list as a result of error handling or adapter reset.

Cc: sta...@vger.kernel.org
Fixes: 1f4a4a19508d ("scsi: ibmvfc: Complete commands outside the  
host/queue lock")

Signed-off-by: Tyrel Datwyler 
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 19 +--
 drivers/scsi/ibmvscsi/ibmvfc.h |  1 +
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index bee1bec49c09..935b01ee44b7 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -807,6 +807,13 @@ static int ibmvfc_init_event_pool(struct  
ibmvfc_host *vhost,

for (i = 0; i < size; ++i) {
struct ibmvfc_event *evt = >events[i];

+   /*
+* evt->active states
+*  1 = in flight
+*  0 = being completed
+* -1 = free/freed
+*/
+   atomic_set(>active, -1);
atomic_set(>free, 1);
evt->crq.valid = 0x80;
evt->crq.ioba = cpu_to_be64(pool->iu_token + 
(sizeof(*evt->xfer_iu) * i));
@@ -1017,6 +1024,7 @@ static void ibmvfc_free_event(struct ibmvfc_event *evt)

BUG_ON(!ibmvfc_valid_event(pool, evt));
BUG_ON(atomic_inc_return(>free) != 1);
+   BUG_ON(atomic_dec_and_test(>active));


Avoid new BUG_ONs. See  
https://www.kernel.org/doc/html/latest/process/deprecated.html




spin_lock_irqsave(>queue->l_lock, flags);
list_add_tail(>queue_list, >queue->free);
@@ -1072,6 +1080,12 @@ static void ibmvfc_complete_purge(struct  
list_head *purge_list)

  **/
 static void ibmvfc_fail_request(struct ibmvfc_event *evt, int error_code)
 {
+   /*
+* Anything we are failing should still be active. Otherwise, it
+* implies we already got a response for the command and are doing
+* something bad like double completing it.
+*/
+   BUG_ON(!atomic_dec_and_test(>active));


Same




Re: [PATCH v2] powerpc/rtas_flash: fix a potential buffer overflow

2021-07-14 Thread Christophe Leroy

Yi Zhuang  a écrit :


Since snprintf() returns the possible output size instead of the
actual output size, the available flash_msg length returned by
get_validate_flash_msg may exceed the given buffer limit when
simple_read_from_buffer calls copy_to_user

Reported-by: kernel test robot 
Fixes: a94a14720eaf5 powerpc/rtas_flash: Fix validate_flash buffer  
overflow issue

Signed-off-by: Yi Zhuang 
---
 arch/powerpc/kernel/rtas_flash.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/rtas_flash.c  
b/arch/powerpc/kernel/rtas_flash.c

index a99179d83538..062f0724c2ff 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -470,9 +470,14 @@ static int get_validate_flash_msg(struct  
rtas_validate_flash_t *args_buf,

if (args_buf->status >= VALIDATE_TMP_UPDATE) {
n = sprintf(msg, "%d\n", args_buf->update_results);
if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) ||
-   (args_buf->update_results == VALIDATE_TMP_UPDATE))
+   (args_buf->update_results == VALIDATE_TMP_UPDATE)) {
n += snprintf(msg + n, msglen - n, "%s\n",
args_buf->buf);
+   if (n >= msglen) {


n cannot be greater than msglen



+   n = msglen;
+   printk(KERN_ERR "FLASH: msg too long.\n");
+   }
+   }
} else {
n = sprintf(msg, "%d\n", args_buf->status);
}
--
2.26.0.106.g9fadedd





Re: [PATCH v3 2/2] net/ps3_gelic: Cleanups, improve logging

2021-07-11 Thread Christophe Leroy

Geoff Levand  a écrit :


General source cleanups and improved logging messages.



Describe a bit more what you do to cleanup and improve.

Some of your changes are not cleanup , they increase the mess.


You should read kernel coding style


Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 395 ++-
 1 file changed, 216 insertions(+), 179 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index e01938128882..9dbcb7c4ec80 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -44,17 +44,17 @@ MODULE_AUTHOR("SCE Inc.");
 MODULE_DESCRIPTION("Gelic Network driver");
 MODULE_LICENSE("GPL");

-
-/* set irq_mask */
 int gelic_card_set_irq_mask(struct gelic_card *card, u64 mask)
 {
+   struct device *dev = ctodev(card);
int status;

status = lv1_net_set_interrupt_mask(bus_id(card), dev_id(card),
mask, 0);
-   if (status)
-   dev_info(ctodev(card),
-"%s failed %d\n", __func__, status);
+   if (status) {



No { } for single lines


+   dev_err(dev, "%s:%d failed: %d\n", __func__, __LINE__, status);
+   }
+
return status;
 }

@@ -63,6 +63,7 @@ static void gelic_card_rx_irq_on(struct gelic_card *card)
card->irq_mask |= GELIC_CARD_RXINT;
gelic_card_set_irq_mask(card, card->irq_mask);
 }
+
 static void gelic_card_rx_irq_off(struct gelic_card *card)
 {
card->irq_mask &= ~GELIC_CARD_RXINT;
@@ -70,15 +71,14 @@ static void gelic_card_rx_irq_off(struct  
gelic_card *card)

 }

 static void gelic_card_get_ether_port_status(struct gelic_card *card,
-int inform)
+   int inform)


Bad indent


 {
u64 v2;
struct net_device *ether_netdev;

lv1_net_control(bus_id(card), dev_id(card),
-   GELIC_LV1_GET_ETH_PORT_STATUS,
-   GELIC_LV1_VLAN_TX_ETHERNET_0, 0, 0,
-   >ether_port_status, );
+   GELIC_LV1_GET_ETH_PORT_STATUS, GELIC_LV1_VLAN_TX_ETHERNET_0, 0,
+   0, >ether_port_status, );


Bad indent



if (inform) {
ether_netdev = card->netdev[GELIC_PORT_ETHERNET_0];
@@ -105,15 +105,17 @@ gelic_descr_get_status(struct gelic_descr *descr)

 static int gelic_card_set_link_mode(struct gelic_card *card, int mode)
 {
+   struct device *dev = ctodev(card);
int status;
u64 v1, v2;

status = lv1_net_control(bus_id(card), dev_id(card),
-GELIC_LV1_SET_NEGOTIATION_MODE,
-GELIC_LV1_PHY_ETHERNET_0, mode, 0, , );
+   GELIC_LV1_SET_NEGOTIATION_MODE, GELIC_LV1_PHY_ETHERNET_0, mode,
+   0, , );
+
if (status) {
-   pr_info("%s: failed setting negotiation mode %d\n", __func__,
-   status);
+   dev_err(dev, "%s:%d: Failed setting negotiation mode: %d\n",
+   __func__, __LINE__, status);
return -EBUSY;
}

@@ -130,13 +132,16 @@ static int gelic_card_set_link_mode(struct  
gelic_card *card, int mode)

  */
 static void gelic_card_disable_txdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;

/* this hvc blocks until the DMA in progress really stopped */
status = lv1_net_stop_tx_dma(bus_id(card), dev_id(card));
-   if (status)
-   dev_err(ctodev(card),
-   "lv1_net_stop_tx_dma failed, status=%d\n", status);
+
+   if (status) {
+   dev_err(dev, "%s:%d: lv1_net_stop_tx_dma failed: %d\n",
+   __func__, __LINE__, status);
+   }
 }

 /**
@@ -187,13 +192,16 @@ static void gelic_card_enable_rxdmac(struct  
gelic_card *card)

  */
 static void gelic_card_disable_rxdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;

/* this hvc blocks until the DMA in progress really stopped */
status = lv1_net_stop_rx_dma(bus_id(card), dev_id(card));
-   if (status)
-   dev_err(ctodev(card),
-   "lv1_net_stop_rx_dma failed, %d\n", status);
+
+   if (status) {
+   dev_err(dev, "%s:%d: lv1_net_stop_rx_dma failed: %d\n",
+   __func__, __LINE__, status);
+   }
 }

 /**
@@ -216,6 +224,7 @@ static void gelic_descr_set_status(struct  
gelic_descr *descr,

 * Usually caller of this function wants to inform that to the
 * hardware, so we assure here the hardware sees the change.
 */
+


Bad blank line, it separates the comment from the commentee



wmb();
 }

@@ -229,8 +238,7 @@ static void gelic_descr_set_status(struct  
gelic_descr *descr,

  * and re-initialize the hardware chain 

Re: [PATCH v3 1/2] net/ps3_gelic: Add gelic_descr structures

2021-07-11 Thread Christophe Leroy

Geoff Levand  a écrit :


Create two new structures, struct gelic_hw_regs and struct gelic_chain_link,
and replace the corresponding members of struct gelic_descr with the new
structures.  struct gelic_hw_regs holds the register variables used by the
gelic hardware device.  struct gelic_chain_link holds variables used to
manage the driver's linked list of gelic descr structures.

Fixes several DMA mapping problems with the PS3's gelic network driver:

 * Change from checking the return value of dma_map_single to using the
   dma_mapping_error routine.
 * Use the correct buffer length when mapping the RX skb.
 * Improved error checking and debug logging.


Your patch has a lot of cosmetic changes. Several of them are just  
wrong. The other ones belong to another patch. This patch should focus  
only on the changes it targets.


Your patch is way too big and addresses several different topics.  
Should be split in several patches.


I suggest you run checkpatch.pl --strict on your patch




Fixes runtime errors like these, and also other randomly occurring errors:

  IP-Config: Complete:
  DMA-API: ps3_gelic_driver sb_05: device driver failed to check map error
  WARNING: CPU: 0 PID: 0 at kernel/dma/debug.c:1027 .check_unmap+0x888/0x8dc

Signed-off-by: Geoff Levand 
---
 drivers/net/ethernet/toshiba/ps3_gelic_net.c | 573 +++
 drivers/net/ethernet/toshiba/ps3_gelic_net.h |  24 +-
 2 files changed, 341 insertions(+), 256 deletions(-)

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c  
b/drivers/net/ethernet/toshiba/ps3_gelic_net.c

index 55e652624bd7..e01938128882 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -96,9 +96,11 @@ static void  
gelic_card_get_ether_port_status(struct gelic_card *card,

  * returns the status as in the dmac_cmd_status field of the descriptor
  */
 static enum gelic_descr_dma_status
+


This blank line is pointless and misleading


 gelic_descr_get_status(struct gelic_descr *descr)
 {
-   return be32_to_cpu(descr->dmac_cmd_status) & GELIC_DESCR_DMA_STAT_MASK;
+   return be32_to_cpu(descr->hw_regs.dmac_cmd_status)
+   & GELIC_DESCR_DMA_STAT_MASK;


The & should be at the end of previous line and the second line should  
be aligned to the open parenthesis



 }

 static int gelic_card_set_link_mode(struct gelic_card *card, int mode)
@@ -146,24 +148,34 @@ static void gelic_card_disable_txdmac(struct  
gelic_card *card)

  */
 static void gelic_card_enable_rxdmac(struct gelic_card *card)
 {
+   struct device *dev = ctodev(card);
int status;
+#if defined(DEBUG)
+   static const int debug_build = 1;
+#else
+   static const int debug_build = 0;
+#endif


Useless

You can directly use __is_defined(DEBUG) below



-#ifdef DEBUG
-   if (gelic_descr_get_status(card->rx_chain.head) !=
-   GELIC_DESCR_DMA_CARDOWNED) {
-   printk(KERN_ERR "%s: status=%x\n", __func__,
-  be32_to_cpu(card->rx_chain.head->dmac_cmd_status));
-   printk(KERN_ERR "%s: nextphy=%x\n", __func__,
-  be32_to_cpu(card->rx_chain.head->next_descr_addr));
-   printk(KERN_ERR "%s: head=%p\n", __func__,
-  card->rx_chain.head);
+   if (debug_build


&& must be at the en of first line


+   && (gelic_descr_get_status(card->rx_chain.head)
+   != GELIC_DESCR_DMA_CARDOWNED)) {


!= must be at end of previous line


+   dev_err(dev, "%s:%d: status=%x\n", __func__, __LINE__,
+   be32_to_cpu(
+   card->rx_chain.head->hw_regs.dmac_cmd_status));


Alignment should match open parentesis. And lines can be 100 chars  
long when nécessary




+   dev_err(dev, "%s:%d: nextphy=%x\n", __func__, __LINE__,
+   be32_to_cpu(
+   card->rx_chain.head->hw_regs.next_descr_addr));
+   dev_err(dev, "%s:%d: head=%px\n", __func__, __LINE__,
+   card->rx_chain.head);
}
-#endif
+
status = lv1_net_start_rx_dma(bus_id(card), dev_id(card),
-   card->rx_chain.head->bus_addr, 0);
-   if (status)
-   dev_info(ctodev(card),
-"lv1_net_start_rx_dma failed, status=%d\n", status);
+   card->rx_chain.head->link.cpu_addr, 0);
+
+   if (status) {


No { } for single lines



+   dev_err(dev, "%s:%d: lv1_net_start_rx_dma failed: %d\n",
+   __func__, __LINE__, status);
+   }
 }

 /**
@@ -193,11 +205,11 @@ static void gelic_card_disable_rxdmac(struct  
gelic_card *card)

  * in the status
  */
 static void gelic_descr_set_status(struct gelic_descr *descr,
-  enum gelic_descr_dma_status status)
+   enum gelic_descr_dma_status status)
 {
-   descr->dmac_cmd_status = cpu_to_be32(status 

[PATCH v2 2/2] powerpc/32s: Save content of sr0 to avoid 'mfsr'

2021-07-09 Thread Christophe Leroy
Calling 'mfsr' to get the content of segment registers is heavy,
in addition it requires clearing of the 'reserved' bits.

In order to avoid this operation, save it in mm context and in
thread struct.

The saved sr0 is the one used by kernel, this means that on
interrupt/syscall entry it can be used as is.

In interrupt/syscall exit, the only thing to do is to clear SR_NX.

This improves null_syscall selftest by 12 cycles, ie 4%.

Capability to deactive KUEP at boot time is re-enabled by this patch.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/kup.h  |  3 ++
 arch/powerpc/include/asm/book3s/32/mmu-hash.h |  1 +
 arch/powerpc/include/asm/processor.h  |  1 +
 arch/powerpc/kernel/entry_32.S| 24 ---
 arch/powerpc/mm/book3s32/kuap.c   |  5 +++-
 arch/powerpc/mm/book3s32/kuep.c   | 30 ++-
 arch/powerpc/mm/book3s32/mmu_context.c| 15 +-
 arch/powerpc/mm/mmu_context.c |  3 ++
 8 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 2e0e87cf7d7a..05659fd01557 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -12,6 +12,9 @@
 extern struct static_key_false disable_kuap_key;
 extern struct static_key_false disable_kuep_key;
 
+extern s32 patch__kuep_interrupt_entry, patch__kuep_interrupt_exit;
+extern s32 patch__kuep_syscall_entry, patch__kuep_syscall_exit;
+
 static __always_inline bool kuap_is_disabled(void)
 {
return !IS_ENABLED(CONFIG_PPC_KUAP) || 
static_branch_unlikely(_kuap_key);
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index e6c90802de03..fa613693949d 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -196,6 +196,7 @@ struct hash_pte {
 
 typedef struct {
unsigned long id;
+   unsigned long sr0;
void __user *vdso;
 } mm_context_t;
 
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index f348e564f7dd..4b13f94a4f42 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -157,6 +157,7 @@ struct thread_struct {
 #ifdef CONFIG_PPC_BOOK3S_32
unsigned long   r0, r3, r4, r5, r6, r8, r9, r11;
unsigned long   lr, ctr;
+   unsigned long   sr0;
 #endif
 #endif /* CONFIG_PPC32 */
/* Debug Registers */
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index e382c7fc952e..773ce90df360 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "head_32.h"
 
@@ -55,11 +56,10 @@ prepare_transfer_to_handler:
 #ifdef CONFIG_PPC_KUEP
beq 1f
 
-   mfsrr4,0
-   rlwinm  r4,r4,0,8,3
-   orisr4,r4,SR_NX@h
+0: blr /* lwz  r4, current->thread.sr0(r2) */
update_user_segments_by_6 r4, r5, r6, r7, r8, r9
blr
+   patch_site  0b, patch__kuep_interrupt_entry
 1:
 #endif
/* if from kernel, check interrupted DOZE/NAP mode */
@@ -104,10 +104,10 @@ transfer_to_syscall:
SAVE_2GPRS(7, r1)
addir2,r10,-THREAD
 #if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32)
-   mfsrr9,0
-   rlwinm  r9,r9,0,8,3
-   orisr9,r9,SR_NX@h
+0: b   1f  /* lwz  r9, (r10) */
update_user_segments_by_4 r9, r10, r11, r12
+   patch_site  0b, patch__kuep_syscall_entry
+1:
 #endif
SAVE_NVGPRS(r1)
 
@@ -127,9 +127,11 @@ ret_from_syscall:
bne-2f
 #endif /* CONFIG_PPC_47x */
 #if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32)
-   mfsrr7,0
-   rlwinm  r7,r7,0,8,2
+0: b   1f  /* lwz  r7, current->thread.sr0(r2) */
+   rlwinm  r7,r7,0,~SR_NX
update_user_segments_by_6 r7, r8, r9, r10, r11, r12
+   patch_site  0b, patch__kuep_syscall_exit
+1:
 #endif
lwz r4,_LINK(r1)
lwz r5,_CCR(r1)
@@ -295,9 +297,11 @@ interrupt_return:
bl  interrupt_exit_user_prepare
cmpwi   r3,0
 #if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32)
-   mfsrr7,0
-   rlwinm  r7,r7,0,8,2
+0: b   1f  /* lwz  r7, current->thread.sr0(r2) */
+   rlwinm  r7,r7,0,~SR_NX
update_user_segments_by_6 r7, r8, r9, r10, r11, r12
+   patch_site  0b, patch__kuep_interrupt_exit
+1:
 #endif
bne-.Lrestore_nvgprs
 
diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
index 0f920f09af57..28676cabb005 100644
--- a/arch/powerpc/mm/book3s32/kuap.c
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -20,8 +20,11 @@ EXPORT_SYMBOL(kuap_unlock_all_ool);
 
 void setup_kuap(bool disabled)
 {
-

[PATCH v2 1/2] powerpc/32s: Do kuep_lock() and kuep_unlock() in assembly

2021-07-09 Thread Christophe Leroy
When interrupt and syscall entries where converted to C, KUEP locking
and unlocking was also converted. It improved performance by unrolling
the loop, and allowed easily implementing boot time deactivation of
KUEP.

However, null_syscall selftest shows that KUEP is still heavy
(361 cycles with KUEP, 210 cycles without).

A way to improve more is to group 'mtsr's together, instead of
repeating 'addi' + 'mtsr' several times.

In order to do that, more registers need to be available. In C, GCC
will always be able to provide the requested number of registers, but
at the cost of saving some data on the stack, which is counter
performant here.

So let's do it in assembly, when we have full control of which
register can be used. It also has the advantage of locking earlier
and unlocking later and it helps GCC generating less tricky code.
The only drawback is to make boot time deactivation less straight
forward and require 'hand' instruction patching.

In syscall entry, there are only 4 registers availables, so
group 'mtsr's by 4.

In interrupt entry, syscall exit and interrupt exist, we can group
by 6, which means 2 groups for a typical config with 12 user segments.

With this change, null_syscall selftest reports 334 cycles. Without
the change it was 361 cycles, that's an 7% reduction.

For the time being, capability to deactive at boot time is disabled.
It will be re-enabled in following patch.

Signed-off-by: Christophe Leroy 
---
v2: Fixed build failure for non book3s/32
---
 arch/powerpc/include/asm/book3s/32/kup.h  | 16 ---
 arch/powerpc/include/asm/book3s/32/mmu-hash.h | 98 ++-
 arch/powerpc/include/asm/interrupt.h  |  6 +-
 arch/powerpc/include/asm/kup.h|  5 -
 arch/powerpc/kernel/entry_32.S| 26 +
 arch/powerpc/kernel/head_32.h |  2 +
 arch/powerpc/kernel/interrupt.c   |  3 -
 arch/powerpc/mm/book3s32/kuep.c   |  7 +-
 8 files changed, 133 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 64201125a287..2e0e87cf7d7a 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -22,22 +22,6 @@ static __always_inline bool kuep_is_disabled(void)
return !IS_ENABLED(CONFIG_PPC_KUEP) || 
static_branch_unlikely(_kuep_key);
 }
 
-static inline void kuep_lock(void)
-{
-   if (kuep_is_disabled())
-   return;
-
-   update_user_segments(mfsr(0) | SR_NX);
-}
-
-static inline void kuep_unlock(void)
-{
-   if (kuep_is_disabled())
-   return;
-
-   update_user_segments(mfsr(0) & ~SR_NX);
-}
-
 #ifdef CONFIG_PPC_KUAP
 
 #include 
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index f5be185cbdf8..e6c90802de03 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -64,7 +64,103 @@ struct ppc_bat {
 #define SR_KP  0x2000  /* User key */
 #define SR_KS  0x4000  /* Supervisor key */
 
-#ifndef __ASSEMBLY__
+#ifdef __ASSEMBLY__
+
+#include 
+
+.macro uus_addi sr reg1 reg2 imm
+   .if NUM_USER_SEGMENTS > \sr
+   addi\reg1,\reg2,\imm
+   .endif
+.endm
+
+.macro uus_mtsr sr reg1
+   .if NUM_USER_SEGMENTS > \sr
+   mtsr\sr, \reg1
+   .endif
+.endm
+
+.macro update_user_segments_by_4 tmp1 tmp2 tmp3 tmp4
+   uus_addi1, \tmp2, \tmp1, 0x111
+   uus_addi2, \tmp3, \tmp1, 0x222
+   uus_addi3, \tmp4, \tmp1, 0x333
+
+   uus_mtsr0, \tmp1
+   uus_mtsr1, \tmp2
+   uus_mtsr2, \tmp3
+   uus_mtsr3, \tmp4
+
+   uus_addi4, \tmp1, \tmp1, 0x444
+   uus_addi5, \tmp2, \tmp2, 0x444
+   uus_addi6, \tmp3, \tmp3, 0x444
+   uus_addi7, \tmp4, \tmp4, 0x444
+
+   uus_mtsr4, \tmp1
+   uus_mtsr5, \tmp2
+   uus_mtsr6, \tmp3
+   uus_mtsr7, \tmp4
+
+   uus_addi8, \tmp1, \tmp1, 0x444
+   uus_addi9, \tmp2, \tmp2, 0x444
+   uus_addi10, \tmp3, \tmp3, 0x444
+   uus_addi11, \tmp4, \tmp4, 0x444
+
+   uus_mtsr8, \tmp1
+   uus_mtsr9, \tmp2
+   uus_mtsr10, \tmp3
+   uus_mtsr11, \tmp4
+
+   uus_addi12, \tmp1, \tmp1, 0x444
+   uus_addi13, \tmp2, \tmp2, 0x444
+   uus_addi14, \tmp3, \tmp3, 0x444
+   uus_addi15, \tmp4, \tmp4, 0x444
+
+   uus_mtsr12, \tmp1
+   uus_mtsr13, \tmp2
+   uus_mtsr14, \tmp3
+   uus_mtsr15, \tmp4
+.endm
+
+.macro update_user_segments_by_6 tmp1 tmp2 tmp3 tmp4 tmp5 tmp6
+   uus_addi1, \tmp2, \tmp1, 0x111
+   uus_addi2, \tmp3, \tmp1, 0x222
+   uus_addi3, \tmp4, \tmp1, 0x333
+   uus_addi4, \tmp5, \t

[PATCH 2/2] powerpc/32s: Save content of sr0 to avoid 'mfsr'

2021-07-09 Thread Christophe Leroy
Calling 'mfsr' to get the content of segment registers is heavy,
in addition it requires clearing of the 'reserved' bits.

In order to avoid this operation, save it in mm context and in
thread struct.

The saved sr0 is the one used by kernel, this means that on
interrupt/syscall entry it can be used as is.

In interrupt/syscall exit, the only thing to do is to clear SR_NX.

This improves null_syscall selftest by 12 cycles, ie 4%.

Capability to deactive KUEP at boot time is re-enabled by this patch.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/kup.h  |  3 ++
 arch/powerpc/include/asm/book3s/32/mmu-hash.h |  1 +
 arch/powerpc/include/asm/processor.h  |  1 +
 arch/powerpc/kernel/entry_32.S| 24 ---
 arch/powerpc/mm/book3s32/kuap.c   |  5 +++-
 arch/powerpc/mm/book3s32/kuep.c   | 30 ++-
 arch/powerpc/mm/book3s32/mmu_context.c| 15 +-
 arch/powerpc/mm/mmu_context.c |  3 ++
 8 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 2e0e87cf7d7a..05659fd01557 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -12,6 +12,9 @@
 extern struct static_key_false disable_kuap_key;
 extern struct static_key_false disable_kuep_key;
 
+extern s32 patch__kuep_interrupt_entry, patch__kuep_interrupt_exit;
+extern s32 patch__kuep_syscall_entry, patch__kuep_syscall_exit;
+
 static __always_inline bool kuap_is_disabled(void)
 {
return !IS_ENABLED(CONFIG_PPC_KUAP) || 
static_branch_unlikely(_kuap_key);
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index e6c90802de03..fa613693949d 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -196,6 +196,7 @@ struct hash_pte {
 
 typedef struct {
unsigned long id;
+   unsigned long sr0;
void __user *vdso;
 } mm_context_t;
 
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index f348e564f7dd..4b13f94a4f42 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -157,6 +157,7 @@ struct thread_struct {
 #ifdef CONFIG_PPC_BOOK3S_32
unsigned long   r0, r3, r4, r5, r6, r8, r9, r11;
unsigned long   lr, ctr;
+   unsigned long   sr0;
 #endif
 #endif /* CONFIG_PPC32 */
/* Debug Registers */
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 84b51a387e95..582f9050a011 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "head_32.h"
 
@@ -55,11 +56,10 @@ prepare_transfer_to_handler:
 #ifdef CONFIG_PPC_KUEP
beq 1f
 
-   mfsrr4,0
-   rlwinm  r4,r4,0,8,3
-   orisr4,r4,SR_NX@h
+0: blr /* lwz  r4, current->thread.sr0(r2) */
update_user_segments_by_6 r4, r5, r6, r7, r8, r9
blr
+   patch_site  0b, patch__kuep_interrupt_entry
 1:
 #endif
/* if from kernel, check interrupted DOZE/NAP mode */
@@ -104,10 +104,10 @@ transfer_to_syscall:
SAVE_2GPRS(7, r1)
addir2,r10,-THREAD
 #ifdef CONFIG_PPC_KUEP
-   mfsrr9,0
-   rlwinm  r9,r9,0,8,3
-   orisr9,r9,SR_NX@h
+0: b   1f  /* lwz  r9, (r10) */
update_user_segments_by_4 r9, r10, r11, r12
+   patch_site  0b, patch__kuep_syscall_entry
+1:
 #endif
SAVE_NVGPRS(r1)
 
@@ -127,9 +127,11 @@ ret_from_syscall:
bne-2f
 #endif /* CONFIG_PPC_47x */
 #ifdef CONFIG_PPC_KUEP
-   mfsrr7,0
-   rlwinm  r7,r7,0,8,2
+0: b   1f  /* lwz  r7, current->thread.sr0(r2) */
+   rlwinm  r7,r7,0,~SR_NX
update_user_segments_by_6 r7, r8, r9, r10, r11, r12
+   patch_site  0b, patch__kuep_syscall_exit
+1:
 #endif
lwz r4,_LINK(r1)
lwz r5,_CCR(r1)
@@ -295,9 +297,11 @@ interrupt_return:
bl  interrupt_exit_user_prepare
cmpwi   r3,0
 #ifdef CONFIG_PPC_KUEP
-   mfsrr7,0
-   rlwinm  r7,r7,0,8,2
+0: b   1f  /* lwz  r7, current->thread.sr0(r2) */
+   rlwinm  r7,r7,0,~SR_NX
update_user_segments_by_6 r7, r8, r9, r10, r11, r12
+   patch_site  0b, patch__kuep_interrupt_exit
+1:
 #endif
bne-.Lrestore_nvgprs
 
diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
index 0f920f09af57..28676cabb005 100644
--- a/arch/powerpc/mm/book3s32/kuap.c
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -20,8 +20,11 @@ EXPORT_SYMBOL(kuap_unlock_all_ool);
 
 void setup_kuap(bool disabled)
 {
-   if (!disabled)
+   if (!disabled) {
kuap_lock_all_ool();
+   init_mm.context.sr0 |= SR_KS;
+   

[PATCH 1/2] powerpc/32s: Do kuep_lock() and kuep_unlock() in assembly

2021-07-09 Thread Christophe Leroy
When interrupt and syscall entries where converted to C, KUEP locking
and unlocking was also converted. It improved performance by unrolling
the loop, and allowed easily implementing boot time deactivation of
KUEP.

However, null_syscall selftest shows that KUEP is still heavy
(361 cycles with KUEP, 210 cycles without).

A way to improve more is to group 'mtsr's together, instead of
repeating 'addi' + 'mtsr' several times.

In order to do that, more registers need to be available. In C, GCC
will always be able to provide the requested number of registers, but
at the cost of saving some data on the stack, which is counter
performant here.

So let's do it in assembly, when we have full control of which
register can be used. It also has the advantage of locking earlier
and unlocking later and it helps GCC generating less tricky code.
The only drawback is to make boot time deactivation less straight
forward and require 'hand' instruction patching.

In syscall entry, there are only 4 registers availables, so
group 'mtsr's by 4.

In interrupt entry, syscall exit and interrupt exist, we can group
by 6, which means 2 groups for a typical config with 12 user segments.

With this change, null_syscall selftest reports 334 cycles. Without
the change it was 361 cycles, that's an 7% reduction.

For the time being, capability to deactive at boot time is disabled.
It will be re-enabled in following patch.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/kup.h  | 16 ---
 arch/powerpc/include/asm/book3s/32/mmu-hash.h | 98 ++-
 arch/powerpc/include/asm/interrupt.h  |  6 +-
 arch/powerpc/include/asm/kup.h|  5 -
 arch/powerpc/kernel/entry_32.S| 26 +
 arch/powerpc/kernel/head_32.h |  2 +
 arch/powerpc/kernel/interrupt.c   |  3 -
 arch/powerpc/mm/book3s32/kuep.c   |  7 +-
 8 files changed, 133 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/kup.h 
b/arch/powerpc/include/asm/book3s/32/kup.h
index 64201125a287..2e0e87cf7d7a 100644
--- a/arch/powerpc/include/asm/book3s/32/kup.h
+++ b/arch/powerpc/include/asm/book3s/32/kup.h
@@ -22,22 +22,6 @@ static __always_inline bool kuep_is_disabled(void)
return !IS_ENABLED(CONFIG_PPC_KUEP) || 
static_branch_unlikely(_kuep_key);
 }
 
-static inline void kuep_lock(void)
-{
-   if (kuep_is_disabled())
-   return;
-
-   update_user_segments(mfsr(0) | SR_NX);
-}
-
-static inline void kuep_unlock(void)
-{
-   if (kuep_is_disabled())
-   return;
-
-   update_user_segments(mfsr(0) & ~SR_NX);
-}
-
 #ifdef CONFIG_PPC_KUAP
 
 #include 
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index f5be185cbdf8..e6c90802de03 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -64,7 +64,103 @@ struct ppc_bat {
 #define SR_KP  0x2000  /* User key */
 #define SR_KS  0x4000  /* Supervisor key */
 
-#ifndef __ASSEMBLY__
+#ifdef __ASSEMBLY__
+
+#include 
+
+.macro uus_addi sr reg1 reg2 imm
+   .if NUM_USER_SEGMENTS > \sr
+   addi\reg1,\reg2,\imm
+   .endif
+.endm
+
+.macro uus_mtsr sr reg1
+   .if NUM_USER_SEGMENTS > \sr
+   mtsr\sr, \reg1
+   .endif
+.endm
+
+.macro update_user_segments_by_4 tmp1 tmp2 tmp3 tmp4
+   uus_addi1, \tmp2, \tmp1, 0x111
+   uus_addi2, \tmp3, \tmp1, 0x222
+   uus_addi3, \tmp4, \tmp1, 0x333
+
+   uus_mtsr0, \tmp1
+   uus_mtsr1, \tmp2
+   uus_mtsr2, \tmp3
+   uus_mtsr3, \tmp4
+
+   uus_addi4, \tmp1, \tmp1, 0x444
+   uus_addi5, \tmp2, \tmp2, 0x444
+   uus_addi6, \tmp3, \tmp3, 0x444
+   uus_addi7, \tmp4, \tmp4, 0x444
+
+   uus_mtsr4, \tmp1
+   uus_mtsr5, \tmp2
+   uus_mtsr6, \tmp3
+   uus_mtsr7, \tmp4
+
+   uus_addi8, \tmp1, \tmp1, 0x444
+   uus_addi9, \tmp2, \tmp2, 0x444
+   uus_addi10, \tmp3, \tmp3, 0x444
+   uus_addi11, \tmp4, \tmp4, 0x444
+
+   uus_mtsr8, \tmp1
+   uus_mtsr9, \tmp2
+   uus_mtsr10, \tmp3
+   uus_mtsr11, \tmp4
+
+   uus_addi12, \tmp1, \tmp1, 0x444
+   uus_addi13, \tmp2, \tmp2, 0x444
+   uus_addi14, \tmp3, \tmp3, 0x444
+   uus_addi15, \tmp4, \tmp4, 0x444
+
+   uus_mtsr12, \tmp1
+   uus_mtsr13, \tmp2
+   uus_mtsr14, \tmp3
+   uus_mtsr15, \tmp4
+.endm
+
+.macro update_user_segments_by_6 tmp1 tmp2 tmp3 tmp4 tmp5 tmp6
+   uus_addi1, \tmp2, \tmp1, 0x111
+   uus_addi2, \tmp3, \tmp1, 0x222
+   uus_addi3, \tmp4, \tmp1, 0x333
+   uus_addi4, \tmp5, \tmp1, 0x444
+   uus_addi5, \t

[PATCH v4 4/4] powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP

2021-07-08 Thread Christophe Leroy
This patch converts powerpc to the generic PTDUMP implementation.

Signed-off-by: Christophe Leroy 
---
v4: Reworked init of ptdump range
---
 arch/powerpc/Kconfig|   2 +
 arch/powerpc/Kconfig.debug  |  30 ---
 arch/powerpc/mm/Makefile|   2 +-
 arch/powerpc/mm/mmu_decl.h  |   2 +-
 arch/powerpc/mm/ptdump/Makefile |   9 +-
 arch/powerpc/mm/ptdump/ptdump.c | 146 
 6 files changed, 47 insertions(+), 144 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0104345d0a65..dc1ab533a1cf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -123,6 +123,7 @@ config PPC
select ARCH_HAS_COPY_MC if PPC64
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
+   select ARCH_HAS_DEBUG_WXif STRICT_KERNEL_RWX
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_ELF_RANDOMIZE
@@ -182,6 +183,7 @@ config PPC
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAPif PCI
+   select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 205cd77f321f..192f0ed0097f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -365,36 +365,6 @@ config FAIL_IOMMU
 
  If you are unsure, say N.
 
-config PPC_PTDUMP
-   bool "Export kernel pagetable layout to userspace via debugfs"
-   depends on DEBUG_KERNEL && DEBUG_FS
-   help
- This option exports the state of the kernel pagetables to a
- debugfs file. This is only useful for kernel developers who are
- working in architecture specific areas of the kernel - probably
- not a good idea to enable this feature in a production kernel.
-
- If you are unsure, say N.
-
-config PPC_DEBUG_WX
-   bool "Warn on W+X mappings at boot"
-   depends on PPC_PTDUMP && STRICT_KERNEL_RWX
-   help
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
 config PPC_FAST_ENDIAN_SWITCH
bool "Deprecated fast endian-switch syscall"
depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index eae4ec2988fc..df8172da2301 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES)   += slice.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
-obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
+obj-$(CONFIG_PTDUMP_CORE)  += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7dac910c0b21..dd1cabc2ea0f 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { }
 void __init mmu_mapin_immr(void);
 #endif
 
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void);
 #else
 static inline void ptdump_check_wx(void) { }
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
index 712762be3cb1..4050cbb55acf 100644
--- a/arch/powerpc/mm/ptdump/Makefile
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -5,5 +5,10 @@ obj-y  += ptdump.o
 obj-$(CONFIG_4xx)  += shared.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += shared.o
-obj-$(CONFIG_PPC_BOOK3S_32)+= shared.o bats.o segment_regs.o
-obj-$(CONFIG_PPC_BOOK3S_64)+= book3s64.o hashpagetable.o
+obj-$(CONFIG_PPC_BOOK3S_32)+= shared.o
+obj-$(CONFIG_PPC_BOOK3S_64)+= book3s64.o
+
+ifdef CONFIG_PTDUMP_DEBUGFS
+obj-$(CONFIG_PPC_BOOK3S_32)+= bats.o segment_regs.o
+obj-$(CONFIG_PPC_BOOK3S_64)+= hashpagetable.o
+endif
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index fb531bc64fc5..2d80d775d15e 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +55,7 @@
  *
  */
 stru

[PATCH v4 3/4] powerpc/ptdump: Reduce level numbers by 1 in note_page() and add p4d level

2021-07-08 Thread Christophe Leroy
Do the same as commit f8f0d0b6fa20 ("mm: ptdump: reduce level numbers
by 1 in note_page()") and add missing p4d level.

This will align powerpc to the users of generic ptdump.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/ptdump/8xx.c  |  6 --
 arch/powerpc/mm/ptdump/book3s64.c |  6 --
 arch/powerpc/mm/ptdump/ptdump.c   | 17 +
 arch/powerpc/mm/ptdump/shared.c   |  6 --
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 86da2a669680..fac932eb8f9a 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/book3s64.c 
b/arch/powerpc/mm/ptdump/book3s64.c
index 14f73868db66..5ad92d9dc5d1 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 3eb8732641da..fb531bc64fc5 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -58,7 +58,7 @@ struct pg_state {
const struct addr_marker *marker;
unsigned long start_address;
unsigned long start_pa;
-   unsigned int level;
+   int level;
u64 current_flags;
bool check_wx;
unsigned long wx_pages;
@@ -188,10 +188,9 @@ static void note_prot_wx(struct pg_state *st, unsigned 
long addr)
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
 }
 
-static void note_page_update_state(struct pg_state *st, unsigned long addr,
-  unsigned int level, u64 val)
+static void note_page_update_state(struct pg_state *st, unsigned long addr, 
int level, u64 val)
 {
-   u64 flag = val & pg_level[level].mask;
+   u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
u64 pa = val & PTE_RPN_MASK;
 
st->level = level;
@@ -206,12 +205,12 @@ static void note_page_update_state(struct pg_state *st, 
unsigned long addr,
 }
 
 static void note_page(struct pg_state *st, unsigned long addr,
-  unsigned int level, u64 val, unsigned long page_size)
+ int level, u64 val, unsigned long page_size)
 {
-   u64 flag = val & pg_level[level].mask;
+   u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
 
/* At first no level is set */
-   if (!st->level) {
+   if (st->level == -1) {
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
note_page_update_state(st, addr, level, val);
/*
@@ -383,6 +382,7 @@ static int ptdump_show(struct seq_file *m, void *v)
struct pg_state st = {
.seq = m,
.marker = address_markers,
+   .level = -1,
.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : 
TASK_SIZE,
};
 
@@ -393,7 +393,7 @@ static int ptdump_show(struct seq_file *m, void *v)
 
/* Traverse kernel page tables */
walk_pagetables();
-   note_page(, 0, 0, 0, 0);
+   note_page(, 0, -1, 0, 0);
return 0;
 }
 
@@ -415,6 +415,7 @@ void ptdump_check_wx(void)
struct pg_state st = {
.seq = NULL,
.marker = address_markers,
+   .level = -1,
.check_wx = true,
.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : 
TASK_SIZE,
};
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index c005fe041c18..03607ab90c66 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
-- 
2.25.0



[PATCH v4 2/4] powerpc/ptdump: Remove unused 'page_size' parameter

2021-07-08 Thread Christophe Leroy
note_page_update_state() doesn't use page_size. Remove it.

Could also be removed to note_page() but as a following patch
will remove all current users of note_page(), just leave it as
is for now.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/ptdump/ptdump.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 349fd8fe173f..3eb8732641da 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -189,7 +189,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long 
addr)
 }
 
 static void note_page_update_state(struct pg_state *st, unsigned long addr,
-  unsigned int level, u64 val, unsigned long 
page_size)
+  unsigned int level, u64 val)
 {
u64 flag = val & pg_level[level].mask;
u64 pa = val & PTE_RPN_MASK;
@@ -213,7 +213,7 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
/* At first no level is set */
if (!st->level) {
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
-   note_page_update_state(st, addr, level, val, page_size);
+   note_page_update_state(st, addr, level, val);
/*
 * Dump the section of virtual memory when:
 *   - the PTE flags from one entry to the next differs.
@@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
 * Address indicates we have passed the end of the
 * current section of virtual memory
 */
-   note_page_update_state(st, addr, level, val, page_size);
+   note_page_update_state(st, addr, level, val);
}
 }
 
-- 
2.25.0



[PATCH v4 1/4] powerpc/ptdump: Use DEFINE_SHOW_ATTRIBUTE()

2021-07-08 Thread Christophe Leroy
Use DEFINE_SHOW_ATTRIBUTE() instead of open coding
open() and fops.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/ptdump/bats.c  | 14 ++
 arch/powerpc/mm/ptdump/hashpagetable.c | 12 +---
 arch/powerpc/mm/ptdump/ptdump.c| 13 +
 arch/powerpc/mm/ptdump/segment_regs.c  | 12 +---
 4 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index c4c628b03cf8..4ed3418f07d9 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 
lower, u32 upper, bool
 
 #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), 
mfspr(_u), _d)
 
-static int bats_show_603(struct seq_file *m, void *v)
+static int bats_show(struct seq_file *m, void *v)
 {
seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
 
@@ -88,17 +88,7 @@ static int bats_show_603(struct seq_file *m, void *v)
return 0;
 }
 
-static int bats_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, bats_show_603, NULL);
-}
-
-static const struct file_operations bats_fops = {
-   .open   = bats_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bats);
 
 static int __init bats_init(void)
 {
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c 
b/arch/powerpc/mm/ptdump/hashpagetable.c
index ad6df9a2e7c8..c7f824d294b2 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
 }
 
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-   .open   = ptdump_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static int ptdump_init(void)
 {
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 5062c58b1e5b..349fd8fe173f 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -397,18 +397,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
 }
 
-
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-   .open   = ptdump_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static void build_pgtable_complete_mask(void)
 {
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c 
b/arch/powerpc/mm/ptdump/segment_regs.c
index 565048a0c9be..3054944d3d7e 100644
--- a/arch/powerpc/mm/ptdump/segment_regs.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -41,17 +41,7 @@ static int sr_show(struct seq_file *m, void *v)
return 0;
 }
 
-static int sr_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, sr_show, NULL);
-}
-
-static const struct file_operations sr_fops = {
-   .open   = sr_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(sr);
 
 static int __init sr_init(void)
 {
-- 
2.25.0



[PATCH] powerpc/32: Fix critical and debug interrupts on BOOKE

2021-07-06 Thread Christophe Leroy
32 bits BOOKE have special interrupts for debug and other
critical events.

When handling those interrupts, dedicated registers are saved
in the stack frame in addition to the standard registers, leading
to a shift of the pt_regs struct.

Since commit db297c3b07af ("powerpc/32: Don't save thread.regs on
interrupt entry"), the pt_regs struct is expected to be at the
same place all the time.

Instead of handling a special struct in addition to pt_regs, just
add those special registers to struct pt_regs.

Reported-by: Radu Rendec 
Signed-off-by: Christophe Leroy 
Fixes: db297c3b07af ("powerpc/32: Don't save thread.regs on interrupt entry")
Cc: sta...@vger.kernel.org
---
 arch/powerpc/include/asm/ptrace.h | 16 
 arch/powerpc/kernel/asm-offsets.c | 31 ++-
 arch/powerpc/kernel/head_booke.h  | 27 +++
 3 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 3e5d470a6155..14422e851494 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -70,6 +70,22 @@ struct pt_regs
unsigned long __pad[4]; /* Maintain 16 byte interrupt stack 
alignment */
};
 #endif
+#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE)
+   struct { /* Must be a multiple of 16 bytes */
+   unsigned long mas0;
+   unsigned long mas1;
+   unsigned long mas2;
+   unsigned long mas3;
+   unsigned long mas6;
+   unsigned long mas7;
+   unsigned long srr0;
+   unsigned long srr1;
+   unsigned long csrr0;
+   unsigned long csrr1;
+   unsigned long dsrr0;
+   unsigned long dsrr1;
+   };
+#endif
 };
 #endif
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index a47eefa09bcb..5bee245d832b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -309,24 +309,21 @@ int main(void)
STACK_PT_REGS_OFFSET(STACK_REGS_IAMR, iamr);
 #endif
 
-#if defined(CONFIG_PPC32)
-#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
-   DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE);
-   DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas0));
+#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE)
+   STACK_PT_REGS_OFFSET(MAS0, mas0);
/* we overload MMUCR for 44x on MAS0 since they are mutually exclusive 
*/
-   DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas0));
-   DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas1));
-   DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas2));
-   DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas3));
-   DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas6));
-   DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
mas7));
-   DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
srr0));
-   DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
srr1));
-   DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
csrr0));
-   DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
csrr1));
-   DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
dsrr0));
-   DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, 
dsrr1));
-#endif
+   STACK_PT_REGS_OFFSET(MMUCR, mas0);
+   STACK_PT_REGS_OFFSET(MAS1, mas1);
+   STACK_PT_REGS_OFFSET(MAS2, mas2);
+   STACK_PT_REGS_OFFSET(MAS3, mas3);
+   STACK_PT_REGS_OFFSET(MAS6, mas6);
+   STACK_PT_REGS_OFFSET(MAS7, mas7);
+   STACK_PT_REGS_OFFSET(_SRR0, srr0);
+   STACK_PT_REGS_OFFSET(_SRR1, srr1);
+   STACK_PT_REGS_OFFSET(_CSRR0, csrr0);
+   STACK_PT_REGS_OFFSET(_CSRR1, csrr1);
+   STACK_PT_REGS_OFFSET(_DSRR0, dsrr0);
+   STACK_PT_REGS_OFFSET(_DSRR1, dsrr1);
 #endif
 
/* About the CPU features table */
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 87b806e8eded..e5503420b6c6 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -168,20 +168,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV)
 /* only on e500mc */
 #define DBG_STACK_BASE dbgirq_ctx
 
-#define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE)
-
 #ifdef CONFIG_SMP
 #define BOOKE_LOAD_EXC_LEVEL_STACK(level)  \
mfspr   r8,SPRN_PIR;\
slwir8,r8,2;\
addis   r8,r8,level##_STACK_BASE@ha;\
lwz r8,level##_STACK_BASE@l(r8);\
-   addir8,r8,EXC_LVL_FRAME_OVERHEAD;
+   addir8,r8,THREAD_SIZE - INT_FRAME_SIZE;
 #else

Re: Hitting BUG_ON in do_notify_resume() with gdb and SIGTRAP

2021-07-06 Thread Christophe Leroy




Le 06/07/2021 à 16:05, Radu Rendec a écrit :

On Tue, 2021-07-06 at 15:53 +0200, Christophe Leroy wrote:

Le 06/07/2021 à 15:50, Radu Rendec a écrit :

On Tue, 2021-07-06 at 15:16 +0200, Christophe Leroy wrote:

Le 06/07/2021 à 13:56, Radu Rendec a écrit :

On Tue, 2021-07-06 at 12:43 +0200, Christophe Leroy wrote:

Le 04/07/2021 à 23:38, Radu Rendec a écrit :

I'm trying to set up my (virtual) environment to test an old bug in the
PPC32 ptrace() code. I came across a completely different problem,
which seems to make gdb pretty much unusable on PPC32. I'm not sure if
this is a real kernel bug or maybe something wrong with my
configuration.

I'm running kernel 5.13 in a qemu VM with one e500mc CPU. I am running
native gdb (inside the VM) and setting a breakpoint in main() in a test
"hello world" program. Upon running the test program, I am hitting the
BUG_ON in do_notify_resume() on line 292. The kernel bug log snippet is
included below at the end of the email.

FWIW, gdb says:
Program terminated with signal SIGTRAP, Trace/breakpoint trap.
The program no longer exists.

I also added a pr_info() to do_notify_resume() just to see how much
different 'regs' and 'current->thread.regs' are. Surprisingly, they are
just 0x30 apart: regs=c7955f10 cur=c7955f40. Also, 'current' seems to
be OK (pid and comm are consistent with the test program).


The TRAP = 0x7d8 is obviously wrong.

Need to know which 'TRAP' it is exactly.
Could you try to dump what we have at the correct regs ?
Something like 'show_regs(current->thread.regs)' should do it.


Sure, please see the output below. It looks to me like the "correct"
regs are just garbage. Either they are overwritten or current->thread.regs
is wrong. But in any case, r1 = 0 doesn't look good.


Yes indeed. I think I identified the problem. For Critical interrupts like 
DEBUG interrupt, struct
exception_regs is added, therefore the frame has 12x4 (0x30) more bytes. That's 
what you see.

Commit
https://github.com/linuxppc/linux/commit/db297c3b07af7856fb7c666fbc9792d8e37556be#diff-dd6b952a3980da19df4facccdb4f3dddeb8cef56ee384c7f03d02b23b0c6cb26

Need to find the best solution now to fix that.


Awesome, happy to see you figured it out so quickly.

I'm not sure if it makes any sense, but one thing that comes to mind is
to put struct exception_regs before struct pt_regs when the frame is
saved. Unless of course other parts of the code expect the opposite.


Yes I think it is a good idea. I think I won't have time to look at that before 
summer vacation though.


I can take a stab at it. I'm not familiar with that part of the code,
but the best way to learn is to get your hands dirty :) In the worst
case, I won't fix it.



Not that easy in fact.
After looking once more, the best solution I see now would be to move the content of struct 
exception_regs into the second part of struct pt_regs (the kernel one in asm/ptrace.h).


Changes should be limited to head_booke.h and asm-offsets.c
struct exception_regs and STACK_EXC_LVL_FRAME_SIZE should go away.

Christophe


Re: Hitting BUG_ON in do_notify_resume() with gdb and SIGTRAP

2021-07-06 Thread Christophe Leroy




Le 06/07/2021 à 15:50, Radu Rendec a écrit :

On Tue, 2021-07-06 at 15:16 +0200, Christophe Leroy wrote:

Le 06/07/2021 à 13:56, Radu Rendec a écrit :

On Tue, 2021-07-06 at 12:43 +0200, Christophe Leroy wrote:

Le 04/07/2021 à 23:38, Radu Rendec a écrit :

I'm trying to set up my (virtual) environment to test an old bug in the
PPC32 ptrace() code. I came across a completely different problem,
which seems to make gdb pretty much unusable on PPC32. I'm not sure if
this is a real kernel bug or maybe something wrong with my
configuration.

I'm running kernel 5.13 in a qemu VM with one e500mc CPU. I am running
native gdb (inside the VM) and setting a breakpoint in main() in a test
"hello world" program. Upon running the test program, I am hitting the
BUG_ON in do_notify_resume() on line 292. The kernel bug log snippet is
included below at the end of the email.

FWIW, gdb says:
Program terminated with signal SIGTRAP, Trace/breakpoint trap.
The program no longer exists.

I also added a pr_info() to do_notify_resume() just to see how much
different 'regs' and 'current->thread.regs' are. Surprisingly, they are
just 0x30 apart: regs=c7955f10 cur=c7955f40. Also, 'current' seems to
be OK (pid and comm are consistent with the test program).


The TRAP = 0x7d8 is obviously wrong.

Need to know which 'TRAP' it is exactly.
Could you try to dump what we have at the correct regs ?
Something like 'show_regs(current->thread.regs)' should do it.


Sure, please see the output below. It looks to me like the "correct"
regs are just garbage. Either they are overwritten or current->thread.regs
is wrong. But in any case, r1 = 0 doesn't look good.


Yes indeed. I think I identified the problem. For Critical interrupts like 
DEBUG interrupt, struct
exception_regs is added, therefore the frame has 12x4 (0x30) more bytes. That's 
what you see.

Commit
https://github.com/linuxppc/linux/commit/db297c3b07af7856fb7c666fbc9792d8e37556be#diff-dd6b952a3980da19df4facccdb4f3dddeb8cef56ee384c7f03d02b23b0c6cb26

Need to find the best solution now to fix that.


Awesome, happy to see you figured it out so quickly.

I'm not sure if it makes any sense, but one thing that comes to mind is
to put struct exception_regs before struct pt_regs when the frame is
saved. Unless of course other parts of the code expect the opposite.



Yes I think it is a good idea. I think I won't have time to look at that before 
summer vacation though.


Re: Hitting BUG_ON in do_notify_resume() with gdb and SIGTRAP

2021-07-06 Thread Christophe Leroy




Le 06/07/2021 à 13:56, Radu Rendec a écrit :

On Tue, 2021-07-06 at 12:43 +0200, Christophe Leroy wrote:

Le 04/07/2021 à 23:38, Radu Rendec a écrit :

I'm trying to set up my (virtual) environment to test an old bug in the
PPC32 ptrace() code. I came across a completely different problem,
which seems to make gdb pretty much unusable on PPC32. I'm not sure if
this is a real kernel bug or maybe something wrong with my
configuration.

I'm running kernel 5.13 in a qemu VM with one e500mc CPU. I am running
native gdb (inside the VM) and setting a breakpoint in main() in a test
"hello world" program. Upon running the test program, I am hitting the
BUG_ON in do_notify_resume() on line 292. The kernel bug log snippet is
included below at the end of the email.

FWIW, gdb says:
Program terminated with signal SIGTRAP, Trace/breakpoint trap.
The program no longer exists.

I also added a pr_info() to do_notify_resume() just to see how much
different 'regs' and 'current->thread.regs' are. Surprisingly, they are
just 0x30 apart: regs=c7955f10 cur=c7955f40. Also, 'current' seems to
be OK (pid and comm are consistent with the test program).


The TRAP = 0x7d8 is obviously wrong.

Need to know which 'TRAP' it is exactly.
Could you try to dump what we have at the correct regs ?
Something like 'show_regs(current->thread.regs)' should do it.


Sure, please see the output below. It looks to me like the "correct"
regs are just garbage. Either they are overwritten or current->thread.regs
is wrong. But in any case, r1 = 0 doesn't look good.


Yes indeed. I think I identified the problem. For Critical interrupts like DEBUG interrupt, struct 
exception_regs is added, therefore the frame has 12x4 (0x30) more bytes. That's what you see.


Commit 
https://github.com/linuxppc/linux/commit/db297c3b07af7856fb7c666fbc9792d8e37556be#diff-dd6b952a3980da19df4facccdb4f3dddeb8cef56ee384c7f03d02b23b0c6cb26


Need to find the best solution now to fix that.



regs=c7a0ff10 cur=c7a0ff40 pid=139 comm=test
CPU: 0 PID: 139 Comm: test Not tainted 5.13.0-dirty #4
NIP:  1338 LR: 0003 CTR: 0003
REGS: c7a0ff40 TRAP: 67   Not tainted  (5.13.0-dirty)
MSR:  1002d202   CR: 1004  XER: 80670100

GPR00: b7fc36d8   b7fe17b4  b7ffd588 b7ffe8b8 b7ffee10
GPR08: b7fff214 b7ffdf40 b7fff208 b858 b970 b7fff130 0001 b960
GPR16: b7fff2b0 b7ffd5f0 b7ffe8a8 b850 b7fc3714 1002d002 b7fff208 0003
GPR24: b7fc3714  22000284 b960 07d8 1338 0800 b850
NIP [1338] 0x1338
LR [0003] 0x3
Call Trace:
[c7a0fe40] [c0008eac] show_regs+0x4c/0x1b0 (unreliable)
[c7a0fe80] [c000969c] do_notify_resume+0x31c/0x320
[c7a0fee0] [c0010b94] interrupt_exit_user_prepare+0x94/0xc0
[c7a0ff00] [c00151e8] interrupt_return+0x14/0x13c
--- interrupt: 7d8 at 0xb7fc3714
NIP:  b7fc3714 LR: b7fc3714 CTR: 0003
REGS: c7a0ff10 TRAP: 07d8   Not tainted  (5.13.0-dirty)
MSR:  1002d002   CR: 22000284  XER: 

GPR00: b7fc3584 b850     00a0 6474e552
GPR08: b7fbe0d4 0001 b7fff230 b850 b7fc36d8   b7fe17b4
GPR16:  b7ffd588 b7ffe8b8 b7ffee10 b7fff214 b7ffdf40 b7fff208 b858
GPR24: b970 b7fff130 0001 b960 b7fff2b0 b7ffd5f0 b7ffe8a8 b850
NIP [b7fc3714] 0xb7fc3714
LR [b7fc3714] 0xb7fc3714
--- interrupt: 7d8
[ cut here ]
kernel BUG at arch/powerpc/kernel/signal.c:298!
Oops: Exception in kernel mode, sig: 5 [#1]
BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
Modules linked in:
CPU: 0 PID: 139 Comm: test Not tainted 5.13.0-dirty #4
NIP:  c000969c LR: c000969c CTR: c065f540
REGS: c7a0fdc0 TRAP: 0700   Not tainted  (5.13.0-dirty)
MSR:  00028002   CR: 28000282  XER: 2000

GPR00: c000969c c7a0fe80 c70ef500 c70efbd8 c70ef500 0010 c7a0fc38 0002
GPR08: 0001    28000282   b7fe17b4
GPR16:  b7ffd588 b7ffe8b8 b7ffee10 b7fff214 b7ffdf40 b7fff208 b858
GPR24: b970 b7fff130 0001 b960 c7a0ff10 0800 c70ef500 0102
NIP [c000969c] do_notify_resume+0x31c/0x320
LR [c000969c] do_notify_resume+0x31c/0x320
Call Trace:
[c7a0fe80] [c000969c] do_notify_resume+0x31c/0x320 (unreliable)
[c7a0fee0] [c0010b94] interrupt_exit_user_prepare+0x94/0xc0
[c7a0ff00] [c00151e8] interrupt_return+0x14/0x13c
--- interrupt: 7d8 at 0xb7fc3714
NIP:  b7fc3714 LR: b7fc3714 CTR: 0003
REGS: c7a0ff10 TRAP: 07d8   Not tainted  (5.13.0-dirty)
MSR:  1002d002   CR: 22000284  XER: 

GPR00: b7fc3584 b850     00a0 6474e552
GPR08: b7fbe0d4 0001 b7fff230 b850 b7fc36d8   b7fe17b4
GPR16:  b7ffd588 b7ffe8b8 b7ffee10 b7fff214 b7ffdf40 b7fff208 b858
GPR24: b970 b7fff130 0001 b960 b7fff2b0 b7ffd5f0 b7ffe8a8 b850
NIP [b7fc3714] 0xb7fc3714
LR [b7fc3714] 0xb7fc3714
--- interrupt: 7d8
Instruction dump:
93a10054 90010064 93c10058 48b95369 80c20398 3c60c0

Re: Hitting BUG_ON in do_notify_resume() with gdb and SIGTRAP

2021-07-06 Thread Christophe Leroy




Le 04/07/2021 à 23:38, Radu Rendec a écrit :

Hi Everyone,

I'm trying to set up my (virtual) environment to test an old bug in the
PPC32 ptrace() code. I came across a completely different problem,
which seems to make gdb pretty much unusable on PPC32. I'm not sure if
this is a real kernel bug or maybe something wrong with my
configuration.

I'm running kernel 5.13 in a qemu VM with one e500mc CPU. I am running
native gdb (inside the VM) and setting a breakpoint in main() in a test
"hello world" program. Upon running the test program, I am hitting the
BUG_ON in do_notify_resume() on line 292. The kernel bug log snippet is
included below at the end of the email.

FWIW, gdb says:
Program terminated with signal SIGTRAP, Trace/breakpoint trap.
The program no longer exists.

I also added a pr_info() to do_notify_resume() just to see how much
different 'regs' and 'current->thread.regs' are. Surprisingly, they are
just 0x30 apart: regs=c7955f10 cur=c7955f40. Also, 'current' seems to
be OK (pid and comm are consistent with the test program).


The TRAP = 0x7d8 is obviously wrong.

Need to know which 'TRAP' it is exactly.
Could you try to dump what we have at the correct regs ?
Something like 'show_regs(current->thread.regs)' should do it.




If I'm not missing something, the 'regs' pointer that is eventually
passed to do_notify_resume() is calculated in interrupt_return() by
adding STACK_FRAME_OVERHEAD (defined to 112) to the value of r1. That
means all registers are saved on the stack before entering the
interrupt handler and, upon returning, a pointer to the register
structure is calculated from the stack pointer. Either the offset
itself is wrong, or the stack pointer is off by 0x30.

This is as far as I have gone. Hopefully this rings a bell to someone
who is familiar with that part of the code and has a better
understanding of PPC32 interrupt handling in general.

Last but not least, my configuration is fairly standard. I'm using the
powerpc-e500mc--glibc--bleeding-edge-2020.08-1 toolchain from Bootlin
to compile everything (kernel and user-space). The qemu version is
5.2.0 (Fedora 34) and the user-space is a small Busybox based rootfs
that I built using Buildroot 2021.05. The gdb version is 9.2.

regs=c7955f10 cur=c7955f40 pid=138 comm=test
[ cut here ]
kernel BUG at arch/powerpc/kernel/signal.c:296!
Oops: Exception in kernel mode, sig: 5 [#1]
BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
Modules linked in:
CPU: 0 PID: 138 Comm: test Not tainted 5.13.0-dirty #3
NIP:  c0009694 LR: c0009694 CTR: c065f540
REGS: c7955dc0 TRAP: 0700   Not tainted  (5.13.0-dirty)
MSR:  00028002   CR: 28000282  XER: 2000

GPR00: c0009694 c7955e80 c7145100 002c dfbdc3d4 dfbe5d24 0027 dfbdc3d8
GPR08: c0ffe988    22000282   b7fe17b4
GPR16:  b7ffd588 b7ffe8b8 b7ffee10 b7fff214 b7ffdf40 b7fff208 b858
GPR24: b970 b7fff130 0001 b960 c7955f10 0800 c7145100 0102
NIP [c0009694] do_notify_resume+0x314/0x320
LR [c0009694] do_notify_resume+0x314/0x320
Call Trace:
[c7955e80] [c0009694] do_notify_resume+0x314/0x320 (unreliable)
[c7955ee0] [c0010b94] interrupt_exit_user_prepare+0x94/0xc0
[c7955f00] [c00151e8] interrupt_return+0x14/0x13c
--- interrupt: 7d8 at 0xb7fc3714
NIP:  b7fc3714 LR: b7fc3714 CTR: 0003
REGS: c7955f10 TRAP: 07d8   Not tainted  (5.13.0-dirty)
MSR:  1002d002   CR: 22000284  XER: 

GPR00: b7fc3584 b850     00a0 6474e552
GPR08: b7fbe0d4 0001 b7fff230 b850 b7fc36d8   b7fe17b4
GPR16:  b7ffd588 b7ffe8b8 b7ffee10 b7fff214 b7ffdf40 b7fff208 b858
GPR24: b970 b7fff130 0001 b960 b7fff2b0 b7ffd5f0 b7ffe8a8 b850
NIP [b7fc3714] 0xb7fc3714
LR [b7fc3714] 0xb7fc3714
--- interrupt: 7d8
Instruction dump:
4b04 7c0802a6 93a10054 90010064 93c10058 48b95369 80c20398 3c60c0dc
7f84e378 38e204b0 3863ce30 4809d819 <0fe0> 6000 6000 3d20c0ff
---[ end trace 065671519ba3d526 ]---

Note: the BUG() line is slightly different because I had made this
small change to print the pointers:

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 9ded046edb0e..57ea6e500a6f 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -289,7 +289,12 @@ void do_notify_resume(struct pt_regs *regs, unsigned long 
thread_info_flags)
klp_update_patch_state(current);
  
  	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {

-   BUG_ON(regs != current->thread.regs);
+   if (regs != current->thread.regs) {
+   pr_info("regs=%px cur=%px pid=%d comm=%s\n",
+   regs, current->thread.regs,
+   current->pid, current->comm);
+   BUG();
+   }
do_signal(current);
}
  



Re: [PATCH 4/4] bpf powerpc: Add addr > TASK_SIZE_MAX explicit check

2021-07-06 Thread Christophe Leroy




Le 06/07/2021 à 09:32, Ravi Bangoria a écrit :

On PowerPC with KUAP enabled, any kernel code which wants to
access userspace needs to be surrounded by disable-enable KUAP.
But that is not happening for BPF_PROBE_MEM load instruction.
So, when BPF program tries to access invalid userspace address,
page-fault handler considers it as bad KUAP fault:

   Kernel attempted to read user page (d000) - exploit attempt? (uid: 0)

Considering the fact that PTR_TO_BTF_ID (which uses BPF_PROBE_MEM
mode) could either be a valid kernel pointer or NULL but should
never be a pointer to userspace address, execute BPF_PROBE_MEM load
only if addr > TASK_SIZE_MAX, otherwise set dst_reg=0 and move on.

This will catch NULL, valid or invalid userspace pointers. Only bad
kernel pointer will be handled by BPF exception table.

[Alexei suggested for x86]
Suggested-by: Alexei Starovoitov 
Signed-off-by: Ravi Bangoria 
---
  arch/powerpc/net/bpf_jit_comp64.c | 38 +++
  1 file changed, 38 insertions(+)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 1884c6dca89a..46becae76210 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -753,6 +753,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
/* dst = *(u8 *)(ul) (src + off) */
case BPF_LDX | BPF_MEM | BPF_B:
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
+   if (BPF_MODE(code) == BPF_PROBE_MEM) {
+   EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], src_reg, 
off));
+   PPC_LI64(b2p[TMP_REG_2], TASK_SIZE_MAX);
+   EMIT(PPC_RAW_CMPLD(b2p[TMP_REG_1], 
b2p[TMP_REG_2]));
+   PPC_BCC(COND_GT, (ctx->idx + 4) * 4);
+   EMIT(PPC_RAW_XOR(dst_reg, dst_reg, dst_reg));


Prefered way to clear a register is to do 'li reg, 0'


+   PPC_JMP((ctx->idx + 2) * 4);
+   }
EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off));
if (insn_is_zext([i + 1]))
addrs[++i] = ctx->idx * 4;
@@ -763,6 +771,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
/* dst = *(u16 *)(ul) (src + off) */
case BPF_LDX | BPF_MEM | BPF_H:
case BPF_LDX | BPF_PROBE_MEM | BPF_H:
+   if (BPF_MODE(code) == BPF_PROBE_MEM) {
+   EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], src_reg, 
off));
+   PPC_LI64(b2p[TMP_REG_2], TASK_SIZE_MAX);
+   EMIT(PPC_RAW_CMPLD(b2p[TMP_REG_1], 
b2p[TMP_REG_2]));
+   PPC_BCC(COND_GT, (ctx->idx + 4) * 4);
+   EMIT(PPC_RAW_XOR(dst_reg, dst_reg, dst_reg));
+   PPC_JMP((ctx->idx + 2) * 4);
+   }


That code seems strictly identical to the previous one and the next one.
Can you refactor in a function ?


EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off));
if (insn_is_zext([i + 1]))
addrs[++i] = ctx->idx * 4;
@@ -773,6 +789,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
/* dst = *(u32 *)(ul) (src + off) */
case BPF_LDX | BPF_MEM | BPF_W:
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
+   if (BPF_MODE(code) == BPF_PROBE_MEM) {
+   EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], src_reg, 
off));
+   PPC_LI64(b2p[TMP_REG_2], TASK_SIZE_MAX);
+   EMIT(PPC_RAW_CMPLD(b2p[TMP_REG_1], 
b2p[TMP_REG_2]));
+   PPC_BCC(COND_GT, (ctx->idx + 4) * 4);
+   EMIT(PPC_RAW_XOR(dst_reg, dst_reg, dst_reg));
+   PPC_JMP((ctx->idx + 2) * 4);
+   }
EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off));
if (insn_is_zext([i + 1]))
addrs[++i] = ctx->idx * 4;
@@ -783,6 +807,20 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
/* dst = *(u64 *)(ul) (src + off) */
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+   if (BPF_MODE(code) == BPF_PROBE_MEM) {
+   EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], src_reg, 
off));
+   PPC_LI64(b2p[TMP_REG_2], TASK_SIZE_MAX);
+   EMIT(PPC_RAW_CMPLD(b2p[TMP_REG_1], 
b2p[TMP_REG_2]));
+   if (off % 4)


That test is worth a comment.

And I'd prefer

if (off & 

Re: [PATCH 3/4] bpf powerpc: Add BPF_PROBE_MEM support for 64bit JIT

2021-07-06 Thread Christophe Leroy




Le 06/07/2021 à 09:32, Ravi Bangoria a écrit :

BPF load instruction with BPF_PROBE_MEM mode can cause a fault
inside kernel. Append exception table for such instructions
within BPF program.


Can you do the same for 32bit ?



Unlike other archs which uses extable 'fixup' field to pass dest_reg
and nip, BPF exception table on PowerPC follows the generic PowerPC
exception table design, where it populates both fixup and extable
sections witin BPF program. fixup section contains two instructions,
first instruction clears dest_reg and 2nd jumps to next instruction
in the BPF code. extable 'insn' field contains relative offset of
the instruction and 'fixup' field contains relative offset of the
fixup entry. Example layout of BPF program with extable present:

  +--+
  |  |
  |  |
0x4020 -->| ld   r27,4(r3)   |
  |  |
  |  |
0x40ac -->| lwz  r3,0(r4)|
  |  |
  |  |
  |--|
0x4280 -->| xor r27,r27,r27  |  \ fixup entry
  | b   0x4024   |  /
0x4288 -->| xor r3,r3,r3 |
  | b   0x40b0   |
  |--|
0x4290 -->| insn=0xfd90  |  \ extable entry
  | fixup=0xffec |  /
0x4298 -->| insn=0xfe14  |
  | fixup=0xffec |
  +--+

(Addresses shown here are chosen random, not real)

Signed-off-by: Ravi Bangoria 
---
  arch/powerpc/net/bpf_jit.h|  5 ++-
  arch/powerpc/net/bpf_jit_comp.c   | 25 +
  arch/powerpc/net/bpf_jit_comp32.c |  2 +-
  arch/powerpc/net/bpf_jit_comp64.c | 60 ++-
  4 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 411c63d945c7..e9408ad190d3 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -141,8 +141,11 @@ struct codegen_context {
unsigned int idx;
unsigned int stack_size;
int b2p[ARRAY_SIZE(b2p)];
+   unsigned int exentry_idx;
  };
  
+#define BPF_FIXUP_LEN	8 /* Two instructions */

+
  static inline void bpf_flush_icache(void *start, void *end)
  {
smp_wmb();  /* smp write barrier */
@@ -166,7 +169,7 @@ static inline void bpf_clear_seen_register(struct 
codegen_context *ctx, int i)
  
  void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func);

  int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct 
codegen_context *ctx,
-  u32 *addrs);
+  u32 *addrs, int pass);
  void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
  void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
  void bpf_jit_realloc_regs(struct codegen_context *ctx);
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index a9585e52a88d..3ebd8897cf09 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -89,6 +89,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
  {
u32 proglen;
u32 alloclen;
+   u32 extable_len = 0;
+   u32 fixup_len = 0;


Setting those to 0 doesn't seem to be needed, as it doesn't seem to exist any path to skip the 
setting below. You should not perform unnecessary init at declaration as it is error prone.



u8 *image = NULL;
u32 *code_base;
u32 *addrs;
@@ -131,7 +133,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
image = jit_data->image;
bpf_hdr = jit_data->header;
proglen = jit_data->proglen;
-   alloclen = proglen + FUNCTION_DESCR_SIZE;
extra_pass = true;
goto skip_init_ctx;
}
@@ -149,7 +150,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
  
  	/* Scouting faux-generate pass 0 */

-   if (bpf_jit_build_body(fp, 0, , addrs)) {
+   if (bpf_jit_build_body(fp, 0, , addrs, 0)) {
/* We hit something illegal or unsupported. */
fp = org_fp;
goto out_addrs;
@@ -162,7 +163,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 */
if (cgctx.seen & SEEN_TAILCALL) {
cgctx.idx = 0;
-   if (bpf_jit_build_body(fp, 0, , addrs)) {
+   if (bpf_jit_build_body(fp, 0, , addrs, 0)) {
fp = org_fp;
goto out_addrs;
}
@@ -177,8 +178,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
bpf_jit_build_prologue(0, );
bpf_jit_build_epilogue(0, );
  
+	fixup_len = fp->aux->num_exentries * BPF_FIXUP_LEN;

+   extable_len = fp->aux->num_exentries * sizeof(struct 

[PATCH] powerpc/non-smp: Inconditionaly call smp_mb() on switch_mm

2021-07-05 Thread Christophe Leroy
Commit 3ccfebedd8cf ("powerpc, membarrier: Skip memory barrier in
switch_mm()") added some logic to skip the smp_mb() in
switch_mm_irqs_off() before the call to switch_mmu_context().

However, on non SMP smp_mb() is just a compiler barrier and doing
it inconditionaly is simpler than the logic used to check
whether the barrier is needed or not.

After the patch:

 :
...
   c:   7c 04 18 40 cmplw   r4,r3
  10:   81 24 00 24 lwz r9,36(r4)
  14:   91 25 04 c8 stw r9,1224(r5)
  18:   4d 82 00 20 beqlr
  1c:   48 00 00 00 b   1c 
1c: R_PPC_REL24 switch_mmu_context

Before the patch:

 :
...
   c:   7c 04 18 40 cmplw   r4,r3
  10:   81 24 00 24 lwz r9,36(r4)
  14:   91 25 04 c8 stw r9,1224(r5)
  18:   4d 82 00 20 beqlr
  1c:   81 24 00 28 lwz r9,40(r4)
  20:   71 29 00 0a andi.   r9,r9,10
  24:   40 82 00 34 bne 58 
  28:   48 00 00 00 b   28 
28: R_PPC_REL24 switch_mmu_context
...
  58:   2c 03 00 00 cmpwi   r3,0
  5c:   41 82 ff cc beq 28 
  60:   48 00 00 00 b   60 
60: R_PPC_REL24 switch_mmu_context

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/membarrier.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/membarrier.h 
b/arch/powerpc/include/asm/membarrier.h
index 6e20bb5c74ea..de7f79157918 100644
--- a/arch/powerpc/include/asm/membarrier.h
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -12,7 +12,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct 
*prev,
 * when switching from userspace to kernel is not needed after
 * store to rq->curr.
 */
-   if (likely(!(atomic_read(>membarrier_state) &
+   if (IS_ENABLED(CONFIG_SMP) &&
+   likely(!(atomic_read(>membarrier_state) &
 (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
return;
-- 
2.25.0



[PATCH v3 4/4] powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP

2021-07-05 Thread Christophe Leroy
This patch converts powerpc to the generic PTDUMP implementation.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig|   2 +
 arch/powerpc/Kconfig.debug  |  30 ---
 arch/powerpc/mm/Makefile|   2 +-
 arch/powerpc/mm/mmu_decl.h  |   2 +-
 arch/powerpc/mm/ptdump/Makefile |   9 ++-
 arch/powerpc/mm/ptdump/ptdump.c | 136 
 6 files changed, 46 insertions(+), 135 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0104345d0a65..dc1ab533a1cf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -123,6 +123,7 @@ config PPC
select ARCH_HAS_COPY_MC if PPC64
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
+   select ARCH_HAS_DEBUG_WXif STRICT_KERNEL_RWX
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_DMA_MAP_DIRECT  if PPC_PSERIES
select ARCH_HAS_ELF_RANDOMIZE
@@ -182,6 +183,7 @@ config PPC
select GENERIC_IRQ_SHOW
select GENERIC_IRQ_SHOW_LEVEL
select GENERIC_PCI_IOMAPif PCI
+   select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 205cd77f321f..192f0ed0097f 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -365,36 +365,6 @@ config FAIL_IOMMU
 
  If you are unsure, say N.
 
-config PPC_PTDUMP
-   bool "Export kernel pagetable layout to userspace via debugfs"
-   depends on DEBUG_KERNEL && DEBUG_FS
-   help
- This option exports the state of the kernel pagetables to a
- debugfs file. This is only useful for kernel developers who are
- working in architecture specific areas of the kernel - probably
- not a good idea to enable this feature in a production kernel.
-
- If you are unsure, say N.
-
-config PPC_DEBUG_WX
-   bool "Warn on W+X mappings at boot"
-   depends on PPC_PTDUMP && STRICT_KERNEL_RWX
-   help
- Generate a warning if any W+X mappings are found at boot.
-
- This is useful for discovering cases where the kernel is leaving
- W+X mappings after applying NX, as such mappings are a security risk.
-
- Note that even if the check fails, your kernel is possibly
- still fine, as W+X mappings are not a security hole in
- themselves, what they do is that they make the exploitation
- of other unfixed kernel bugs easier.
-
- There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
-
- If in doubt, say "Y".
-
 config PPC_FAST_ENDIAN_SWITCH
bool "Deprecated fast endian-switch syscall"
depends on DEBUG_KERNEL && PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index eae4ec2988fc..df8172da2301 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES)   += slice.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
-obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
+obj-$(CONFIG_PTDUMP_CORE)  += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 7dac910c0b21..dd1cabc2ea0f 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { }
 void __init mmu_mapin_immr(void);
 #endif
 
-#ifdef CONFIG_PPC_DEBUG_WX
+#ifdef CONFIG_DEBUG_WX
 void ptdump_check_wx(void);
 #else
 static inline void ptdump_check_wx(void) { }
diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile
index 712762be3cb1..4050cbb55acf 100644
--- a/arch/powerpc/mm/ptdump/Makefile
+++ b/arch/powerpc/mm/ptdump/Makefile
@@ -5,5 +5,10 @@ obj-y  += ptdump.o
 obj-$(CONFIG_4xx)  += shared.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)   += shared.o
-obj-$(CONFIG_PPC_BOOK3S_32)+= shared.o bats.o segment_regs.o
-obj-$(CONFIG_PPC_BOOK3S_64)+= book3s64.o hashpagetable.o
+obj-$(CONFIG_PPC_BOOK3S_32)+= shared.o
+obj-$(CONFIG_PPC_BOOK3S_64)+= book3s64.o
+
+ifdef CONFIG_PTDUMP_DEBUGFS
+obj-$(CONFIG_PPC_BOOK3S_32)+= bats.o segment_regs.o
+obj-$(CONFIG_PPC_BOOK3S_64)+= hashpagetable.o
+endif
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index fb531bc64fc5..8d0c724b0c18 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -54,6 +55,7 @@
  *
  */
 struct pg_state {
+   struct ptdump_

[PATCH v3 3/4] powerpc/ptdump: Reduce level numbers by 1 in note_page() and add p4d level

2021-07-05 Thread Christophe Leroy
Do the same as commit f8f0d0b6fa20 ("mm: ptdump: reduce level numbers
by 1 in note_page()") and add missing p4d level.

This will align powerpc to the users of generic ptdump.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/ptdump/8xx.c  |  6 --
 arch/powerpc/mm/ptdump/book3s64.c |  6 --
 arch/powerpc/mm/ptdump/ptdump.c   | 17 +
 arch/powerpc/mm/ptdump/shared.c   |  6 --
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c
index 86da2a669680..fac932eb8f9a 100644
--- a/arch/powerpc/mm/ptdump/8xx.c
+++ b/arch/powerpc/mm/ptdump/8xx.c
@@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/book3s64.c 
b/arch/powerpc/mm/ptdump/book3s64.c
index 14f73868db66..5ad92d9dc5d1 100644
--- a/arch/powerpc/mm/ptdump/book3s64.c
+++ b/arch/powerpc/mm/ptdump/book3s64.c
@@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 3eb8732641da..fb531bc64fc5 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -58,7 +58,7 @@ struct pg_state {
const struct addr_marker *marker;
unsigned long start_address;
unsigned long start_pa;
-   unsigned int level;
+   int level;
u64 current_flags;
bool check_wx;
unsigned long wx_pages;
@@ -188,10 +188,9 @@ static void note_prot_wx(struct pg_state *st, unsigned 
long addr)
st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
 }
 
-static void note_page_update_state(struct pg_state *st, unsigned long addr,
-  unsigned int level, u64 val)
+static void note_page_update_state(struct pg_state *st, unsigned long addr, 
int level, u64 val)
 {
-   u64 flag = val & pg_level[level].mask;
+   u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
u64 pa = val & PTE_RPN_MASK;
 
st->level = level;
@@ -206,12 +205,12 @@ static void note_page_update_state(struct pg_state *st, 
unsigned long addr,
 }
 
 static void note_page(struct pg_state *st, unsigned long addr,
-  unsigned int level, u64 val, unsigned long page_size)
+ int level, u64 val, unsigned long page_size)
 {
-   u64 flag = val & pg_level[level].mask;
+   u64 flag = level >= 0 ? val & pg_level[level].mask : 0;
 
/* At first no level is set */
-   if (!st->level) {
+   if (st->level == -1) {
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
note_page_update_state(st, addr, level, val);
/*
@@ -383,6 +382,7 @@ static int ptdump_show(struct seq_file *m, void *v)
struct pg_state st = {
.seq = m,
.marker = address_markers,
+   .level = -1,
.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : 
TASK_SIZE,
};
 
@@ -393,7 +393,7 @@ static int ptdump_show(struct seq_file *m, void *v)
 
/* Traverse kernel page tables */
walk_pagetables();
-   note_page(, 0, 0, 0, 0);
+   note_page(, 0, -1, 0, 0);
return 0;
 }
 
@@ -415,6 +415,7 @@ void ptdump_check_wx(void)
struct pg_state st = {
.seq = NULL,
.marker = address_markers,
+   .level = -1,
.check_wx = true,
.start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : 
TASK_SIZE,
};
diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c
index c005fe041c18..03607ab90c66 100644
--- a/arch/powerpc/mm/ptdump/shared.c
+++ b/arch/powerpc/mm/ptdump/shared.c
@@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = {
 };
 
 struct pgtable_level pg_level[5] = {
-   {
-   }, { /* pgd */
+   { /* pgd */
+   .flag   = flag_array,
+   .num= ARRAY_SIZE(flag_array),
+   }, { /* p4d */
.flag   = flag_array,
.num= ARRAY_SIZE(flag_array),
}, { /* pud */
-- 
2.25.0



[PATCH v3 2/4] powerpc/ptdump: Remove unused 'page_size' parameter

2021-07-05 Thread Christophe Leroy
note_page_update_state() doesn't use page_size. Remove it.

Could also be removed to note_page() but as a following patch
will remove all current users of note_page(), just leave it as
is for now.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/ptdump/ptdump.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 349fd8fe173f..3eb8732641da 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -189,7 +189,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long 
addr)
 }
 
 static void note_page_update_state(struct pg_state *st, unsigned long addr,
-  unsigned int level, u64 val, unsigned long 
page_size)
+  unsigned int level, u64 val)
 {
u64 flag = val & pg_level[level].mask;
u64 pa = val & PTE_RPN_MASK;
@@ -213,7 +213,7 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
/* At first no level is set */
if (!st->level) {
pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
-   note_page_update_state(st, addr, level, val, page_size);
+   note_page_update_state(st, addr, level, val);
/*
 * Dump the section of virtual memory when:
 *   - the PTE flags from one entry to the next differs.
@@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long 
addr,
 * Address indicates we have passed the end of the
 * current section of virtual memory
 */
-   note_page_update_state(st, addr, level, val, page_size);
+   note_page_update_state(st, addr, level, val);
}
 }
 
-- 
2.25.0



[PATCH v3 1/4] powerpc/ptdump: Use DEFINE_SHOW_ATTRIBUTE()

2021-07-05 Thread Christophe Leroy
Use DEFINE_SHOW_ATTRIBUTE() instead of open coding
open() and fops.

Signed-off-by: Christophe Leroy 
---
v4: This series is following the partial merge of "Convert powerpc to 
GENERIC_PTDUMP".
Patches 1 2 and 3 of that series were merged in 5.13. This updated series is a
split of patch 4. The split helps reduce the size of the previous patch by only
keeping in it the actual conversion.

 arch/powerpc/mm/ptdump/bats.c  | 14 ++
 arch/powerpc/mm/ptdump/hashpagetable.c | 12 +---
 arch/powerpc/mm/ptdump/ptdump.c| 13 +
 arch/powerpc/mm/ptdump/segment_regs.c  | 12 +---
 4 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c
index c4c628b03cf8..4ed3418f07d9 100644
--- a/arch/powerpc/mm/ptdump/bats.c
+++ b/arch/powerpc/mm/ptdump/bats.c
@@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 
lower, u32 upper, bool
 
 #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), 
mfspr(_u), _d)
 
-static int bats_show_603(struct seq_file *m, void *v)
+static int bats_show(struct seq_file *m, void *v)
 {
seq_puts(m, "---[ Instruction Block Address Translation ]---\n");
 
@@ -88,17 +88,7 @@ static int bats_show_603(struct seq_file *m, void *v)
return 0;
 }
 
-static int bats_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, bats_show_603, NULL);
-}
-
-static const struct file_operations bats_fops = {
-   .open   = bats_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bats);
 
 static int __init bats_init(void)
 {
diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c 
b/arch/powerpc/mm/ptdump/hashpagetable.c
index ad6df9a2e7c8..c7f824d294b2 100644
--- a/arch/powerpc/mm/ptdump/hashpagetable.c
+++ b/arch/powerpc/mm/ptdump/hashpagetable.c
@@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
 }
 
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-   .open   = ptdump_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static int ptdump_init(void)
 {
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 5062c58b1e5b..349fd8fe173f 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -397,18 +397,7 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
 }
 
-
-static int ptdump_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
-   .open   = ptdump_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
 
 static void build_pgtable_complete_mask(void)
 {
diff --git a/arch/powerpc/mm/ptdump/segment_regs.c 
b/arch/powerpc/mm/ptdump/segment_regs.c
index 565048a0c9be..3054944d3d7e 100644
--- a/arch/powerpc/mm/ptdump/segment_regs.c
+++ b/arch/powerpc/mm/ptdump/segment_regs.c
@@ -41,17 +41,7 @@ static int sr_show(struct seq_file *m, void *v)
return 0;
 }
 
-static int sr_open(struct inode *inode, struct file *file)
-{
-   return single_open(file, sr_show, NULL);
-}
-
-static const struct file_operations sr_fops = {
-   .open   = sr_open,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(sr);
 
 static int __init sr_init(void)
 {
-- 
2.25.0



Re: [RFC PATCH] powerpc: flexible register range save/restore macros

2021-07-04 Thread Christophe Leroy




Le 03/07/2021 à 11:14, Nicholas Piggin a écrit :

Introduce macros that operate on a (start, end) range of registers,
which reduces lines of code and need to do mental arithmetic while
reading the code.


Looks like a nice patch.

Maybe you could split the patch in two parts, one part for GPRs and one patch 
for the FP/VR regs.

Christophe



Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/boot/crt0.S  | 31 ---
  arch/powerpc/crypto/md5-asm.S | 10 +--
  arch/powerpc/crypto/sha1-powerpc-asm.S|  6 +-
  arch/powerpc/include/asm/ppc_asm.h| 81 +--
  arch/powerpc/kernel/cpu_setup_6xx.S   |  2 +-
  arch/powerpc/kernel/entry_32.S| 23 +++---
  arch/powerpc/kernel/exceptions-64e.S  | 14 +---
  arch/powerpc/kernel/exceptions-64s.S  |  6 +-
  arch/powerpc/kernel/fpu.S | 28 +++
  arch/powerpc/kernel/head_32.h |  3 +-
  arch/powerpc/kernel/head_booke.h  |  3 +-
  arch/powerpc/kernel/interrupt_64.S| 34 +++-
  arch/powerpc/kernel/optprobes_head.S  |  4 +-
  arch/powerpc/kernel/tm.S  | 47 +--
  .../powerpc/kernel/trace/ftrace_64_mprofile.S | 14 ++--
  arch/powerpc/kernel/vector.S  |  8 +-
  arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  5 +-
  .../lib/test_emulate_step_exec_instr.S|  8 +-
  18 files changed, 140 insertions(+), 187 deletions(-)

diff --git a/arch/powerpc/boot/crt0.S b/arch/powerpc/boot/crt0.S
index 1d83966f5ef6..349279ba8ce7 100644
--- a/arch/powerpc/boot/crt0.S
+++ b/arch/powerpc/boot/crt0.S
@@ -226,16 +226,19 @@ p_base:   mflrr10 /* r10 now points to 
runtime addr of p_base */
  #ifdef __powerpc64__
  
  #define PROM_FRAME_SIZE 512

-#define SAVE_GPR(n, base)   std n,8*(n)(base)
-#define REST_GPR(n, base)   ld  n,8*(n)(base)
-#define SAVE_2GPRS(n, base) SAVE_GPR(n, base); SAVE_GPR(n+1, base)
-#define SAVE_4GPRS(n, base) SAVE_2GPRS(n, base); SAVE_2GPRS(n+2, base)
-#define SAVE_8GPRS(n, base) SAVE_4GPRS(n, base); SAVE_4GPRS(n+4, base)
-#define SAVE_10GPRS(n, base)SAVE_8GPRS(n, base); SAVE_2GPRS(n+8, base)
-#define REST_2GPRS(n, base) REST_GPR(n, base); REST_GPR(n+1, base)
-#define REST_4GPRS(n, base) REST_2GPRS(n, base); REST_2GPRS(n+2, base)
-#define REST_8GPRS(n, base) REST_4GPRS(n, base); REST_4GPRS(n+4, base)
-#define REST_10GPRS(n, base)REST_8GPRS(n, base); REST_2GPRS(n+8, base)
+
+.macro OP_REGS op, width, start, end, base, offset
+   .Lreg=\start
+   .rept (\end - \start + 1)
+   \op .Lreg,\offset+\width*.Lreg(\base)
+   .Lreg=.Lreg+1
+   .endr
+.endm
+
+#define SAVE_GPRS(start, end, base)OP_REGS std, 8, start, end, base, 0
+#define SAVE_GPR(n, base)  SAVE_GPRS(n, n, base)
+#define REST_GPRS(start, end, base)OP_REGS ld, 8, start, end, base, 0
+#define REST_GPR(n, base)  REST_GPRS(n, n, base)
  
  /* prom handles the jump into and return from firmware.  The prom args pointer

 is loaded in r3. */
@@ -246,9 +249,7 @@ prom:
stdur1,-PROM_FRAME_SIZE(r1) /* Save SP and create stack space */
  
  	SAVE_GPR(2, r1)

-   SAVE_GPR(13, r1)
-   SAVE_8GPRS(14, r1)
-   SAVE_10GPRS(22, r1)
+   SAVE_GPRS(13, 31, r1)
mfcrr10
std r10,8*32(r1)
mfmsr   r10
@@ -283,9 +284,7 @@ prom:
  
  	/* Restore other registers */

REST_GPR(2, r1)
-   REST_GPR(13, r1)
-   REST_8GPRS(14, r1)
-   REST_10GPRS(22, r1)
+   REST_GPRS(13, 31, r1)
ld  r10,8*32(r1)
mtcrr10
  
diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S

index 948d100a2934..8f335a3f8430 100644
--- a/arch/powerpc/crypto/md5-asm.S
+++ b/arch/powerpc/crypto/md5-asm.S
@@ -38,15 +38,11 @@
  
  #define INITIALIZE \

PPC_STLU r1,-INT_FRAME_SIZE(r1); \
-   SAVE_8GPRS(14, r1); /* push registers onto stack*/ \
-   SAVE_4GPRS(22, r1);\
-   SAVE_GPR(26, r1)
+   SAVE_GPRS(14, 26, r1)   /* push registers onto stack*/
  
  #define FINALIZE \

-   REST_8GPRS(14, r1); /* pop registers from stack */ \
-   REST_4GPRS(22, r1);\
-   REST_GPR(26, r1);  \
-   addir1,r1,INT_FRAME_SIZE;
+   REST_GPRS(14, 26, r1);  /* pop registers from stack */
+   addir1,r1,INT_FRAME_SIZE
  
  #ifdef __BIG_ENDIAN__

  #define LOAD_DATA(reg, off) \
diff --git a/arch/powerpc/crypto/sha1-powerpc-asm.S 
b/arch/powerpc/crypto/sha1-powerpc-asm.S
index 23e248beff71..f0d5ed557ab1 100644
--- a/arch/powerpc/crypto/sha1-powerpc-asm.S
+++ b/arch/powerpc/crypto/sha1-powerpc-asm.S
@@ -125,8 +125,7 @@
  
  _GLOBAL(powerpc_sha_transform)

PPC_STLU 

Re: [PATCH] powerpc/mm: Fix lockup on kernel exec fault

2021-07-01 Thread Christophe Leroy




Le 02/07/2021 à 03:25, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of July 1, 2021 9:17 pm:

The powerpc kernel is not prepared to handle exec faults from kernel.
Especially, the function is_exec_fault() will return 'false' when an
exec fault is taken by kernel, because the check is based on reading
current->thread.regs->trap which contains the trap from user.

For instance, when provoking a LKDTM EXEC_USERSPACE test,
current->thread.regs->trap is set to SYSCALL trap (0xc00), and
the fault taken by the kernel is not seen as an exec fault by
set_access_flags_filter().

Commit d7df2443cd5f ("powerpc/mm: Fix spurrious segfaults on radix
with autonuma") made it clear and handled it properly. But later on
commit d3ca587404b3 ("powerpc/mm: Fix reporting of kernel execute
faults") removed that handling, introducing test based on error_code.
And here is the problem, because on the 603 all upper bits of SRR1
get cleared when the TLB instruction miss handler bails out to ISI.


So the problem is 603 doesn't see the DSISR_NOEXEC_OR_G bit?


I a way yes. But the problem is also that the kernel doesn't see it as an exec fault in 
set_access_flags_filter() as explained above. If it could see it as an exec fault, it would set 
PAGE_EXEC and it would work (or maybe not because it seems it also checks for the dirtiness of the 
page, and here the page is also flagged as dirty).


603 will see DSISR_NOEXEC_OR_G if it's an access to a page which is in a 
segment flagged NX.



I don't see the problem with this for 64s, I don't think anything sane
can be done for any 0x400 interrupt in the kernel so it's probably
good to catch all here just in case. For 64s,

Acked-by: Nicholas Piggin 

Why is 32s clearing those top bits? And it seems to be setting DSISR
that AFAIKS it does not use. Seems like it would be good to add a
NOEXEC_OR_G bit into SRR1.


Probably for simplicity.

When taking the Instruction TLB Miss interrupt, SRR1 contains CR0 fields in bits 0-3 and some 
dedicated info in bits 12-15. That doesn't match SRR1 bits for ISI, so before falling back to the 
ISI handler, ITLB Miss handler error patch clears upper SRR1 bits.


Maybe it could instead try to set the right bits, but it would make it more complicated because the 
error patch can be taken for the following reasons:

- No page table
- Not PAGE_PRESENT
- Not PAGE_ACCESSED
- Not PAGE_EXEC
- Below TASK_SIZE and not PAGE_USER

At the time being the verification of the flags is done with a single 'andc' operation. If we wanted 
to set the proper bits, it would mean testing the flags separately, which would impact performance 
on the no-error path.


Or maybe it would be good enough to set the PROTFAULT bit in all cases but the lack of page table. 
The 8xx sets PROTFAULT when hitting non-exec pages, so the kernel is prepared for it anyway. Not 
sure about the lack of PAGE_PRESENT thought. The 8xx sets NOHPTE bit when PAGE_PRESENT is cleared.


But is it really worth doing ?

Christophe


Re: [PATCH 2/2] powerpc/bpf: Reject atomic ops in ppc32 JIT

2021-07-01 Thread Christophe Leroy




Le 01/07/2021 à 17:08, Naveen N. Rao a écrit :

Commit 91c960b0056672 ("bpf: Rename BPF_XADD and prepare to encode other
atomics in .imm") converted BPF_XADD to BPF_ATOMIC and updated all JIT
implementations to reject JIT'ing instructions with an immediate value
different from BPF_ADD. However, ppc32 BPF JIT was implemented around
the same time and didn't include the same change. Update the ppc32 JIT
accordingly.

Signed-off-by: Naveen N. Rao 


Shouldn't it also include a Fixes tag and stable Cc as PPC32 eBPF was added in 
5.13 ?

Fixes: 51c66ad849a7 ("powerpc/bpf: Implement extended BPF on PPC32")
Cc: sta...@vger.kernel.org


---
  arch/powerpc/net/bpf_jit_comp32.c | 14 +++---
  1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c
index cbe5b399ed869d..91c990335a16c9 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -773,9 +773,17 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
break;
  
  		/*

-* BPF_STX XADD (atomic_add)
+* BPF_STX ATOMIC (atomic ops)
 */
-   case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src 
*/
+   case BPF_STX | BPF_ATOMIC | BPF_W:
+   if (imm != BPF_ADD) {
+   pr_err_ratelimited(
+   "eBPF filter atomic op code %02x (@%d) 
unsupported\n", code, i);
+   return -ENOTSUPP;
+   }
+
+   /* *(u32 *)(dst + off) += src */
+
bpf_set_seen_register(ctx, tmp_reg);
/* Get offset into TMP_REG */
EMIT(PPC_RAW_LI(tmp_reg, off));
@@ -789,7 +797,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
PPC_BCC_SHORT(COND_NE, (ctx->idx - 3) * 4);
break;
  
-		case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */

+   case BPF_STX | BPF_ATOMIC | BPF_DW: /* *(u64 *)(dst + off) += 
src */
return -EOPNOTSUPP;
  
  		/*




Re: [PATCH] sched: Use WARN_ON

2021-07-01 Thread Christophe Leroy




Le 01/07/2021 à 14:50, Jason Wang a écrit :

The BUG_ON macro simplifies the if condition followed by BUG, but it
will lead to the kernel crashing. Therefore, we can try using WARN_ON
instead of if condition followed by BUG.


But are you sure it is ok to continue if spu_acquire(ctx) returned false ?
Shouldn't there be at least for fallback handling ?

Something like:

if (WARN_ON(spu_acquire(ctx)))
return;


Christophe




Signed-off-by: Jason Wang 
---
  arch/powerpc/platforms/cell/spufs/sched.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c 
b/arch/powerpc/platforms/cell/spufs/sched.c
index 369206489895..0f218d9e5733 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -904,8 +904,8 @@ static noinline void spusched_tick(struct spu_context *ctx)
struct spu_context *new = NULL;
struct spu *spu = NULL;
  
-	if (spu_acquire(ctx))

-   BUG();  /* a kernel thread never has signals pending */
+   /* a kernel thread never has signals pending */
+   WARN_ON(spu_acquire(ctx));
  
  	if (ctx->state != SPU_STATE_RUNNABLE)

goto out;



[PATCH] powerpc/mm: Fix lockup on kernel exec fault

2021-07-01 Thread Christophe Leroy
The powerpc kernel is not prepared to handle exec faults from kernel.
Especially, the function is_exec_fault() will return 'false' when an
exec fault is taken by kernel, because the check is based on reading
current->thread.regs->trap which contains the trap from user.

For instance, when provoking a LKDTM EXEC_USERSPACE test,
current->thread.regs->trap is set to SYSCALL trap (0xc00), and
the fault taken by the kernel is not seen as an exec fault by
set_access_flags_filter().

Commit d7df2443cd5f ("powerpc/mm: Fix spurrious segfaults on radix
with autonuma") made it clear and handled it properly. But later on
commit d3ca587404b3 ("powerpc/mm: Fix reporting of kernel execute
faults") removed that handling, introducing test based on error_code.
And here is the problem, because on the 603 all upper bits of SRR1
get cleared when the TLB instruction miss handler bails out to ISI.

Until commit cbd7e6ca0210 ("powerpc/fault: Avoid heavy
search_exception_tables() verification"), an exec fault from kernel
at a userspace address was indirectly caught by the lack of entry for
that address in the exception tables. But after that commit the
kernel mainly rely on KUAP or on core mm handling to catch wrong
user accesses. Here the access is not wrong, so mm handles it.
It is a minor fault because PAGE_EXEC is not set,
set_access_flags_filter() should set PAGE_EXEC and voila.
But as is_exec_fault() returns false as explained in the begining,
set_access_flags_filter() bails out without setting PAGE_EXEC flag,
which leads to a forever minor exec fault.

As the kernel is not prepared to handle such exec faults, the thing
to do is to fire in bad_kernel_fault() for any exec fault taken by
the kernel, as it was prior to commit d3ca587404b3.

Fixes: d3ca587404b3 ("powerpc/mm: Fix reporting of kernel execute faults")
Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/fault.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 34f641d4a2fe..a8d0ce85d39a 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -199,9 +199,7 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned 
long error_code,
 {
int is_exec = TRAP(regs) == INTERRUPT_INST_STORAGE;
 
-   /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on 
others */
-   if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT |
- DSISR_PROTFAULT))) {
+   if (is_exec) {
pr_crit_ratelimited("kernel tried to execute %s page (%lx) - 
exploit attempt? (uid: %d)\n",
address >= TASK_SIZE ? "exec-protected" : 
"user",
address,
-- 
2.25.0



Re: [PATCH v3 3/9] powerpc/64e: remove implicit soft-masking and interrupt exit restart logic

2021-06-30 Thread Christophe Leroy




Le 30/06/2021 à 09:46, Nicholas Piggin a écrit :

The implicit soft-masking to speed up interrupt return was going to be
used by 64e as well, but it has not been extensively tested on that
platform and is not considered ready. It was intended to be disabled
before merge. Disable it for now.

Most of the restart code is common with 64s, so with more correctness
and performance testing this could be re-enabled again by adding the
extra soft-mask checks to interrupt handlers and flipping
exit_must_hard_disable().

Fixes: 9d1988ca87dd ("powerpc/64: treat low kernel text as irqs soft-masked")
Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/include/asm/interrupt.h | 33 
  arch/powerpc/kernel/exceptions-64e.S | 12 +-
  arch/powerpc/kernel/interrupt.c  |  2 +-
  arch/powerpc/kernel/interrupt_64.S   | 16 --
  4 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index 8b4b1e84e110..f13c93b033c7 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -73,20 +73,34 @@
  #include 
  #include 
  
-#ifdef CONFIG_PPC64

+#ifdef CONFIG_PPC_BOOK3S_64


Can we avoid that ifdef and use IS_ENABLED(CONFIG_PPC_BOOK3S_64) below ?


  extern char __end_soft_masked[];
  unsigned long search_kernel_restart_table(unsigned long addr);
-#endif
  
-#ifdef CONFIG_PPC_BOOK3S_64

  DECLARE_STATIC_KEY_FALSE(interrupt_exit_not_reentrant);
  
+static inline bool is_implicit_soft_masked(struct pt_regs *regs)

+{
+   if (regs->msr & MSR_PR)
+   return false;
+
+   if (regs->nip >= (unsigned long)__end_soft_masked)
+   return false;
+
+   return true;
+}
+
  static inline void srr_regs_clobbered(void)
  {
local_paca->srr_valid = 0;
local_paca->hsrr_valid = 0;
  }
  #else
+static inline bool is_implicit_soft_masked(struct pt_regs *regs)
+{
+   return false;
+}
+
  static inline void srr_regs_clobbered(void)
  {
  }
@@ -150,11 +164,13 @@ static inline void interrupt_enter_prepare(struct pt_regs 
*regs, struct interrup
 */
if (TRAP(regs) != INTERRUPT_PROGRAM) {
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
-   BUG_ON(regs->nip < (unsigned long)__end_soft_masked);
+   BUG_ON(is_implicit_soft_masked(regs));
}
+#ifdef CONFIG_PPC_BOOK3S


Allthough we are already in a PPC64 section, wouldn't it be better to use 
CONFIG_PPC_BOOK3S_64 ?

Can we use IS_ENABLED(CONFIG_PPC_BOOK3S_64) instead ?


/* Move this under a debugging check */
if (arch_irq_disabled_regs(regs))
BUG_ON(search_kernel_restart_table(regs->nip));
+#endif
}
  #endif
  
@@ -244,10 +260,9 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte

local_paca->irq_soft_mask = IRQS_ALL_DISABLED;
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
  
-	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !(regs->msr & MSR_PR) &&

-   regs->nip < (unsigned long)__end_soft_masked) {
-   // Kernel code running below __end_soft_masked is
-   // implicitly soft-masked.
+   if (is_implicit_soft_masked(regs)) {
+   // Adjust regs->softe soft implicit soft-mask, so
+   // arch_irq_disabled_regs(regs) behaves as expected.
regs->softe = IRQS_ALL_DISABLED;
}
  
@@ -282,6 +297,7 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter

 */
  
  #ifdef CONFIG_PPC64

+#ifdef CONFIG_PPC_BOOK3S


IS_ENABLED(CONFIG_PPC_BOOK3S_64) instead ?


if (arch_irq_disabled_regs(regs)) {
unsigned long rst = search_kernel_restart_table(regs->nip);
if (rst)
@@ -289,7 +305,6 @@ static inline void interrupt_nmi_exit_prepare(struct 
pt_regs *regs, struct inter
}
  #endif
  
-#ifdef CONFIG_PPC64

if (nmi_disables_ftrace(regs))
this_cpu_set_ftrace_enabled(state->ftrace_enabled);
  
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S

index d634bfceed2c..1401787b0b93 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -342,17 +342,7 @@ ret_from_mc_except:
  #define PROLOG_ADDITION_MASKABLE_GEN(n)   
\
lbz r10,PACAIRQSOFTMASK(r13);   /* are irqs soft-masked? */ \
andi.   r10,r10,IRQS_DISABLED;  /* yes -> go out of line */ \
-   bne masked_interrupt_book3e_##n;\
-   /* Kernel code below __end_soft_masked is implicitly masked */  \
-   andi.   r10,r11,MSR_PR; \
-   bne 1f; /* user -> not masked */ \
-   std 

[PATCH v2] powerpc/4xx: Fix setup_kuep() on SMP

2021-06-29 Thread Christophe Leroy
On SMP, setup_kuep() is also called from start_secondary() since
commit 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").

start_secondary() is not an __init function.

Remove the __init marker from setup_kuep() and bail out when
not caller on the first CPU as the work is already done.

Reported-by: kernel test robot 
Fixes: 10248dcba120 ("powerpc/44x: Implement Kernel Userspace Exec Protection 
(KUEP)")
Fixes: 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").
Signed-off-by: Christophe Leroy 
---
v2: Add missing asm/smp.h to avoid build failure without CONFIG_SMP
---
 arch/powerpc/mm/nohash/44x.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index 7da6d1e9fc9b..e079f26b267e 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -241,8 +242,11 @@ void __init mmu_init_secondary(int cpu)
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
+   if (smp_processor_id() != boot_cpuid)
+   return;
+
if (disabled)
patch_instruction_site(__tlb_44x_kuep, 
ppc_inst(PPC_RAW_NOP()));
else
-- 
2.25.0



Re: [PATCH] powerpc/4xx: Fix setup_kuep() on SMP

2021-06-29 Thread Christophe Leroy




Le 29/06/2021 à 13:58, Michael Ellerman a écrit :

Christophe Leroy  writes:

On SMP, setup_kuep() is also called from start_secondary() since
commit 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").

start_secondary() is not an __init function.

Remove the __init marker from setup_kuep() and bail out when
not caller on the first CPU as the work is already done.

Reported-by: kernel test robot 
Fixes: 10248dcba120 ("powerpc/44x: Implement Kernel Userspace Exec Protection 
(KUEP)")
Fixes: 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/nohash/44x.c | 5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index 7da6d1e9fc9b..20c18bd5b9a0 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -241,8 +241,11 @@ void __init mmu_init_secondary(int cpu)
  #endif /* CONFIG_SMP */
  
  #ifdef CONFIG_PPC_KUEP

-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
  {
+   if (smp_processor_id() != boot_cpuid)
+   return;
+
if (disabled)
patch_instruction_site(__tlb_44x_kuep, 
ppc_inst(PPC_RAW_NOP()));
else


Building ppc44x_defconfig gives me:

   /linux/arch/powerpc/mm/nohash/44x.c: In function 'setup_kuep':
   /linux/arch/powerpc/mm/nohash/44x.c:246:35: error: 'boot_cpuid' undeclared 
(first use in this function); did you mean 'boot_cpu_init'?
 246 | if (smp_processor_id() != boot_cpuid)
 |   ^~
 |   boot_cpu_init
   /linux/arch/powerpc/mm/nohash/44x.c:246:35: note: each undeclared identifier 
is reported only once for each function it appears in



Seems like we need  when we don't have CONFIG_SMP.

I tested it with akebono_defconfig, looks like it has CONFIG_SMP.


[PATCH] powerpc/4xx: Fix setup_kuep() on SMP

2021-06-28 Thread Christophe Leroy
On SMP, setup_kuep() is also called from start_secondary() since
commit 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").

start_secondary() is not an __init function.

Remove the __init marker from setup_kuep() and bail out when
not caller on the first CPU as the work is already done.

Reported-by: kernel test robot 
Fixes: 10248dcba120 ("powerpc/44x: Implement Kernel Userspace Exec Protection 
(KUEP)")
Fixes: 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/44x.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c
index 7da6d1e9fc9b..20c18bd5b9a0 100644
--- a/arch/powerpc/mm/nohash/44x.c
+++ b/arch/powerpc/mm/nohash/44x.c
@@ -241,8 +241,11 @@ void __init mmu_init_secondary(int cpu)
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
+   if (smp_processor_id() != boot_cpuid)
+   return;
+
if (disabled)
patch_instruction_site(__tlb_44x_kuep, 
ppc_inst(PPC_RAW_NOP()));
else
-- 
2.25.0



[PATCH] powerpc/32s: Fix setup_{kuap/kuep}() on SMP

2021-06-28 Thread Christophe Leroy
On SMP, setup_kup() is also called from start_secondary().

start_secondary() is not an __init function.

Remove the __init marker from setup_kuep() and and setup_kuap().

Reported-by: kernel test robot 
Fixes: 86f46f343272 ("powerpc/32s: Initialise KUAP and KUEP in C").
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/book3s32/kuap.c | 2 +-
 arch/powerpc/mm/book3s32/kuep.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/kuap.c b/arch/powerpc/mm/book3s32/kuap.c
index 9df6911b8fde..0f920f09af57 100644
--- a/arch/powerpc/mm/book3s32/kuap.c
+++ b/arch/powerpc/mm/book3s32/kuap.c
@@ -18,7 +18,7 @@ void kuap_unlock_all_ool(void)
 }
 EXPORT_SYMBOL(kuap_unlock_all_ool);
 
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
if (!disabled)
kuap_lock_all_ool();
diff --git a/arch/powerpc/mm/book3s32/kuep.c b/arch/powerpc/mm/book3s32/kuep.c
index 3f6eb6e23fca..c20733d6e02c 100644
--- a/arch/powerpc/mm/book3s32/kuep.c
+++ b/arch/powerpc/mm/book3s32/kuep.c
@@ -5,7 +5,7 @@
 
 struct static_key_false disable_kuep_key;
 
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
if (!disabled)
kuep_lock();
-- 
2.25.0



Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables

2021-06-28 Thread Christophe Leroy




Le 28/06/2021 à 08:03, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.


But we do handle hugetlb separately

if (vma && is_vm_hugetlb_page(vma)) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);

Are we using hugepd format for non hugetlb entries?


Yes, on the 8xx we use hugepd for 8M pages for linear mapping and for kasan shadow mapping (See 
commit bb5f33c06940 ("Merge "Use hugepages to map kernel mem on 8xx" into next")


And I'm working on implementing huge VMAP with 8M pages, that will also make 
use of hugepd.





Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy 
Reviewed-by: Steven Price 
---
v3:
- Rebased on next-20210624 (no change since v2)
- Added Steven's Reviewed-by
- Sent as standalone for merge via mm

v2:
- Add a guard for NULL ops->pte_entry
- Take mm->page_table_lock when walking hugepage table, as suggested by 
follow_huge_pd()
---
  mm/pagewalk.c | 58 ++-
  1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..9b3db11a4d1d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
return err;
  }
  
+#ifdef CONFIG_ARCH_HAS_HUGEPD

+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   int err = 0;
+   const struct mm_walk_ops *ops = walk->ops;
+   int shift = hugepd_shift(*phpd);
+   int page_size = 1 << shift;
+
+   if (!ops->pte_entry)
+   return 0;
+
+   if (addr & (page_size - 1))
+   return 0;
+
+   for (;;) {
+   pte_t *pte;
+
+   spin_lock(>mm->page_table_lock);
+   pte = hugepte_offset(*phpd, addr, pdshift);
+   err = ops->pte_entry(pte, addr, addr + page_size, walk);
+   spin_unlock(>mm->page_table_lock);
+
+   if (err)
+   break;
+   if (addr >= end - page_size)
+   break;
+   addr += page_size;
+   }
+   return err;
+}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   return 0;
+}
+#endif
+
  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
  {
@@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
goto again;
}
  
-		err = walk_pte_range(pmd, addr, next, walk);

+   if (is_hugepd(__hugepd(pmd_val(*pmd
+   err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
walk, PMD_SHIFT);
+   else
+   err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
if (pud_none(*pud))
goto again;
  
-		err = walk_pmd_range(pud, addr, next, walk);

+   if (is_hugepd(__hugepd(pud_val(*pud
+   err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
+   else
+   err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, 
unsigned long end,
if (err)
break;
}
-   if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+   if (is_hugepd(__hugepd(p4d_val(*p4d
+   err = walk_hugepd_range((hugepd_t *)p4d, addr, next, 
walk, P4D_SHIFT);
+   else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long 
end,
if (err)
break;
}
-   if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-   ops->pte_entry)
+   if (is_hugepd(__hugepd(pgd_val(*pg

Re: [PATCH v3] mm: pagewalk: Fix walk for hugepage tables

2021-06-27 Thread Christophe Leroy




Le 28/06/2021 à 03:12, Andrew Morton a écrit :

On Fri, 25 Jun 2021 05:10:12 + (UTC) Christophe Leroy 
 wrote:


Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.


More details, please?  I assume "crazy result" is userspace visible?
For how long has this bug existed?  Is a -stable backport needed?  Has
a Fixes: commit been identified?  etcetera!



I discovered the problem while porting powerpc to generic page table dump.
The generic page table dump uses walk_page_range_novma() .

Yes, "crazy result" is that when dumping /sys/kernel/debug/kernel_page_tables, you get random 
entries because at the time being the pagewalk code sees huge page directories as standard page tables.


The bug has always existed as far as I can see, but as no other architectures than powerpc use huge 
page directories, it only pops up now when powerpc is trying to use that generic page walking code.


So I don't think it is worth a backport to -stable, and about a Fixes: tag I 
don't know.

IIUC, hugepd was introduced for the first time in mm by commit cbd34da7dc9a ("mm: move the powerpc 
hugepd code to mm/gup.c")


Before that, hugepd was internal to powerpc.

I guess you are asking about Fixes: tag and backporting because of the patch 
subject.
Should I reword the page subject to something like "mm: enable the generic page walk code to walk 
huge page directories" ?


[PATCH] powerpc: Remove in_kernel_text()

2021-06-27 Thread Christophe Leroy
Last user of in_kernel_text() stopped using in with
commit 549e8152de80 ("powerpc: Make the 64-bit kernel as a
position-independent executable").

Generic function is_kernel_text() does the same.

So remote it.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/sections.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index 324d7b298ec3..6e4af4492a14 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -38,14 +38,6 @@ extern char start_virt_trampolines[];
 extern char end_virt_trampolines[];
 #endif
 
-static inline int in_kernel_text(unsigned long addr)
-{
-   if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end)
-   return 1;
-
-   return 0;
-}
-
 static inline unsigned long kernel_toc_addr(void)
 {
/* Defined by the linker, see vmlinux.lds.S */
-- 
2.25.0



[PATCH] powerpc/interrupt: Use names in check_return_regs_valid()

2021-06-25 Thread Christophe Leroy
trap->regs == 0x3000 is trap_is_scv()

trap 0x500 is INTERRUPT_EXTERNAL

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/interrupt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 1b4a99ecb7e5..0052702ee5ac 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -222,12 +222,12 @@ static void check_return_regs_valid(struct pt_regs *regs)
u8 *validp;
char *h;
 
-   if (regs->trap == 0x3000)
+   if (trap_is_scv(regs))
return;
 
trap = regs->trap;
// EE in HV mode sets HSRRs like 0xea0
-   if (cpu_has_feature(CPU_FTR_HVMODE) && trap == 0x500)
+   if (cpu_has_feature(CPU_FTR_HVMODE) && trap == INTERRUPT_EXTERNAL)
trap = 0xea0;
 
switch (trap) {
-- 
2.25.0



Re: [PATCH 1/2] powerpc/bug: Remove specific powerpc BUG_ON() and WARN_ON() on PPC32

2021-06-25 Thread Christophe Leroy

Hi Michael,

What happened to this series ? It has been flagged 'under review' in Patchwork since mid April but I 
never saw it in next-test.


Thanks
Christophe

Le 12/04/2021 à 18:26, Christophe Leroy a écrit :

powerpc BUG_ON() and WARN_ON() are based on using twnei instruction.

For catching simple conditions like a variable having value 0, this
is efficient because it does the test and the trap at the same time.
But most conditions used with BUG_ON or WARN_ON are more complex and
forces GCC to format the condition into a 0 or 1 value in a register.
This will usually require 2 to 3 instructions.

The most efficient solution would be to use __builtin_trap() because
GCC is able to optimise the use of the different trap instructions
based on the requested condition, but this is complex if not
impossible for the following reasons:
- __builtin_trap() is a non-recoverable instruction, so it can't be
used for WARN_ON
- Knowing which line of code generated the trap would require the
analysis of DWARF information. This is not a feature we have today.

As mentioned in commit 8d4fbcfbe0a4 ("Fix WARN_ON() on bitfield ops")
the way WARN_ON() is implemented is suboptimal. That commit also
mentions an issue with 'long long' condition. It fixed it for
WARN_ON() but the same problem still exists today with BUG_ON() on
PPC32. It will be fixed by using the generic implementation.

By using the generic implementation, gcc will naturally generate a
branch to the unconditional trap generated by BUG().

As modern powerpc implement zero-cycle branch,
that's even more efficient.

And for the functions using WARN_ON() and its return, the test
on return from WARN_ON() is now also used for the WARN_ON() itself.

On PPC64 we don't want it because we want to be able to use CFAR
register to track how we entered the code that trapped. The CFAR
register would be clobbered by the branch.

A simple test function:

unsigned long test9w(unsigned long a, unsigned long b)
{
if (WARN_ON(!b))
return 0;
return a / b;
}

Before the patch:

046c :
 46c:   7c 89 00 34 cntlzw  r9,r4
 470:   55 29 d9 7e rlwinm  r9,r9,27,5,31
 474:   0f 09 00 00 twnei   r9,0
 478:   2c 04 00 00 cmpwi   r4,0
 47c:   41 82 00 0c beq 488 
 480:   7c 63 23 96 divwu   r3,r3,r4
 484:   4e 80 00 20 blr

 488:   38 60 00 00 li  r3,0
 48c:   4e 80 00 20 blr

After the patch:

0468 :
 468:   2c 04 00 00 cmpwi   r4,0
 46c:   41 82 00 0c beq 478 
 470:   7c 63 23 96 divwu   r3,r3,r4
 474:   4e 80 00 20 blr

 478:   0f e0 00 00 twuir0,0
 47c:   38 60 00 00 li  r3,0
 480:   4e 80 00 20 blr

So we see before the patch we need 3 instructions on the likely path
to handle the WARN_ON(). With the patch the trap goes on the unlikely
path.

See below the difference at the entry of system_call_exception where
we have several BUG_ON(), allthough less impressing.

With the patch:

 :
   0:   81 6a 00 84 lwz r11,132(r10)
   4:   90 6a 00 88 stw r3,136(r10)
   8:   71 60 00 02 andi.   r0,r11,2
   c:   41 82 00 70 beq 7c 
  10:   71 60 40 00 andi.   r0,r11,16384
  14:   41 82 00 6c beq 80 
  18:   71 6b 80 00 andi.   r11,r11,32768
  1c:   41 82 00 68 beq 84 
  20:   94 21 ff e0 stwur1,-32(r1)
  24:   93 e1 00 1c stw r31,28(r1)
  28:   7d 8c 42 e6 mftbr12
...
  7c:   0f e0 00 00 twuir0,0
  80:   0f e0 00 00 twuir0,0
  84:   0f e0 00 00 twuir0,0

Without the patch:

 :
   0:   94 21 ff e0 stwur1,-32(r1)
   4:   93 e1 00 1c stw r31,28(r1)
   8:   90 6a 00 88 stw r3,136(r10)
   c:   81 6a 00 84 lwz r11,132(r10)
  10:   69 60 00 02 xorir0,r11,2
  14:   54 00 ff fe rlwinm  r0,r0,31,31,31
  18:   0f 00 00 00 twnei   r0,0
  1c:   69 60 40 00 xorir0,r11,16384
  20:   54 00 97 fe rlwinm  r0,r0,18,31,31
  24:   0f 00 00 00 twnei   r0,0
  28:   69 6b 80 00 xorir11,r11,32768
  2c:   55 6b 8f fe rlwinm  r11,r11,17,31,31
  30:   0f 0b 00 00 twnei   r11,0
  34:   7d 8c 42 e6 mftbr12

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/bug.h | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index d1635ffbb179..101dea4eec8d 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -68,7 +68,11 @@
BUG_ENTRY(&qu

[PATCH] powerpc/interrupt: Also use exit_must_hard_disable() on PPC32

2021-06-25 Thread Christophe Leroy
Reduce #ifdefs a bit by making exit_must_hard_disable() return
true on PPC32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/interrupt.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index cee12f2fd459..1b4a99ecb7e5 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -33,10 +33,10 @@ static inline bool exit_must_hard_disable(void)
 {
return static_branch_unlikely(_exit_not_reentrant);
 }
-#elif defined(CONFIG_PPC64)
+#else
 static inline bool exit_must_hard_disable(void)
 {
-   return false;
+   return IS_ENABLED(CONFIG_PPC32);
 }
 #endif
 
@@ -56,12 +56,10 @@ static notrace __always_inline bool 
prep_irq_for_enabled_exit(bool restartable)
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
 
-#ifdef CONFIG_PPC32
-   __hard_EE_RI_disable();
-#else
if (exit_must_hard_disable() || !restartable)
__hard_EE_RI_disable();
 
+#ifdef CONFIG_PPC64
/* This pattern matches prep_irq_for_idle */
if (unlikely(lazy_irq_pending_nocheck())) {
if (exit_must_hard_disable() || !restartable) {
-- 
2.25.0



[PATCH 2/2] powerpc/ptrace: Refactor regs_set_return_{msr/ip}

2021-06-25 Thread Christophe Leroy
regs_set_return_msr() and regs_set_return_ip() have a copy
of the code of set_return_regs_changed().

Call the later instead.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/ptrace.h | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 14b8105a1e27..3e5d470a6155 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -146,19 +146,13 @@ static inline void set_return_regs_changed(void)
 static inline void regs_set_return_ip(struct pt_regs *regs, unsigned long ip)
 {
regs->nip = ip;
-#ifdef CONFIG_PPC_BOOK3S_64
-   local_paca->hsrr_valid = 0;
-   local_paca->srr_valid = 0;
-#endif
+   set_return_regs_changed();
 }
 
 static inline void regs_set_return_msr(struct pt_regs *regs, unsigned long msr)
 {
regs->msr = msr;
-#ifdef CONFIG_PPC_BOOK3S_64
-   local_paca->hsrr_valid = 0;
-   local_paca->srr_valid = 0;
-#endif
+   set_return_regs_changed();
 }
 
 static inline void regs_add_return_ip(struct pt_regs *regs, long offset)
-- 
2.25.0



[PATCH 1/2] powerpc/ptrace: Move set_return_regs_changed() before regs_set_return_{msr/ip}

2021-06-25 Thread Christophe Leroy
regs_set_return_msr() and regs_set_return_ip() have a copy
of the code of set_return_regs_changed().

Move up set_return_regs_changed() so it can be reused by
regs_set_return_{msr/ip}

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/ptrace.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index fcf63f559344..14b8105a1e27 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -135,26 +135,26 @@ extern unsigned long profile_pc(struct pt_regs *regs);
 long do_syscall_trace_enter(struct pt_regs *regs);
 void do_syscall_trace_leave(struct pt_regs *regs);
 
-static inline void regs_set_return_ip(struct pt_regs *regs, unsigned long ip)
+static inline void set_return_regs_changed(void)
 {
-   regs->nip = ip;
 #ifdef CONFIG_PPC_BOOK3S_64
local_paca->hsrr_valid = 0;
local_paca->srr_valid = 0;
 #endif
 }
 
-static inline void regs_set_return_msr(struct pt_regs *regs, unsigned long msr)
+static inline void regs_set_return_ip(struct pt_regs *regs, unsigned long ip)
 {
-   regs->msr = msr;
+   regs->nip = ip;
 #ifdef CONFIG_PPC_BOOK3S_64
local_paca->hsrr_valid = 0;
local_paca->srr_valid = 0;
 #endif
 }
 
-static inline void set_return_regs_changed(void)
+static inline void regs_set_return_msr(struct pt_regs *regs, unsigned long msr)
 {
+   regs->msr = msr;
 #ifdef CONFIG_PPC_BOOK3S_64
local_paca->hsrr_valid = 0;
local_paca->srr_valid = 0;
-- 
2.25.0



[PATCH] powerpc/syscalls: Simplify do_mmap2()

2021-06-25 Thread Christophe Leroy
When shift is nul, operations remain valid so no test needed.

And 'ret' is unnecessary.

And use IS_ALIGNED() to check alignment, that's more clear.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/syscalls.c | 15 ---
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index bf4ae0f0e36c..825931e400df 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -41,20 +41,13 @@ static inline long do_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long off, int shift)
 {
-   long ret = -EINVAL;
-
if (!arch_validate_prot(prot, addr))
-   goto out;
+   return -EINVAL;
 
-   if (shift) {
-   if (off & ((1 << shift) - 1))
-   goto out;
-   off >>= shift;
-   }
+   if (!IS_ALIGNED(off, 1 << shift))
+   return -EINVAL;
 
-   ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off);
-out:
-   return ret;
+   return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> shift);
 }
 
 SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
-- 
2.25.0



[PATCH RFC 2/2] powerpc/signal: Retire signal trampoline on stack

2021-06-25 Thread Christophe Leroy
The signal trampoline is either:
- As specified by the caller via SA_RESTORER
- In VDSO if VDSO is properly mapped
- Fallback on user stack

However, nowadays user stack is mapped non executable by default
so the fallback will generate an exec fault.

All other architectures having VDSO except x86 and sh don't install
any fallback trampoline on stack.

Simplify the code by doing the same, remove signal trampoline
on stack. If VDSO is not mapped, a NULL like address will be set
and the user app will gently fault.

If a user application explicitly want's to unmap VDSO, it can
still provide an SA_RESTORER.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c  | 26 +-
 arch/powerpc/kernel/signal_64.c  | 38 
 arch/powerpc/perf/callchain_32.c |  5 -
 arch/powerpc/perf/callchain_64.c |  2 --
 4 files changed, 14 insertions(+), 57 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index cf3da1386595..366d07cb42da 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -769,16 +769,13 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
}
 
/* Save user registers on the stack */
-   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   if (ksig->ka.sa.sa_flags & SA_RESTORER)
tramp = (unsigned long)ksig->ka.sa.sa_restorer;
-   } else if (tsk->mm->context.vdso) {
+   else if (tsk->mm->context.vdso)
tramp = VDSO32_SYMBOL(tsk->mm->context.vdso, sigtramp_rt32);
-   } else {
-   tramp = (unsigned long)mctx->mc_pad;
-   unsafe_put_user(PPC_RAW_LI(_R0, __NR_rt_sigreturn), 
>mc_pad[0], failed);
-   unsafe_put_user(PPC_RAW_SC(), >mc_pad[1], failed);
-   asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0]));
-   }
+   else
+   tramp = 0;
+
unsafe_put_sigset_t(>uc.uc_sigmask, oldset, failed);
 
user_access_end();
@@ -867,16 +864,13 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
else
unsafe_save_user_regs(regs, mctx, tm_mctx, 1, failed);
 
-   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   if (ksig->ka.sa.sa_flags & SA_RESTORER)
tramp = (unsigned long)ksig->ka.sa.sa_restorer;
-   } else if (tsk->mm->context.vdso) {
+   else if (tsk->mm->context.vdso)
tramp = VDSO32_SYMBOL(tsk->mm->context.vdso, sigtramp32);
-   } else {
-   tramp = (unsigned long)mctx->mc_pad;
-   unsafe_put_user(PPC_RAW_LI(_R0, __NR_sigreturn), 
>mc_pad[0], failed);
-   unsafe_put_user(PPC_RAW_SC(), >mc_pad[1], failed);
-   asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0]));
-   }
+   else
+   tramp = 0;
+
user_access_end();
 
regs->link = tramp;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index fb31a334aca6..39522ebd1137 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -610,32 +610,6 @@ static long restore_tm_sigcontexts(struct task_struct 
*tsk, struct sigcontext __
 }
 #endif
 
-/*
- * Setup the trampoline code on the stack
- */
-static long setup_trampoline(unsigned int syscall, unsigned int __user *tramp)
-{
-   int i;
-   long err = 0;
-
-   /* Call the handler and pop the dummy stackframe*/
-   err |= __put_user(PPC_RAW_BCTRL(), [0]);
-   err |= __put_user(PPC_RAW_ADDI(_R1, _R1, __SIGNAL_FRAMESIZE), 
[1]);
-
-   err |= __put_user(PPC_RAW_LI(_R0, syscall), [2]);
-   err |= __put_user(PPC_RAW_SC(), [3]);
-
-   /* Minimal traceback info */
-   for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
-   err |= __put_user(0, [i]);
-
-   if (!err)
-   flush_icache_range((unsigned long) [0],
-  (unsigned long) [TRAMP_SIZE]);
-
-   return err;
-}
-
 /*
  * Userspace code may pass a ucontext which doesn't include VSX added
  * at the end.  We need to check for this case.
@@ -910,16 +884,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t 
*set,
tsk->thread.fp_state.fpscr = 0;
 
/* Set up to return from userspace. */
-   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   if (ksig->ka.sa.sa_flags & SA_RESTORER)
regs_set_return_ip(regs, (unsigned 
long)ksig->ka.sa.sa_restorer);
-   } else if (tsk->mm->context.vdso) {
+   else if (tsk->mm->context.vdso)
regs_set_return_ip(regs, VDSO64_SYMBOL(tsk->mm->context.vdso, 
sigtramp_rt64));
-   } else {
-   err |= setup_trampoline(__NR_rt_sigreturn, >tramp[0]);
-   if (err)
-   

[PATCH 1/2] powerpc/signal: Fix handling of SA_RESTORER sigaction flag

2021-06-25 Thread Christophe Leroy
powerpc advertises support of SA_RESTORER sigaction flag.

Make it the truth.

Cc: sta...@vger.kernel.org
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 8 ++--
 arch/powerpc/kernel/signal_64.c | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 0608581967f0..cf3da1386595 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -769,7 +769,9 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
}
 
/* Save user registers on the stack */
-   if (tsk->mm->context.vdso) {
+   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   tramp = (unsigned long)ksig->ka.sa.sa_restorer;
+   } else if (tsk->mm->context.vdso) {
tramp = VDSO32_SYMBOL(tsk->mm->context.vdso, sigtramp_rt32);
} else {
tramp = (unsigned long)mctx->mc_pad;
@@ -865,7 +867,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
else
unsafe_save_user_regs(regs, mctx, tm_mctx, 1, failed);
 
-   if (tsk->mm->context.vdso) {
+   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   tramp = (unsigned long)ksig->ka.sa.sa_restorer;
+   } else if (tsk->mm->context.vdso) {
tramp = VDSO32_SYMBOL(tsk->mm->context.vdso, sigtramp32);
} else {
tramp = (unsigned long)mctx->mc_pad;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 1831bba0582e..fb31a334aca6 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -910,7 +910,9 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
tsk->thread.fp_state.fpscr = 0;
 
/* Set up to return from userspace. */
-   if (tsk->mm->context.vdso) {
+   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   regs_set_return_ip(regs, (unsigned 
long)ksig->ka.sa.sa_restorer);
+   } else if (tsk->mm->context.vdso) {
regs_set_return_ip(regs, VDSO64_SYMBOL(tsk->mm->context.vdso, 
sigtramp_rt64));
} else {
err |= setup_trampoline(__NR_rt_sigreturn, >tramp[0]);
-- 
2.25.0



Re: [PATCH] powerpc: mark local variables around longjmp as volatile

2021-06-25 Thread Christophe Leroy




Le 29/04/2021 à 10:06, Arnd Bergmann a écrit :

From: Arnd Bergmann 

gcc-11 points out that modifying local variables next to a
longjmp/setjmp may cause undefined behavior:

arch/powerpc/kexec/crash.c: In function 'crash_kexec_prepare_cpus.constprop':
arch/powerpc/kexec/crash.c:108:22: error: variable 'ncpus' might be clobbered 
by 'longjmp' or 'vfork' [-Werror=clobbere
d]
arch/powerpc/kexec/crash.c:109:13: error: variable 'tries' might be clobbered 
by 'longjmp' or 'vfork' [-Werror=clobbere
d]
arch/powerpc/xmon/xmon.c: In function 'xmon_print_symbol':
arch/powerpc/xmon/xmon.c:3625:21: error: variable 'name' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c: In function 'stop_spus':
arch/powerpc/xmon/xmon.c:4057:13: error: variable 'i' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c: In function 'restart_spus':
arch/powerpc/xmon/xmon.c:4098:13: error: variable 'i' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c: In function 'dump_opal_msglog':
arch/powerpc/xmon/xmon.c:3008:16: error: variable 'pos' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c: In function 'show_pte':
arch/powerpc/xmon/xmon.c:3207:29: error: variable 'tsk' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c: In function 'show_tasks':
arch/powerpc/xmon/xmon.c:3302:29: error: variable 'tsk' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c: In function 'xmon_core':
arch/powerpc/xmon/xmon.c:494:13: error: variable 'cmd' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c:860:21: error: variable 'bp' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c:860:21: error: variable 'bp' might be clobbered by 
'longjmp' or 'vfork' [-Werror=clobbered]
arch/powerpc/xmon/xmon.c:492:48: error: argument 'fromipi' might be clobbered 
by 'longjmp' or 'vfork' [-Werror=clobbered]

According to the documentation, marking these as 'volatile' is
sufficient to avoid the problem, and it shuts up the warning.



I think this change deserves some comment in the code, and maybe also an update of 
https://www.kernel.org/doc/html/latest/process/volatile-considered-harmful.html


Otherwise, there's a risk that one day or another, someone removes those 
'volatile' markings.

Christophe




Signed-off-by: Arnd Bergmann 
---
  arch/powerpc/kexec/crash.c |  4 ++--
  arch/powerpc/xmon/xmon.c   | 22 +++---
  2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index 0196d0c211ac..10f997e6bb95 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -105,8 +105,8 @@ void crash_ipi_callback(struct pt_regs *regs)
  static void crash_kexec_prepare_cpus(int cpu)
  {
unsigned int msecs;
-   unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */
-   int tries = 0;
+   volatile unsigned int ncpus = num_online_cpus() - 1;/* Excluding the 
panic cpu */
+   volatile int tries = 0;
int (*old_handler)(struct pt_regs *regs);
  
  	printk(KERN_EMERG "Sending IPI to other CPUs\n");

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index c8173e92f19d..ce0eacf77645 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -489,10 +489,10 @@ static void xmon_touch_watchdogs(void)
touch_nmi_watchdog();
  }
  
-static int xmon_core(struct pt_regs *regs, int fromipi)

+static int xmon_core(struct pt_regs *regs, volatile int fromipi)
  {
-   int cmd = 0;
-   struct bpt *bp;
+   volatile int cmd = 0;
+   struct bpt *volatile bp;
long recurse_jmp[JMP_BUF_LEN];
bool locked_down;
unsigned long offset;
@@ -857,7 +857,7 @@ static inline void force_enable_xmon(void)
  static struct bpt *at_breakpoint(unsigned long pc)
  {
int i;
-   struct bpt *bp;
+   struct bpt *volatile bp;
  
  	bp = bpts;

for (i = 0; i < NBPTS; ++i, ++bp)
@@ -3005,7 +3005,7 @@ static void dump_opal_msglog(void)
  {
unsigned char buf[128];
ssize_t res;
-   loff_t pos = 0;
+   volatile loff_t pos = 0;
  
  	if (!firmware_has_feature(FW_FEATURE_OPAL)) {

printf("Machine is not running OPAL firmware.\n");
@@ -3160,7 +3160,7 @@ memzcan(void)
printf("%.8lx\n", a - mskip);
  }
  
-static void show_task(struct task_struct *tsk)

+static void show_task(struct task_struct *volatile tsk)
  {
char state;
  
@@ -3204,7 +3204,7 @@ static void format_pte(void *ptep, unsigned long pte)

  static void show_pte(unsigned long addr)
  {
unsigned long tskv = 0;
-   struct task_struct *tsk = NULL;
+   struct task_struct *volatile tsk = NULL;
struct mm_struct *mm;
pgd_t *pgdp;
p4d_t *p4dp;

[PATCH v3] mm: pagewalk: Fix walk for hugepage tables

2021-06-24 Thread Christophe Leroy
Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.

Signed-off-by: Christophe Leroy 
Reviewed-by: Steven Price 
---
v3:
- Rebased on next-20210624 (no change since v2)
- Added Steven's Reviewed-by
- Sent as standalone for merge via mm

v2:
- Add a guard for NULL ops->pte_entry
- Take mm->page_table_lock when walking hugepage table, as suggested by 
follow_huge_pd()
---
 mm/pagewalk.c | 58 ++-
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..9b3db11a4d1d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
return err;
 }
 
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   int err = 0;
+   const struct mm_walk_ops *ops = walk->ops;
+   int shift = hugepd_shift(*phpd);
+   int page_size = 1 << shift;
+
+   if (!ops->pte_entry)
+   return 0;
+
+   if (addr & (page_size - 1))
+   return 0;
+
+   for (;;) {
+   pte_t *pte;
+
+   spin_lock(>mm->page_table_lock);
+   pte = hugepte_offset(*phpd, addr, pdshift);
+   err = ops->pte_entry(pte, addr, addr + page_size, walk);
+   spin_unlock(>mm->page_table_lock);
+
+   if (err)
+   break;
+   if (addr >= end - page_size)
+   break;
+   addr += page_size;
+   }
+   return err;
+}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   return 0;
+}
+#endif
+
 static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
 {
@@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
goto again;
}
 
-   err = walk_pte_range(pmd, addr, next, walk);
+   if (is_hugepd(__hugepd(pmd_val(*pmd
+   err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
walk, PMD_SHIFT);
+   else
+   err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
if (pud_none(*pud))
goto again;
 
-   err = walk_pmd_range(pud, addr, next, walk);
+   if (is_hugepd(__hugepd(pud_val(*pud
+   err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
+   else
+   err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, 
unsigned long end,
if (err)
break;
}
-   if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+   if (is_hugepd(__hugepd(p4d_val(*p4d
+   err = walk_hugepd_range((hugepd_t *)p4d, addr, next, 
walk, P4D_SHIFT);
+   else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long 
end,
if (err)
break;
}
-   if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-   ops->pte_entry)
+   if (is_hugepd(__hugepd(pgd_val(*pgd
+   err = walk_hugepd_range((hugepd_t *)pgd, addr, next, 
walk, PGDIR_SHIFT);
+   else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || 
ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
-- 
2.25.0



Re: [PATCH v2 1/4] mm: pagewalk: Fix walk for hugepage tables

2021-06-24 Thread Christophe Leroy




Le 25/06/2021 à 06:45, Michael Ellerman a écrit :

Christophe Leroy  writes:

Hi Michael,

Le 19/04/2021 à 12:47, Christophe Leroy a écrit :

Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.


I see you took patch 2 and 3 of the series.


Yeah I decided those were bug fixes so could be taken separately.


Do you expect Andrew to take patch 1 via mm tree, and then you'll take
patch 4 once mm tree is merged ?


I didn't feel I could take patch 1 via the powerpc tree without risking
conflicts.

Andrew could take patch 1 and 4 via mm, though he might not want to pick
them up this late.


Patch 4 needs patches 2 and 3 and doesn't apply without them so it is not that 
easy.

Maybe Andrew you can take patch 1 now and then Michael you can take patch 4 at anytime during 5.15 
preparation without any conflict risk ?




I guess step one would be to repost 1 and 4 as a new series. Either they
can go via mm, or for 5.15 I could probably take them both as long as I
pick them up early enough.



I'll first repost patch 1 as standalone and see what happens.

Christophe


Re: [PATCH v2 1/4] mm: pagewalk: Fix walk for hugepage tables

2021-06-24 Thread Christophe Leroy

Hi Michael,

Le 19/04/2021 à 12:47, Christophe Leroy a écrit :

Pagewalk ignores hugepd entries and walk down the tables
as if it was traditionnal entries, leading to crazy result.

Add walk_hugepd_range() and use it to walk hugepage tables.


I see you took patch 2 and 3 of the series.

Do you expect Andrew to take patch 1 via mm tree, and then you'll take patch 4 
once mm tree is merged ?

Christophe



Signed-off-by: Christophe Leroy 
---
v2:
- Add a guard for NULL ops->pte_entry
- Take mm->page_table_lock when walking hugepage table, as suggested by 
follow_huge_pd()
---
  mm/pagewalk.c | 58 ++-
  1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d9f177..9b3db11a4d1d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,6 +58,45 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, 
unsigned long end,
return err;
  }
  
+#ifdef CONFIG_ARCH_HAS_HUGEPD

+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   int err = 0;
+   const struct mm_walk_ops *ops = walk->ops;
+   int shift = hugepd_shift(*phpd);
+   int page_size = 1 << shift;
+
+   if (!ops->pte_entry)
+   return 0;
+
+   if (addr & (page_size - 1))
+   return 0;
+
+   for (;;) {
+   pte_t *pte;
+
+   spin_lock(>mm->page_table_lock);
+   pte = hugepte_offset(*phpd, addr, pdshift);
+   err = ops->pte_entry(pte, addr, addr + page_size, walk);
+   spin_unlock(>mm->page_table_lock);
+
+   if (err)
+   break;
+   if (addr >= end - page_size)
+   break;
+   addr += page_size;
+   }
+   return err;
+}
+#else
+static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr,
+unsigned long end, struct mm_walk *walk, int 
pdshift)
+{
+   return 0;
+}
+#endif
+
  static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  struct mm_walk *walk)
  {
@@ -108,7 +147,10 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, 
unsigned long end,
goto again;
}
  
-		err = walk_pte_range(pmd, addr, next, walk);

+   if (is_hugepd(__hugepd(pmd_val(*pmd
+   err = walk_hugepd_range((hugepd_t *)pmd, addr, next, 
walk, PMD_SHIFT);
+   else
+   err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
@@ -157,7 +199,10 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, 
unsigned long end,
if (pud_none(*pud))
goto again;
  
-		err = walk_pmd_range(pud, addr, next, walk);

+   if (is_hugepd(__hugepd(pud_val(*pud
+   err = walk_hugepd_range((hugepd_t *)pud, addr, next, 
walk, PUD_SHIFT);
+   else
+   err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -189,7 +234,9 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, 
unsigned long end,
if (err)
break;
}
-   if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
+   if (is_hugepd(__hugepd(p4d_val(*p4d
+   err = walk_hugepd_range((hugepd_t *)p4d, addr, next, 
walk, P4D_SHIFT);
+   else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -224,8 +271,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long 
end,
if (err)
break;
}
-   if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
-   ops->pte_entry)
+   if (is_hugepd(__hugepd(pgd_val(*pgd
+   err = walk_hugepd_range((hugepd_t *)pgd, addr, next, 
walk, PGDIR_SHIFT);
+   else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || 
ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;



Re: [PATCH v2] powerpc/kprobes: Fix Oops by passing ppc_inst as a pointer to emulate_step() on ppc32

2021-06-24 Thread Christophe Leroy




Le 24/06/2021 à 12:59, Naveen N. Rao a écrit :

Christophe Leroy wrote:

From: Naveen N. Rao 

Trying to use a kprobe on ppc32 results in the below splat:
    BUG: Unable to handle kernel data access on read at 0x7c0802a6
    Faulting instruction address: 0xc002e9f0
    Oops: Kernel access of bad area, sig: 11 [#1]
    BE PAGE_SIZE=4K PowerPC 44x Platform
    Modules linked in:
    CPU: 0 PID: 89 Comm: sh Not tainted 5.13.0-rc1-01824-g3a81c0495fdb #7
    NIP:  c002e9f0 LR: c0011858 CTR: 8a47
    REGS: c292fd50 TRAP: 0300   Not tainted  (5.13.0-rc1-01824-g3a81c0495fdb)
    MSR:  9000   CR: 24002002  XER: 2000
    DEAR: 7c0802a6 ESR: 
    
    NIP [c002e9f0] emulate_step+0x28/0x324
    LR [c0011858] optinsn_slot+0x128/0x1
    Call Trace:
 opt_pre_handler+0x7c/0xb4 (unreliable)
 optinsn_slot+0x128/0x1
 ret_from_syscall+0x0/0x28

The offending instruction is:
    81 24 00 00 lwz r9,0(r4)

Here, we are trying to load the second argument to emulate_step():
struct ppc_inst, which is the instruction to be emulated. On ppc64,
structures are passed in registers when passed by value. However, per
the ppc32 ABI, structures are always passed to functions as pointers.
This isn't being adhered to when setting up the call to emulate_step()
in the optprobe trampoline. Fix the same.

Fixes: eacf4c0202654a ("powerpc: Enable OPTPROBES on PPC32")
Cc: sta...@vger.kernel.org
Signed-off-by: Naveen N. Rao 
---
v2: Rebased on powerpc/merge 7f030e9d57b8
Signed-off-by: Christophe Leroy 


Thanks for rebasing this!

I think git am drops everything after three dashes, so applying this patch will drop your SOB. The 
recommended style (*) for adding a changelog is to include it within [] before the second SOB.




Yes, I saw that after sending the mail. Usually I add a signed-off-by with 'git --amend -s' when I 
add the history, so it goes before the '---' I'm adding.


This time I must have forgotten the '-s' so it was added by the 'git format-patch -s' which is in my 
submit script, and so it was added at the end.


It's not really a big deal, up to Michael to either move it at the right place or discard it, I 
don't really mind. Do the easiest for you.


Thanks
Christophe


Re: nand: WARNING: a0000000.nand: the ECC used on your system (1b/256B) is too weak compared to the one required by the NAND chip (4b/512B)

2021-06-23 Thread Christophe Leroy




Le 19/06/2021 à 20:40, Miquel Raynal a écrit :

Hi Christophe,


Now and then I'm using one of the latest kernels (Today is 5.13-rc6), and 
sometime in one of the 5.x releases, I started to get errors like:

[5.098265] ecc_sw_hamming_correct: uncorrectable ECC error
[5.103859] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 60
bytes from PEB 99:59824, read only 60 bytes, retry
[5.525843] ecc_sw_hamming_correct: uncorrectable ECC error
[5.531571] ecc_sw_hamming_correct: uncorrectable ECC error
[5.537490] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 30
73 bytes from PEB 107:108976, read only 3073 bytes, retry
[5.691121] ecc_sw_hamming_correct: uncorrectable ECC error
[5.696709] ecc_sw_hamming_correct: uncorrectable ECC error
[5.702426] ecc_sw_hamming_correct: uncorrectable ECC error
[5.708141] ecc_sw_hamming_correct: uncorrectable ECC error
[5.714103] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 30
35 bytes from PEB 107:25144, read only 3035 bytes, retry
[   20.523689] random: crng init done
[   21.892130] ecc_sw_hamming_correct: uncorrectable ECC error
[   21.897730] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 13
94 bytes from PEB 116:75776, read only 1394 bytes, retry

Most of the time, when the reading of the file fails, I just have to read it 
once more and it gets read without that error.


It really looks like a regular bitflip happening "sometimes". Is this a
board which already had a life? What are the usage counters (UBI should
tell you this) compared to the official endurance of your chip (see the
datasheet)?


The board had a peacefull life:

UBI reports "ubi0: max/mean erase counter: 49/20, WL threshold: 4096"


Mmmh. Indeed.



I have tried with half a dozen of boards and all have the issue.

   

What am I supposed to do to avoid the ECC weakness warning at startup and to 
fix that ECC error issue ?


I honestly don't think the errors come from the 5.1x kernels given the
above logs. If you flash back your old 4.14 I am pretty sure you'll
have the same errors at some point.


I don't have any problem like that with 4.14 with any of the board.

When booting a 4.14 kernel I don't get any problem on the same board.



If you can reliably show that when returning to a 4.14 kernel the ECC
weakness disappears, then there is certainly something new. What driver
are you using? Maybe you can do a bisection?


Using the GPIO driver, and the NAND chip is a HYNIX.

I can say that the ECC weakness doesn't exist until v5.5 included. The weakness 
appears with v5.6.

I have tried bisection between those two versions and I couldn't end up to a reliable result. The 
closer the v5.5 you go, the more difficult it is to reproduce the issue.


So I looked at what was done around the places, and in fact that's mainly optimisation in the 
powerpc code. It seems that the more powerpc is optimised, the more the problem occurs.


Looking at the GPIO nand driver, I saw that no-op gpio_nand_dosync() function. By adding a memory 
barrier in that function, the ECC weakness disappeared completely.


Not sure what the final solution has to be.

Christophe


[PATCH v2] powerpc/kprobes: Fix Oops by passing ppc_inst as a pointer to emulate_step() on ppc32

2021-06-22 Thread Christophe Leroy
From: Naveen N. Rao 

Trying to use a kprobe on ppc32 results in the below splat:
BUG: Unable to handle kernel data access on read at 0x7c0802a6
Faulting instruction address: 0xc002e9f0
Oops: Kernel access of bad area, sig: 11 [#1]
BE PAGE_SIZE=4K PowerPC 44x Platform
Modules linked in:
CPU: 0 PID: 89 Comm: sh Not tainted 5.13.0-rc1-01824-g3a81c0495fdb #7
NIP:  c002e9f0 LR: c0011858 CTR: 8a47
REGS: c292fd50 TRAP: 0300   Not tainted  (5.13.0-rc1-01824-g3a81c0495fdb)
MSR:  9000   CR: 24002002  XER: 2000
DEAR: 7c0802a6 ESR: 

NIP [c002e9f0] emulate_step+0x28/0x324
LR [c0011858] optinsn_slot+0x128/0x1
Call Trace:
 opt_pre_handler+0x7c/0xb4 (unreliable)
 optinsn_slot+0x128/0x1
 ret_from_syscall+0x0/0x28

The offending instruction is:
81 24 00 00 lwz r9,0(r4)

Here, we are trying to load the second argument to emulate_step():
struct ppc_inst, which is the instruction to be emulated. On ppc64,
structures are passed in registers when passed by value. However, per
the ppc32 ABI, structures are always passed to functions as pointers.
This isn't being adhered to when setting up the call to emulate_step()
in the optprobe trampoline. Fix the same.

Fixes: eacf4c0202654a ("powerpc: Enable OPTPROBES on PPC32")
Cc: sta...@vger.kernel.org
Signed-off-by: Naveen N. Rao 
---
v2: Rebased on powerpc/merge 7f030e9d57b8
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/optprobes.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 2b8fe40069ad..53facb4b377f 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -228,8 +228,12 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
/*
 * 3. load instruction to be emulated into relevant register, and
 */
-   temp = ppc_inst_read(p->ainsn.insn);
-   patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + TMPL_INSN_IDX);
+   if (IS_ENABLED(CONFIG_PPC64)) {
+   temp = ppc_inst_read(p->ainsn.insn);
+   patch_imm_load_insns(ppc_inst_as_ulong(temp), 4, buff + 
TMPL_INSN_IDX);
+   } else {
+   patch_imm_load_insns((unsigned long)p->ainsn.insn, 4, buff + 
TMPL_INSN_IDX);
+   }
 
/*
 * 4. branch back from trampoline
-- 
2.25.0



Re: [PATCH 2/2] powerpc/64s/interrupt: Check and fix srr_valid without crashing

2021-06-22 Thread Christophe Leroy




Le 22/06/2021 à 10:54, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 22, 2021 4:47 pm:



Le 22/06/2021 à 08:04, Nicholas Piggin a écrit :

The PPC_RFI_SRR_DEBUG check added by patch "powerpc/64s: avoid reloading
(H)SRR registers if they are still valid" has a few deficiencies. It
does not fix the actual problem, it's not enabled by default, and it
causes a program check interrupt which can cause more difficulties.

However there are a lot of paths which may clobber SRRs or change return
regs, and difficult to have a high confidence that all paths are covered
without wider testing.

Add a relatively low overhead always-enabled check that catches most
such cases, reports once, and fixes it so the kernel can continue.

Signed-off-by: Nicholas Piggin 
---
   arch/powerpc/kernel/interrupt.c | 58 +
   1 file changed, 58 insertions(+)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 05fa3ae56e25..5920a3e8d1d5 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -231,6 +231,56 @@ static notrace void booke_load_dbcr0(void)
   #endif
   }
   
+#include  /* for show_regs */

+static void check_return_regs_valid(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   static bool warned = false;
+
+   if (regs->trap == 0x980 || regs->trap == 0xe00 || regs->trap == 0xe20 ||
+   regs->trap == 0xe40 || regs->trap == 0xe60 || regs->trap == 0xe80 ||
+   regs->trap == 0xea0 || regs->trap == 0xf80 || regs->trap == 0x1200 
||
+   regs->trap == 0x1500 || regs->trap == 0x1600 || regs->trap == 
0x1800) {


Can you use names defined in asm/interrupt.h instead of raw values ?
Some are already there, others can be added.


Good idea. Could go into a helper too actually.

I wanted to clean up the KVM definitions and unify them with interrupt.h
defs but that's a bit of churn. Can I get to that in the next merge or
so?




Sure

Christophe


Re: [PATCH 2/2] powerpc/64s/interrupt: Check and fix srr_valid without crashing

2021-06-22 Thread Christophe Leroy




Le 22/06/2021 à 08:04, Nicholas Piggin a écrit :

The PPC_RFI_SRR_DEBUG check added by patch "powerpc/64s: avoid reloading
(H)SRR registers if they are still valid" has a few deficiencies. It
does not fix the actual problem, it's not enabled by default, and it
causes a program check interrupt which can cause more difficulties.

However there are a lot of paths which may clobber SRRs or change return
regs, and difficult to have a high confidence that all paths are covered
without wider testing.

Add a relatively low overhead always-enabled check that catches most
such cases, reports once, and fixes it so the kernel can continue.

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/kernel/interrupt.c | 58 +
  1 file changed, 58 insertions(+)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 05fa3ae56e25..5920a3e8d1d5 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -231,6 +231,56 @@ static notrace void booke_load_dbcr0(void)
  #endif
  }
  
+#include  /* for show_regs */

+static void check_return_regs_valid(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   static bool warned = false;
+
+   if (regs->trap == 0x980 || regs->trap == 0xe00 || regs->trap == 0xe20 ||
+   regs->trap == 0xe40 || regs->trap == 0xe60 || regs->trap == 0xe80 ||
+   regs->trap == 0xea0 || regs->trap == 0xf80 || regs->trap == 0x1200 
||
+   regs->trap == 0x1500 || regs->trap == 0x1600 || regs->trap == 
0x1800) {


Can you use names defined in asm/interrupt.h instead of raw values ?
Some are already there, others can be added.



+   if (local_paca->hsrr_valid) {
+   unsigned long hsrr0 = mfspr(SPRN_HSRR0);
+   unsigned long hsrr1 = mfspr(SPRN_HSRR1);
+
+   if (hsrr0 == regs->nip && hsrr1 == regs->msr)
+   return;
+
+   if (!warned) {
+   warned = true;
+   printk("HSRR0 was: %lx should be: %lx\n",
+   hsrr0, regs->nip);
+   printk("HSRR1 was: %lx should be: %lx\n",
+   hsrr1, regs->msr);
+   show_regs(regs);
+   }
+   local_paca->hsrr_valid = 0; /* fixup */
+   }
+
+   } else if (regs->trap != 0x3000) {
+   if (local_paca->srr_valid) {
+   unsigned long srr0 = mfspr(SPRN_SRR0);
+   unsigned long srr1 = mfspr(SPRN_SRR1);
+
+   if (srr0 == regs->nip && srr1 == regs->msr)
+   return;
+
+   if (!warned) {
+   warned = true;
+   printk("SRR0 was: %lx should be: %lx\n",
+   srr0, regs->nip);
+   printk("SRR1 was: %lx should be: %lx\n",
+   srr1, regs->msr);
+   show_regs(regs);
+   }
+   local_paca->srr_valid = 0; /* fixup */
+   }
+   }
+#endif
+}
+
  /*
   * This should be called after a syscall returns, with r3 the return value
   * from the syscall. If this function returns non-zero, the system call
@@ -327,6 +377,8 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
}
}
  
+	check_return_regs_valid(regs);

+
user_enter_irqoff();
  
  	/* scv need not set RI=0 because SRRs are not used */

@@ -405,6 +457,8 @@ notrace unsigned long interrupt_exit_user_prepare(struct 
pt_regs *regs)
}
}
  
+	check_return_regs_valid(regs);

+
user_enter_irqoff();
  
  	if (unlikely(!__prep_irq_for_enabled_exit(true))) {

@@ -469,9 +523,13 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct 
pt_regs *regs)
}
}
  
+		check_return_regs_valid(regs);

+
if (unlikely(!prep_irq_for_enabled_exit(true, 
!irqs_disabled_flags(flags
goto again;
} else {
+   check_return_regs_valid(regs);
+
/* Returning to a kernel context with local irqs disabled. */
__hard_EE_RI_disable();
  #ifdef CONFIG_PPC64



Re: [PATCH for 4.16 v7 02/11] powerpc: membarrier: Skip memory barrier in switch_mm()

2021-06-21 Thread Christophe Leroy




Le 19/06/2021 à 17:02, Segher Boessenkool a écrit :

On Sat, Jun 19, 2021 at 11:35:34AM +0200, Christophe Leroy wrote:



Le 18/06/2021 à 19:26, Mathieu Desnoyers a écrit :

- On Jun 18, 2021, at 1:13 PM, Christophe Leroy
christophe.le...@csgroup.eu wrote:
[...]


I don't understand all that complexity to just replace a simple
'smp_mb__after_unlock_lock()'.

#define smp_mb__after_unlock_lock() smp_mb()
#define smp_mb()barrier()
# define barrier() __asm__ __volatile__("": : :"memory")


Am I missing some subtility ?


On powerpc CONFIG_SMP, smp_mb() is actually defined as:

#define smp_mb()__smp_mb()
#define __smp_mb()  mb()
#define mb()   __asm__ __volatile__ ("sync" : : : "memory")

So the original motivation here was to skip a "sync" instruction whenever
switching between threads which are part of the same process. But based on
recent discussions, I suspect my implementation may be inaccurately doing
so though.



I see.

Then, if you think a 'sync' is a concern, shouldn't we try and remove the
forest of 'sync' in the I/O accessors ?

I can't really understand why we need all those 'sync' and 'isync' and
'twi' around the accesses whereas I/O memory is usually mapped as 'Guarded'
so memory access ordering is already garantied.

I'm sure we'll save a lot with that.


The point of the twi in the I/O accessors was to make things easier to
debug if the accesses fail: for the twi insn to complete the load will
have to have completed as well.  On a correctly working system you never
should need this (until something fails ;-) )

Without the twi you might need to enforce ordering in some cases still.
The twi is a very heavy hammer, but some of that that gives us is no
doubt actually needed.



Well, I've always been quite perplex about that. According to the documentation of the 8xx, if a bus 
error or something happens on an I/O access, the exception will be accounted on the instruction 
which does the access. But based on the following function, I understand that some version of 
powerpc do generate the trap on the instruction which was being executed at the time the I/O access 
failed, not the instruction that does the access itself ?


/*
 * I/O accesses can cause machine checks on powermacs.
 * Check if the NIP corresponds to the address of a sync
 * instruction for which there is an entry in the exception
 * table.
 *  -- paulus.
 */
static inline int check_io_access(struct pt_regs *regs)
{
#ifdef CONFIG_PPC32
unsigned long msr = regs->msr;
const struct exception_table_entry *entry;
unsigned int *nip = (unsigned int *)regs->nip;

if (((msr & 0x) == 0 || (msr & (0x8 | 0x4)))
&& (entry = search_exception_tables(regs->nip)) != NULL) {
/*
 * Check that it's a sync instruction, or somewhere
 * in the twi; isync; nop sequence that inb/inw/inl uses.
 * As the address is in the exception table
 * we should be able to read the instr there.
 * For the debug message, we look at the preceding
 * load or store.
 */
if (*nip == PPC_INST_NOP)
nip -= 2;
else if (*nip == PPC_INST_ISYNC)
--nip;
if (*nip == PPC_INST_SYNC || (*nip >> 26) == OP_TRAP) {
unsigned int rb;

--nip;
rb = (*nip >> 11) & 0x1f;
printk(KERN_DEBUG "%s bad port %lx at %p\n",
   (*nip & 0x100)? "OUT to": "IN from",
   regs->gpr[rb] - _IO_BASE, nip);
regs->msr |= MSR_RI;
regs->nip = extable_fixup(entry);
return 1;
}
}
#endif /* CONFIG_PPC32 */
return 0;
}

Am I right ?

It is not only the twi which bother's me in the I/O accessors but also the 
sync/isync and stuff.

A write typically is

sync
stw

A read is

sync
lwz
twi
isync

Taking into account that HW ordering is garanteed by the fact that __iomem is guarded, isn't the 
'memory' clobber enough as a barrier ?


Thanks
Christophe


Re: [PATCH for 4.16 v7 02/11] powerpc: membarrier: Skip memory barrier in switch_mm()

2021-06-19 Thread Christophe Leroy




Le 18/06/2021 à 19:26, Mathieu Desnoyers a écrit :

- On Jun 18, 2021, at 1:13 PM, Christophe Leroy christophe.le...@csgroup.eu 
wrote:
[...]


I don't understand all that complexity to just replace a simple
'smp_mb__after_unlock_lock()'.

#define smp_mb__after_unlock_lock() smp_mb()
#define smp_mb()barrier()
# define barrier() __asm__ __volatile__("": : :"memory")


Am I missing some subtility ?


On powerpc CONFIG_SMP, smp_mb() is actually defined as:

#define smp_mb()__smp_mb()
#define __smp_mb()  mb()
#define mb()   __asm__ __volatile__ ("sync" : : : "memory")

So the original motivation here was to skip a "sync" instruction whenever
switching between threads which are part of the same process. But based on
recent discussions, I suspect my implementation may be inaccurately doing
so though.



I see.

Then, if you think a 'sync' is a concern, shouldn't we try and remove the forest of 'sync' in the 
I/O accessors ?


I can't really understand why we need all those 'sync' and 'isync' and 'twi' around the accesses 
whereas I/O memory is usually mapped as 'Guarded' so memory access ordering is already garantied.


I'm sure we'll save a lot with that.

Christophe


Re: [PATCH for 4.16 v7 02/11] powerpc: membarrier: Skip memory barrier in switch_mm()

2021-06-18 Thread Christophe Leroy




Le 29/01/2018 à 21:20, Mathieu Desnoyers a écrit :

Allow PowerPC to skip the full memory barrier in switch_mm(), and
only issue the barrier when scheduling into a task belonging to a
process that has registered to use expedited private.

Threads targeting the same VM but which belong to different thread
groups is a tricky case. It has a few consequences:

It turns out that we cannot rely on get_nr_threads(p) to count the
number of threads using a VM. We can use
(atomic_read(>mm_users) == 1 && get_nr_threads(p) == 1)
instead to skip the synchronize_sched() for cases where the VM only has
a single user, and that user only has a single thread.

It also turns out that we cannot use for_each_thread() to set
thread flags in all threads using a VM, as it only iterates on the
thread group.

Therefore, test the membarrier state variable directly rather than
relying on thread flags. This means
membarrier_register_private_expedited() needs to set the
MEMBARRIER_STATE_PRIVATE_EXPEDITED flag, issue synchronize_sched(), and
only then set MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY which allows
private expedited membarrier commands to succeed.
membarrier_arch_switch_mm() now tests for the
MEMBARRIER_STATE_PRIVATE_EXPEDITED flag.


Looking at switch_mm_irqs_off(), I found it more complex than expected and found that this patch is 
the reason for that complexity.


Before the patch (ie in kernel 4.14), we have:

 :
   0:   81 24 01 c8 lwz r9,456(r4)
   4:   71 29 00 01 andi.   r9,r9,1
   8:   40 82 00 1c bne 24 
   c:   39 24 01 c8 addir9,r4,456
  10:   39 40 00 01 li  r10,1
  14:   7d 00 48 28 lwarx   r8,0,r9
  18:   7d 08 53 78 or  r8,r8,r10
  1c:   7d 00 49 2d stwcx.  r8,0,r9
  20:   40 c2 ff f4 bne-14 
  24:   7c 04 18 40 cmplw   r4,r3
  28:   81 24 00 24 lwz r9,36(r4)
  2c:   91 25 04 4c stw r9,1100(r5)
  30:   4d 82 00 20 beqlr
  34:   48 00 00 00 b   34 
34: R_PPC_REL24 switch_mmu_context


After the patch (ie in 5.13-rc6), that now is:

 :
   0:   81 24 02 18 lwz r9,536(r4)
   4:   71 29 00 01 andi.   r9,r9,1
   8:   41 82 00 24 beq 2c 
   c:   7c 04 18 40 cmplw   r4,r3
  10:   81 24 00 24 lwz r9,36(r4)
  14:   91 25 04 d0 stw r9,1232(r5)
  18:   4d 82 00 20 beqlr
  1c:   81 24 00 28 lwz r9,40(r4)
  20:   71 29 00 0a andi.   r9,r9,10
  24:   40 82 00 34 bne 58 
  28:   48 00 00 00 b   28 
28: R_PPC_REL24 switch_mmu_context
  2c:   39 24 02 18 addir9,r4,536
  30:   39 40 00 01 li  r10,1
  34:   7d 00 48 28 lwarx   r8,0,r9
  38:   7d 08 53 78 or  r8,r8,r10
  3c:   7d 00 49 2d stwcx.  r8,0,r9
  40:   40 a2 ff f4 bne 34 
  44:   7c 04 18 40 cmplw   r4,r3
  48:   81 24 00 24 lwz r9,36(r4)
  4c:   91 25 04 d0 stw r9,1232(r5)
  50:   4d 82 00 20 beqlr
  54:   48 00 00 00 b   54 
54: R_PPC_REL24 switch_mmu_context
  58:   2c 03 00 00 cmpwi   r3,0
  5c:   41 82 ff cc beq 28 
  60:   48 00 00 00 b   60 
60: R_PPC_REL24 switch_mmu_context


Especially, the comparison of 'prev' to 0 is pointless as both cases end up with just branching to 
'switch_mmu_context'


I don't understand all that complexity to just replace a simple 
'smp_mb__after_unlock_lock()'.

#define smp_mb__after_unlock_lock() smp_mb()
#define smp_mb()barrier()
# define barrier() __asm__ __volatile__("": : :"memory")


Am I missing some subtility ?

Thanks
Christophe


Re: [PATCH 8/8] membarrier: Rewrite sync_core_before_usermode() and improve documentation

2021-06-18 Thread Christophe Leroy




Le 16/06/2021 à 20:52, Andy Lutomirski a écrit :

On 6/15/21 9:45 PM, Nicholas Piggin wrote:

Excerpts from Andy Lutomirski's message of June 16, 2021 1:21 pm:

The old sync_core_before_usermode() comments suggested that a non-icache-syncing
return-to-usermode instruction is x86-specific and that all other
architectures automatically notice cross-modified code on return to
userspace.



+/*
+ * XXX: can a powerpc person put an appropriate comment here?
+ */
+static inline void membarrier_sync_core_before_usermode(void)
+{
+}
+
+#endif /* _ASM_POWERPC_SYNC_CORE_H */


powerpc's can just go in asm/membarrier.h


$ ls arch/powerpc/include/asm/membarrier.h
ls: cannot access 'arch/powerpc/include/asm/membarrier.h': No such file
or directory


https://github.com/torvalds/linux/blob/master/arch/powerpc/include/asm/membarrier.h


Was added by 
https://github.com/torvalds/linux/commit/3ccfebedd8cf54e291c809c838d8ad5cc00f5688






/*
  * The RFI family of instructions are context synchronising, and
  * that is how we return to userspace, so nothing is required here.
  */


Thanks!



Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'

2021-06-17 Thread Christophe Leroy




Le 17/06/2021 à 08:36, Athira Rajeev a écrit :




On 16-Jun-2021, at 11:56 AM, Christophe Leroy  
wrote:



Le 16/06/2021 à 05:40, Athira Rajeev a écrit :

On 16-Jun-2021, at 8:53 AM, Madhavan Srinivasan  wrote:


On 6/15/21 8:35 PM, Christophe Leroy wrote:
For your information, I'm getting the following Oops. Detected with 5.13-rc6, it also oopses on 
5.12 and 5.11.

Runs ok on 5.10. I'm starting bisecting now.



Thanks for reporting, got the issue. What has happened in this case is that, pmu device is not 
registered
and trying to access the instruction point which will land in perf_instruction_pointer(). And 
recently I have added
a workaround patch for power10 DD1 which has caused this breakage. My bad. We are working on a 
fix patch

for the same and will post it out. Sorry again.


Hi Christophe,
Can you please try with below patch in your environment and test if it works 
for you.
From 55d3afc9369dfbe28a7152c8e9f856c11c7fe43d Mon Sep 17 00:00:00 2001
From: Athira Rajeev 
Date: Tue, 15 Jun 2021 22:28:11 -0400
Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when
pmu is not set
On systems without any specific PMU driver support registered, running
perf record causes oops:
[   38.841073] NIP [c013af54] perf_instruction_pointer+0x24/0x100
[   38.841079] LR [c03c7358] perf_prepare_sample+0x4e8/0x820
[   38.841085] --- interrupt: 300
[   38.841088] [c0001cf03440] [c03c6ef8] 
perf_prepare_sample+0x88/0x820 (unreliable)
[   38.841096] [c0001cf034a0] [c03c76d0] 
perf_event_output_forward+0x40/0xc0
[   38.841104] [c0001cf03520] [c03b45e8] 
__perf_event_overflow+0x88/0x1b0
[   38.841112] [c0001cf03570] [c03b480c] 
perf_swevent_hrtimer+0xfc/0x1a0
[   38.841119] [c0001cf03740] [c02399cc] 
__hrtimer_run_queues+0x17c/0x380
[   38.841127] [c0001cf037c0] [c023a5f8] 
hrtimer_interrupt+0x128/0x2f0
[   38.841135] [c0001cf03870] [c002962c] timer_interrupt+0x13c/0x370
[   38.841143i] [c0001cf038d0] [c0009ba4] 
decrementer_common_virt+0x1a4/0x1b0
[   38.841151] --- interrupt: 900 at copypage_power7+0xd4/0x1c0
During perf record session, perf_instruction_pointer() is called to
capture the sample ip. This function in core-book3s accesses ppmu->flags.
If a platform specific PMU driver is not registered, ppmu is set to NULL
and accessing its members results in a crash. Fix this crash by checking
if ppmu is set.
Signed-off-by: Athira Rajeev 
Reported-by: Christophe Leroy 


Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero")
Cc: sta...@vger.kernel.org
Tested-by: Christophe Leroy 


Hi Christophe,

Thanks for testing with the change. I have a newer version where I have added 
braces around the check.
Can you please check once and can I add your tested-by for the below patch.


Yes it works, you can add my Tested-by:
Please also add Cc: sta...@vger.kernel.org, this needs to be backported as soon 
as possible.

Thanks
Christophe


Re: [PATCH v3 1/5] powerpc/interrupt: Rename and lightly change syscall_exit_prepare_main()

2021-06-17 Thread Christophe Leroy




Le 17/06/2021 à 13:25, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 15, 2021 6:33 pm:

Rename syscall_exit_prepare_main() into interrupt_exit_prepare_main()

Make it static as it is not used anywhere else.

Pass it the 'ret' so that it can 'or' it directly instead of
oring twice, once inside the function and once outside.

And remove 'r3' parameter which is not used.

Also fix a typo where CONFIG_PPC_BOOK3S should be CONFIG_PPC_BOOK3S_64.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
This series applies on top of Nic's series speeding up interrupt return on 64s

  arch/powerpc/kernel/interrupt.c | 11 +--
  1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 74c995a42399..ba2d602d2da6 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -243,11 +243,10 @@ static notrace void booke_load_dbcr0(void)
  #endif
  }
  
-notrace unsigned long syscall_exit_prepare_main(unsigned long r3,

-   struct pt_regs *regs)
+static notrace unsigned long
+interrupt_exit_user_prepare_main(struct pt_regs *regs, unsigned long ret)


Hmm, I tried switching the order of the arguments thinking it would
match caller and return registers better but didn't seem to help
generated code. Yet I think I will make that change to your patch if
you don't mind.


That's a static function that most likely gets inlined so the order of 
parameters makes no difference.
I tend to like that almost all functions dealing with interrupts take regs as first param, but I 
have no strong opinion about it so you can change it if that's better for you.


Christophe


Re: [PATCH] selftests/powerpc: Add a test of sigreturn vs VDSO

2021-06-17 Thread Christophe Leroy




Le 26/03/2020 à 13:06, Michael Ellerman a écrit :

On Wed, 2020-03-04 at 11:04:02 UTC, Michael Ellerman wrote:

There's two different paths through the sigreturn code, depending on
whether the VDSO is mapped or not. We recently discovered a bug in the
unmapped case, because it's not commonly used these days.

So add a test that sends itself a signal, then moves the VDSO, takes
another signal and finally unmaps the VDSO before sending itself
another signal. That tests the standard signal path, the code that
handles the VDSO being moved, and also the signal path in the case
where the VDSO is unmapped.

Signed-off-by: Michael Ellerman 


Applied to powerpc next.

https://git.kernel.org/powerpc/c/a0968a025c04702427a4aee2c618f451a5098cd8

cheers



Doesn't work anymore since the split of VDSO and VVAR.

Christophe


Re: [PATCH v14 3/4] mm: define default MAX_PTRS_PER_* in include/pgtable.h

2021-06-17 Thread Christophe Leroy




Le 17/06/2021 à 08:39, Daniel Axtens a écrit :

Commit c65e774fb3f6 ("x86/mm: Make PGDIR_SHIFT and PTRS_PER_P4D variable")
made PTRS_PER_P4D variable on x86 and introduced MAX_PTRS_PER_P4D as a
constant for cases which need a compile-time constant (e.g. fixed-size
arrays).

powerpc likewise has boot-time selectable MMU features which can cause
other mm "constants" to vary. For KASAN, we have some static
PTE/PMD/PUD/P4D arrays so we need compile-time maximums for all these
constants. Extend the MAX_PTRS_PER_ idiom, and place default definitions
in include/pgtable.h. These define MAX_PTRS_PER_x to be PTRS_PER_x unless
an architecture has defined MAX_PTRS_PER_x in its arch headers.

Clean up pgtable-nop4d.h and s390's MAX_PTRS_PER_P4D definitions while
we're at it: both can just pick up the default now.

Signed-off-by: Daniel Axtens 


Reviewed-by: Christophe Leroy 



---

s390 was compile tested only.
---
  arch/s390/include/asm/pgtable.h |  2 --
  include/asm-generic/pgtable-nop4d.h |  1 -
  include/linux/pgtable.h | 22 ++
  3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7c66ae5d7e32..cf05954ce013 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -342,8 +342,6 @@ static inline int is_module_addr(void *addr)
  #define PTRS_PER_P4D  _CRST_ENTRIES
  #define PTRS_PER_PGD  _CRST_ENTRIES
  
-#define MAX_PTRS_PER_P4D	PTRS_PER_P4D

-
  /*
   * Segment table and region3 table entry encoding
   * (R = read-only, I = invalid, y = young bit):
diff --git a/include/asm-generic/pgtable-nop4d.h 
b/include/asm-generic/pgtable-nop4d.h
index ce2cbb3c380f..2f6b1befb129 100644
--- a/include/asm-generic/pgtable-nop4d.h
+++ b/include/asm-generic/pgtable-nop4d.h
@@ -9,7 +9,6 @@
  typedef struct { pgd_t pgd; } p4d_t;
  
  #define P4D_SHIFT		PGDIR_SHIFT

-#define MAX_PTRS_PER_P4D   1
  #define PTRS_PER_P4D  1
  #define P4D_SIZE  (1UL << P4D_SHIFT)
  #define P4D_MASK  (~(P4D_SIZE-1))
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 9e6f71265f72..69700e3e615f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1625,4 +1625,26 @@ typedef unsigned int pgtbl_mod_mask;
  #define pte_leaf_size(x) PAGE_SIZE
  #endif
  
+/*

+ * Some architectures have MMUs that are configurable or selectable at boot
+ * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
+ * helps to have a static maximum value.
+ */
+
+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif
+
+#ifndef MAX_PTRS_PER_P4D
+#define MAX_PTRS_PER_P4D PTRS_PER_P4D
+#endif
+
  #endif /* _LINUX_PGTABLE_H */



Re: [PATCH v14 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Christophe Leroy




Le 17/06/2021 à 08:39, Daniel Axtens a écrit :

Allow architectures to define a kasan_arch_is_ready() hook that bails
out of any function that's about to touch the shadow unless the arch
says that it is ready for the memory to be accessed. This is fairly
uninvasive and should have a negligible performance penalty.

This will only work in outline mode, so an arch must specify
ARCH_DISABLE_KASAN_INLINE if it requires this.

Cc: Balbir Singh 
Cc: Aneesh Kumar K.V 
Suggested-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 

--

Both previous RFCs for ppc64 - by 2 different people - have
needed this trick! See:
  - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
  - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series

I haven't been able to exercise the arch hook error for !GENERIC as I
don't have a particularly modern aarch64 toolchain or a lot of experience
cross-compiling with clang. But it does fire for GENERIC + INLINE on x86.
---
  mm/kasan/common.c  | 4 
  mm/kasan/generic.c | 3 +++
  mm/kasan/kasan.h   | 8 
  mm/kasan/shadow.c  | 8 
  4 files changed, 23 insertions(+)

diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8f450bc28045..b18abaf8c78e 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -449,6 +449,14 @@ static inline void kasan_poison_last_granule(const void 
*address, size_t size) {
  
  #endif /* CONFIG_KASAN_GENERIC */
  
+#ifndef kasan_arch_is_ready

+static inline bool kasan_arch_is_ready(void)   { return true; }
+#else
+#if !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif
+#endif


Would be cleaner and more readable as

+#ifndef kasan_arch_is_ready
+static inline bool kasan_arch_is_ready(void)   { return true; }
+#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE)
+#error kasan_arch_is_ready only works in KASAN generic outline mode!
+#endif


+
  /*
   * Exported functions for interfaces called from assembly or from generated
   * code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 082ee5b6d9a1..3c7f7efe6f68 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -73,6 +73,10 @@ void kasan_poison(const void *addr, size_t size, u8 value, 
bool init)
  {
void *shadow_start, *shadow_end;
  
+	/* Don't touch the shadow memory if arch isn't ready */

+   if (!kasan_arch_is_ready())
+   return;
+
/*
 * Perform shadow offset calculation based on untagged address, as
 * some of the callers (e.g. kasan_poison_object_data) pass tagged
@@ -99,6 +103,10 @@ EXPORT_SYMBOL(kasan_poison);
  #ifdef CONFIG_KASAN_GENERIC
  void kasan_poison_last_granule(const void *addr, size_t size)
  {
+   /* Don't touch the shadow memory if arch isn't ready */
+   if (!kasan_arch_is_ready())
+   return;
+
if (size & KASAN_GRANULE_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
*shadow = size & KASAN_GRANULE_MASK;



Re: [PATCH v14 2/4] kasan: allow architectures to provide an outline readiness check

2021-06-17 Thread Christophe Leroy




Le 17/06/2021 à 08:39, Daniel Axtens a écrit :

Allow architectures to define a kasan_arch_is_ready() hook that bails
out of any function that's about to touch the shadow unless the arch
says that it is ready for the memory to be accessed. This is fairly
uninvasive and should have a negligible performance penalty.

This will only work in outline mode, so an arch must specify
ARCH_DISABLE_KASAN_INLINE if it requires this.

Cc: Balbir Singh 
Cc: Aneesh Kumar K.V 
Suggested-by: Christophe Leroy 
Signed-off-by: Daniel Axtens 

--

Both previous RFCs for ppc64 - by 2 different people - have
needed this trick! See:
  - https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series
  - https://patchwork.ozlabs.org/patch/795211/  # ppc radix series

I haven't been able to exercise the arch hook error for !GENERIC as I
don't have a particularly modern aarch64 toolchain or a lot of experience
cross-compiling with clang. But it does fire for GENERIC + INLINE on x86.


Modern toolchains are available here 
https://mirrors.edge.kernel.org/pub/tools/crosstool/



Re: [PATCH v13 3/3] kasan: define and use MAX_PTRS_PER_* for early shadow tables

2021-06-16 Thread Christophe Leroy




Le 16/06/2021 à 11:07, Marco Elver a écrit :

On Wed, 16 Jun 2021 at 10:03, Daniel Axtens  wrote:
[...]

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 768d7d342757..fd65f477ac92 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -40,10 +40,22 @@ struct kunit_kasan_expectation {
  #define PTE_HWTABLE_PTRS 0
  #endif

+#ifndef MAX_PTRS_PER_PTE
+#define MAX_PTRS_PER_PTE PTRS_PER_PTE
+#endif
+
+#ifndef MAX_PTRS_PER_PMD
+#define MAX_PTRS_PER_PMD PTRS_PER_PMD
+#endif
+
+#ifndef MAX_PTRS_PER_PUD
+#define MAX_PTRS_PER_PUD PTRS_PER_PUD
+#endif


This is introducing new global constants in a  header. It
feels like this should be in  together with a
comment. Because  is actually included in
, most of the kernel will get these new definitions.
That in itself is fine, but it feels wrong that the KASAN header
introduces these.

Thoughts?

Sorry for only realizing this now.


My idea here was to follow the same road as MAX_PTRS_PER_P4D, added by commit 
https://github.com/linuxppc/linux/commit/c65e774f


That commit spread MAX_PTRS_PER_P4D everywhere.

Instead of doing the same, we found that it would be better to define a fallback for when the 
architecture doesn't define MAX_PTRS_PER_PxD . Now, it can be made more global in pgtable.h, in that 
case I'd suggest to also include MAX_PTRS_PER_P4D in the dance and avoid architectures like s390 
having to define it, or even not defining it either in asm-generic/pgtable-nop4d.h


Christophe



Thanks,
-- Marco


  extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
-extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
-extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
-extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
+extern pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS];
+extern pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD];
+extern pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD];
  extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];

  int kasan_populate_early_shadow(const void *shadow_start,
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 348f31d15a97..cc64ed6858c6 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -41,7 +41,7 @@ static inline bool kasan_p4d_table(pgd_t pgd)
  }
  #endif
  #if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss;
  static inline bool kasan_pud_table(p4d_t p4d)
  {
 return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
@@ -53,7 +53,7 @@ static inline bool kasan_pud_table(p4d_t p4d)
  }
  #endif
  #if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss;
  static inline bool kasan_pmd_table(pud_t pud)
  {
 return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
@@ -64,7 +64,7 @@ static inline bool kasan_pmd_table(pud_t pud)
 return false;
  }
  #endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS]
 __page_aligned_bss;

  static inline bool kasan_pte_table(pmd_t pmd)
--
2.30.2

--
You received this message because you are subscribed to the Google Groups 
"kasan-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to kasan-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/kasan-dev/20210616080244.51236-4-dja%40axtens.net.


Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'

2021-06-16 Thread Christophe Leroy




Le 16/06/2021 à 08:33, Madhavan Srinivasan a écrit :


On 6/16/21 11:56 AM, Christophe Leroy wrote:



Le 16/06/2021 à 05:40, Athira Rajeev a écrit :




On 16-Jun-2021, at 8:53 AM, Madhavan Srinivasan  wrote:


On 6/15/21 8:35 PM, Christophe Leroy wrote:
For your information, I'm getting the following Oops. Detected with 5.13-rc6, it also oopses on 
5.12 and 5.11.

Runs ok on 5.10. I'm starting bisecting now.



Thanks for reporting, got the issue. What has happened in this case is that, pmu device is not 
registered
and trying to access the instruction point which will land in perf_instruction_pointer(). And 
recently I have added
a workaround patch for power10 DD1 which has caused this breakage. My bad. We are working on a 
fix patch

for the same and will post it out. Sorry again.



Hi Christophe,

Can you please try with below patch in your environment and test if it works 
for you.

 From 55d3afc9369dfbe28a7152c8e9f856c11c7fe43d Mon Sep 17 00:00:00 2001
From: Athira Rajeev 
Date: Tue, 15 Jun 2021 22:28:11 -0400
Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when
  pmu is not set

On systems without any specific PMU driver support registered, running
perf record causes oops:

[   38.841073] NIP [c013af54] perf_instruction_pointer+0x24/0x100
[   38.841079] LR [c03c7358] perf_prepare_sample+0x4e8/0x820
[   38.841085] --- interrupt: 300
[   38.841088] [c0001cf03440] [c03c6ef8] 
perf_prepare_sample+0x88/0x820 (unreliable)
[   38.841096] [c0001cf034a0] [c03c76d0] 
perf_event_output_forward+0x40/0xc0
[   38.841104] [c0001cf03520] [c03b45e8] 
__perf_event_overflow+0x88/0x1b0
[   38.841112] [c0001cf03570] [c03b480c] 
perf_swevent_hrtimer+0xfc/0x1a0
[   38.841119] [c0001cf03740] [c02399cc] 
__hrtimer_run_queues+0x17c/0x380
[   38.841127] [c0001cf037c0] [c023a5f8] 
hrtimer_interrupt+0x128/0x2f0
[   38.841135] [c0001cf03870] [c002962c] timer_interrupt+0x13c/0x370
[   38.841143i] [c0001cf038d0] [c0009ba4] 
decrementer_common_virt+0x1a4/0x1b0
[   38.841151] --- interrupt: 900 at copypage_power7+0xd4/0x1c0

During perf record session, perf_instruction_pointer() is called to
capture the sample ip. This function in core-book3s accesses ppmu->flags.
If a platform specific PMU driver is not registered, ppmu is set to NULL
and accessing its members results in a crash. Fix this crash by checking
if ppmu is set.

Signed-off-by: Athira Rajeev 
Reported-by: Christophe Leroy 


Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero")
Cc: sta...@vger.kernel.org
Tested-by: Christophe Leroy 

Thanks, but just wonder what is the system config and processor version in 
which you got this fail.
Reason, we do have generic-pmu which should kick-in in absence of a platform 
specific driver.




It's an mpc8321 (book3s/32)

Christophe


Re: Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'

2021-06-16 Thread Christophe Leroy




Le 16/06/2021 à 05:40, Athira Rajeev a écrit :




On 16-Jun-2021, at 8:53 AM, Madhavan Srinivasan  wrote:


On 6/15/21 8:35 PM, Christophe Leroy wrote:

For your information, I'm getting the following Oops. Detected with 5.13-rc6, 
it also oopses on 5.12 and 5.11.
Runs ok on 5.10. I'm starting bisecting now.



Thanks for reporting, got the issue. What has happened in this case is that, 
pmu device is not registered
and trying to access the instruction point which will land in 
perf_instruction_pointer(). And recently I have added
a workaround patch for power10 DD1 which has caused this breakage. My bad. We 
are working on a fix patch
for the same and will post it out. Sorry again.



Hi Christophe,

Can you please try with below patch in your environment and test if it works 
for you.

 From 55d3afc9369dfbe28a7152c8e9f856c11c7fe43d Mon Sep 17 00:00:00 2001
From: Athira Rajeev 
Date: Tue, 15 Jun 2021 22:28:11 -0400
Subject: [PATCH] powerpc/perf: Fix crash with 'perf_instruction_pointer' when
  pmu is not set

On systems without any specific PMU driver support registered, running
perf record causes oops:

[   38.841073] NIP [c013af54] perf_instruction_pointer+0x24/0x100
[   38.841079] LR [c03c7358] perf_prepare_sample+0x4e8/0x820
[   38.841085] --- interrupt: 300
[   38.841088] [c0001cf03440] [c03c6ef8] 
perf_prepare_sample+0x88/0x820 (unreliable)
[   38.841096] [c0001cf034a0] [c03c76d0] 
perf_event_output_forward+0x40/0xc0
[   38.841104] [c0001cf03520] [c03b45e8] 
__perf_event_overflow+0x88/0x1b0
[   38.841112] [c0001cf03570] [c03b480c] 
perf_swevent_hrtimer+0xfc/0x1a0
[   38.841119] [c0001cf03740] [c02399cc] 
__hrtimer_run_queues+0x17c/0x380
[   38.841127] [c0001cf037c0] [c023a5f8] 
hrtimer_interrupt+0x128/0x2f0
[   38.841135] [c0001cf03870] [c002962c] timer_interrupt+0x13c/0x370
[   38.841143i] [c0001cf038d0] [c0009ba4] 
decrementer_common_virt+0x1a4/0x1b0
[   38.841151] --- interrupt: 900 at copypage_power7+0xd4/0x1c0

During perf record session, perf_instruction_pointer() is called to
capture the sample ip. This function in core-book3s accesses ppmu->flags.
If a platform specific PMU driver is not registered, ppmu is set to NULL
and accessing its members results in a crash. Fix this crash by checking
if ppmu is set.

Signed-off-by: Athira Rajeev 
Reported-by: Christophe Leroy 


Fixes: 2ca13a4cc56c ("powerpc/perf: Use regs->nip when SIAR is zero")
Cc: sta...@vger.kernel.org
Tested-by: Christophe Leroy 


---
  arch/powerpc/perf/core-book3s.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 16d4d1b6a1ff..816756588cb7 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2254,7 +2254,7 @@ unsigned long perf_instruction_pointer(struct pt_regs 
*regs)
bool use_siar = regs_use_siar(regs);
unsigned long siar = mfspr(SPRN_SIAR);
  
-	if (ppmu->flags & PPMU_P10_DD1) {

+   if (ppmu && ppmu->flags & PPMU_P10_DD1) {
if (siar)
return siar;
else



Oops (NULL pointer) with 'perf record' of selftest 'null_syscall'

2021-06-15 Thread Christophe Leroy
For your information, I'm getting the following Oops. Detected with 5.13-rc6, it also oopses on 5.12 
and 5.11.

Runs ok on 5.10. I'm starting bisecting now.

root@vgoippro:/tmp# perf record /root/null_syscall
[  285.559987] BUG: Kernel NULL pointer dereference on read at 0x0040
[  285.566533] Faulting instruction address: 0xc0021f0c
[  285.571486] Oops: Kernel access of bad area, sig: 11 [#1]
[  285.576872] BE PAGE_SIZE=4K PREEMPT CMPCPRO
[  285.581080] SAF3000 DIE NOTIFICATION
[  285.584661] CPU: 0 PID: 442 Comm: null_syscall Not tainted 5.13.0-rc6-s3k-dev-01645-g7649ee3d2957 
#5164

[  285.594035] NIP:  c0021f0c LR: c00e8ad8 CTR: c00d8a5c
[  285.599074] REGS: e67757d0 TRAP: 0300   Not tainted  
(5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
[  285.607576] MSR:  1032   CR: 44775b18  XER: 2000
[  285.614063] DAR: 0040 DSISR: 2000
[  285.614063] GPR00: c00e8810 e6775880 c1c52640 e6775b20 7cb36ae0 f028 
43ebeedc 5ccc47d0
[  285.614063] GPR08:  0900 e6775b20 0001  1025b2c0 
10013088 10012ee0
[  285.614063] GPR16: b000 0007 0001 c00deb64 0042 0001 
78db7b23 c0b13200
[  285.614063] GPR24:    e6775b20 c13b8560 0107 
e6775940 e67758e8
[  285.651693] NIP [c0021f0c] perf_instruction_pointer+0x10/0x60
[  285.657460] LR [c00e8ad8] perf_prepare_sample+0x344/0x674
[  285.662859] Call Trace:
[  285.665301] [e6775880] [c00e8810] perf_prepare_sample+0x7c/0x674 (unreliable)
[  285.672452] [e67758c0] [c00e8e44] perf_event_output_forward+0x3c/0x94
[  285.678903] [e6775910] [c00dea8c] __perf_event_overflow+0x74/0x14c
[  285.685108] [e6775930] [c00dec5c] perf_swevent_hrtimer+0xf8/0x170
[  285.691217] [e6775a40] [c008c8d0] 
__hrtimer_run_queues.constprop.0+0x160/0x318
[  285.698456] [e6775a90] [c008d94c] hrtimer_interrupt+0x148/0x3b0
[  285.704394] [e6775ae0] [c000c0c0] timer_interrupt+0xc4/0x22c
[  285.710067] [e6775b10] [c00046f0] Decrementer_virt+0xb8/0xbc
[  285.715744] --- interrupt: 900 at pagecache_get_page+0x210/0x430
[  285.721764] NIP:  c00f52a8 LR: c00f5408 CTR: c00f59d8
[  285.726805] REGS: e6775b20 TRAP: 0900   Not tainted  
(5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
[  285.735306] MSR:  9032   CR: 28422d68  XER: 
[  285.742056]
[  285.742056] GPR00: c00f513c e6775bd0 c1c52640 c1c52640   
 c1382c38
[  285.742056] GPR08:   0001  88482d68 1025b2c0 
10013088 10012ee0
[  285.742056] GPR16: b000 0007 0001 10012ee0 c18187ac c0b87800 
61c88647 c0c18c00
[  285.742056] GPR24: 0001 0003  0002 c18187a8 00100cca 
0044 
[  285.777079] NIP [c00f52a8] pagecache_get_page+0x210/0x430
[  285.782482] LR [c00f5408] pagecache_get_page+0x370/0x430
[  285.787796] --- interrupt: 900
[  285.790843] [e6775bd0] [c00f513c] pagecache_get_page+0xa4/0x430 (unreliable)
[  285.797910] [e6775c30] [c00f5ca8] filemap_fault+0x2d0/0x8e8
[  285.803500] [e6775ca0] [c012d244] __do_fault+0x4c/0xd8
[  285.808666] [e6775cb0] [c0130f64] handle_mm_fault+0x274/0x10b8
[  285.814517] [e6775d30] [c0014f58] do_page_fault+0x1d4/0x67c
[  285.820117] [e6775d60] [c000424c] DataAccess_virt+0xd4/0xe4
[  285.825707] --- interrupt: 300 at __arch_clear_user+0x10/0xcc
[  285.831458] NIP:  c001a3cc LR: c01d5cfc CTR: 
[  285.836497] REGS: e6775d70 TRAP: 0300   Not tainted  
(5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
[  285.845000] MSR:  9032   CR: 48004264  XER: 2000
[  285.851751] DAR: 10012ee0 DSISR: 2200
[  285.851751] GPR00: c01d53fc e6775e20 c1c52640  0120 0008 
c136241c 
[  285.851751] GPR08:  9ffed120 10012ee0 0004 28004868 1025b2c0 
10013088 10012ee0
[  285.851751] GPR16: b000 0007 0001 10012ee0 1000 10012d0c 
1000 c1d74240
[  285.851751] GPR24: 10012ee0  c1345e80 c1343dc0 1b38  
c132ec00 c1386a00
[  285.889384] NIP [c001a3cc] __arch_clear_user+0x10/0xcc
[  285.894527] LR [c01d5cfc] load_elf_binary+0xec4/0x1340
[  285.899682] --- interrupt: 300
[  285.902730] [e6775e20] [c01d53fc] load_elf_binary+0x5c4/0x1340 (unreliable)
[  285.909713] [e6775ea0] [c0163258] bprm_execve+0x200/0x55c
[  285.915138] [e6775ef0] [c0163e00] do_execveat_common+0x178/0x1f4
[  285.921162] [e6775f20] [c0165558] sys_execve+0x40/0x58
[  285.926321] [e6775f40] [c001404c] ret_from_syscall+0x0/0x28
[  285.931917] --- interrupt: c00 at 0xfc3ce78
[  285.936097] NIP:  0fc3ce78 LR: 0fc3d7cc CTR: c01657cc
[  285.941135] REGS: e6775f50 TRAP: 0c00   Not tainted  
(5.13.0-rc6-s3k-dev-01645-g7649ee3d2957)
[  285.949636] MSR:  d032   CR: 22004868  XER: 2000
[  285.956655]
[  285.956655] GPR00: 000b afab1bf0 a7d77a50 afab6ee1 afab64c8 104bd9b0 
fefefeff 7f7f7f7f
[  285.956655] GPR08: afab6ee0  006df8f9 011d 24004864 1025b2c0 
10231a50 10249108
[  285.956655] GPR16: 104beeb0 10254830 105dd3f4 1025 1018a124 10188448 
10234e58 
[  285.956655] GPR24: 10231ae0 0003 0001 104bd9b0 

Re: [PATCH v2 2/4] powerpc/interrupt: Refactor prep_irq_for_user_exit()

2021-06-15 Thread Christophe Leroy




Le 11/06/2021 à 04:30, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 5, 2021 12:56 am:

prep_irq_for_user_exit() is a superset of
prep_irq_for_kernel_enabled_exit().

Refactor it.


I like the refactoring, but now prep_irq_for_user_exit() is calling
prep_irq_for_kernel_enabled_exit(), which seems like the wrong naming.

You could re-name prep_irq_for_kernel_enabled_exit() to
prep_irq_for_enabled_exit() maybe? Or it could be
__prep_irq_for_enabled_exit() then prep_irq_for_kernel_enabled_exit()
and prep_irq_for_user_exit() would both call it.


I renamed it prep_irq_for_enabled_exit().

And I realised that after patch 4, prep_irq_for_enabled_exit() has become a trivial function used 
only once.


So I swapped patches 1/2 with patches 3/4 and added a 5th one to squash prep_irq_for_enabled_exit() 
into its caller.


You didn't have any comment on patch 4 (that is now patch 2) ?

Thanks for the review
Christophe


[PATCH v3 5/5] powerpc/interrupt: Remove prep_irq_for_user_exit()

2021-06-15 Thread Christophe Leroy
prep_irq_for_user_exit() has only one caller, squash it
inside that caller.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/interrupt.c | 16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 05831d99bf26..de335da7ab52 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -75,18 +75,6 @@ static notrace __always_inline bool 
prep_irq_for_enabled_exit(bool restartable)
return true;
 }
 
-static notrace __always_inline bool prep_irq_for_user_exit(void)
-{
-   bool ret;
-
-   user_enter_irqoff();
-   ret = prep_irq_for_enabled_exit(true);
-   if (!ret)
-   user_exit_irqoff();
-
-   return ret;
-}
-
 /* Has to run notrace because it is entered not completely "reconciled" */
 notrace long system_call_exception(long r3, long r4, long r5,
   long r6, long r7, long r8,
@@ -276,7 +264,9 @@ interrupt_exit_user_prepare_main(struct pt_regs *regs, 
unsigned long ret)
}
}
 
-   if (!prep_irq_for_user_exit()) {
+   user_enter_irqoff();
+   if (!prep_irq_for_enabled_exit(true)) {
+   user_exit_irqoff();
local_irq_enable();
local_irq_disable();
goto again;
-- 
2.25.0



[PATCH v3 4/5] powerpc/interrupt: Refactor prep_irq_for_{user/kernel_enabled}_exit()

2021-06-15 Thread Christophe Leroy
prep_irq_for_user_exit() is a superset of
prep_irq_for_kernel_enabled_exit().

Rename prep_irq_for_kernel_enabled_exit() as prep_irq_for_enabled_exit()
and have prep_irq_for_user_exit() use it.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 29 +++--
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 9780c26f19cf..05831d99bf26 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -50,7 +50,7 @@ static inline bool exit_must_hard_disable(void)
  * restartable is true then EE/RI can be left on because interrupts are handled
  * with a restart sequence.
  */
-static notrace __always_inline bool prep_irq_for_kernel_enabled_exit(bool 
restartable)
+static notrace __always_inline bool prep_irq_for_enabled_exit(bool restartable)
 {
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
@@ -77,29 +77,14 @@ static notrace __always_inline bool 
prep_irq_for_kernel_enabled_exit(bool restar
 
 static notrace __always_inline bool prep_irq_for_user_exit(void)
 {
-   user_enter_irqoff();
-   /* This must be done with RI=1 because tracing may touch vmaps */
-   trace_hardirqs_on();
-
-#ifdef CONFIG_PPC32
-   __hard_EE_RI_disable();
-#else
-   if (exit_must_hard_disable())
-   __hard_EE_RI_disable();
+   bool ret;
 
-   /* This pattern matches prep_irq_for_idle */
-   if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable()) {
-   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
-   __hard_RI_enable();
-   }
-   trace_hardirqs_off();
+   user_enter_irqoff();
+   ret = prep_irq_for_enabled_exit(true);
+   if (!ret)
user_exit_irqoff();
 
-   return false;
-   }
-#endif
-   return true;
+   return ret;
 }
 
 /* Has to run notrace because it is entered not completely "reconciled" */
@@ -465,7 +450,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct 
pt_regs *regs)
 * Stack store exit can't be restarted because the interrupt
 * stack frame might have been clobbered.
 */
-   if (!prep_irq_for_kernel_enabled_exit(unlikely(stack_store))) {
+   if (!prep_irq_for_enabled_exit(unlikely(stack_store))) {
/*
 * Replay pending soft-masked interrupts now. Don't
 * just local_irq_enabe(); local_irq_disable(); because
-- 
2.25.0



[PATCH v3 3/5] powerpc/interrupt: Interchange prep_irq_for_{kernel_enabled/user}_exit()

2021-06-15 Thread Christophe Leroy
prep_irq_for_user_exit() is a superset of
prep_irq_for_kernel_enabled_exit(). In order to allow refactoring in
following patch, interchange the two. This will allow
prep_irq_for_user_exit() to call a renamed version of
prep_irq_for_kernel_enabled_exit().

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/interrupt.c | 23 +++
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index b9558372adc0..9780c26f19cf 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -46,27 +46,28 @@ static inline bool exit_must_hard_disable(void)
  * This should be called with local irqs disabled, but if they were previously
  * enabled when the interrupt handler returns (indicating a process-context /
  * synchronous interrupt) then irqs_enabled should be true.
+ *
+ * restartable is true then EE/RI can be left on because interrupts are handled
+ * with a restart sequence.
  */
-static notrace __always_inline bool prep_irq_for_user_exit(void)
+static notrace __always_inline bool prep_irq_for_kernel_enabled_exit(bool 
restartable)
 {
-   user_enter_irqoff();
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
 
 #ifdef CONFIG_PPC32
__hard_EE_RI_disable();
 #else
-   if (exit_must_hard_disable())
+   if (exit_must_hard_disable() || !restartable)
__hard_EE_RI_disable();
 
/* This pattern matches prep_irq_for_idle */
if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable()) {
+   if (exit_must_hard_disable() || !restartable) {
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
__hard_RI_enable();
}
trace_hardirqs_off();
-   user_exit_irqoff();
 
return false;
}
@@ -74,28 +75,26 @@ static notrace __always_inline bool 
prep_irq_for_user_exit(void)
return true;
 }
 
-/*
- * restartable is true then EE/RI can be left on because interrupts are handled
- * with a restart sequence.
- */
-static notrace __always_inline bool prep_irq_for_kernel_enabled_exit(bool 
restartable)
+static notrace __always_inline bool prep_irq_for_user_exit(void)
 {
+   user_enter_irqoff();
/* This must be done with RI=1 because tracing may touch vmaps */
trace_hardirqs_on();
 
 #ifdef CONFIG_PPC32
__hard_EE_RI_disable();
 #else
-   if (exit_must_hard_disable() || !restartable)
+   if (exit_must_hard_disable())
__hard_EE_RI_disable();
 
/* This pattern matches prep_irq_for_idle */
if (unlikely(lazy_irq_pending_nocheck())) {
-   if (exit_must_hard_disable() || !restartable) {
+   if (exit_must_hard_disable()) {
local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
__hard_RI_enable();
}
trace_hardirqs_off();
+   user_exit_irqoff();
 
return false;
}
-- 
2.25.0



[PATCH v3 2/5] powerpc/interrupt: Refactor interrupt_exit_user_prepare()

2021-06-15 Thread Christophe Leroy
interrupt_exit_user_prepare() is a superset of
interrupt_exit_user_prepare_main().

Refactor to avoid code duplication.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/interrupt.c | 57 ++---
 1 file changed, 3 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index ba2d602d2da6..b9558372adc0 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -400,9 +400,7 @@ notrace unsigned long syscall_exit_restart(unsigned long 
r3, struct pt_regs *reg
 
 notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs)
 {
-   unsigned long ti_flags;
-   unsigned long flags;
-   unsigned long ret = 0;
+   unsigned long ret;
 
if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x))
BUG_ON(!(regs->msr & MSR_RI));
@@ -416,63 +414,14 @@ notrace unsigned long interrupt_exit_user_prepare(struct 
pt_regs *regs)
 */
kuap_assert_locked();
 
-   local_irq_save(flags);
-
-again:
-   ti_flags = READ_ONCE(current_thread_info()->flags);
-   while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) {
-   local_irq_enable(); /* returning to user: may enable */
-   if (ti_flags & _TIF_NEED_RESCHED) {
-   schedule();
-   } else {
-   if (ti_flags & _TIF_SIGPENDING)
-   ret |= _TIF_RESTOREALL;
-   do_notify_resume(regs, ti_flags);
-   }
-   local_irq_disable();
-   ti_flags = READ_ONCE(current_thread_info()->flags);
-   }
-
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) {
-   if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
-   unlikely((ti_flags & _TIF_RESTORE_TM))) {
-   restore_tm_state(regs);
-   } else {
-   unsigned long mathflags = MSR_FP;
-
-   if (cpu_has_feature(CPU_FTR_VSX))
-   mathflags |= MSR_VEC | MSR_VSX;
-   else if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   mathflags |= MSR_VEC;
-
-   /* See above restore_math comment */
-   if ((regs->msr & mathflags) != mathflags)
-   restore_math(regs);
-   }
-   }
-
-   if (!prep_irq_for_user_exit()) {
-   local_irq_enable();
-   local_irq_disable();
-   goto again;
-   }
-
-   booke_load_dbcr0();
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   local_paca->tm_scratch = regs->msr;
-#endif
+   local_irq_disable();
 
-   account_cpu_user_exit();
+   ret = interrupt_exit_user_prepare_main(regs, 0);
 
 #ifdef CONFIG_PPC64
regs->exit_result = ret;
 #endif
 
-   /* Restore user access locks last */
-   kuap_user_restore(regs);
-   kuep_unlock();
-
return ret;
 }
 
-- 
2.25.0



[PATCH v3 1/5] powerpc/interrupt: Rename and lightly change syscall_exit_prepare_main()

2021-06-15 Thread Christophe Leroy
Rename syscall_exit_prepare_main() into interrupt_exit_prepare_main()

Make it static as it is not used anywhere else.

Pass it the 'ret' so that it can 'or' it directly instead of
oring twice, once inside the function and once outside.

And remove 'r3' parameter which is not used.

Also fix a typo where CONFIG_PPC_BOOK3S should be CONFIG_PPC_BOOK3S_64.

Signed-off-by: Christophe Leroy 
Reviewed-by: Nicholas Piggin 
---
This series applies on top of Nic's series speeding up interrupt return on 64s

 arch/powerpc/kernel/interrupt.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c
index 74c995a42399..ba2d602d2da6 100644
--- a/arch/powerpc/kernel/interrupt.c
+++ b/arch/powerpc/kernel/interrupt.c
@@ -243,11 +243,10 @@ static notrace void booke_load_dbcr0(void)
 #endif
 }
 
-notrace unsigned long syscall_exit_prepare_main(unsigned long r3,
-   struct pt_regs *regs)
+static notrace unsigned long
+interrupt_exit_user_prepare_main(struct pt_regs *regs, unsigned long ret)
 {
unsigned long ti_flags;
-   unsigned long ret = 0;
 
 again:
ti_flags = READ_ONCE(current_thread_info()->flags);
@@ -269,7 +268,7 @@ notrace unsigned long syscall_exit_prepare_main(unsigned 
long r3,
ti_flags = READ_ONCE(current_thread_info()->flags);
}
 
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) {
+   if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && IS_ENABLED(CONFIG_PPC_FPU)) {
if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
unlikely((ti_flags & _TIF_RESTORE_TM))) {
restore_tm_state(regs);
@@ -365,7 +364,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3,
}
 
local_irq_disable();
-   ret |= syscall_exit_prepare_main(r3, regs);
+   ret = interrupt_exit_user_prepare_main(regs, ret);
 
 #ifdef CONFIG_PPC64
regs->exit_result = ret;
@@ -393,7 +392,7 @@ notrace unsigned long syscall_exit_restart(unsigned long 
r3, struct pt_regs *reg
 
BUG_ON(!user_mode(regs));
 
-   regs->exit_result |= syscall_exit_prepare_main(r3, regs);
+   regs->exit_result = interrupt_exit_user_prepare_main(regs, 
regs->exit_result);
 
return regs->exit_result;
 }
-- 
2.25.0



Re: [PATCH 5/7] signal: Add unsafe_copy_siginfo_to_user()

2021-06-15 Thread Christophe Leroy




Le 15/06/2021 à 09:21, Christoph Hellwig a écrit :

On Tue, Jun 15, 2021 at 09:03:42AM +0200, Christophe Leroy wrote:



Le 15/06/2021 ?? 08:52, Christoph Hellwig a ??crit??:

On Tue, Jun 15, 2021 at 06:41:01AM +, Christophe Leroy wrote:

+   unsafe_copy_to_user(__ucs_to, __ucs_from,   \
+   sizeof(struct kernel_siginfo), label);  \
+   unsafe_clear_user(__ucs_expansion, SI_EXPANSION_SIZE, label);   \
+} while (0)


unsafe_clear_user does not exist at this point, and even your later
patch only adds it for powerpc.



You missed below chunck I guess:


diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index c05e903cef02..37073caac474 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -398,6 +398,7 @@ long strnlen_user_nofault(const void __user *unsafe_addr, 
long count);
   #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
   #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
   #define unsafe_copy_from_user(d,s,l,e) 
unsafe_op_wrap(__copy_from_user(d,s,l),e)
+#define unsafe_clear_user(d, l, e) unsafe_op_wrap(__clear_user(d, l), e)


That doesn't help with architectures that define user_access_begin but
do not define unsafe_clear_user. (i.e. x86).



Yes, the day they want to use unsafe_copy_siginfo_to_user() they'll have to implement 
unsafe_clear_user().


Until that day, they don't need unsafe_clear_user() and I'm sure the result would be disastrous if a 
poor powerpc guy like me was trying to implement some low level x86 code.


Similar to unsafe_get_compat_sigset(), an arch wanting to use it has to implement 
unsafe_copy_from_user().


Re: [PATCH v2 00/12] powerpc: Cleanup use of 'struct ppc_inst'

2021-06-15 Thread Christophe Leroy




Le 15/06/2021 à 09:18, Michael Ellerman a écrit :

Christophe Leroy  writes:

This series is a cleanup of the use of 'struct ppc_inst'.

A confusion is made between internal representation of powerpc
instructions with 'struct ppc_inst' and in-memory code which is
and will always be an array of 'unsigned int'.


Why don't we use u32 *, to make it even more explicit what the expected
size is?



I guess that's historical, we could use u32 *

We can convert it incrementaly maybe ?


Re: [PATCH 6/7] powerpc/uaccess: Add unsafe_clear_user()

2021-06-15 Thread Christophe Leroy




Le 15/06/2021 à 08:53, Christoph Hellwig a écrit :

On Tue, Jun 15, 2021 at 06:41:02AM +, Christophe Leroy wrote:

Implement unsafe_clear_user() for powerpc.
It's a copy/paste of unsafe_copy_to_user() with value 0 as source.

It may be improved in a later patch by using 'dcbz' instruction
to zeroize full cache lines at once.


Please add this to common code insted of making it powerpc specific.



A common version is added in previous patch.

Just like unsafe_copy_to_user(), unsafe_clear_user() needs to be arch defined.

unsafe_copy_to_user() has both an x86 implementation and a powerpc 
implementation, why do different ?

I can't see how it could be not powerpc specific. At the end we want to use 'dcbz' to zeroize full 
cachelines at once, even if at the time being that's a simple write of 0.


Re: [PATCH 5/7] signal: Add unsafe_copy_siginfo_to_user()

2021-06-15 Thread Christophe Leroy




Le 15/06/2021 à 08:52, Christoph Hellwig a écrit :

On Tue, Jun 15, 2021 at 06:41:01AM +, Christophe Leroy wrote:

+   unsafe_copy_to_user(__ucs_to, __ucs_from,   \
+   sizeof(struct kernel_siginfo), label);  \
+   unsafe_clear_user(__ucs_expansion, SI_EXPANSION_SIZE, label);   \
+} while (0)


unsafe_clear_user does not exist at this point, and even your later
patch only adds it for powerpc.



You missed below chunck I guess:

> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index c05e903cef02..37073caac474 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -398,6 +398,7 @@ long strnlen_user_nofault(const void __user *unsafe_addr, 
long count);
>   #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
>   #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
>   #define unsafe_copy_from_user(d,s,l,e) 
unsafe_op_wrap(__copy_from_user(d,s,l),e)
> +#define unsafe_clear_user(d, l, e) unsafe_op_wrap(__clear_user(d, l), e)
>   static inline unsigned long user_access_save(void) { return 0UL; }
>   static inline void user_access_restore(unsigned long flags) { }
>   #endif


Re: [PATCH 7/7] powerpc/signal: Use unsafe_copy_siginfo_to_user()

2021-06-15 Thread Christophe Leroy




Le 15/06/2021 à 08:55, Christoph Hellwig a écrit :

@@ -836,14 +830,19 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0]));
}
unsafe_put_sigset_t(>uc.uc_sigmask, oldset, failed);
+#ifndef CONFIG_COMPAT
+   unsafe_copy_siginfo_to_user(>info, >info, failed);
+#endif
  
  	/* create a stack frame for the caller of the handler */

unsafe_put_user(regs->gpr[1], newsp, failed);
  
  	user_access_end();
  
-	if (copy_siginfo_to_user(>info, >info))

+#ifdef CONFIG_COMPAT
+   if (copy_siginfo_to_user32(>info, >info))
goto badframe;
+#endif


Shouldn't the compat case be handled the same way?



It would be best, but it is not that easy to convert. So for the time being it is left aside, anyway 
compat is for compatibility, so performance doesn't matter so much.


[PATCH 7/7] powerpc/signal: Use unsafe_copy_siginfo_to_user()

2021-06-15 Thread Christophe Leroy
Use unsafe_copy_siginfo_to_user() in order to do the copy
within the user access block.

On an mpc 8321 (book3s/32) the improvment is about 5% on a process
sending a signal to itself.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 13 ++---
 arch/powerpc/kernel/signal_64.c |  5 +
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 621de6e457b3..f3276cf05c8a 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -765,12 +765,6 @@ static long restore_tm_user_regs(struct pt_regs *regs, 
struct mcontext __user *s
 }
 #endif
 
-#ifdef CONFIG_PPC64
-
-#define copy_siginfo_to_user   copy_siginfo_to_user32
-
-#endif /* CONFIG_PPC64 */
-
 /*
  * Set up a signal frame for a "real-time" signal handler
  * (one which gets siginfo).
@@ -836,14 +830,19 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0]));
}
unsafe_put_sigset_t(>uc.uc_sigmask, oldset, failed);
+#ifndef CONFIG_COMPAT
+   unsafe_copy_siginfo_to_user(>info, >info, failed);
+#endif
 
/* create a stack frame for the caller of the handler */
unsafe_put_user(regs->gpr[1], newsp, failed);
 
user_access_end();
 
-   if (copy_siginfo_to_user(>info, >info))
+#ifdef CONFIG_COMPAT
+   if (copy_siginfo_to_user32(>info, >info))
goto badframe;
+#endif
 
regs->link = tramp;
 
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 35c301457fbf..47cf7462e0d6 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -901,15 +901,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t 
*set,
}
 
unsafe_copy_to_user(>uc.uc_sigmask, set, sizeof(*set), 
badframe_block);
+   unsafe_copy_siginfo_to_user(>info, >info, badframe_block);
/* Allocate a dummy caller frame for the signal handler. */
unsafe_put_user(regs->gpr[1], newsp, badframe_block);
 
user_write_access_end();
 
-   /* Save the siginfo outside of the unsafe block. */
-   if (copy_siginfo_to_user(>info, >info))
-   goto badframe;
-
/* Make sure signal handler doesn't get spurious FP exceptions */
tsk->thread.fp_state.fpscr = 0;
 
-- 
2.25.0



[PATCH 6/7] powerpc/uaccess: Add unsafe_clear_user()

2021-06-15 Thread Christophe Leroy
Implement unsafe_clear_user() for powerpc.
It's a copy/paste of unsafe_copy_to_user() with value 0 as source.

It may be improved in a later patch by using 'dcbz' instruction
to zeroize full cache lines at once.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/uaccess.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 22c79ab40006..962b675485ff 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -467,6 +467,26 @@ do {   
\
unsafe_put_user(*(u8*)(_src + _i), (u8 __user *)(_dst + _i), 
e); \
 } while (0)
 
+#define unsafe_clear_user(d, l, e) \
+do {   \
+   u8 __user *_dst = (u8 __user *)(d); \
+   size_t _len = (l);  \
+   int _i; \
+   \
+   for (_i = 0; _i < (_len & ~(sizeof(u64) - 1)); _i += sizeof(u64)) \
+   unsafe_put_user(0, (u64 __user *)(_dst + _i), e);   \
+   if (_len & 4) { \
+   unsafe_put_user(0, (u32 __user *)(_dst + _i), e);   \
+   _i += 4;\
+   }   \
+   if (_len & 2) { \
+   unsafe_put_user(0, (u16 __user *)(_dst + _i), e);   \
+   _i += 2;\
+   }   \
+   if (_len & 1)   \
+   unsafe_put_user(0, (u8 __user *)(_dst + _i), e);\
+} while (0)
+
 #define HAVE_GET_KERNEL_NOFAULT
 
 #define __get_kernel_nofault(dst, src, type, err_label)
\
-- 
2.25.0



[PATCH 5/7] signal: Add unsafe_copy_siginfo_to_user()

2021-06-15 Thread Christophe Leroy
In the same spirit as commit fb05121fd6a2 ("signal: Add
unsafe_get_compat_sigset()"), implement an 'unsafe' version of
copy_siginfo_to_user() in order to use it within user access blocks.

For that, also add an 'unsafe' version of clear_user().

Signed-off-by: Christophe Leroy 
---
 include/linux/signal.h  | 15 +++
 include/linux/uaccess.h |  1 +
 kernel/signal.c |  5 -
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 201f88e3738b..beac7b5e4acc 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -35,6 +35,21 @@ static inline void copy_siginfo_to_external(siginfo_t *to,
 int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
 int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);
 
+static __always_inline char __user *si_expansion(const siginfo_t __user *info)
+{
+   return ((char __user *)info) + sizeof(struct kernel_siginfo);
+}
+
+#define unsafe_copy_siginfo_to_user(to, from, label) do {  \
+   siginfo_t __user *__ucs_to = to;\
+   const kernel_siginfo_t *__ucs_from = from;  \
+   char __user *__ucs_expansion = si_expansion(__ucs_to);  \
+   \
+   unsafe_copy_to_user(__ucs_to, __ucs_from,   \
+   sizeof(struct kernel_siginfo), label);  \
+   unsafe_clear_user(__ucs_expansion, SI_EXPANSION_SIZE, label);   \
+} while (0)
+
 enum siginfo_layout {
SIL_KILL,
SIL_TIMER,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index c05e903cef02..37073caac474 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -398,6 +398,7 @@ long strnlen_user_nofault(const void __user *unsafe_addr, 
long count);
 #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
 #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
 #define unsafe_copy_from_user(d,s,l,e) 
unsafe_op_wrap(__copy_from_user(d,s,l),e)
+#define unsafe_clear_user(d, l, e) unsafe_op_wrap(__clear_user(d, l), e)
 static inline unsigned long user_access_save(void) { return 0UL; }
 static inline void user_access_restore(unsigned long flags) { }
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index f7c6ffcbd044..7a366331d2b7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3286,11 +3286,6 @@ enum siginfo_layout siginfo_layout(unsigned sig, int 
si_code)
return layout;
 }
 
-static inline char __user *si_expansion(const siginfo_t __user *info)
-{
-   return ((char __user *)info) + sizeof(struct kernel_siginfo);
-}
-
 int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
 {
char __user *expansion = si_expansion(to);
-- 
2.25.0



[PATCH 4/7] powerpc/signal: Include the new stack frame inside the user access block

2021-06-15 Thread Christophe Leroy
Include the new stack frame inside the user access block and set it up
using unsafe_put_user().

On an mpc 8321 (book3s/32) the improvment is about 4% on a process
sending a signal to itself.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 29 +
 arch/powerpc/kernel/signal_64.c | 14 +++---
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 8f05ed0da292..621de6e457b3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -781,7 +781,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
struct rt_sigframe __user *frame;
struct mcontext __user *mctx;
struct mcontext __user *tm_mctx = NULL;
-   unsigned long newsp = 0;
+   unsigned long __user *newsp;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
/* Save the thread's msr before get_tm_stackpointer() changes it */
@@ -789,6 +789,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+   newsp = (unsigned long __user *)((unsigned long)frame - 
(__SIGNAL_FRAMESIZE + 16));
mctx = >uc.uc_mcontext;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
tm_mctx = >uc_transact.uc_mcontext;
@@ -798,7 +799,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
else
prepare_save_user_regs(1);
 
-   if (!user_access_begin(frame, sizeof(*frame)))
+   if (!user_access_begin(newsp, __SIGNAL_FRAMESIZE + 16 + sizeof(*frame)))
goto badframe;
 
/* Put the siginfo & fill in most of the ucontext */
@@ -836,6 +837,9 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
}
unsafe_put_sigset_t(>uc.uc_sigmask, oldset, failed);
 
+   /* create a stack frame for the caller of the handler */
+   unsafe_put_user(regs->gpr[1], newsp, failed);
+
user_access_end();
 
if (copy_siginfo_to_user(>info, >info))
@@ -847,13 +851,8 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
tsk->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */
 #endif
 
-   /* create a stack frame for the caller of the handler */
-   newsp = ((unsigned long)frame) - (__SIGNAL_FRAMESIZE + 16);
-   if (put_user(regs->gpr[1], (u32 __user *)newsp))
-   goto badframe;
-
/* Fill registers for signal handler */
-   regs->gpr[1] = newsp;
+   regs->gpr[1] = (unsigned long)newsp;
regs->gpr[3] = ksig->sig;
regs->gpr[4] = (unsigned long)>info;
regs->gpr[5] = (unsigned long)>uc;
@@ -883,7 +882,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
struct sigframe __user *frame;
struct mcontext __user *mctx;
struct mcontext __user *tm_mctx = NULL;
-   unsigned long newsp = 0;
+   unsigned long __user *newsp;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
/* Save the thread's msr before get_tm_stackpointer() changes it */
@@ -891,6 +890,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+   newsp = (unsigned long __user *)((unsigned long)frame - 
__SIGNAL_FRAMESIZE);
mctx = >mctx;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
tm_mctx = >mctx_transact;
@@ -900,7 +900,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
else
prepare_save_user_regs(1);
 
-   if (!user_access_begin(frame, sizeof(*frame)))
+   if (!user_access_begin(newsp, __SIGNAL_FRAMESIZE + sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
 
@@ -931,6 +931,8 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
unsafe_put_user(PPC_INST_SC, >mc_pad[1], failed);
asm("dcbst %y0; sync; icbi %y0; sync" :: "Z" (mctx->mc_pad[0]));
}
+   /* create a stack frame for the caller of the handler */
+   unsafe_put_user(regs->gpr[1], newsp, failed);
user_access_end();
 
regs->link = tramp;
@@ -939,12 +941,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
tsk->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */
 #endif
 
-   /* create a stack frame for the caller of the handler */
-   newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
-   if (put_user(regs->gpr[1], (u32 __user *)newsp))
-   goto badframe;
-
-   regs->gpr[1] = newsp;
+   regs->gpr[1] = (unsigned long)newsp;
regs->gpr[3] = ksig->sig;
regs->gpr[4] = (unsigned long) sc;
 

[PATCH 3/7] powerpc/signal64: Access function descriptor with user access block

2021-06-15 Thread Christophe Leroy
Access the function descriptor of the handler within a
user access block.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_64.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 8b2eb758131c..9ca97b4366df 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -936,8 +936,18 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
func_descr_t __user *funct_desc_ptr =
(func_descr_t __user *) ksig->ka.sa.sa_handler;
 
-   err |= get_user(regs->ctr, _desc_ptr->entry);
-   err |= get_user(regs->gpr[2], _desc_ptr->toc);
+   if (user_read_access_begin(funct_desc_ptr, 
sizeof(func_descr_t))) {
+   unsafe_get_user(regs->ctr, _desc_ptr->entry, 
bad_funct_desc_block);
+   unsafe_get_user(regs->gpr[2], _desc_ptr->toc, 
bad_funct_desc_block);
+   } else {
+   goto bad_funct_desc;
+bad_funct_desc_block:
+   user_read_access_end();
+bad_funct_desc:
+   signal_fault(current, regs, __func__, funct_desc_ptr);
+   return 1;
+   }
+   user_read_access_end();
}
 
/* enter the signal handler in native-endian mode */
-- 
2.25.0



[PATCH 2/7] powerpc/signal64: Don't read sigaction arguments back from user memory

2021-06-15 Thread Christophe Leroy
From: Michael Ellerman 

When delivering a signal to a sigaction style handler (SA_SIGINFO), we
pass pointers to the siginfo and ucontext via r4 and r5.

Currently we populate the values in those registers by reading the
pointers out of the sigframe in user memory, even though the values in
user memory were written by the kernel just prior:

  unsafe_put_user(>info, >pinfo, badframe_block);
  unsafe_put_user(>uc, >puc, badframe_block);
  ...
  if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
err |= get_user(regs->gpr[4], (unsigned long __user *)>pinfo);
err |= get_user(regs->gpr[5], (unsigned long __user *)>puc);

ie. we write >info into frame->pinfo, and then read frame->pinfo
back into r4, and similarly for >uc.

The code has always been like this, since linux-fullhistory commit
d4f2d95eca2c ("Forward port of 2.4 ppc64 signal changes.").

There's no reason for us to read the values back from user memory,
rather than just setting the value in the gpr[4/5] directly. In fact
reading the value back from user memory opens up the possibility of
another user thread changing the values before we read them back.
Although any process doing that would be racing against the kernel
delivering the signal, and would risk corrupting the stack, so that
would be a userspace bug.

Note that this is 64-bit only code, so there's no subtlety with the size
of pointers differing between kernel and user. Also the frame variable
is not modified to point elsewhere during the function.

In the past reading the values back from user memory was not costly, but
now that we have KUAP on some CPUs it is, so we'd rather avoid it for
that reason too.

So change the code to just set the values directly, using the same
values we have written to the sigframe previously in the function.

Note also that this matches what our 32-bit signal code does.

Using a version of will-it-scale's signal1_threads that sets SA_SIGINFO,
this results in a ~4% increase in signals per second on a Power9, from
229,777 to 239,766.

Signed-off-by: Michael Ellerman 
Reviewed-by: Nicholas Piggin 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f9e1f5428b9e..8b2eb758131c 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -947,8 +947,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
regs->gpr[3] = ksig->sig;
regs->result = 0;
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-   err |= get_user(regs->gpr[4], (unsigned long __user 
*)>pinfo);
-   err |= get_user(regs->gpr[5], (unsigned long __user 
*)>puc);
+   regs->gpr[4] = (unsigned long)>info;
+   regs->gpr[5] = (unsigned long)>uc;
regs->gpr[6] = (unsigned long) frame;
} else {
regs->gpr[4] = (unsigned long)>uc.uc_mcontext;
-- 
2.25.0



[PATCH 1/7] powerpc/signal64: Copy siginfo before changing regs->nip

2021-06-15 Thread Christophe Leroy
From: Michael Ellerman 

In commit 96d7a4e06fab ("powerpc/signal64: Rewrite handle_rt_signal64()
to minimise uaccess switches") the 64-bit signal code was rearranged to
use user_write_access_begin/end().

As part of that change the call to copy_siginfo_to_user() was moved
later in the function, so that it could be done after the
user_write_access_end().

In particular it was moved after we modify regs->nip to point to the
signal trampoline. That means if copy_siginfo_to_user() fails we exit
handle_rt_signal64() with an error but with regs->nip modified, whereas
previously we would not modify regs->nip until the copy succeeded.

Returning an error from signal delivery but with regs->nip updated
leaves the process in a sort of half-delivered state. We do immediately
force a SEGV in signal_setup_done(), called from do_signal(), so the
process should never run in the half-delivered state.

However that SEGV is not delivered until we've gone around to
do_notify_resume() again, so it's possible some tracing could observe
the half-delivered state.

There are other cases where we fail signal delivery with regs partly
updated, eg. the write to newsp and SA_SIGINFO, but the latter at least
is very unlikely to fail as it reads back from the frame we just wrote
to.

Looking at other arches they seem to be more careful about leaving regs
unchanged until the copy operations have succeeded, and in general that
seems like good hygenie.

So although the current behaviour is not cleary buggy, it's also not
clearly correct. So move the call to copy_siginfo_to_user() up prior to
the modification of regs->nip, which is closer to the old behaviour, and
easier to reason about.

Signed-off-by: Michael Ellerman 
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_64.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index dca66481d0c2..f9e1f5428b9e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -902,6 +902,10 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
unsafe_copy_to_user(>uc.uc_sigmask, set, sizeof(*set), 
badframe_block);
user_write_access_end();
 
+   /* Save the siginfo outside of the unsafe block. */
+   if (copy_siginfo_to_user(>info, >info))
+   goto badframe;
+
/* Make sure signal handler doesn't get spurious FP exceptions */
tsk->thread.fp_state.fpscr = 0;
 
@@ -915,11 +919,6 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
regs->nip = (unsigned long) >tramp[0];
}
 
-
-   /* Save the siginfo outside of the unsafe block. */
-   if (copy_siginfo_to_user(>info, >info))
-   goto badframe;
-
/* Allocate a dummy caller frame for the signal handler. */
newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);
-- 
2.25.0



Re: [PATCH] powerpc/signal64: Don't read sigaction arguments back from user memory

2021-06-15 Thread Christophe Leroy




Le 14/06/2021 à 13:49, Christophe Leroy a écrit :



Le 14/06/2021 à 07:49, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 14, 2021 3:30 pm:



Le 14/06/2021 à 03:32, Nicholas Piggin a écrit :

Excerpts from Michael Ellerman's message of June 10, 2021 5:29 pm:

When delivering a signal to a sigaction style handler (SA_SIGINFO), we
pass pointers to the siginfo and ucontext via r4 and r5.

Currently we populate the values in those registers by reading the
pointers out of the sigframe in user memory, even though the values in
user memory were written by the kernel just prior:

    unsafe_put_user(>info, >pinfo, badframe_block);
    unsafe_put_user(>uc, >puc, badframe_block);
    ...
    if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
    err |= get_user(regs->gpr[4], (unsigned long __user *)>pinfo);
    err |= get_user(regs->gpr[5], (unsigned long __user *)>puc);

ie. we write >info into frame->pinfo, and then read frame->pinfo
back into r4, and similarly for >uc.

The code has always been like this, since linux-fullhistory commit
d4f2d95eca2c ("Forward port of 2.4 ppc64 signal changes.").

There's no reason for us to read the values back from user memory,
rather than just setting the value in the gpr[4/5] directly. In fact
reading the value back from user memory opens up the possibility of
another user thread changing the values before we read them back.
Although any process doing that would be racing against the kernel
delivering the signal, and would risk corrupting the stack, so that
would be a userspace bug.

Note that this is 64-bit only code, so there's no subtlety with the size
of pointers differing between kernel and user. Also the frame variable
is not modified to point elsewhere during the function.

In the past reading the values back from user memory was not costly, but
now that we have KUAP on some CPUs it is, so we'd rather avoid it for
that reason too.

So change the code to just set the values directly, using the same
values we have written to the sigframe previously in the function.

Note also that this matches what our 32-bit signal code does.

Using a version of will-it-scale's signal1_threads that sets SA_SIGINFO,
this results in a ~4% increase in signals per second on a Power9, from
229,777 to 239,766.


Good find, nice improvement. Will make it possible to make the error
handling much nicer too I think.

Reviewed-by: Nicholas Piggin 

You've moved copy_siginfo_to_user right up to the user access unlock,
could save 2 more KUAP lock/unlocks if we had an unsafe_clear_user. If
we can move the other user access stuff up as well, the stack frame
put_user could use unsafe_put_user as well, saving 1 more. Another few
percent?


I'm looking at making an 'unsafe' version of copy_siginfo_to_user().
That's straight forward for 'native' signals, but for compat signals that's 
more tricky.


Ah nice. Native is most important at the moment.



Finally not so easy. We have a quite efficient clear_user() which uses 'dcbz'. When replacing that 
by a simplistic unsafe_clear_user() on the same model as unsafe_copy_to_user(), performance are 
degradated on 32s. Need to implement it more efficiently.




Don't know what I did yesterday. Performance is _not_ degraded, it is improved by 5%. I'll send out 
a series soon.


Re: [PATCH] powerpc/signal64: Don't read sigaction arguments back from user memory

2021-06-14 Thread Christophe Leroy




Le 14/06/2021 à 07:49, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 14, 2021 3:30 pm:



Le 14/06/2021 à 03:32, Nicholas Piggin a écrit :

Excerpts from Michael Ellerman's message of June 10, 2021 5:29 pm:

When delivering a signal to a sigaction style handler (SA_SIGINFO), we
pass pointers to the siginfo and ucontext via r4 and r5.

Currently we populate the values in those registers by reading the
pointers out of the sigframe in user memory, even though the values in
user memory were written by the kernel just prior:

unsafe_put_user(>info, >pinfo, badframe_block);
unsafe_put_user(>uc, >puc, badframe_block);
...
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
err |= get_user(regs->gpr[4], (unsigned long __user *)>pinfo);
err |= get_user(regs->gpr[5], (unsigned long __user *)>puc);

ie. we write >info into frame->pinfo, and then read frame->pinfo
back into r4, and similarly for >uc.

The code has always been like this, since linux-fullhistory commit
d4f2d95eca2c ("Forward port of 2.4 ppc64 signal changes.").

There's no reason for us to read the values back from user memory,
rather than just setting the value in the gpr[4/5] directly. In fact
reading the value back from user memory opens up the possibility of
another user thread changing the values before we read them back.
Although any process doing that would be racing against the kernel
delivering the signal, and would risk corrupting the stack, so that
would be a userspace bug.

Note that this is 64-bit only code, so there's no subtlety with the size
of pointers differing between kernel and user. Also the frame variable
is not modified to point elsewhere during the function.

In the past reading the values back from user memory was not costly, but
now that we have KUAP on some CPUs it is, so we'd rather avoid it for
that reason too.

So change the code to just set the values directly, using the same
values we have written to the sigframe previously in the function.

Note also that this matches what our 32-bit signal code does.

Using a version of will-it-scale's signal1_threads that sets SA_SIGINFO,
this results in a ~4% increase in signals per second on a Power9, from
229,777 to 239,766.


Good find, nice improvement. Will make it possible to make the error
handling much nicer too I think.

Reviewed-by: Nicholas Piggin 

You've moved copy_siginfo_to_user right up to the user access unlock,
could save 2 more KUAP lock/unlocks if we had an unsafe_clear_user. If
we can move the other user access stuff up as well, the stack frame
put_user could use unsafe_put_user as well, saving 1 more. Another few
percent?


I'm looking at making an 'unsafe' version of copy_siginfo_to_user().
That's straight forward for 'native' signals, but for compat signals that's 
more tricky.


Ah nice. Native is most important at the moment.



Finally not so easy. We have a quite efficient clear_user() which uses 'dcbz'. When replacing that 
by a simplistic unsafe_clear_user() on the same model as unsafe_copy_to_user(), performance are 
degradated on 32s. Need to implement it more efficiently.


Christophe


Re: [PATCH] powerpc/signal64: Copy siginfo before changing regs->nip

2021-06-14 Thread Christophe Leroy




Le 14/06/2021 à 07:55, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of June 14, 2021 3:31 pm:



Le 14/06/2021 à 03:29, Nicholas Piggin a écrit :

Excerpts from Nicholas Piggin's message of June 14, 2021 10:47 am:

Excerpts from Michael Ellerman's message of June 8, 2021 11:46 pm:

In commit 96d7a4e06fab ("powerpc/signal64: Rewrite handle_rt_signal64()
to minimise uaccess switches") the 64-bit signal code was rearranged to
use user_write_access_begin/end().

As part of that change the call to copy_siginfo_to_user() was moved
later in the function, so that it could be done after the
user_write_access_end().

In particular it was moved after we modify regs->nip to point to the
signal trampoline. That means if copy_siginfo_to_user() fails we exit
handle_rt_signal64() with an error but with regs->nip modified, whereas
previously we would not modify regs->nip until the copy succeeded.

Returning an error from signal delivery but with regs->nip updated
leaves the process in a sort of half-delivered state. We do immediately
force a SEGV in signal_setup_done(), called from do_signal(), so the
process should never run in the half-delivered state.

However that SEGV is not delivered until we've gone around to
do_notify_resume() again, so it's possible some tracing could observe
the half-delivered state.

There are other cases where we fail signal delivery with regs partly
updated, eg. the write to newsp and SA_SIGINFO, but the latter at least
is very unlikely to fail as it reads back from the frame we just wrote
to.

Looking at other arches they seem to be more careful about leaving regs
unchanged until the copy operations have succeeded, and in general that
seems like good hygenie.

So although the current behaviour is not cleary buggy, it's also not
clearly correct. So move the call to copy_siginfo_to_user() up prior to
the modification of regs->nip, which is closer to the old behaviour, and
easier to reason about.


Good catch, should it still have a Fixes: tag though? Even if it's not
clearly buggy we want it to be patched.


Also...



Signed-off-by: Michael Ellerman 
---
   arch/powerpc/kernel/signal_64.c | 9 -
   1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index dca66481d0c2..f9e1f5428b9e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -902,6 +902,10 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
unsafe_copy_to_user(>uc.uc_sigmask, set, sizeof(*set), 
badframe_block);
user_write_access_end();
   
+	/* Save the siginfo outside of the unsafe block. */

+   if (copy_siginfo_to_user(>info, >info))
+   goto badframe;
+
/* Make sure signal handler doesn't get spurious FP exceptions */
tsk->thread.fp_state.fpscr = 0;
   
@@ -915,11 +919,6 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,

regs->nip = (unsigned long) >tramp[0];
}
   
-

-   /* Save the siginfo outside of the unsafe block. */
-   if (copy_siginfo_to_user(>info, >info))
-   goto badframe;
-
/* Allocate a dummy caller frame for the signal handler. */
newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
err |= put_user(regs->gpr[1], (unsigned long __user *)newsp);


Does the same reasoning apply to this one and the ELF V1 function
descriptor thing? It seems like you could move all of that block
up instead. With your other SA_SIGINFO get_user patch, there would
then be no possibility of error after you start modifying regs.



To move the above in the user access block, we need to open a larger window. At 
the time being the
window opened only contains the 'frame'. 'newsp' points before the 'frame'.



Only by 64/128 bytes though. Is that a problem? Not for 64s. Could it
cause more overhead than it saves on other platforms?


No it is not a problem at all, just need to not be forgotten, on ppc64 it may go unnoticed, on 32s 
it will blew up if we forget to enlarge the access window and the access involves a different 256M 
segment (Very unlikely for sure but ...)




For protection, it looks like all the important control data is in the
signal frame anyway, this frame is just for stack unwinding?


That's my understanding as well.

Christophe


  1   2   3   4   5   6   7   8   9   10   >