Re: [RFC 5/5] ipmi:bt-bmc: Add Microwatt

2021-10-05 Thread Anton Blanchard
Hi Joel,

> The series looks good.
> 
> I've got a couple of patches on the ipmi list that this will conflict
> with:
> 
>  https://sourceforge.net/p/openipmi/mailman/message/37345391/
>  https://lore.kernel.org/all/20210903015314.177987-1-j...@jms.id.au/
> 
> If there's no feedback from others I suggest basing your series on top
> of those, and sending them to Corey on the ipmi list to merge.

Looks good, will do.

Thanks,
Anton

> Cheers,
> 
> Joel
> 
> >
> > Signed-off-by: Anton Blanchard 
> > ---
> >  .../devicetree/bindings/ipmi/ibt-bmc.txt  |  1 +
> >  drivers/char/ipmi/Kconfig |  8 ++-
> >  drivers/char/ipmi/bt-bmc.c| 69
> > +++ 3 files changed, 75 insertions(+), 3
> > deletions(-)
> >
> > diff --git a/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
> > b/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt index
> > 78ee716a950e..1b661daf0193 100644 ---
> > a/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt +++
> > b/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt @@ -9,6 +9,7
> > @@ Required properties:
> >  - compatible : should be one of
> > "aspeed,ast2400-ibt-bmc"
> > "aspeed,ast2500-ibt-bmc"
> > +   "ibm,microwatt-ibt-bmc"
> >  - reg: physical address and size of the registers
> >
> >  Optional properties:
> > diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
> > index 8b2f0f675e5f..079302f4eef2 100644
> > --- a/drivers/char/ipmi/Kconfig
> > +++ b/drivers/char/ipmi/Kconfig
> > @@ -152,13 +152,15 @@ config IPMI_KCS_BMC_SERIO
> >   called kcs_bmc_serio.
> >
> >  config BT_IPMI_BMC
> > -   depends on ARCH_ASPEED || COMPILE_TEST
> > +   depends on ARCH_ASPEED || PPC_MICROWATT || COMPILE_TEST
> > depends on REGMAP && REGMAP_MMIO && MFD_SYSCON
> > tristate "BT IPMI bmc driver"
> > help
> >   Provides a driver for the BT (Block Transfer) IPMI
> > interface
> > - found on Aspeed SOCs (AST2400 and AST2500). The driver
> > - implements the BMC side of the BT interface.
> > + found on Aspeed SOCs (AST2400 and AST2500) as well as the
> > OpenPOWER
> > + LPC peripheral macro at
> > + <https://github.com/OpenPOWERFoundation/lpcperipheral>
> > + The driver implements the BMC side of the BT interface.
> >
> >  config IPMB_DEVICE_INTERFACE
> > tristate 'IPMB Interface handler'
> > diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
> > index b48e04405ac4..24327b57c60b 100644
> > --- a/drivers/char/ipmi/bt-bmc.c
> > +++ b/drivers/char/ipmi/bt-bmc.c
> > @@ -41,6 +41,11 @@
> >  #define   BT_CR2_IRQ_HBUSY 0x40
> >  #define ASPEED_BT_CR3  0xc
> >
> > +#define MICROWATT_IRQ_MASK 0x0
> > +#define MICROWATT_IRQ_STATUS   0x4
> > +#define   IRQ_HOST_TO_BMC_ATTN 0x1
> > +#define   IRQ_HOST_NOT_BUSY0x2
> > +
> >  #define BT_CTRL0x10
> >  #define   BT_CTRL_B_BUSY   0x80
> >  #define   BT_CTRL_H_BUSY   0x40
> > @@ -395,6 +400,27 @@ static irqreturn_t aspeed_bt_bmc_irq(int irq,
> > void *arg) return IRQ_HANDLED;
> >  }
> >
> > +static irqreturn_t microwatt_bt_bmc_irq(int irq, void *arg)
> > +{
> > +   struct bt_bmc *bt_bmc = arg;
> > +   u32 reg;
> > +   int rc;
> > +
> > +   rc = regmap_read(bt_bmc->map, bt_bmc->offset +
> > MICROWATT_IRQ_STATUS, );
> > +   if (rc)
> > +   return IRQ_NONE;
> > +
> > +   /* Interrupt wasn't something we knew about */
> > +   if (!(reg & (IRQ_HOST_TO_BMC_ATTN | IRQ_HOST_NOT_BUSY)))
> > +   return IRQ_NONE;
> > +
> > +   /* ack all pending IRQs */
> > +   regmap_write(bt_bmc->map, bt_bmc->offset +
> > MICROWATT_IRQ_STATUS, 0); +
> > +   wake_up(_bmc->queue);
> > +   return IRQ_HANDLED;
> > +}
> > +
> >  static int aspeed_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
> >  struct platform_device *pdev)
> >  {
> > @@ -446,6 +472,48 @@ static const struct bt_bmc_ops
> > aspeed_bt_bmc_ops = { .enable_bt = aspeed_enable_bt,
> >  };
> >
> > +static int microwatt_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
> > +struct platform_device *pdev)
> > +{
> > +   struct device *dev = >dev;
> > +  

[RFC 5/5] ipmi:bt-bmc: Add Microwatt

2021-10-05 Thread Anton Blanchard
This adds the Microwatt specific bits, including interrupt support.

Signed-off-by: Anton Blanchard 
---
 .../devicetree/bindings/ipmi/ibt-bmc.txt  |  1 +
 drivers/char/ipmi/Kconfig |  8 ++-
 drivers/char/ipmi/bt-bmc.c| 69 +++
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt 
b/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
index 78ee716a950e..1b661daf0193 100644
--- a/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
+++ b/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
@@ -9,6 +9,7 @@ Required properties:
 - compatible : should be one of
"aspeed,ast2400-ibt-bmc"
"aspeed,ast2500-ibt-bmc"
+   "ibm,microwatt-ibt-bmc"
 - reg: physical address and size of the registers
 
 Optional properties:
diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
index 8b2f0f675e5f..079302f4eef2 100644
--- a/drivers/char/ipmi/Kconfig
+++ b/drivers/char/ipmi/Kconfig
@@ -152,13 +152,15 @@ config IPMI_KCS_BMC_SERIO
  called kcs_bmc_serio.
 
 config BT_IPMI_BMC
-   depends on ARCH_ASPEED || COMPILE_TEST
+   depends on ARCH_ASPEED || PPC_MICROWATT || COMPILE_TEST
depends on REGMAP && REGMAP_MMIO && MFD_SYSCON
tristate "BT IPMI bmc driver"
help
  Provides a driver for the BT (Block Transfer) IPMI interface
- found on Aspeed SOCs (AST2400 and AST2500). The driver
- implements the BMC side of the BT interface.
+ found on Aspeed SOCs (AST2400 and AST2500) as well as the OpenPOWER
+ LPC peripheral macro at
+ <https://github.com/OpenPOWERFoundation/lpcperipheral>
+ The driver implements the BMC side of the BT interface.
 
 config IPMB_DEVICE_INTERFACE
tristate 'IPMB Interface handler'
diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index b48e04405ac4..24327b57c60b 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -41,6 +41,11 @@
 #define   BT_CR2_IRQ_HBUSY 0x40
 #define ASPEED_BT_CR3  0xc
 
+#define MICROWATT_IRQ_MASK 0x0
+#define MICROWATT_IRQ_STATUS   0x4
+#define   IRQ_HOST_TO_BMC_ATTN 0x1
+#define   IRQ_HOST_NOT_BUSY0x2
+
 #define BT_CTRL0x10
 #define   BT_CTRL_B_BUSY   0x80
 #define   BT_CTRL_H_BUSY   0x40
@@ -395,6 +400,27 @@ static irqreturn_t aspeed_bt_bmc_irq(int irq, void *arg)
return IRQ_HANDLED;
 }
 
+static irqreturn_t microwatt_bt_bmc_irq(int irq, void *arg)
+{
+   struct bt_bmc *bt_bmc = arg;
+   u32 reg;
+   int rc;
+
+   rc = regmap_read(bt_bmc->map, bt_bmc->offset + MICROWATT_IRQ_STATUS, 
);
+   if (rc)
+   return IRQ_NONE;
+
+   /* Interrupt wasn't something we knew about */
+   if (!(reg & (IRQ_HOST_TO_BMC_ATTN | IRQ_HOST_NOT_BUSY)))
+   return IRQ_NONE;
+
+   /* ack all pending IRQs */
+   regmap_write(bt_bmc->map, bt_bmc->offset + MICROWATT_IRQ_STATUS, 0);
+
+   wake_up(_bmc->queue);
+   return IRQ_HANDLED;
+}
+
 static int aspeed_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
 struct platform_device *pdev)
 {
@@ -446,6 +472,48 @@ static const struct bt_bmc_ops aspeed_bt_bmc_ops = {
.enable_bt = aspeed_enable_bt,
 };
 
+static int microwatt_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
+struct platform_device *pdev)
+{
+   struct device *dev = >dev;
+   int rc;
+
+   bt_bmc->irq = platform_get_irq_optional(pdev, 0);
+   if (bt_bmc->irq < 0)
+   return bt_bmc->irq;
+
+   rc = devm_request_irq(dev, bt_bmc->irq, microwatt_bt_bmc_irq, 
IRQF_SHARED,
+ DEVICE_NAME, bt_bmc);
+   if (rc < 0) {
+   dev_warn(dev, "Unable to request IRQ %d\n", bt_bmc->irq);
+   bt_bmc->irq = rc;
+   return rc;
+   }
+
+   /*
+* Configure the hardware to give us an interrupt whenever the H2B
+* bit is set or the HBUSY bit is cleared.
+*
+* H2B will be asserted when the bmc has data for us; HBUSY
+* will be cleared (along with B2H) when we can write the next
+* message to the BT buffer
+*/
+   rc = regmap_update_bits(bt_bmc->map, bt_bmc->offset + 
MICROWATT_IRQ_MASK,
+   (IRQ_HOST_TO_BMC_ATTN | IRQ_HOST_NOT_BUSY),
+   (IRQ_HOST_TO_BMC_ATTN | IRQ_HOST_NOT_BUSY));
+
+   return rc;
+}
+
+static void microwatt_enable_bt(struct bt_bmc *bt_bmc)
+{
+}
+
+static const struct bt_bmc_ops microwatt_bt_bmc_ops = {
+   .config_irq = microwatt_bt_bmc_config_irq,
+   .enable_bt = microwatt_enable_bt,
+};
+
 static int bt_bmc_probe(struct platform_device *pdev)
 {
struct bt_bmc *bt_bm

[RFC 4/5] ipmi:bt-bmc: No longer ASPEED specific

2021-10-05 Thread Anton Blanchard
The driver is no longer specific to ASPEED, so rename the config option
and remove the dependency on ARCH_ASPEED.

Signed-off-by: Anton Blanchard 
---
 .../bindings/ipmi/{aspeed,ast2400-ibt-bmc.txt => ibt-bmc.txt}   | 2 +-
 arch/arm/configs/aspeed_g4_defconfig| 2 +-
 arch/arm/configs/aspeed_g5_defconfig| 2 +-
 arch/arm/configs/multi_v5_defconfig | 2 +-
 arch/arm/configs/multi_v7_defconfig | 2 +-
 drivers/char/ipmi/Kconfig   | 2 +-
 drivers/char/ipmi/Makefile  | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)
 rename Documentation/devicetree/bindings/ipmi/{aspeed,ast2400-ibt-bmc.txt => 
ibt-bmc.txt} (93%)

diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt 
b/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
similarity index 93%
rename from Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
rename to Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
index 028268fd99ee..78ee716a950e 100644
--- a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt
+++ b/Documentation/devicetree/bindings/ipmi/ibt-bmc.txt
@@ -1,4 +1,4 @@
-* Aspeed BT (Block Transfer) IPMI interface
+* BT (Block Transfer) IPMI interface
 
 The Aspeed SOCs (AST2400 and AST2500) are commonly used as BMCs
 (BaseBoard Management Controllers) and the BT interface can be used to
diff --git a/arch/arm/configs/aspeed_g4_defconfig 
b/arch/arm/configs/aspeed_g4_defconfig
index acaafa351d08..51696ba49c80 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -124,7 +124,7 @@ CONFIG_SERIAL_8250_ASPEED_VUART=y
 CONFIG_SERIAL_8250_SHARE_IRQ=y
 CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_ASPEED_KCS_IPMI_BMC=y
-CONFIG_ASPEED_BT_IPMI_BMC=y
+CONFIG_BT_IPMI_BMC=y
 CONFIG_HW_RANDOM_TIMERIOMEM=y
 # CONFIG_I2C_COMPAT is not set
 CONFIG_I2C_CHARDEV=y
diff --git a/arch/arm/configs/aspeed_g5_defconfig 
b/arch/arm/configs/aspeed_g5_defconfig
index 480dbbb4ff91..758dac62f34f 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -141,7 +141,7 @@ CONFIG_SERIAL_8250_DW=y
 CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_ASPEED_KCS_IPMI_BMC=y
 CONFIG_IPMI_KCS_BMC_SERIO=y
-CONFIG_ASPEED_BT_IPMI_BMC=y
+CONFIG_BT_IPMI_BMC=y
 CONFIG_HW_RANDOM_TIMERIOMEM=y
 # CONFIG_I2C_COMPAT is not set
 CONFIG_I2C_CHARDEV=y
diff --git a/arch/arm/configs/multi_v5_defconfig 
b/arch/arm/configs/multi_v5_defconfig
index 80a3ae02d759..f3ed5da74dfa 100644
--- a/arch/arm/configs/multi_v5_defconfig
+++ b/arch/arm/configs/multi_v5_defconfig
@@ -150,7 +150,7 @@ CONFIG_SERIAL_ATMEL_TTYAT=y
 CONFIG_SERIAL_IMX=y
 CONFIG_SERIAL_IMX_CONSOLE=y
 CONFIG_ASPEED_KCS_IPMI_BMC=m
-CONFIG_ASPEED_BT_IPMI_BMC=m
+CONFIG_BT_IPMI_BMC=m
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_TIMERIOMEM=m
 # CONFIG_I2C_COMPAT is not set
diff --git a/arch/arm/configs/multi_v7_defconfig 
b/arch/arm/configs/multi_v7_defconfig
index ba67c4717dcc..03e97d95c251 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -384,7 +384,7 @@ CONFIG_SERIAL_OWL_CONSOLE=y
 CONFIG_SERIAL_DEV_BUS=y
 CONFIG_VIRTIO_CONSOLE=y
 CONFIG_ASPEED_KCS_IPMI_BMC=m
-CONFIG_ASPEED_BT_IPMI_BMC=m
+CONFIG_BT_IPMI_BMC=m
 CONFIG_HW_RANDOM=y
 CONFIG_HW_RANDOM_ST=y
 CONFIG_TCG_TPM=m
diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
index 249b31197eea..8b2f0f675e5f 100644
--- a/drivers/char/ipmi/Kconfig
+++ b/drivers/char/ipmi/Kconfig
@@ -151,7 +151,7 @@ config IPMI_KCS_BMC_SERIO
  This support is also available as a module. The module will be
  called kcs_bmc_serio.
 
-config ASPEED_BT_IPMI_BMC
+config BT_IPMI_BMC
depends on ARCH_ASPEED || COMPILE_TEST
depends on REGMAP && REGMAP_MMIO && MFD_SYSCON
tristate "BT IPMI bmc driver"
diff --git a/drivers/char/ipmi/Makefile b/drivers/char/ipmi/Makefile
index 84f47d18007f..75c71cbd568b 100644
--- a/drivers/char/ipmi/Makefile
+++ b/drivers/char/ipmi/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_IPMI_POWEROFF) += ipmi_poweroff.o
 obj-$(CONFIG_IPMI_KCS_BMC) += kcs_bmc.o
 obj-$(CONFIG_IPMI_KCS_BMC_SERIO) += kcs_bmc_serio.o
 obj-$(CONFIG_IPMI_KCS_BMC_CDEV_IPMI) += kcs_bmc_cdev_ipmi.o
-obj-$(CONFIG_ASPEED_BT_IPMI_BMC) += bt-bmc.o
+obj-$(CONFIG_BT_IPMI_BMC) += bt-bmc.o
 obj-$(CONFIG_ASPEED_KCS_IPMI_BMC) += kcs_bmc_aspeed.o
 obj-$(CONFIG_NPCM7XX_KCS_IPMI_BMC) += kcs_bmc_npcm7xx.o
 obj-$(CONFIG_IPMB_DEVICE_INTERFACE) += ipmb_dev_int.o
-- 
2.31.1



[RFC 3/5] ipmi:bt-bmc: Put arch specific function into bt_bmc_ops

2021-10-05 Thread Anton Blanchard
While most of the driver is arch agnostic, setting up and handling
interrupts, and enabling the hardware is not. Create bt_bmc_ops to
handle these functions.

Signed-off-by: Anton Blanchard 
---
 drivers/char/ipmi/bt-bmc.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index 2b0fe1255026..b48e04405ac4 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * This is a BMC device used to communicate to the host
@@ -435,15 +436,30 @@ static void aspeed_enable_bt(struct bt_bmc *bt_bmc)
 BT_CR0_ENABLE_IBT);
 }
 
+struct bt_bmc_ops {
+   int (*config_irq)(struct bt_bmc *bt_bmc, struct platform_device *pdev);
+   void (*enable_bt)(struct bt_bmc *bt_bmc);
+};
+
+static const struct bt_bmc_ops aspeed_bt_bmc_ops = {
+   .config_irq = aspeed_bt_bmc_config_irq,
+   .enable_bt = aspeed_enable_bt,
+};
+
 static int bt_bmc_probe(struct platform_device *pdev)
 {
struct bt_bmc *bt_bmc;
struct device *dev;
int rc;
+   const struct bt_bmc_ops *ops;
 
dev = >dev;
dev_info(dev, "Found bt bmc device\n");
 
+   ops = of_device_get_match_data(>dev);
+   if (!ops)
+   return -ENODEV;
+
bt_bmc = devm_kzalloc(dev, sizeof(*bt_bmc), GFP_KERNEL);
if (!bt_bmc)
return -ENOMEM;
@@ -483,7 +499,7 @@ static int bt_bmc_probe(struct platform_device *pdev)
return rc;
}
 
-   aspeed_bt_bmc_config_irq(bt_bmc, pdev);
+   ops->config_irq(bt_bmc, pdev);
 
if (bt_bmc->irq >= 0) {
dev_info(dev, "Using IRQ %d\n", bt_bmc->irq);
@@ -494,7 +510,7 @@ static int bt_bmc_probe(struct platform_device *pdev)
add_timer(_bmc->poll_timer);
}
 
-   aspeed_enable_bt(bt_bmc);
+   ops->enable_bt(bt_bmc);
 
clr_b_busy(bt_bmc);
 
@@ -512,8 +528,8 @@ static int bt_bmc_remove(struct platform_device *pdev)
 }
 
 static const struct of_device_id bt_bmc_match[] = {
-   { .compatible = "aspeed,ast2400-ibt-bmc" },
-   { .compatible = "aspeed,ast2500-ibt-bmc" },
+   { .compatible = "aspeed,ast2400-ibt-bmc", .data = _bt_bmc_ops },
+   { .compatible = "aspeed,ast2500-ibt-bmc", .data = _bt_bmc_ops },
{ },
 };
 
-- 
2.31.1



[RFC 2/5] ipmi:bt-bmc: Prefix ASPEED specific registers with ASPEED_

2021-10-05 Thread Anton Blanchard
Signed-off-by: Anton Blanchard 
---
 drivers/char/ipmi/bt-bmc.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index f85fafc96ef6..2b0fe1255026 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -26,19 +26,19 @@
 #define BT_IO_BASE 0xe4
 #define BT_IRQ 10
 
-#define BT_CR0 0x0
+#define ASPEED_BT_CR0  0x0
 #define   BT_CR0_IO_BASE   16
 #define   BT_CR0_IRQ   12
 #define   BT_CR0_EN_CLR_SLV_RDP0x8
 #define   BT_CR0_EN_CLR_SLV_WRP0x4
 #define   BT_CR0_ENABLE_IBT0x1
-#define BT_CR1 0x4
+#define ASPEED_BT_CR1  0x4
 #define   BT_CR1_IRQ_H2B   0x01
 #define   BT_CR1_IRQ_HBUSY 0x40
-#define BT_CR2 0x8
+#define ASPEED_BT_CR2  0x8
 #define   BT_CR2_IRQ_H2B   0x01
 #define   BT_CR2_IRQ_HBUSY 0x40
-#define BT_CR3 0xc
+#define ASPEED_BT_CR3  0xc
 
 #define BT_CTRL0x10
 #define   BT_CTRL_B_BUSY   0x80
@@ -379,7 +379,7 @@ static irqreturn_t aspeed_bt_bmc_irq(int irq, void *arg)
u32 reg;
int rc;
 
-   rc = regmap_read(bt_bmc->map, bt_bmc->offset + BT_CR2, );
+   rc = regmap_read(bt_bmc->map, bt_bmc->offset + ASPEED_BT_CR2, );
if (rc)
return IRQ_NONE;
 
@@ -388,7 +388,7 @@ static irqreturn_t aspeed_bt_bmc_irq(int irq, void *arg)
return IRQ_NONE;
 
/* ack pending IRQs */
-   regmap_write(bt_bmc->map, bt_bmc->offset + BT_CR2, reg);
+   regmap_write(bt_bmc->map, bt_bmc->offset + ASPEED_BT_CR2, reg);
 
wake_up(_bmc->queue);
return IRQ_HANDLED;
@@ -418,7 +418,7 @@ static int aspeed_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
 * will be cleared (along with B2H) when we can write the next
 * message to the BT buffer
 */
-   rc = regmap_update_bits(bt_bmc->map, bt_bmc->offset + BT_CR1,
+   rc = regmap_update_bits(bt_bmc->map, bt_bmc->offset + ASPEED_BT_CR1,
(BT_CR1_IRQ_H2B | BT_CR1_IRQ_HBUSY),
(BT_CR1_IRQ_H2B | BT_CR1_IRQ_HBUSY));
 
@@ -427,7 +427,7 @@ static int aspeed_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
 
 static void aspeed_enable_bt(struct bt_bmc *bt_bmc)
 {
-   regmap_write(bt_bmc->map, bt_bmc->offset + BT_CR0,
+   regmap_write(bt_bmc->map, bt_bmc->offset + ASPEED_BT_CR0,
 (BT_IO_BASE << BT_CR0_IO_BASE) |
 (BT_IRQ << BT_CR0_IRQ) |
 BT_CR0_EN_CLR_SLV_RDP |
-- 
2.31.1



[RFC 1/5] ipmi:bt-bmc: Separate out ASPEED specific bits

2021-10-05 Thread Anton Blanchard
Most of the IPMI BT BMC driver is architecture agnostic - it deals with
architected registers and behaviour in the IPMI specification.

Separate out the few ASPEED specific bits into their own functions
so we can use this driver on other architectures.

Signed-off-by: Anton Blanchard 
---
 drivers/char/ipmi/bt-bmc.c | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index 6e3d247b55d1..f85fafc96ef6 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -39,6 +39,7 @@
 #define   BT_CR2_IRQ_H2B   0x01
 #define   BT_CR2_IRQ_HBUSY 0x40
 #define BT_CR3 0xc
+
 #define BT_CTRL0x10
 #define   BT_CTRL_B_BUSY   0x80
 #define   BT_CTRL_H_BUSY   0x40
@@ -372,7 +373,7 @@ static void poll_timer(struct timer_list *t)
add_timer(_bmc->poll_timer);
 }
 
-static irqreturn_t bt_bmc_irq(int irq, void *arg)
+static irqreturn_t aspeed_bt_bmc_irq(int irq, void *arg)
 {
struct bt_bmc *bt_bmc = arg;
u32 reg;
@@ -393,7 +394,7 @@ static irqreturn_t bt_bmc_irq(int irq, void *arg)
return IRQ_HANDLED;
 }
 
-static int bt_bmc_config_irq(struct bt_bmc *bt_bmc,
+static int aspeed_bt_bmc_config_irq(struct bt_bmc *bt_bmc,
 struct platform_device *pdev)
 {
struct device *dev = >dev;
@@ -403,7 +404,7 @@ static int bt_bmc_config_irq(struct bt_bmc *bt_bmc,
if (bt_bmc->irq < 0)
return bt_bmc->irq;
 
-   rc = devm_request_irq(dev, bt_bmc->irq, bt_bmc_irq, IRQF_SHARED,
+   rc = devm_request_irq(dev, bt_bmc->irq, aspeed_bt_bmc_irq, IRQF_SHARED,
  DEVICE_NAME, bt_bmc);
if (rc < 0) {
dev_warn(dev, "Unable to request IRQ %d\n", bt_bmc->irq);
@@ -424,6 +425,16 @@ static int bt_bmc_config_irq(struct bt_bmc *bt_bmc,
return rc;
 }
 
+static void aspeed_enable_bt(struct bt_bmc *bt_bmc)
+{
+   regmap_write(bt_bmc->map, bt_bmc->offset + BT_CR0,
+(BT_IO_BASE << BT_CR0_IO_BASE) |
+(BT_IRQ << BT_CR0_IRQ) |
+BT_CR0_EN_CLR_SLV_RDP |
+BT_CR0_EN_CLR_SLV_WRP |
+BT_CR0_ENABLE_IBT);
+}
+
 static int bt_bmc_probe(struct platform_device *pdev)
 {
struct bt_bmc *bt_bmc;
@@ -472,7 +483,7 @@ static int bt_bmc_probe(struct platform_device *pdev)
return rc;
}
 
-   bt_bmc_config_irq(bt_bmc, pdev);
+   aspeed_bt_bmc_config_irq(bt_bmc, pdev);
 
if (bt_bmc->irq >= 0) {
dev_info(dev, "Using IRQ %d\n", bt_bmc->irq);
@@ -483,12 +494,7 @@ static int bt_bmc_probe(struct platform_device *pdev)
add_timer(_bmc->poll_timer);
}
 
-   regmap_write(bt_bmc->map, bt_bmc->offset + BT_CR0,
-(BT_IO_BASE << BT_CR0_IO_BASE) |
-(BT_IRQ << BT_CR0_IRQ) |
-BT_CR0_EN_CLR_SLV_RDP |
-BT_CR0_EN_CLR_SLV_WRP |
-BT_CR0_ENABLE_IBT);
+   aspeed_enable_bt(bt_bmc);
 
clr_b_busy(bt_bmc);
 
-- 
2.31.1



[PATCH] powerpc/configs: Disable legacy ptys on microwatt defconfig

2021-08-04 Thread Anton Blanchard
We shouldn't need legacy ptys, and disabling the option improves boot
time by about 0.5 seconds.

Signed-off-by: Anton Blanchard 
---

diff --git a/arch/powerpc/configs/microwatt_defconfig 
b/arch/powerpc/configs/microwatt_defconfig
index a08b739123da..ebc90aefbc0c 100644
--- a/arch/powerpc/configs/microwatt_defconfig
+++ b/arch/powerpc/configs/microwatt_defconfig
@@ -57,6 +57,7 @@ CONFIG_NETDEVICES=y
 # CONFIG_INPUT is not set
 # CONFIG_SERIO is not set
 # CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
 CONFIG_SERIAL_8250=y
 # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
 CONFIG_SERIAL_8250_CONSOLE=y



[PATCH] powerpc/configs: Add BLK_DEV_NVME to pseries_defconfig

2020-07-28 Thread Anton Blanchard
I've forgotten to manual enable NVME when building pseries kernels
for machines with NVME adapters. Since it's a reasonably common
configuration, enable it by default.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/configs/pseries_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index dfa4a726333b..358642d6f46d 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -94,6 +94,7 @@ CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=65536
 CONFIG_VIRTIO_BLK=m
+CONFIG_BLK_DEV_NVME=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
 CONFIG_BLK_DEV_SR=y
-- 
2.26.2



Re: [PATCH v2 1/2] powerpc/drmem: accelerate memory_add_physaddr_to_nid() with LMB xarray

2020-07-22 Thread Anton Blanchard
Hi Scott,

I'm hitting this issue and Rick just pointed my at your patch. Any
chance we could get it upstream?

Thanks,
Anton

> On PowerPC, memory_add_physaddr_to_nid() uses a linear search to find
> an LMB matching the given address.  This scales very poorly when there
> are many LMBs.  The poor scaling cripples drmem_init() during boot:
> lmb_set_nid(), which calls memory_add_physaddr_to_nid(), is called for
> each LMB.
> 
> If we index each LMB in an xarray by its base address we can achieve
> O(log n) search during memory_add_physaddr_to_nid(), which scales much
> better.
> 
> For example, in the lab we have a 64TB P9 machine with 256MB LMBs.
> So, suring drmem_init() we instantiate 249854 LMBs.  On a vanilla
> kernel it completes drmem_init() in ~35 seconds with a soft lockup
> trace.  On the patched kernel it completes drmem_init() in ~0.5
> seconds.
> 
> Before:
> [   53.721639] drmem: initializing drmem v2
> [   80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s!
> [swapper/0:1] [   80.604377] Modules linked in:
> [   80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+
> #4 [   80.604397] NIP:  c00a4980 LR: c00a4940 CTR:
>  [   80.604407] REGS: c0002dbff8493830 TRAP: 0901
> Not tainted  (5.6.0-rc2+) [   80.604412] MSR:  82009033
>   CR: 44000248  XER: 000d [
> 80.604431] CFAR: c00a4a38 IRQMASK: 0 [   80.604431] GPR00:
> c00a4940 c0002dbff8493ac0 c1904400 c0003cfede30 [
>   80.604431] GPR04:  c0f4095a
> 002f 1000 [   80.604431] GPR08:
> cbf7ecdb7fb8 cbf7ecc2d3c8 0008 c00c0002fdfb2001 [
>   80.604431] GPR12:  c0001e8ec200 [   80.604477]
> NIP [c00a4980] hot_add_scn_to_nid+0xa0/0x3e0 [   80.604486]
> LR [c00a4940] hot_add_scn_to_nid+0x60/0x3e0 [   80.604492]
> Call Trace: [   80.604498] [c0002dbff8493ac0] [c00a4940]
> hot_add_scn_to_nid+0x60/0x3e0 (unreliable) [   80.604509]
> [c0002dbff8493b20] [c0087c10]
> memory_add_physaddr_to_nid+0x20/0x60 [   80.604521]
> [c0002dbff8493b40] [c10d4880] drmem_init+0x25c/0x2f0 [
> 80.604530] [c0002dbff8493c10] [c0010154]
> do_one_initcall+0x64/0x2c0 [   80.604540] [c0002dbff8493ce0]
> [c10c4aa0] kernel_init_freeable+0x2d8/0x3a0 [   80.604550]
> [c0002dbff8493db0] [c0010824] kernel_init+0x2c/0x148 [
> 80.604560] [c0002dbff8493e20] [c000b648]
> ret_from_kernel_thread+0x5c/0x74 [   80.604567] Instruction dump: [
> 80.604574] 392918e8 e949 e90a000a e92a 80ea000c 1d080018
> 3908ffe8 7d094214 [   80.604586] 7fa94040 419d00dc e9490010 714a0088
> <2faa0008> 409e00ac e949 7fbe5040 [   89.047390] drmem: 249854
> LMB(s)
> 
> After:
> [   53.424702] drmem: initializing drmem v2
> [   53.898813] drmem: 249854 LMB(s)
> 
> lmb_set_nid() is called from dlpar_lmb_add() so this patch will also
> improve memory hot-add speeds on big machines.
> 
> Signed-off-by: Scott Cheloha 
> ---
>  arch/powerpc/include/asm/drmem.h |  1 +
>  arch/powerpc/mm/drmem.c  | 24 
>  arch/powerpc/mm/numa.c   | 29 ++---
>  3 files changed, 35 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/drmem.h
> b/arch/powerpc/include/asm/drmem.h index 3d76e1c388c2..90a5a9ad872b
> 100644 --- a/arch/powerpc/include/asm/drmem.h
> +++ b/arch/powerpc/include/asm/drmem.h
> @@ -88,6 +88,7 @@ static inline bool drmem_lmb_reserved(struct
> drmem_lmb *lmb) return lmb->flags & DRMEM_LMB_RESERVED;
>  }
>  
> +struct drmem_lmb *drmem_find_lmb_by_base_addr(u64 base_addr);
>  u64 drmem_lmb_memory_max(void);
>  void __init walk_drmem_lmbs(struct device_node *dn,
>   void (*func)(struct drmem_lmb *, const
> __be32 **)); diff --git a/arch/powerpc/mm/drmem.c
> b/arch/powerpc/mm/drmem.c index 44bfbdae920c..62cbe79e3860 100644
> --- a/arch/powerpc/mm/drmem.c
> +++ b/arch/powerpc/mm/drmem.c
> @@ -11,12 +11,31 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  
> +static DEFINE_XARRAY(drmem_lmb_xa_base_addr);
>  static struct drmem_lmb_info __drmem_info;
>  struct drmem_lmb_info *drmem_info = &__drmem_info;
>  
> +static int drmem_cache_lmb_for_lookup(struct drmem_lmb *lmb)
> +{
> + void *ret;
> +
> + ret = xa_store(_lmb_xa_base_addr, lmb->base_addr, lmb,
> +GFP_KERNEL);
> + if (xa_is_err(ret))
> + return xa_err(ret);
> +
> + return 0;
> +}
> +
> +struct drmem_lmb *drmem_find_lmb_by_base_addr(u64 base_addr)
> +{
> + return xa_load(_lmb_xa_base_addr, base_addr);
> +}
> +
>  u64 drmem_lmb_memory_max(void)
>  {
>   struct drmem_lmb *last_lmb;
> @@ -364,6 +383,8 @@ static void __init init_drmem_v1_lmbs(const
> __be32 *prop) 
>   for_each_drmem_lmb(lmb) {
>   read_drconf_v1_cell(lmb, );
> + if (drmem_cache_lmb_for_lookup(lmb) != 0)
> + 

[PATCH] powerpc/vdso: Fix vdso cpu truncation

2020-07-15 Thread Anton Blanchard
From: Milton Miller 

The code in vdso_cpu_init that exposes the cpu and numa node to
userspace via SPRG_VDSO incorrctly masks the cpu to 12 bits. This means
that any kernel running on a box with more than 4096 threads (NR_CPUS
advertises a limit of of 8192 cpus) would expose userspace to two cpu
contexts running at the same time with the same cpu number.

Note: I'm not aware of any distro shipping a kernel with support for more
than 4096 threads today, nor of any system image that currently exceeds
4096 threads. Found via code browsing.

Fixes: 18ad51dd342a7eb09dbcd059d0b451b616d4dafc ("powerpc: Add VDSO version of 
getcpu")
Signed-off-by: Milton Miller 
Signed-off-by: Anton Blanchard 
---
 arch/powerpc/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index e0f4ba45b6cc..8dad44262e75 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -677,7 +677,7 @@ int vdso_getcpu_init(void)
node = cpu_to_node(cpu);
WARN_ON_ONCE(node > 0x);
 
-   val = (cpu & 0xfff) | ((node & 0x) << 16);
+   val = (cpu & 0x) | ((node & 0x) << 16);
mtspr(SPRN_SPRG_VDSO_WRITE, val);
get_paca()->sprg_vdso = val;
 
-- 
2.26.2



Re: [PATCH] pseries: Fix 64 bit logical memory block panic

2020-07-15 Thread Anton Blanchard
Hi Aneesh,

> > Booting with a 4GB LMB size causes us to panic:
> >
> >   qemu-system-ppc64: OS terminated: OS panic:
> >   Memory block size not suitable: 0x0
> >
> > Fix pseries_memory_block_size() to handle 64 bit LMBs.

> We need similar changes at more places?

I agree. I wanted to get a minimal and tested fix (using QEMU) that
could make it into stable, so that the distros will at least boot.

Thanks,
Anton


[PATCH] pseries: Fix 64 bit logical memory block panic

2020-07-14 Thread Anton Blanchard
Booting with a 4GB LMB size causes us to panic:

  qemu-system-ppc64: OS terminated: OS panic:
  Memory block size not suitable: 0x0

Fix pseries_memory_block_size() to handle 64 bit LMBs.

Cc: sta...@vger.kernel.org
Signed-off-by: Anton Blanchard 
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5ace2f9a277e..6574ac33e887 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -27,7 +27,7 @@ static bool rtas_hp_event;
 unsigned long pseries_memory_block_size(void)
 {
struct device_node *np;
-   unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
+   uint64_t memblock_size = MIN_MEMORY_BLOCK_SIZE;
struct resource r;
 
np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-- 
2.26.2



[PATCH] powerpc: Add cputime_to_nsecs()

2020-07-13 Thread Anton Blanchard
Generic code has a wrapper to implement cputime_to_nsecs() on top of
cputime_to_usecs() but we can easily return the full nanosecond
resolution directly.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/include/asm/cputime.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/cputime.h 
b/arch/powerpc/include/asm/cputime.h
index 0fccd5ea1e9a..9335b93924b4 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -36,6 +36,8 @@ static inline unsigned long cputime_to_usecs(const cputime_t 
ct)
return mulhdu((__force u64) ct, __cputime_usec_factor);
 }
 
+#define cputime_to_nsecs(cputime) tb_to_ns((__force u64)cputime)
+
 /*
  * PPC64 uses PACA which is task independent for storing accounting data while
  * PPC32 uses struct thread_info, therefore at task switch the accounting data
-- 
2.26.2



Re: [RFC][PATCH] avoid refcounting the lazy tlb mm struct

2020-07-09 Thread Anton Blanchard
Hi Nick,

> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
> 
> Not doing lazy tlb at all slows switching down quite a bit, so I
> wonder if we can avoid the refcount for the lazy tlb, but have
> __mmdrop() IPI all CPUs that might be using this mm lazily.
> 
> This patch has only had light testing so far, but seems to work okay.

I tested this patch on a large POWER8 system with 1536 hardware threads.
I can create a worst case situation for mm refcounting by using
the threaded context switch test in will-it-scale set to half the
number of available CPUs (768).

With that workload the patch improves the context switch rate by 118x!

Tested-by: Anton Blanchard 

Thanks,
Anton

> diff --git a/arch/Kconfig b/arch/Kconfig
> index 8cc35dc556c7..69ea7172db3d 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -411,6 +411,16 @@ config MMU_GATHER_NO_GATHER
>   bool
>   depends on MMU_GATHER_TABLE_FREE
>  
> +config MMU_LAZY_TLB_SHOOTDOWN
> + bool
> + help
> +   Instead of refcounting the "lazy tlb" mm struct, which can
> cause
> +   contention with multi-threaded apps on large
> multiprocessor systems,
> +   this option causes __mmdrop to IPI all CPUs in the
> mm_cpumask and
> +   switch to init_mm if they were using the to-be-freed mm as
> the lazy
> +   tlb. Architectures which do not track all possible lazy
> tlb CPUs in
> +   mm_cpumask can not use this (without modification).
> +
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>   bool
>  
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 920c4e3ca4ef..24ac85c868db 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -225,6 +225,7 @@ config PPC
>   select HAVE_PERF_USER_STACK_DUMP
>   select MMU_GATHER_RCU_TABLE_FREE
>   select MMU_GATHER_PAGE_SIZE
> + select MMU_LAZY_TLB_SHOOTDOWN
>   select HAVE_REGS_AND_STACK_ACCESS_API
>   select HAVE_RELIABLE_STACKTRACE if
> PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c
> b/arch/powerpc/mm/book3s64/radix_tlb.c index
> b5cc9b23cf02..52730629b3eb 100644 ---
> a/arch/powerpc/mm/book3s64/radix_tlb.c +++
> b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -652,10 +652,10 @@ static
> void do_exit_flush_lazy_tlb(void *arg)
>* Must be a kernel thread because sender is
> single-threaded. */
>   BUG_ON(current->mm);
> - mmgrab(_mm);
> + mmgrab_lazy_tlb(_mm);
>   switch_mm(mm, _mm, current);
>   current->active_mm = _mm;
> - mmdrop(mm);
> + mmdrop_lazy_tlb(mm);
>   }
>   _tlbiel_pid(pid, RIC_FLUSH_ALL);
>  }
> diff --git a/fs/exec.c b/fs/exec.c
> index e6e8a9a70327..6c96c8feba1f 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1119,7 +1119,7 @@ static int exec_mmap(struct mm_struct *mm)
>   mmput(old_mm);
>   return 0;
>   }
> - mmdrop(active_mm);
> + mmdrop_lazy_tlb(active_mm);
>   return 0;
>  }
>  
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 480a4d1b7dd8..ef28059086a1 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -51,6 +51,25 @@ static inline void mmdrop(struct mm_struct *mm)
>  
>  void mmdrop(struct mm_struct *mm);
>  
> +static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
> +{
> + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> + mmgrab(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
> +{
> + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> + mmdrop(mm);
> +}
> +
> +static inline void mmdrop_lazy_tlb_smp_mb(struct mm_struct *mm)
> +{
> + mmdrop_lazy_tlb(mm);
> + if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN))
> + smp_mb();
> +}
> +
>  /*
>   * This has to be called after a get_task_mm()/mmget_not_zero()
>   * followed by taking the mmap_lock for writing before modifying the
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 142b23645d82..e3f1039cee9f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -685,6 +685,34 @@ static void check_mm(struct mm_struct *mm)
>  #define allocate_mm()(kmem_cache_alloc(mm_cachep,
> GFP_KERNEL)) #define free_mm(mm)  (kmem_cache_free(mm_cachep,
> (mm))) 
> +static void do_shoot_lazy_tlb(void *arg)
> +{
> + struct mm_struct *mm = arg;
> +
> + if (current->active_mm == mm

[PATCH] xmon: Reset RCU and soft lockup watchdogs

2020-06-29 Thread Anton Blanchard
I'm seeing RCU warnings when exiting xmon. xmon resets the NMI watchdog,
but does nothing with the RCU stall or soft lockup watchdogs. Add a
helper function that handles all three.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/xmon/xmon.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 7efe4bc3ccf6..d27944e38b04 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -481,6 +481,13 @@ static inline int unrecoverable_excp(struct pt_regs *regs)
 #endif
 }
 
+static void xmon_touch_watchdogs(void)
+{
+   touch_softlockup_watchdog_sync();
+   rcu_cpu_stall_reset();
+   touch_nmi_watchdog();
+}
+
 static int xmon_core(struct pt_regs *regs, int fromipi)
 {
int cmd = 0;
@@ -718,7 +725,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
else
insert_cpu_bpts();
 
-   touch_nmi_watchdog();
+   xmon_touch_watchdogs();
local_irq_restore(flags);
 
return cmd != 'X' && cmd != EOF;
-- 
2.26.2



Re: [PATCH] powerpc/64s/radix: Fix !SMP build

2020-03-05 Thread Anton Blanchard
Thanks Nick,

> Signed-off-by: Nicholas Piggin 

Tested-by: Anton Blanchard 

> ---
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 1 +
>  arch/powerpc/mm/book3s64/radix_tlb.c | 7 ++-
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c
> b/arch/powerpc/mm/book3s64/radix_pgtable.c index
> dd1bea45325c..2a9a0cd79490 100644 ---
> a/arch/powerpc/mm/book3s64/radix_pgtable.c +++
> b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -26,6 +26,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c
> b/arch/powerpc/mm/book3s64/radix_tlb.c index
> 03f43c924e00..758ade2c2b6e 100644 ---
> a/arch/powerpc/mm/book3s64/radix_tlb.c +++
> b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -587,6 +587,11 @@ void
> radix__local_flush_all_mm(struct mm_struct *mm) preempt_enable();
>  }
>  EXPORT_SYMBOL(radix__local_flush_all_mm);
> +
> +static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
> +{
> + radix__local_flush_all_mm(mm);
> +}
>  #endif /* CONFIG_SMP */
>  
>  void radix__local_flush_tlb_page_psize(struct mm_struct *mm,
> unsigned long vmaddr, @@ -777,7 +782,7 @@ void
> radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long
> vmaddr) EXPORT_SYMBOL(radix__flush_tlb_page); 
>  #else /* CONFIG_SMP */
> -#define radix__flush_all_mm radix__local_flush_all_mm
> +static inline void exit_flush_lazy_tlbs(struct mm_struct *mm) { }
>  #endif /* CONFIG_SMP */
>  
>  static void do_tlbiel_kernel(void *info)



[PATCH] powerpc/vdso: Fix multiple issues with sys_call_table

2020-03-05 Thread Anton Blanchard
The VDSO exports a bitmap of valid syscalls. vdso_setup_syscall_map()
sets this up, but there are both little and big endian bugs. The issue
is with:

   if (sys_call_table[i] != sys_ni_syscall)

On little endian, instead of comparing pointers to the two functions,
we compare the first two instructions of each function. If a function
happens to have the same first two instructions as sys_ni_syscall, then
we have a spurious match and mark the instruction as not implemented.
Fix this by removing the inline declarations.

On big endian we have a further issue where sys_ni_syscall is a function
descriptor and sys_call_table[] holds pointers to the instruction text.
Fix this by using dereference_kernel_function_descriptor().

Cc: sta...@vger.kernel.org
Signed-off-by: Anton Blanchard 

---
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index b9a108411c0d..d186b729026e 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -30,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #undef DEBUG
 
@@ -644,19 +646,16 @@ static __init int vdso_setup(void)
 static void __init vdso_setup_syscall_map(void)
 {
unsigned int i;
-   extern unsigned long *sys_call_table;
-#ifdef CONFIG_PPC64
-   extern unsigned long *compat_sys_call_table;
-#endif
-   extern unsigned long sys_ni_syscall;
+   unsigned long ni_syscall;
 
+   ni_syscall = (unsigned 
long)dereference_kernel_function_descriptor(sys_ni_syscall);
 
for (i = 0; i < NR_syscalls; i++) {
 #ifdef CONFIG_PPC64
-   if (sys_call_table[i] != sys_ni_syscall)
+   if (sys_call_table[i] != ni_syscall)
vdso_data->syscall_map_64[i >> 5] |=
0x8000UL >> (i & 0x1f);
-   if (compat_sys_call_table[i] != sys_ni_syscall)
+   if (compat_sys_call_table[i] != ni_syscall)
vdso_data->syscall_map_32[i >> 5] |=
0x8000UL >> (i & 0x1f);
 #else /* CONFIG_PPC64 */


[PATCH] powerpc/configs: Disable latencytop

2019-06-03 Thread Anton Blanchard
latencytop adds almost 4kB to each and every task struct and as such
it doesn't deserve to be in our defconfigs.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/configs/g5_defconfig   | 1 -
 arch/powerpc/configs/gamecube_defconfig | 1 -
 arch/powerpc/configs/maple_defconfig| 1 -
 arch/powerpc/configs/pmac32_defconfig   | 1 -
 arch/powerpc/configs/powernv_defconfig  | 1 -
 arch/powerpc/configs/ppc64_defconfig| 1 -
 arch/powerpc/configs/ppc64e_defconfig   | 1 -
 arch/powerpc/configs/ppc6xx_defconfig   | 1 -
 arch/powerpc/configs/pseries_defconfig  | 1 -
 arch/powerpc/configs/wii_defconfig  | 1 -
 10 files changed, 10 deletions(-)

diff --git a/arch/powerpc/configs/g5_defconfig 
b/arch/powerpc/configs/g5_defconfig
index ceb3c770786f..8e9389d6c8ef 100644
--- a/arch/powerpc/configs/g5_defconfig
+++ b/arch/powerpc/configs/g5_defconfig
@@ -244,7 +244,6 @@ CONFIG_CRC_T10DIF=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_MUTEXES=y
-CONFIG_LATENCYTOP=y
 CONFIG_BOOTX_TEXT=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
diff --git a/arch/powerpc/configs/gamecube_defconfig 
b/arch/powerpc/configs/gamecube_defconfig
index 805b0f87653c..bfffdc4f1b73 100644
--- a/arch/powerpc/configs/gamecube_defconfig
+++ b/arch/powerpc/configs/gamecube_defconfig
@@ -91,7 +91,6 @@ CONFIG_CRC_CCITT=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_MUTEXES=y
-CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_DMA_API_DEBUG=y
 CONFIG_PPC_EARLY_DEBUG=y
diff --git a/arch/powerpc/configs/maple_defconfig 
b/arch/powerpc/configs/maple_defconfig
index c5f2005005d3..1c436fafb397 100644
--- a/arch/powerpc/configs/maple_defconfig
+++ b/arch/powerpc/configs/maple_defconfig
@@ -104,7 +104,6 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LATENCYTOP=y
 CONFIG_XMON=y
 CONFIG_XMON_DEFAULT=y
 CONFIG_BOOTX_TEXT=y
diff --git a/arch/powerpc/configs/pmac32_defconfig 
b/arch/powerpc/configs/pmac32_defconfig
index 50b610b48914..8d632ceaea48 100644
--- a/arch/powerpc/configs/pmac32_defconfig
+++ b/arch/powerpc/configs/pmac32_defconfig
@@ -293,7 +293,6 @@ CONFIG_CRC_T10DIF=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_HUNG_TASK=y
-CONFIG_LATENCYTOP=y
 CONFIG_XMON=y
 CONFIG_XMON_DEFAULT=y
 CONFIG_BOOTX_TEXT=y
diff --git a/arch/powerpc/configs/powernv_defconfig 
b/arch/powerpc/configs/powernv_defconfig
index ef2ef98d3f28..1cf8ce18b4ca 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -317,7 +317,6 @@ CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_HARDLOCKUP_DETECTOR=y
-CONFIG_LATENCYTOP=y
 CONFIG_FUNCTION_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_FTRACE_SYSCALLS=y
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index 91fdb619b484..96d695ffe074 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -367,7 +367,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_DEBUG_MUTEXES=y
-CONFIG_LATENCYTOP=y
 CONFIG_FUNCTION_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/ppc64e_defconfig 
b/arch/powerpc/configs/ppc64e_defconfig
index 41d85cb3c9a2..7e52d4658867 100644
--- a/arch/powerpc/configs/ppc64e_defconfig
+++ b/arch/powerpc/configs/ppc64e_defconfig
@@ -223,7 +223,6 @@ CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_DEBUG_MUTEXES=y
-CONFIG_LATENCYTOP=y
 CONFIG_IRQSOFF_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/ppc6xx_defconfig 
b/arch/powerpc/configs/ppc6xx_defconfig
index 7c6baf6df139..7bcdb4d1411c 100644
--- a/arch/powerpc/configs/ppc6xx_defconfig
+++ b/arch/powerpc/configs/ppc6xx_defconfig
@@ -1148,7 +1148,6 @@ CONFIG_FAIL_MAKE_REQUEST=y
 CONFIG_FAIL_IO_TIMEOUT=y
 CONFIG_FAULT_INJECTION_DEBUG_FS=y
 CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
-CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index 62e12f61a3b2..c8f5f281e367 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -290,7 +290,6 @@ CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_SOFTLOCKUP_DETECTOR=y
 CONFIG_HARDLOCKUP_DETECTOR=y
-CONFIG_LATENCYTOP=y
 CONFIG_FUNCTION_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/wii_defconfig 
b/arch/powerpc/configs/wii_defconfig
index f5c366b02828..d60c04a4708a 100644
--- a/arch/powerpc/configs/wii_defconfig
+++ b/arch/powerpc/configs/wii_defconfig
@@ -123,7 +123,6 @@ CONFIG_PRINTK_TIME=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_SPINLOCK=y
 CONFIG_DEBUG_MUTEXES=y
-CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y

Re: [PATCH] powerpc/time: Fix clockevent_decrementer initalisation for PR KVM

2018-10-20 Thread Anton Blanchard
On Fri, 19 Oct 2018 15:23:19 +1100
Michael Ellerman  wrote:

> In the recent commit 8b78fdb045de ("powerpc/time: Use
> clockevents_register_device(), fixing an issue with large
> decrementer") we changed the way we initialise the decrementer
> clockevent(s).
> 
> We no longer initialise the mult & shift values of
> decrementer_clockevent itself.
> 
> This has the effect of breaking PR KVM, because it uses those values
> in kvmppc_emulate_dec(). The symptom is guest kernels spin forever
> mid-way through boot.
> 
> For now fix it by assigning back to decrementer_clockevent the mult
> and shift values.

Thanks Michael, I missed that completely.

Acked-by: Anton Blanchard 

Anton

> 
> Fixes: 8b78fdb045de ("powerpc/time: Use
> clockevents_register_device(), fixing an issue with large
> decrementer") Signed-off-by: Michael Ellerman  ---
>  arch/powerpc/kernel/time.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 40868f3ee113..68e8f963d108 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -989,6 +989,10 @@ static void register_decrementer_clockevent(int
> cpu) 
>   printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d]
> cpu[%d]\n", dec->name, dec->mult, dec->shift, cpu);
> +
> + /* Set values for KVM, see kvm_emulate_dec() */
> + decrementer_clockevent.mult = dec->mult;
> + decrementer_clockevent.shift = dec->shift;
>  }
>  
>  static void enable_large_decrementer(void)



[PATCH] powerpc: Add doorbell tracepoints

2018-10-04 Thread Anton Blanchard
When analysing sources of OS jitter, I noticed that doorbells cannot be
traced.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/include/asm/trace.h | 16 
 arch/powerpc/kernel/dbell.c  |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index d018e8602694..2f1f8cd7b6c8 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -54,6 +54,22 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
TP_ARGS(regs)
 );
 
+#ifdef CONFIG_PPC_DOORBELL
+DEFINE_EVENT(ppc64_interrupt_class, doorbell_entry,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs)
+);
+
+DEFINE_EVENT(ppc64_interrupt_class, doorbell_exit,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs)
+);
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 extern int hcall_tracepoint_regfunc(void);
 extern void hcall_tracepoint_unregfunc(void);
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index b6fe883b1016..5ec3b3835925 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_SMP
 
@@ -81,6 +82,7 @@ void doorbell_exception(struct pt_regs *regs)
struct pt_regs *old_regs = set_irq_regs(regs);
 
irq_enter();
+   trace_doorbell_entry(regs);
 
ppc_msgsync();
 
@@ -91,6 +93,7 @@ void doorbell_exception(struct pt_regs *regs)
 
smp_ipi_demux_relaxed(); /* already performed the barrier */
 
+   trace_doorbell_exit(regs);
irq_exit();
set_irq_regs(old_regs);
 }
-- 
2.17.1



Re: [PATCH] powerpc: Add doorbell tracepoints

2018-10-04 Thread Anton Blanchard
Hi Russell,

> snowpatch builds failed for this patch on all 64-bit configurations
> (ppc64e, ppc64 and ppc64le) with the following:

Thanks! Stupid bug on my part, need more quilt ref. Update to follow.

Anton

> arch/powerpc/kernel/dbell.c:85:9: error: undefined identifier
> 'trace_doorbell_entry'
> arch/powerpc/kernel/dbell.c:96:9: error: undefined identifier
> 'trace_doorbell_exit'
> ./arch/powerpc/include/asm/spinlock.h:171:9: warning: context
> imbalance in 'key_user_put' - unexpected unlock
> arch/powerpc/kernel/dbell.c: In function 'doorbell_exception':
> arch/powerpc/kernel/dbell.c:85:2: error: implicit declaration of
> function 'trace_doorbell_entry'; did you mean 'trace_irq_entry'?
> [-Werror=implicit- function-declaration]
>   trace_doorbell_entry(regs);
>   ^~~~
>   trace_irq_entry
> arch/powerpc/kernel/dbell.c:96:2: error: implicit declaration of
> function 'trace_doorbell_exit'; did you mean 'trace_irq_exit'?
> [-Werror=implicit-function- declaration]
>   trace_doorbell_exit(regs);
>   ^~~
>   trace_irq_exit
> cc1: all warnings being treated as errors
> scripts/Makefile.build:305: recipe for target
> 'arch/powerpc/kernel/dbell.o' failed make[1]: ***
> [arch/powerpc/kernel/dbell.o] Error 1 Makefile:1060: recipe for
> target 'arch/powerpc/kernel' failed
> 
> So does something else need to check for CONFIG_PPC_DOORBELL maybe?
> 
> You can see the failures here:
> http://patchwork.ozlabs.org/patch/978088/ - output in build_new.log
> (I know it's not pretty in there yet, you can search for "Error 1" to
> find the build failure)
> 
> - Russell
> 
> 



[PATCH] powerpc: Add doorbell tracepoints

2018-10-02 Thread Anton Blanchard
When analysing sources of OS jitter, I noticed that doorbells cannot be
traced.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/include/asm/trace.h | 16 
 arch/powerpc/kernel/dbell.c  |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index d018e8602694..eb9aa0f1561e 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -54,6 +54,22 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
TP_ARGS(regs)
 );
 
+#ifdef CONFIG_PPC_DOORBELL
+DEFINE_EVENT(ppc64_interrupt_class, doorbell_exception_entry,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs)
+);
+
+DEFINE_EVENT(ppc64_interrupt_class, doorbell_exception_exit,
+
+   TP_PROTO(struct pt_regs *regs),
+
+   TP_ARGS(regs)
+);
+#endif
+
 #ifdef CONFIG_PPC_PSERIES
 extern int hcall_tracepoint_regfunc(void);
 extern void hcall_tracepoint_unregfunc(void);
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index b6fe883b1016..5ec3b3835925 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_SMP
 
@@ -81,6 +82,7 @@ void doorbell_exception(struct pt_regs *regs)
struct pt_regs *old_regs = set_irq_regs(regs);
 
irq_enter();
+   trace_doorbell_entry(regs);
 
ppc_msgsync();
 
@@ -91,6 +93,7 @@ void doorbell_exception(struct pt_regs *regs)
 
smp_ipi_demux_relaxed(); /* already performed the barrier */
 
+   trace_doorbell_exit(regs);
irq_exit();
set_irq_regs(old_regs);
 }
-- 
2.17.1



[PATCH 2/2] powerpc/time: Add set_state_oneshot_stopped decrementer callback

2018-10-01 Thread Anton Blanchard
If CONFIG_PPC_WATCHDOG is enabled we always cap the decrementer to
0x7fff:

   if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
set_dec(0x7fff);
else
set_dec(decrementer_max);

If there are no future events, we don't reprogram the decrementer
after this and we end up with 0x7fff even on a large decrementer
capable system.

As suggested by Nick, add a set_state_oneshot_stopped callback
so we program the decrementer with decrementer_max if there are
no future events.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/kernel/time.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6a1f0a084ca3..40868f3ee113 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -111,6 +111,7 @@ struct clock_event_device decrementer_clockevent = {
.rating = 200,
.irq= 0,
.set_next_event = decrementer_set_next_event,
+   .set_state_oneshot_stopped = decrementer_shutdown,
.set_state_shutdown = decrementer_shutdown,
.tick_resume= decrementer_shutdown,
.features   = CLOCK_EVT_FEAT_ONESHOT |
-- 
2.17.1



[PATCH 1/2] powerpc/time: Use clockevents_register_device(), fixing an issue with large decrementer

2018-10-01 Thread Anton Blanchard
We currently cap the decrementer clockevent at 4 seconds, even on systems
with large decrementer support. Fix this by converting the code to use
clockevents_register_device() which calculates the upper bound based on
the max_delta passed in.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/kernel/time.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 70f145e02487..6a1f0a084ca3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -984,10 +984,10 @@ static void register_decrementer_clockevent(int cpu)
*dec = decrementer_clockevent;
dec->cpumask = cpumask_of(cpu);
 
+   clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max);
+
printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
dec->name, dec->mult, dec->shift, cpu);
-
-   clockevents_register_device(dec);
 }
 
 static void enable_large_decrementer(void)
@@ -1035,18 +1035,7 @@ static void __init set_decrementer_max(void)
 
 static void __init init_decrementer_clockevent(void)
 {
-   int cpu = smp_processor_id();
-
-   clockevents_calc_mult_shift(_clockevent, ppc_tb_freq, 4);
-
-   decrementer_clockevent.max_delta_ns =
-   clockevent_delta2ns(decrementer_max, _clockevent);
-   decrementer_clockevent.max_delta_ticks = decrementer_max;
-   decrementer_clockevent.min_delta_ns =
-   clockevent_delta2ns(2, _clockevent);
-   decrementer_clockevent.min_delta_ticks = 2;
-
-   register_decrementer_clockevent(cpu);
+   register_decrementer_clockevent(smp_processor_id());
 }
 
 void secondary_cpu_time_init(void)
-- 
2.17.1



Re: [PATCH 2/2] powerpc/time: Only cap decrementer when watchdog is enabled

2018-10-01 Thread Anton Blanchard
Hi Nick,

> Thanks for tracking this down. It's a fix for my breakage
> 
> a7cba02deced ("powerpc: allow soft-NMI watchdog to cover timer
> interrupts with large decrementers")
> 
> Taking another look... what I had expected here is the timer subsystem
> would have stopped the decrementer device after it processed the timer
> and found nothing left. And we should have set DEC to max at that
> time.
> 
> The above patch was really intended to only cover the timer interrupt
> itself locking up. I wonder if we need to add
> 
> .set_state_oneshot_stopped = decrementer_shutdown
> 
> In our decremementer clockevent device?

Thanks Nick, that looks much nicer, and passes my tests.

Anton


[PATCH 2/2] powerpc/time: Only cap decrementer when watchdog is enabled

2018-09-28 Thread Anton Blanchard
If CONFIG_PPC_WATCHDOG is enabled, we always cap the decrementer to
0x7fff. As suggested by Nick, add a run time check of the watchdog
cpumask, so if it is disabled we use the large decrementer.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/kernel/time.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6a1f0a084ca3..3372019f52bd 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -60,6 +60,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -575,7 +576,8 @@ void timer_interrupt(struct pt_regs *regs)
 * 31 bits, which is about 4 seconds on most systems, which gives
 * the watchdog a chance of catching timer interrupt hard lockups.
 */
-   if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
+   if (IS_ENABLED(CONFIG_PPC_WATCHDOG) &&
+   cpumask_test_cpu(smp_processor_id(), _cpumask))
set_dec(0x7fff);
else
set_dec(decrementer_max);
-- 
2.17.1



[PATCH 1/2] powerpc/time: Use clockevents_register_device(), fixing an issue with large decrementer

2018-09-28 Thread Anton Blanchard
We currently cap the decrementer clockevent at 4 seconds, even on systems
with large decrementer support. Fix this by converting the code to use
clockevents_register_device() which calculates the upper bound based on
the max_delta passed in.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/kernel/time.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 70f145e02487..6a1f0a084ca3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -984,10 +984,10 @@ static void register_decrementer_clockevent(int cpu)
*dec = decrementer_clockevent;
dec->cpumask = cpumask_of(cpu);
 
+   clockevents_config_and_register(dec, ppc_tb_freq, 2, decrementer_max);
+
printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
dec->name, dec->mult, dec->shift, cpu);
-
-   clockevents_register_device(dec);
 }
 
 static void enable_large_decrementer(void)
@@ -1035,18 +1035,7 @@ static void __init set_decrementer_max(void)
 
 static void __init init_decrementer_clockevent(void)
 {
-   int cpu = smp_processor_id();
-
-   clockevents_calc_mult_shift(_clockevent, ppc_tb_freq, 4);
-
-   decrementer_clockevent.max_delta_ns =
-   clockevent_delta2ns(decrementer_max, _clockevent);
-   decrementer_clockevent.max_delta_ticks = decrementer_max;
-   decrementer_clockevent.min_delta_ns =
-   clockevent_delta2ns(2, _clockevent);
-   decrementer_clockevent.min_delta_ticks = 2;
-
-   register_decrementer_clockevent(cpu);
+   register_decrementer_clockevent(smp_processor_id());
 }
 
 void secondary_cpu_time_init(void)
-- 
2.17.1



Re: [PATCH] powerpc/Makefiles: Fix clang/llvm build

2018-08-20 Thread Anton Blanchard
Hi Michael,

> This breaks GCC 4.6.3 at least, which we still support:
> 
>   Assembler messages:
>   Error: invalid switch -mpower8
>   Error: unrecognized option -mpower8
>   ../scripts/mod/empty.c:1:0: fatal error: error closing -: Broken
> pipe

Yuck. We have POWER8 instructions in our assembly code, and a toolchain
that doesn't understand the -mpower8 flag, but has support for
power8 instructions.

This is what I see on clang with -mpower7:

/tmp/sstep-2afa55.s:7584: Error: unrecognized opcode: `lbarx'
/tmp/sstep-2afa55.s:7626: Error: unrecognized opcode: `stbcx.'
/tmp/sstep-2afa55.s:8077: Error: unrecognized opcode: `lharx'
/tmp/sstep-2afa55.s:8140: Error: unrecognized opcode: `stbcx.'

Nick: any suggestions?

Thanks,
Anton


[PATCH] powerpc/64: Remove static branch hints from memset()

2018-08-20 Thread Anton Blanchard
From: Anton Blanchard 

Static branch hints override dynamic branch prediction on recent
POWER CPUs. We should only use them when we are overwhelmingly
sure of the direction.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/lib/mem_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S
index ec531de6..3c3be02f33b7 100644
--- a/arch/powerpc/lib/mem_64.S
+++ b/arch/powerpc/lib/mem_64.S
@@ -40,7 +40,7 @@ _GLOBAL(memset)
 .Lms:  PPC_MTOCRF(1,r0)
mr  r6,r3
blt cr1,8f
-   beq+3f  /* if already 8-byte aligned */
+   beq 3f  /* if already 8-byte aligned */
subfr5,r0,r5
bf  31,1f
stb r4,0(r6)
@@ -85,7 +85,7 @@ _GLOBAL(memset)
addir6,r6,8
 8: cmpwi   r5,0
PPC_MTOCRF(1,r5)
-   beqlr+
+   beqlr
bf  29,9f
stw r4,0(r6)
addir6,r6,4
-- 
2.17.1



[PATCH] powerpc/Makefiles: Fix clang/llvm build

2018-08-20 Thread Anton Blanchard
From: Anton Blanchard 

Commit 15a3204d24a3 ("powerpc/64s: Set assembler machine type to POWER4")
passes -mpower4 to the assembler. We have more recent instructions in our
assembly files, but gas permits them. The clang/llvm integrated assembler
is more strict, and we get a build failure.

Fix this by calling the assembler with -mcpu=power8

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 8397c7bd5880..4d9c01df0dec 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -238,7 +238,7 @@ cpu-as-$(CONFIG_4xx)+= -Wa,-m405
 cpu-as-$(CONFIG_ALTIVEC)   += $(call as-option,-Wa$(comma)-maltivec)
 cpu-as-$(CONFIG_E200)  += -Wa,-me200
 cpu-as-$(CONFIG_E500)  += -Wa,-me500
-cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4
+cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower8
 cpu-as-$(CONFIG_PPC_E500MC)+= $(call as-option,-Wa$(comma)-me500mc)
 
 KBUILD_AFLAGS += $(cpu-as-y)
-- 
2.17.1



Re: [PATCH] powerpc/pseries: hcall_exit tracepoint retval should be signed

2018-05-08 Thread Anton Blanchard
Hi Michael,

> The hcall_exit() tracepoint has retval defined as unsigned long. That
> leads to humours results like:
> 
>   bash-3686  [009] d..2   854.134094: hcall_entry: opcode=24
>   bash-3686  [009] d..2   854.134095: hcall_exit: opcode=24
> retval=18446744073709551609
> 
> It's normal for some hcalls to return negative values, displaying them
> as unsigned isn't very helpful. So change it to signed.
> 
>   bash-3711  [001] d..2   471.691008: hcall_entry: opcode=24
>   bash-3711  [001] d..2   471.691008: hcall_exit: opcode=24 retval=-7
> 
> Which can be more easily compared to H_NOT_FOUND in hvcall.h

Much nicer.

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> Signed-off-by: Michael Ellerman <m...@ellerman.id.au>
> ---
>  arch/powerpc/include/asm/asm-prototypes.h| 3 +--
>  arch/powerpc/include/asm/trace.h | 7 +++
>  arch/powerpc/platforms/pseries/hvCall_inst.c | 2 +-
>  arch/powerpc/platforms/pseries/lpar.c| 3 +--
>  4 files changed, 6 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/asm-prototypes.h
> b/arch/powerpc/include/asm/asm-prototypes.h index
> d9713ad62e3c..068760d61e7e 100644 ---
> a/arch/powerpc/include/asm/asm-prototypes.h +++
> b/arch/powerpc/include/asm/asm-prototypes.h @@ -36,8 +36,7 @@ void
> kexec_copy_flush(struct kimage *image); /* pseries hcall tracing */
>  extern struct static_key hcall_tracepoint_key;
>  void __trace_hcall_entry(unsigned long opcode, unsigned long *args);
> -void __trace_hcall_exit(long opcode, unsigned long retval,
> - unsigned long *retbuf);
> +void __trace_hcall_exit(long opcode, long retval, unsigned long
> *retbuf); /* OPAL tracing */
>  #ifdef HAVE_JUMP_LABEL
>  extern struct static_key opal_tracepoint_key;
> diff --git a/arch/powerpc/include/asm/trace.h
> b/arch/powerpc/include/asm/trace.h index 33f3b479138b..d018e8602694
> 100644 --- a/arch/powerpc/include/asm/trace.h
> +++ b/arch/powerpc/include/asm/trace.h
> @@ -81,8 +81,7 @@ TRACE_EVENT_FN_COND(hcall_entry,
>  
>  TRACE_EVENT_FN_COND(hcall_exit,
>  
> - TP_PROTO(unsigned long opcode, unsigned long retval,
> - unsigned long *retbuf),
> + TP_PROTO(unsigned long opcode, long retval, unsigned long
> *retbuf), 
>   TP_ARGS(opcode, retval, retbuf),
>  
> @@ -90,7 +89,7 @@ TRACE_EVENT_FN_COND(hcall_exit,
>  
>   TP_STRUCT__entry(
>   __field(unsigned long, opcode)
> - __field(unsigned long, retval)
> + __field(long, retval)
>   ),
>  
>   TP_fast_assign(
> @@ -98,7 +97,7 @@ TRACE_EVENT_FN_COND(hcall_exit,
>   __entry->retval = retval;
>   ),
>  
> - TP_printk("opcode=%lu retval=%lu", __entry->opcode,
> __entry->retval),
> + TP_printk("opcode=%lu retval=%ld", __entry->opcode,
> __entry->retval), 
>   hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
>  );
> diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c
> b/arch/powerpc/platforms/pseries/hvCall_inst.c index
> 89b7ce807e70..6da320c786cd 100644 ---
> a/arch/powerpc/platforms/pseries/hvCall_inst.c +++
> b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -125,7 +125,7 @@
> static void probe_hcall_entry(void *ignored, unsigned long opcode,
> unsigned long h->purr_start = mfspr(SPRN_PURR); }
>  
> -static void probe_hcall_exit(void *ignored, unsigned long opcode,
> unsigned long retval, +static void probe_hcall_exit(void *ignored,
> unsigned long opcode, long retval, unsigned long *retbuf)
>  {
>   struct hcall_stats *h;
> diff --git a/arch/powerpc/platforms/pseries/lpar.c
> b/arch/powerpc/platforms/pseries/lpar.c index
> adb996ed51e1..5a392e40f3d2 100644 ---
> a/arch/powerpc/platforms/pseries/lpar.c +++
> b/arch/powerpc/platforms/pseries/lpar.c @@ -902,8 +902,7 @@ void
> __trace_hcall_entry(unsigned long opcode, unsigned long *args)
> local_irq_restore(flags); }
>  
> -void __trace_hcall_exit(long opcode, unsigned long retval,
> - unsigned long *retbuf)
> +void __trace_hcall_exit(long opcode, long retval, unsigned long
> *retbuf) {
>   unsigned long flags;
>   unsigned int *depth;



[PATCH] powerpc/sstep: mullw should calculate a 64 bit signed result

2017-09-19 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

mullw should do a 32 bit signed multiply and create a 64 bit signed
result. It currently truncates the result to 32 bits.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/lib/sstep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index c4cda1afb49d..5e8418c28bd8 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1651,8 +1651,9 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
goto arith_done;
 
case 235:   /* mullw */
-   op->val = (unsigned int) regs->gpr[ra] *
-   (unsigned int) regs->gpr[rb];
+   op->val = (long)(int) regs->gpr[ra] *
+   (int) regs->gpr[rb];
+
goto arith_done;
 
case 266:   /* add */
-- 
2.11.0



[PATCH 2/2] powerpc/sstep: Fix issues with mcrf

2017-09-19 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

mcrf broke when we changed analyse_instr() to not modify the register
state. The instruction writes to the CR, so we need to store the result
in op->ccval, not op->val.

Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs")
Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/lib/sstep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9d72e5900320..c4cda1afb49d 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1513,10 +1513,10 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
op->type = COMPUTE + SETCC;
imm = 0xf000UL;
val = regs->gpr[rd];
-   op->val = regs->ccr;
+   op->ccval = regs->ccr;
for (sh = 0; sh < 8; ++sh) {
if (instr & (0x8 >> sh))
-   op->val = (op->val & ~imm) |
+   op->ccval = (op->ccval & ~imm) |
(val & imm);
imm >>= 4;
}
-- 
2.11.0



[PATCH 1/2] powerpc/sstep: Fix issues with set_cr0()

2017-09-19 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

set_cr0() broke when we changed analyse_instr() to not modify the
register state. Instead of looking at regs->gpr[x] which has not
been updated yet, we need to look at op->val.

Fixes: 3cdfcbfd32b9 ("powerpc: Change analyse_instr so it doesn't modify *regs")
Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/lib/sstep.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index fb9f58b868e7..9d72e5900320 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -944,9 +944,9 @@ NOKPROBE_SYMBOL(emulate_dcbz);
: "r" (addr), "i" (-EFAULT), "0" (err))
 
 static nokprobe_inline void set_cr0(const struct pt_regs *regs,
-   struct instruction_op *op, int rd)
+   struct instruction_op *op)
 {
-   long val = regs->gpr[rd];
+   long val = op->val;
 
op->type |= SETCC;
op->ccval = (regs->ccr & 0x0fff) | ((regs->xer >> 3) & 0x1000);
@@ -1326,7 +1326,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
case 13:/* addic. */
imm = (short) instr;
add_with_carry(regs, op, rd, regs->gpr[ra], imm, 0);
-   set_cr0(regs, op, rd);
+   set_cr0(regs, op);
return 1;
 
case 14:/* addi */
@@ -1397,13 +1397,13 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 
case 28:/* andi. */
op->val = regs->gpr[rd] & (unsigned short) instr;
-   set_cr0(regs, op, ra);
+   set_cr0(regs, op);
goto logical_done_nocc;
 
case 29:/* andis. */
imm = (unsigned short) instr;
op->val = regs->gpr[rd] & (imm << 16);
-   set_cr0(regs, op, ra);
+   set_cr0(regs, op);
goto logical_done_nocc;
 
 #ifdef __powerpc64__
@@ -2526,7 +2526,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 
  logical_done:
if (instr & 1)
-   set_cr0(regs, op, ra);
+   set_cr0(regs, op);
  logical_done_nocc:
op->reg = ra;
op->type |= SETREG;
@@ -2534,7 +2534,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 
  arith_done:
if (instr & 1)
-   set_cr0(regs, op, rd);
+   set_cr0(regs, op);
  compute_done:
op->reg = rd;
op->type |= SETREG;
-- 
2.11.0



Re: [PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-07 Thread Anton Blanchard
Hi Reza,

> I may be misunderstanding this, but what if we did something like x86 
> does? When trying to unplug a region smaller than the mapping, they
> fill that part of the pagetable with 0xFD instead of freeing the
> whole thing. Once the whole thing is 0xFD, free it.
> 
> See arch/x86/mm/init_64.c:remove_{pte,pmd,pud}_table()
> 
> ---%<---
>   memset((void *)addr, PAGE_INUSE, next - addr);
> 
>   page_addr = page_address(pte_page(*pte));
>   if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
>   ...
>   pte_clear(_mm, addr, pte);
>   ...
>   }
> ---%<---

But you only have 1GB ptes at this point, you'd need to start
instantiating a new level in the tree, and populate 2MB ptes.

That is what Ben is suggesting. I'm happy to go any way (fix hotplug
to handle this, or increase the memblock size on PowerNV to 1GB), I just
need a solution.

Anton


[PATCH] powerpc: Expose TSCR via sysfs

2017-09-07 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The thread switch control register (TSCR) is a per core register
that configures how the CPU shares resources between SMT threads.

Exposing it via sysfs allows us to tune it at run time.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/kernel/sysfs.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70c7c2b..b60a441092b9 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -485,6 +485,7 @@ SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
 SYSFS_SPRSETUP(purr, SPRN_PURR);
 SYSFS_SPRSETUP(spurr, SPRN_SPURR);
 SYSFS_SPRSETUP(pir, SPRN_PIR);
+SYSFS_SPRSETUP(tscr, SPRN_TSCR);
 
 /*
   Lets only enable read for phyp resources and
@@ -495,6 +496,7 @@ static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
 static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
 static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);
+static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr);
 
 /*
  * This is the system wide DSCR register default value. Any
@@ -774,6 +776,9 @@ static int register_cpu_online(unsigned int cpu)
 
if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
device_create_file(s, _attr_pir);
+
+   if (cpu_has_feature(CPU_FTR_ARCH_206))
+   device_create_file(s, _attr_tscr);
 #endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
@@ -856,6 +861,9 @@ static int unregister_cpu_online(unsigned int cpu)
 
if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
device_remove_file(s, _attr_pir);
+
+   if (cpu_has_feature(CPU_FTR_ARCH_206))
+   device_remove_file(s, _attr_tscr);
 #endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
-- 
2.11.0



Re: [PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-06 Thread Anton Blanchard
Hi,

> There is a similar issue being worked on w.r.t pseries.
> 
> https://lkml.kernel.org/r/1502357028-27465-1-git-send-email-bhar...@linux.vnet.ibm.com
> 
> The question is should we map these regions ? ie, we need to tell the 
> kernel memory region that we would like to hot unplug later so that
> we avoid doing kernel allocations from that. If we do that, then we
> can possibly map them via 2M size ?

But all of memory on PowerNV should be able to be hot unplugged, so
there are two options as I see it - either increase the memory block
size, or map everything with 2MB pages. 

Anton


[PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-06 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Memory hot unplug on PowerNV radix hosts is broken. Our memory block
size is 256MB but since we map the linear region with very large pages,
each pte we tear down maps 1GB.

A hot unplug of one 256MB memory block results in 768MB of memory
getting unintentionally unmapped. At this point we are likely to oops.

Fix this by increasing our memory block size to 1GB on PowerNV radix
hosts.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/platforms/powernv/setup.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..bbb73aa0eb8f 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -272,7 +272,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int 
secondary)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static unsigned long pnv_memory_block_size(void)
 {
-   return 256UL * 1024 * 1024;
+   /*
+* We map the kernel linear region with 1GB large pages on radix. For
+* memory hot unplug to work our memory block size must be at least
+* this size.
+*/
+   if (radix_enabled())
+   return 1UL * 1024 * 1024 * 1024;
+   else
+   return 256UL * 1024 * 1024;
 }
 #endif
 
-- 
2.11.0



Re: [PATCH] powerpc: Fix kernel crash in emulation of vector loads and stores

2017-09-03 Thread Anton Blanchard
Hi Paul,

> Commit 350779a29f11 ("powerpc: Handle most loads and stores in
> instruction emulation code", 2017-08-30) changed the register usage
> in get_vr and put_vr with the aim of leaving the register number in
> r3 untouched on return.  Unfortunately, r6 was not a good choice, as
> the callers as of 350779a29f11 store a MSR value in r6.  Then, in
> commit c22435a5f3d8 ("powerpc: Emulate FP/vector/VSX loads/stores
> correctly when regs not live", 2017-08-30), the saving and restoring
> of the MSR got moved into get_vr and put_vr.  Either way, the effect
> is that we put a value in MSR that only has the 0x3f8 bits non-zero,
> meaning that we are switching to 32-bit mode.  That leads to a crash
> like this:

Thanks! This fixed the issues I was seeing:

Tested-by: Anton Blanchard <an...@samba.org>

Anton


Re: [PATCH] POWER9 PMU stops after idle workaround

2017-08-01 Thread Anton Blanchard
Hi Nick,

> POWER9 DD2 PMU can stop after a state-loss idle in some conditions.
> 
> A solution is to set then clear MMCRA[60] after wake from state-loss
> idle.

Looks good.

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> Signed-off-by: Nicholas Piggin <npig...@gmail.com>
> ---
>  arch/powerpc/kernel/idle_book3s.S | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/idle_book3s.S
> b/arch/powerpc/kernel/idle_book3s.S index 516ebef905c0..e6252c5a57a4
> 100644 --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -460,11 +460,17 @@ pnv_restore_hyp_resource_arch300:
>   /*
>* Workaround for POWER9, if we lost resources, the ERAT
>* might have been mixed up and needs flushing. We also need
> -  * to reload MMCR0 (see comment above).
> +  * to reload MMCR0 (see comment above). We also need to set
> +  * then clear bit 60 in MMCRA to ensure the PMU starts
> running. */
>   blt cr3,1f
>   PPC_INVALIDATE_ERAT
>   ld  r1,PACAR1(r13)
> + mfspr   r4,SPRN_MMCRA
> + ori r4,r4,(1 << (63-60))
> + mtspr   SPRN_MMCRA,r4
> + xorir4,r4,(1 << (63-60))
> + mtspr   SPRN_MMCRA,r4
>   ld  r4,_MMCR0(r1)
>   mtspr   SPRN_MMCR0,r4
>  1:



Re: [PATCH] powerpc/perf: Update default sdar_mode value for power9

2017-07-25 Thread Anton Blanchard
On Tue, 25 Jul 2017 11:05:51 +0530
Madhavan Srinivasan <ma...@linux.vnet.ibm.com> wrote:

> Commit 20dd4c624d251 ('powerpc/perf: Fix SDAR_MODE value for continous
> sampling on Power9') set the default sdar_mode value in
> MMCRA[SDAR_MODE] to be used as 0b01 (Update on TLB miss). And this
> value is set if sdar_mode from event is zero, or we are in continous
> sampling mode in power9 dd1.
> 
> But it is preferred to have the sdar_mode value for power9 as
> 0b10 (Update on dcache miss) for better sampling updates instead
> of 0b01 (Update on TLB miss).

Acked-by: Anton Blanchard <an...@samba.org>

Using a bandwidth test case with a 1MB footprint, I profiled cycles and
chose TLB updates of the SDAR:

# perf record -d -e r0004001E:u ./bw2001 1M
  ^
  SDAR TLB

# perf report -D | grep PERF_RECORD_SAMPLE | sed 's/.*addr: //' | sort -u | wc 
-l
4

I get 4 unique addresses. If I ran with dcache misses:

# perf record -d -e r0008001E:u ./bw2001 1M
  ^
  SDAR dcache miss

# perf report -D|grep PERF_RECORD_SAMPLE| sed 's/.*addr: //'|sort -u | wc -l
5217

I get 5217 unique addresses. No surprises here, but it does show why
TLB misses is the wrong event to default to - we get very little useful
information out of it.

Anton

> Signed-off-by: Madhavan Srinivasan <ma...@linux.vnet.ibm.com>
> ---
>  arch/powerpc/perf/isa207-common.c | 2 +-
>  arch/powerpc/perf/isa207-common.h | 1 +
>  2 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/perf/isa207-common.c
> b/arch/powerpc/perf/isa207-common.c index 3f3aa9a7063a..582ed2c9bc56
> 100644 --- a/arch/powerpc/perf/isa207-common.c
> +++ b/arch/powerpc/perf/isa207-common.c
> @@ -99,7 +99,7 @@ static void mmcra_sdar_mode(u64 event, unsigned
> long *mmcra) else if (!cpu_has_feature(CPU_FTR_POWER9_DD1) &&
> p9_SDAR_MODE(event)) *mmcra |=  p9_SDAR_MODE(event) <<
> MMCRA_SDAR_MODE_SHIFT; else
> - *mmcra |= MMCRA_SDAR_MODE_TLB;
> + *mmcra |= MMCRA_SDAR_MODE_DCACHE;
>   } else
>   *mmcra |= MMCRA_SDAR_MODE_TLB;
>  }
> diff --git a/arch/powerpc/perf/isa207-common.h
> b/arch/powerpc/perf/isa207-common.h index 8acbe6e802c7..7a0228bf283c
> 100644 --- a/arch/powerpc/perf/isa207-common.h
> +++ b/arch/powerpc/perf/isa207-common.h
> @@ -247,6 +247,7 @@
>  #define MMCRA_SDAR_MODE_SHIFT42
>  #define MMCRA_SDAR_MODE_TLB  (1ull <<
> MMCRA_SDAR_MODE_SHIFT) #define MMCRA_SDAR_MODE_NO_UPDATES
> ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) +#define
> MMCRA_SDAR_MODE_DCACHE(2ull << MMCRA_SDAR_MODE_SHIFT)
> #define MMCRA_IFM_SHIFT   30 #define
> MMCRA_THR_CTR_MANT_SHIFT  19 #define
> MMCRA_THR_CTR_MANT_MASK   0x7Ful



Re: [PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Anton Blanchard
Hi,

> > +static notrace int gettime_syscall_fallback(clockid_t clk_id,
> > +struct timespec *tp)
> > +{
> > +   register clockid_t id asm("r3") = clk_id;
> > +   register struct timespec *t asm("r4") = tp;
> > +   register int nr asm("r0") = __NR_clock_gettime;
> > +   register int ret asm("r3");  
> 
> I guess this works. I've always been a bit nervous about register
> variables TBH.

I don't think this works with clang unfortunately.

Anton


Re: [PATCH] POWER9 PMU interrupt after idle workaround

2017-07-14 Thread Anton Blanchard
Hi Nick,

> POWER9 DD2 can see spurious PMU interrupts after state-loss idle in
> some conditions.
> 
> A solution is to save and reload MMCR0 over state-loss idle.

Thanks, looks good.

Tested-by: Anton Blanchard <an...@samba.org>

Anton

> Signed-off-by: Nicholas Piggin <npig...@gmail.com>
> ---
>  arch/powerpc/kernel/idle_book3s.S | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/idle_book3s.S
> b/arch/powerpc/kernel/idle_book3s.S index 5adb390e773b..516ebef905c0
> 100644 --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -30,6 +30,7 @@
>   * Use unused space in the interrupt stack to save and restore
>   * registers for winkle support.
>   */
> +#define _MMCR0   GPR0
>  #define _SDR1GPR3
>  #define _PTCRGPR3
>  #define _RPR GPR4
> @@ -272,6 +273,14 @@ power_enter_stop:
>   b   pnv_wakeup_noloss
>  
>  .Lhandle_esl_ec_set:
> + /*
> +  * POWER9 DD2 can incorrectly set PMAO when waking up after a
> +  * state-loss idle. Saving and restoring MMCR0 over idle is a
> +  * workaround.
> +  */
> + mfspr   r4,SPRN_MMCR0
> + std r4,_MMCR0(r1)
> +
>  /*
>   * Check if the requested state is a deep idle state.
>   */
> @@ -450,10 +459,14 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
>  pnv_restore_hyp_resource_arch300:
>   /*
>* Workaround for POWER9, if we lost resources, the ERAT
> -  * might have been mixed up and needs flushing.
> +  * might have been mixed up and needs flushing. We also need
> +  * to reload MMCR0 (see comment above).
>*/
>   blt cr3,1f
>   PPC_INVALIDATE_ERAT
> + ld  r1,PACAR1(r13)
> + ld  r4,_MMCR0(r1)
> + mtspr   SPRN_MMCR0,r4
>  1:
>   /*
>* POWER ISA 3. Use PSSCR to determine if we



[PATCH] powerpc/perf: Fix POWER9 branch event

2017-06-18 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The POWER9 branch event is wrong, and we always get a count of zero:

...
 0  branches
  3844  branch-misses #0.00% of all branches

Replace it with the correct event.

Fixes: d89f473ff6f8 ("powerpc/perf: Fix PM_BRU_CMPL event code for power9")
Cc: sta...@vger.kernel.org
Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/perf/power9-events-list.h |  2 +-
 arch/powerpc/perf/power9-pmu.c | 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/perf/power9-events-list.h 
b/arch/powerpc/perf/power9-events-list.h
index e9e417eefa59..ebec41aa7c7d 100644
--- a/arch/powerpc/perf/power9-events-list.h
+++ b/arch/powerpc/perf/power9-events-list.h
@@ -16,7 +16,7 @@ EVENT(PM_CYC, 0x0001e)
 EVENT(PM_ICT_NOSLOT_CYC,   0x100f8)
 EVENT(PM_CMPLU_STALL,  0x1e054)
 EVENT(PM_INST_CMPL,0x2)
-EVENT(PM_BRU_CMPL, 0x10012)
+EVENT(PM_BR_CMPL,  0x4d05e)
 EVENT(PM_BR_MPRED_CMPL,0x400f6)
 
 /* All L1 D cache load references counted at finish, gated by reject */
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index b9168163b2b2..90abbebf538f 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -125,7 +125,7 @@ GENERIC_EVENT_ATTR(cpu-cycles,  PM_CYC);
 GENERIC_EVENT_ATTR(stalled-cycles-frontend,PM_ICT_NOSLOT_CYC);
 GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL);
 GENERIC_EVENT_ATTR(instructions,   PM_INST_CMPL);
-GENERIC_EVENT_ATTR(branch-instructions,PM_BRU_CMPL);
+GENERIC_EVENT_ATTR(branch-instructions,PM_BR_CMPL);
 GENERIC_EVENT_ATTR(branch-misses,  PM_BR_MPRED_CMPL);
 GENERIC_EVENT_ATTR(cache-references,   PM_LD_REF_L1);
 GENERIC_EVENT_ATTR(cache-misses,   PM_LD_MISS_L1_FIN);
@@ -143,7 +143,7 @@ CACHE_EVENT_ATTR(LLC-prefetches,PM_L3_PREF_ALL);
 CACHE_EVENT_ATTR(LLC-store-misses, PM_L2_ST_MISS);
 CACHE_EVENT_ATTR(LLC-stores,   PM_L2_ST);
 CACHE_EVENT_ATTR(branch-load-misses,   PM_BR_MPRED_CMPL);
-CACHE_EVENT_ATTR(branch-loads, PM_BRU_CMPL);
+CACHE_EVENT_ATTR(branch-loads, PM_BR_CMPL);
 CACHE_EVENT_ATTR(dTLB-load-misses, PM_DTLB_MISS);
 CACHE_EVENT_ATTR(iTLB-load-misses, PM_ITLB_MISS);
 
@@ -152,7 +152,7 @@ static struct attribute *power9_events_attr[] = {
GENERIC_EVENT_PTR(PM_ICT_NOSLOT_CYC),
GENERIC_EVENT_PTR(PM_CMPLU_STALL),
GENERIC_EVENT_PTR(PM_INST_CMPL),
-   GENERIC_EVENT_PTR(PM_BRU_CMPL),
+   GENERIC_EVENT_PTR(PM_BR_CMPL),
GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
GENERIC_EVENT_PTR(PM_LD_REF_L1),
GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN),
@@ -169,7 +169,7 @@ static struct attribute *power9_events_attr[] = {
CACHE_EVENT_PTR(PM_L2_ST_MISS),
CACHE_EVENT_PTR(PM_L2_ST),
CACHE_EVENT_PTR(PM_BR_MPRED_CMPL),
-   CACHE_EVENT_PTR(PM_BRU_CMPL),
+   CACHE_EVENT_PTR(PM_BR_CMPL),
CACHE_EVENT_PTR(PM_DTLB_MISS),
CACHE_EVENT_PTR(PM_ITLB_MISS),
NULL
@@ -233,7 +233,7 @@ static int power9_generic_events_dd1[] = {
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =   PM_ICT_NOSLOT_CYC,
[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =PM_CMPLU_STALL,
[PERF_COUNT_HW_INSTRUCTIONS] =  PM_INST_DISP,
-   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =   PM_BRU_CMPL,
+   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =   PM_BR_CMPL,
[PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL,
[PERF_COUNT_HW_CACHE_REFERENCES] =  PM_LD_REF_L1,
[PERF_COUNT_HW_CACHE_MISSES] =  PM_LD_MISS_L1_FIN,
@@ -244,7 +244,7 @@ static int power9_generic_events[] = {
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =   PM_ICT_NOSLOT_CYC,
[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =PM_CMPLU_STALL,
[PERF_COUNT_HW_INSTRUCTIONS] =  PM_INST_CMPL,
-   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =   PM_BRU_CMPL,
+   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =   PM_BR_CMPL,
[PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL,
[PERF_COUNT_HW_CACHE_REFERENCES] =  PM_LD_REF_L1,
[PERF_COUNT_HW_CACHE_MISSES] =  PM_LD_MISS_L1_FIN,
@@ -370,7 +370,7 @@ static int 
power9_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
},
[ C(BPU) ] = {
[ C(OP_READ) ] = {
-   [ C(RESULT_ACCESS) ] = PM_BRU_CMPL,
+   [ C(RESULT_ACCESS) ] = PM_BR_CMPL,
 

[PATCH] powerpc/perf: Add POWER9 alternate PM_RUN_CYC and PM_RUN_INST_CMPL events

2017-06-18 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Similar to POWER8, POWER9 can count run cycles and run instructions
completed on more than one PMU.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/perf/power9-events-list.h | 4 
 arch/powerpc/perf/power9-pmu.c | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/arch/powerpc/perf/power9-events-list.h 
b/arch/powerpc/perf/power9-events-list.h
index 71a6bfee5c02..e9e417eefa59 100644
--- a/arch/powerpc/perf/power9-events-list.h
+++ b/arch/powerpc/perf/power9-events-list.h
@@ -51,8 +51,12 @@ EVENT(PM_DTLB_MISS,  0x300fc)
 EVENT(PM_ITLB_MISS,0x400fc)
 /* Run_Instructions */
 EVENT(PM_RUN_INST_CMPL,0x500fa)
+/* Alternate event code for PM_RUN_INST_CMPL */
+EVENT(PM_RUN_INST_CMPL_ALT,0x400fa)
 /* Run_cycles */
 EVENT(PM_RUN_CYC,  0x600f4)
+/* Alternate event code for Run_cycles */
+EVENT(PM_RUN_CYC_ALT,  0x200f4)
 /* Instruction Dispatched */
 EVENT(PM_INST_DISP,0x200f2)
 EVENT(PM_INST_DISP_ALT,0x300f2)
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 018f8e90ac35..b9168163b2b2 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -107,6 +107,8 @@ extern struct attribute_group isa207_pmu_format_group;
 /* Table of alternatives, sorted by column 0 */
 static const unsigned int power9_event_alternatives[][MAX_ALT] = {
{ PM_INST_DISP, PM_INST_DISP_ALT },
+   { PM_RUN_CYC_ALT,   PM_RUN_CYC },
+   { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL },
 };
 
 static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[])
-- 
2.11.0



Re: [PATCH 1/2] powerpc: Fix emulation of mcrf in emulate_step()

2017-06-15 Thread Anton Blanchard
Hi Segher,

> On Thu, Jun 15, 2017 at 09:46:38AM +1000, Anton Blanchard wrote:
> > The mcrf emulation code was looking at the CR fields in the reverse
> > order. It also relied on reserved fields being zero which is
> > somewhat fragile, so fix that too.  
> 
> It masked out the reserved bits.  I find the new code to be less
> readable (but also more correct ;-) ). 

Thanks for that, not sure how I missed it :) I'll respin a simpler
patch.

> Maybe there should be (inline)
> helper function to insert/extract CR fields?

That would be nice, there are quite a few places that could use it.

Anton

> Segher
> 
> 
> > --- a/arch/powerpc/lib/sstep.c
> > +++ b/arch/powerpc/lib/sstep.c
> > @@ -683,8 +683,10 @@ int analyse_instr(struct instruction_op *op,
> > struct pt_regs *regs, case 19:
> > switch ((instr >> 1) & 0x3ff) {
> > case 0: /* mcrf */
> > -   rd = (instr >> 21) & 0x1c;
> > -   ra = (instr >> 16) & 0x1c;
> > +   rd = 7 - ((instr >> 23) & 0x7);
> > +   ra = 7 - ((instr >> 18) & 0x7);
> > +   rd *= 4;
> > +   ra *= 4;
> > val = (regs->ccr >> ra) & 0xf;
> > regs->ccr = (regs->ccr & ~(0xfUL << rd)) |
> > (val << rd); goto instr_done;  
> 



[PATCH 2/2] powerpc: Fix emulation of mfocrf in emulate_step()

2017-06-14 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

>From POWER4 onwards, mfocrf() only places the specified CR field into
the destination GPR, and the rest of it is set to 0. The PowerPC AS
from version 3.0 now requires this behaviour.

The emulation code currently puts the entire CR into the destination GPR.
Fix it.

Cc: sta...@vger.kernel.org
Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/lib/sstep.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index fb84f51b1f0b..ee33327686ae 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -966,6 +966,19 @@ int analyse_instr(struct instruction_op *op, struct 
pt_regs *regs,
 #endif
 
case 19:/* mfcr */
+   if ((instr >> 20) & 1) {
+   imm = 0xf000UL;
+   for (sh = 0; sh < 8; ++sh) {
+   if (instr & (0x8 >> sh)) {
+   regs->gpr[rd] = regs->ccr & imm;
+   break;
+   }
+   imm >>= 4;
+   }
+
+   goto instr_done;
+   }
+
regs->gpr[rd] = regs->ccr;
regs->gpr[rd] &= 0xUL;
goto instr_done;
-- 
2.11.0



[PATCH 1/2] powerpc: Fix emulation of mcrf in emulate_step()

2017-06-14 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The mcrf emulation code was looking at the CR fields in the reverse
order. It also relied on reserved fields being zero which is somewhat
fragile, so fix that too.

Cc: sta...@vger.kernel.org
Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/lib/sstep.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 33117f8a0882..fb84f51b1f0b 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -683,8 +683,10 @@ int analyse_instr(struct instruction_op *op, struct 
pt_regs *regs,
case 19:
switch ((instr >> 1) & 0x3ff) {
case 0: /* mcrf */
-   rd = (instr >> 21) & 0x1c;
-   ra = (instr >> 16) & 0x1c;
+   rd = 7 - ((instr >> 23) & 0x7);
+   ra = 7 - ((instr >> 18) & 0x7);
+   rd *= 4;
+   ra *= 4;
val = (regs->ccr >> ra) & 0xf;
regs->ccr = (regs->ccr & ~(0xfUL << rd)) | (val << rd);
goto instr_done;
-- 
2.11.0



Re: [PATCH] powerpc/kernel: improve FP and vector registers restoration

2017-06-03 Thread Anton Blanchard
On Sat, 3 Jun 2017 19:42:14 -0300
Breno Leitao <lei...@debian.org> wrote:

> Hi Anton,
> 
> On Sat, Jun 03, 2017 at 08:04:11AM +1000, Anton Blanchard wrote:
> > Hi Breno,
> >   
> > > Currently tsk->thread->load_vec and load_fp are not initialized
> > > during a task creation, which set garbage to these variables
> > > (non-zero value).  
> > 
> > Nice catch! It seems like we should zero load_tm too though?  
> 
> Yes, it seems we need to zero load_tm also, since it does not seem to
> be zeroed anywhere else.
> 
> But I did some tests, and load_tm is always zero after start_thread()
> is being called.
> 
> In fact, start_thread() is being called and pt_regs->load_tm is
> already zero since the function start.
> 
> I also wrote a SystemTap script[1] to investigate it better, and I've
> never seen a single load_tm != 0 in a my machine. I tested on both
> POWER8 bare metal and KVM guests. (load_vec and load_fp happened to
> have garbage all the time)
> 
> Any idea if this is just occasional event, or, if there is someone
> zeroing it in an obscure code?

Quite likely no one uses TM :) Try:

#include 

int main(void)
{
__builtin_tbegin(0);
execlp("/bin/true", "/bin/true", NULL);
}

Anton


Re: [PATCH] powerpc/kernel: improve FP and vector registers restoration

2017-06-02 Thread Anton Blanchard
Hi Breno,

> Currently tsk->thread->load_vec and load_fp are not initialized
> during a task creation, which set garbage to these variables
> (non-zero value).

Nice catch! It seems like we should zero load_tm too though?

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> These variables will be checked later at restore_math() to validate
> if the FP and vectors are being utilized. Since these values might be
> non-zero, the restore_math() will continue to save the FP and vectors
> even if they were never utilized before the userspace application.
> load_fp and load_vec counters will then overflow and the FP and
> Altivec will be finally disabled, but before that condition is
> reached (counter overflow) several context switches restored FP and
> vector registers without need, causing a performance degradation.
> 
> Signed-off-by: Breno Leitao <lei...@debian.org>
> Signed-off-by: Gustavo Romero <gusbrom...@gmail.com>
> ---
>  arch/powerpc/kernel/process.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/process.c
> b/arch/powerpc/kernel/process.c index baae104b16c7..a9435397eab8
> 100644 --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1666,6 +1666,7 @@ void start_thread(struct pt_regs *regs,
> unsigned long start, unsigned long sp) #ifdef CONFIG_VSX
>   current->thread.used_vsr = 0;
>  #endif
> + current->thread.load_fp = 0;
>   memset(>thread.fp_state, 0,
> sizeof(current->thread.fp_state)); current->thread.fp_save_area =
> NULL; #ifdef CONFIG_ALTIVEC
> @@ -1674,6 +1675,7 @@ void start_thread(struct pt_regs *regs,
> unsigned long start, unsigned long sp) current->thread.vr_save_area =
> NULL; current->thread.vrsave = 0;
>   current->thread.used_vr = 0;
> + current->thread.load_vec = 0;
>  #endif /* CONFIG_ALTIVEC */
>  #ifdef CONFIG_SPE
>   memset(current->thread.evr, 0, sizeof(current->thread.evr));



Re: [PATCH] powerpc/64: Use tick accounting by default

2017-05-21 Thread Anton Blanchard
Hi Michael,

> > ppc64 is the only architecture that turns on
> > VIRT_CPU_ACCOUNTING_NATIVE by default. The overhead of this option
> > is extremely high - a context switch microbenchmark using
> > sched_yield() is almost 20% slower.  
> 
> Running on what? It should all be nop'ed out unless you're on a
> platform that needs it (SPLPAR).

POWERNV native. We don't nop out all the vtime_account_* gunk do we? It
is all those functions that are a large part of the problem.

> > To get finer grained user/hardirq/softirq statitics, the
> > IRQ_TIME_ACCOUNTING option can be used instead, which has much lower
> > overhead.  
> 
> Can it? We don't select HAVE_IRQ_TIME_ACCOUNTING, so AFAICS it can't
> be enabled.

I have a separate patch to enable it.

> Doesn't dropping this mean we never count stolen time?

Perhaps. Do we have any applications left that care?

Anton


[PATCH] powerpc/64: Use tick accounting by default

2017-05-19 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

ppc64 is the only architecture that turns on VIRT_CPU_ACCOUNTING_NATIVE
by default. The overhead of this option is extremely high - a context
switch microbenchmark using sched_yield() is almost 20% slower.

To get finer grained user/hardirq/softirq statitics, the
IRQ_TIME_ACCOUNTING option can be used instead, which has much lower
overhead.

As such, disable this option by default. If a user really wants it,
they can still enable it manually.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 init/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 1d3475fc9496..a5c30acc1ede 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -342,8 +342,7 @@ config VIRT_CPU_ACCOUNTING
 
 choice
prompt "Cputime accounting"
-   default TICK_CPU_ACCOUNTING if !PPC64
-   default VIRT_CPU_ACCOUNTING_NATIVE if PPC64
+   default TICK_CPU_ACCOUNTING
 
 # Kind of a stub config for the pure tick based cputime accounting
 config TICK_CPU_ACCOUNTING
-- 
2.11.0



[PATCH] powerpc: Add HAVE_IRQ_TIME_ACCOUNTING

2017-05-18 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Allow us to enable IRQ_TIME_ACCOUNTING. Even though we currently
use VIRT_CPU_ACCOUNTING_NATIVE, that option is quite heavy
weight and IRQ_TIME_ACCOUNTING might be better in some cases.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f7c8f9972f61..d090275ace44 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -208,6 +208,7 @@ config PPC
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
+   select HAVE_IRQ_TIME_ACCOUNTING
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
-- 
2.11.0



Re: [PATCH] powerpc: Tweak copy selection parameter in __copy_tofrom_user_power7()

2017-05-18 Thread Anton Blanchard
Hi Andrew,

> Experiments with the netperf benchmark indicated that the size
> selecting VMX-based copies in __copy_tofrom_user_power7() was
> suboptimal on POWER8. Measurements showed that parity was in the
> neighbourhood of 3328 bytes, rather than greater than 4096. The
> change gives a 1.5-2.0% improvement in performance for 4096-byte
> buffers, reducing the relative time spent in
> __copy_tofrom_user_power7() from approximately 7% to approximately 5%
> in the TCP_RR benchmark.

Nice work! All our context switch optimisations we've made over
the last year has likely moved the break even point for this.

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> Signed-off-by: Andrew Jeffery <and...@aj.id.au>
> ---
>  arch/powerpc/lib/copyuser_power7.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/lib/copyuser_power7.S
> b/arch/powerpc/lib/copyuser_power7.S index a24b4039352c..706b7cc19846
> 100644 --- a/arch/powerpc/lib/copyuser_power7.S
> +++ b/arch/powerpc/lib/copyuser_power7.S
> @@ -82,14 +82,14 @@
>  _GLOBAL(__copy_tofrom_user_power7)
>  #ifdef CONFIG_ALTIVEC
>   cmpldi  r5,16
> - cmpldi  cr1,r5,4096
> + cmpldi  cr1,r5,3328
>  
>   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
>   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
>   std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
>  
>   blt .Lshort_copy
> - bgt cr1,.Lvmx_copy
> + bge cr1,.Lvmx_copy
>  #else
>   cmpldi  r5,16
>  



Re: [PATCH v2] powerpc/xmon: Wait for secondaries before IPI'ing on system reset

2017-04-30 Thread Anton Blanchard
Hi Nick,

> An externally triggered system reset (e.g., via QEMU nmi command, or
> pseries reset button) can cause system reset interrupts on all CPUs.
> In case this causes xmon to be entered, it is undesirable for the
> primary (first) CPU into xmon to trigger an NMI IPI to others,
> because this may cause a nested system reset interrupt.
> 
> So spin for a time waiting for secondaries to join xmon before
> performing the NMI IPI, similarly to what the crash dump code does.

That reminds me of similar delays in our crash path:

/*
 * The primary CPU waits a while for all secondary CPUs to enter. This is to
 * avoid sending an IPI if the secondary CPUs are entering
 * crash_kexec_secondary on their own (eg via a system reset).
 *
 * The secondary timeout has to be longer than the primary. Both timeouts are
 * in milliseconds.
 */
#define PRIMARY_TIMEOUT 500
#define SECONDARY_TIMEOUT   1000

...

/*
 * If we came in via system reset, wait a while for the secondary
 * CPUs to enter.
 */
if (TRAP(regs) == 0x100)
mdelay(PRIMARY_TIMEOUT);

We might want to consolidate the juggling we do. Not sure if many people use
it, but kdb and kgdb may benefit if we make it common.

Anton


Re: [PATCH] Enabled pstore write for powerpc

2017-04-27 Thread Anton Blanchard
Hi Ankit,

> After commit c950fd6f201a kernel registers pstore write based on flag
> set. Pstore write for powerpc is broken as flags(PSTORE_FLAGS_DMESG)
> is not set for powerpc architecture. On panic, kernel doesn't write
> message to /fs/pstore/dmesg*(Entry doesn't gets created at all).
> 
> This patch enables pstore write for powerpc architecture by setting
> PSTORE_FLAGS_DMESG flag.
> 
> Fixes:c950fd6f201a pstore: Split pstore fragile flags

Ouch! We've used pstore to shoot customer bugs, so we should also mark
this for stable. Looks like 4.9 onwards?

Anton

> Signed-off-by: Ankit Kumar 
> ---
> 
>  arch/powerpc/kernel/nvram_64.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/powerpc/kernel/nvram_64.c
> b/arch/powerpc/kernel/nvram_64.c index d5e2b83..021db31 100644
> --- a/arch/powerpc/kernel/nvram_64.c
> +++ b/arch/powerpc/kernel/nvram_64.c
> @@ -561,6 +561,7 @@ static ssize_t nvram_pstore_read(u64 *id, enum
> pstore_type_id *type, static struct pstore_info nvram_pstore_info = {
>   .owner = THIS_MODULE,
>   .name = "nvram",
> + .flags = PSTORE_FLAGS_DMESG,
>   .open = nvram_pstore_open,
>   .read = nvram_pstore_read,
>   .write = nvram_pstore_write,



Re: [RFC PATCH] powerpc/mm/radix: Optimize tlbiel flush

2017-04-20 Thread Anton Blanchard
Hi Aneesh,

> For a page walk cache flush, we don't need to loop with set number.
> The set number is ignored with RIC=1 (pwc flush).
> 
> For RIC=2 (flush all), inorder to flush implementation dependent
> caches, we can ignore the set number. Hence we do a RIC=2 flush with
> set no: 0, so we do both the tlb flush for set 0 and the
> implementation dependent cache flushes. This is then followed with
> tbl flush for set 1-127

I've applied your two previous radix tlbiel optimisations as my
baseline, and using the simple exec microbenchmark in a7a9dcd882a6 I
see:

HPT:100%
Radix baseline: 248%
Radix patched:   95%

So this patch fixes the large regression we see with radix, and is even
faster than our HPT number now. Nice work!

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
> ---
> Note: not yet tested.
> 
>  arch/powerpc/mm/tlb-radix.c | 28 +++-
>  1 file changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index b68b5219cf45..b827aef38b90 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -43,12 +43,30 @@ static inline void __tlbiel_pid(unsigned long
> pid, int set, */
>  static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
>  {
> - int set;
> + int set = 0;
>  
>   asm volatile("ptesync": : :"memory");
> - for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
> - __tlbiel_pid(pid, set, ric);
> + if (ric == RIC_FLUSH_ALL) {
> + ric = RIC_FLUSH_TLB;
> + set = 1;
> + /* Use set 0 to flush all */
> + __tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
>   }
> +
> + for (; set < POWER9_TLB_SETS_RADIX ; set++)
> + __tlbiel_pid(pid, set, ric);
> +
> + asm volatile("ptesync": : :"memory");
> + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
> +}
> +
> +static inline void _tlbiel_pwc(unsigned long pid)
> +{
> + asm volatile("ptesync": : :"memory");
> + /*
> +  * for PWC flush, we don't look at set number
> +  */
> + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
>   asm volatile("ptesync": : :"memory");
>   asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
>  }
> @@ -140,7 +158,7 @@ void radix__local_flush_tlb_pwc(struct mmu_gather
> *tlb, unsigned long addr) 
>   pid = mm->context.id;
>   if (pid != MMU_NO_CONTEXT)
> - _tlbiel_pid(pid, RIC_FLUSH_PWC);
> + _tlbiel_pwc(pid);
>  
>   preempt_enable();
>  }
> @@ -222,7 +240,7 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb,
> unsigned long addr) if (lock_tlbie)
>   raw_spin_unlock(_tlbie_lock);
>   } else
> - _tlbiel_pid(pid, RIC_FLUSH_PWC);
> + _tlbiel_pwc(pid);
>  no_context:
>   preempt_enable();
>  }



Re: [PATCH] powerpc/configs: Enable function trace by default

2017-04-19 Thread Anton Blanchard
Hi Balbir,

> > FTRACE is quite CPU consumming, shouldn't it really be on by
> > default ?  
> 
> It does some work at boot to NOP out function entry points at _mcount
> locations. Is that what you are referring to? Or the overhead of the
> code in terms of size? Most distro kernels have tracing on by default.
> 
> The rest of the overhead is enablement based.

Unfortunately the overhead is somewhat high without
CONFIG_MPROFILE_KERNEL, and enabling that option will break old
toolchains. It would be great if we could automatically enable it based
on the toolchain.

Even with CONFIG_MPROFILE_KERNEL enabled, we aren't noping out the
redundant mflr at the start of each function.

Anton


Re: powerpc: Avoid taking a data miss on every userspace instruction miss

2017-04-12 Thread Anton Blanchard
Hi Balbir,

> FYI: The version you applied does not have checks for is_write

Yeah, we decided to do that in a follow up patch. I'm ok if someone
gets to it before me :)

Anton


Re: [PATCH 8/9] powerpc/mm: Wire up hpte_removebolted for powernv

2017-04-11 Thread Anton Blanchard
Hi Oliver,

> From: Rashmica Gupta <rashmic...@gmail.com>
> 
> Adds support for removing bolted (i.e kernel linear mapping) mappings
> on powernv. This is needed to support memory hot unplug operations
> which are required for the teardown of DAX/PMEM devices.
> 
> Cc: Rashmica Gupta <rashmic...@gmail.com>
> Cc: Anton Blanchard <an...@samba.org>
> Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
> ---
> Could the original author of this add their S-o-b? I pulled it out of
> Rashmica's memtrace patch, but I remember someone saying Anton wrote
> it originally.

I did.

Signed-off-by: Anton Blanchard <an...@samba.org>

Anton

> ---
>  arch/powerpc/mm/hash_native_64.c | 31 +++
>  1 file changed, 31 insertions(+)
> 
> diff --git a/arch/powerpc/mm/hash_native_64.c
> b/arch/powerpc/mm/hash_native_64.c index 65bb8f33b399..9ba91d4905a4
> 100644 --- a/arch/powerpc/mm/hash_native_64.c
> +++ b/arch/powerpc/mm/hash_native_64.c
> @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned
> long newpp, unsigned long ea, tlbie(vpn, psize, psize, ssize, 0);
>  }
>  
> +/*
> + * Remove a bolted kernel entry. Memory hotplug uses this.
> + *
> + * No need to lock here because we should be the only user.
> + */
> +static int native_hpte_removebolted(unsigned long ea, int psize, int
> ssize) +{
> + unsigned long vpn;
> + unsigned long vsid;
> + long slot;
> + struct hash_pte *hptep;
> +
> + vsid = get_kernel_vsid(ea, ssize);
> + vpn = hpt_vpn(ea, vsid, ssize);
> +
> + slot = native_hpte_find(vpn, psize, ssize);
> + if (slot == -1)
> + return -ENOENT;
> +
> + hptep = htab_address + slot;
> +
> + /* Invalidate the hpte */
> + hptep->v = 0;
> +
> + /* Invalidate the TLB */
> + tlbie(vpn, psize, psize, ssize, 0);
> + return 0;
> +}
> +
> +
>  static void native_hpte_invalidate(unsigned long slot, unsigned long
> vpn, int bpsize, int apsize, int ssize, int local)
>  {
> @@ -725,6 +755,7 @@ void __init hpte_init_native(void)
>   mmu_hash_ops.hpte_invalidate= native_hpte_invalidate;
>   mmu_hash_ops.hpte_updatepp  = native_hpte_updatepp;
>   mmu_hash_ops.hpte_updateboltedpp =
> native_hpte_updateboltedpp;
> + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
>   mmu_hash_ops.hpte_insert= native_hpte_insert;
>   mmu_hash_ops.hpte_remove= native_hpte_remove;
>   mmu_hash_ops.hpte_clear_all = native_hpte_clear;



Re: [PATCH] ppc64/kprobe: Fix oops when kprobed on 'stdu' instruction

2017-04-10 Thread Anton Blanchard
Hi Ravi,

> If we set a kprobe on a 'stdu' instruction on powerpc64, we see a
> kernel OOPS:

Ouch! We should mark this for stable.

Anton


Re: [PATCH 1/2] powerpc/mm/radix: Don't do page walk cache flush when doing full mm flush

2017-04-09 Thread Anton Blanchard
On Sat,  1 Apr 2017 20:11:47 +0530
"Aneesh Kumar K.V" <aneesh.ku...@linux.vnet.ibm.com> wrote:

> For fullmm tlb flush, we do a flush with RIC_FLUSH_ALL which will
> invalidate all related caches (radix__tlb_flush()). Hence the pwc
> flush is not needed.

Thanks Aneesh. I see a 3x improvement in exec performance with these
2 patches.

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/tlb-radix.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index 83dc1ccc2fa1..f3e58bd60d1a 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -129,6 +129,12 @@ void radix__local_flush_tlb_pwc(struct
> mmu_gather *tlb, unsigned long addr) {
>   unsigned long pid;
>   struct mm_struct *mm = tlb->mm;
> + /*
> +  * If we are doing a full mm flush, we will do a tlb flush
> +  * with RIC_FLUSH_ALL later.
> +  */
> + if (tlb->fullmm)
> + return;
>  
>   preempt_disable();
>  
> @@ -195,6 +201,12 @@ void radix__flush_tlb_pwc(struct mmu_gather
> *tlb, unsigned long addr) unsigned long pid;
>   struct mm_struct *mm = tlb->mm;
>  
> + /*
> +  * If we are doing a full mm flush, we will do a tlb flush
> +  * with RIC_FLUSH_ALL later.
> +  */
> + if (tlb->fullmm)
> + return;
>   preempt_disable();
>  
>   pid = mm->context.id;



Re: [PATCH 2/2] powerpc/mm/radix: Remove unnecessary ptesync

2017-04-09 Thread Anton Blanchard
On Sat,  1 Apr 2017 20:11:48 +0530
"Aneesh Kumar K.V" <aneesh.ku...@linux.vnet.ibm.com> wrote:

> For a tlbiel with pid, we need to issue tlbiel with set number
> encoded. We don't need to do ptesync for each of those. Instead we
> need one for the entire tlbiel pid operation.
> 
> Signed-off-by: Benjamin Herrenschmidt <b...@kernel.crashing.org>
> Signed-off-by: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>

Thanks Aneesh.

Acked-by: Anton Blanchard <an...@samba.org>

Anton

> ---
>  arch/powerpc/mm/tlb-radix.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> index f3e58bd60d1a..b68b5219cf45 100644
> --- a/arch/powerpc/mm/tlb-radix.c
> +++ b/arch/powerpc/mm/tlb-radix.c
> @@ -34,10 +34,8 @@ static inline void __tlbiel_pid(unsigned long pid,
> int set, prs = 1; /* process scoped */
>   r = 1;   /* raidx format */
>  
> - asm volatile("ptesync": : :"memory");
>   asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
>: : "r"(rb), "i"(r), "i"(prs), "i"(ric),
> "r"(rs) : "memory");
> - asm volatile("ptesync": : :"memory");
>  }
>  
>  /*
> @@ -47,9 +45,11 @@ static inline void _tlbiel_pid(unsigned long pid,
> unsigned long ric) {
>   int set;
>  
> + asm volatile("ptesync": : :"memory");
>   for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
>   __tlbiel_pid(pid, set, ric);
>   }
> + asm volatile("ptesync": : :"memory");
>   asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
>  }
>  



Re: [PATCH] powerpc: Avoid taking a data miss on every userspace instruction miss

2017-04-03 Thread Anton Blanchard
Hi Christophe,

> > -   if (user_mode(regs))
> > +   if (!is_exec && user_mode(regs))  
> 
> Shouldn't it also check 'is_write' ?
> If it is a store, is_write should be set, shouldn't it ?

Thanks, Ben had the same suggestion. I'll add that further optimisation
in a subsequent patch.

Anton


[PATCH 3/3] cpuidle: powernv: Avoid a branch in the core snooze_loop() loop

2017-04-03 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

When in the snooze_loop() we want to take up the least amount of
resources. On my version of gcc (6.3), we end up with an extra
branch because it predicts snooze_timeout_en to be false, whereas it
is almost always true.

Use likely() to avoid the branch and be a little nicer to the
other non idle threads on the core.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 drivers/cpuidle/cpuidle-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 8c991c254b95..251a60bfa8ee 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -58,7 +58,7 @@ static int snooze_loop(struct cpuidle_device *dev,
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (snooze_timeout_en && get_tb() > snooze_exit_time)
+   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time)
break;
}
 
-- 
2.11.0



[PATCH 2/3] cpuidle: powernv: Don't continually set thread priority in snooze_loop()

2017-04-03 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The powerpc64 kernel exception handlers have preserved thread priorities
for a long time now, so there is no need to continually set it.

Just set it once on entry and once exit.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 drivers/cpuidle/cpuidle-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 9d9f164894eb..8c991c254b95 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -56,8 +56,8 @@ static int snooze_loop(struct cpuidle_device *dev,
 
snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
+   HMT_very_low();
while (!need_resched()) {
-   HMT_very_low();
if (snooze_timeout_en && get_tb() > snooze_exit_time)
break;
}
-- 
2.11.0



[PATCH 1/3] cpuidle: powernv: Don't bounce between low and very low thread priority

2017-04-03 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The core of snooze_loop() continually bounces between low and very
low thread priority. Changing thread priorities is an expensive
operation that can negatively impact other threads on a core.

All CPUs that can run PowerNV support very low priority, so we can
avoid the change completely.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 drivers/cpuidle/cpuidle-powernv.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index cda8f62d555b..9d9f164894eb 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -57,7 +57,6 @@ static int snooze_loop(struct cpuidle_device *dev,
snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
while (!need_resched()) {
-   HMT_low();
HMT_very_low();
if (snooze_timeout_en && get_tb() > snooze_exit_time)
break;
-- 
2.11.0



[PATCH] powerpc: Avoid taking a data miss on every userspace instruction miss

2017-04-03 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Early on in do_page_fault() we call store_updates_sp(), regardless of
the type of exception. For an instruction miss this doesn't make
sense, because we only use this information to detect if a data miss
is the result of a stack expansion instruction or not.

Worse still, it results in a data miss within every userspace
instruction miss handler, because we try and load the very instruction
we are about to install a pte for!

A simple exec microbenchmark runs 6% faster on POWER8 with this fix:

 #include 
 #include 
 #include 

int main(int argc, char *argv[])
{
unsigned long left = atol(argv[1]);
char leftstr[16];

if (left-- == 0)
return 0;

sprintf(leftstr, "%ld", left);
execlp(argv[0], argv[0], leftstr, NULL);
perror("exec failed\n");

return 0;
}

Pass the number of iterations on the command line (eg 1) and time
how long it takes to execute.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index fd6484fc2fa9..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -287,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
 * can result in fault, which will cause a deadlock when called with
 * mmap_sem held
 */
-   if (user_mode(regs))
+   if (!is_exec && user_mode(regs))
store_update_sp = store_updates_sp(regs);
 
if (user_mode(regs))
-- 
2.11.0



Re: [PATCH] powerpc: Add POWER9 copy_page() loop

2017-04-02 Thread Anton Blanchard
Hi Nick,

> > Good idea, I hadn't thought of embedding it all in a feature
> > section.  
> 
> It may not work currently because you get those ftr_alt_97 relocation
> errors with the "else" parts because relative branches to other code
> need to be direct and I think reachable from both places.

I thought about this a bit more. One potential issue will be
profiling - perf annotate will match the samples against the unpatched
code which could be very confusing.

Anton


[PATCH] powerpc: Avoid taking a data miss on every userspace instruction miss

2017-03-30 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Early on in do_page_fault() we call store_updates_sp(), regardless of
the type of exception. For an instruction miss this doesn't make
sense, because we only use this information to detect if a data miss
is the result of a stack expansion instruction or not.

Worse still, it results in a data miss within every userspace
instruction miss handler, because we try and load the very instruction
we are about to install a pte for!

A simple exec microbenchmark runs 6% faster on POWER8 with this fix:

int main(int argc, char *argv[])
{
unsigned long left = atol(argv[1]);
char leftstr[16];

if (left-- == 0)
return 0;

sprintf(leftstr, "%ld", left);
execlp(argv[0], argv[0], leftstr, NULL);
perror("exec failed\n");

return 0;
}

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index fd6484fc2fa9..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -287,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
 * can result in fault, which will cause a deadlock when called with
 * mmap_sem held
 */
-   if (user_mode(regs))
+   if (!is_exec && user_mode(regs))
store_update_sp = store_updates_sp(regs);
 
if (user_mode(regs))
-- 
2.11.0



[PATCH 3/3] powerpc/configs: Re-enable POWER8 crc32c

2017-03-22 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The config option for the POWER8 crc32c recently changed from
CONFIG_CRYPT_CRC32C_VPMSUM to CONFIG_CRYPTO_CRC32C_VPMSUM. Update
the configs.

Signed-off-by: Anton Blanchard <an...@samba.org
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 arch/powerpc/configs/ppc64_defconfig   | 2 +-
 arch/powerpc/configs/pseries_defconfig | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/configs/powernv_defconfig 
b/arch/powerpc/configs/powernv_defconfig
index 4926d7f..0695ce0 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index dfac33c..e353168f9 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -340,7 +340,7 @@ CONFIG_PPC_EARLY_DEBUG=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index 47f72c8..1a61aa2 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -303,7 +303,7 @@ CONFIG_XMON=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
 CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA256=y
-- 
2.9.3



[PATCH 2/3] powerpc/configs: Make oprofile a module

2017-03-22 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Most people use perf these days, so save about 31kB by making oprofile
a module.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 arch/powerpc/configs/ppc64_defconfig   | 2 +-
 arch/powerpc/configs/pseries_defconfig | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/configs/powernv_defconfig 
b/arch/powerpc/configs/powernv_defconfig
index eb78c74..4926d7f 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index bdca32e..dfac33c 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index cd26091..47f72c8 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y
 CONFIG_BPF_SYSCALL=y
 # CONFIG_COMPAT_BRK is not set
 CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
 CONFIG_KPROBES=y
 CONFIG_JUMP_LABEL=y
 CONFIG_MODULES=y
-- 
2.9.3



[PATCH 1/3] powerpc/configs: Re-enable ISO9660_FS as a built-in in 64 bit configs

2017-03-22 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

It turns out cloud-config uses ISO9660 filesystems to inject
configuration data into cloud images. The cloud-config failures when
ISO9660_FS is not enabled are cryptic, and building it in makes
mainline testing easier, so re-enable it.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/configs/powernv_defconfig | 2 +-
 arch/powerpc/configs/ppc64_defconfig   | 2 +-
 arch/powerpc/configs/pseries_defconfig | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/configs/powernv_defconfig 
b/arch/powerpc/configs/powernv_defconfig
index ac8b833..eb78c74 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
diff --git a/arch/powerpc/configs/ppc64_defconfig 
b/arch/powerpc/configs/ppc64_defconfig
index 4f1288b..bdca32e 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -291,7 +291,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index 4ff68b7..cd26091 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
 CONFIG_VFAT_FS=m
-- 
2.9.3



Re: [PATCH] powerpc: Add POWER9 copy_page() loop

2017-03-20 Thread Anton Blanchard
Hi Nick,

> I've got a patch that makes alternate feature patching a bit
> more flexible and not hit relocation limits when using big "else"
> parts. I was thinking of doing something like
> 
> _GLOBAL_TOC(copy_page)
> BEGIN_FTR_SECTION_NESTED(50)
> #include "copypage_power9.S"
> FTR_SECTION_ELSE_NESTED(50)
> #include "copypage_power7.S"
> ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)

Good idea, I hadn't thought of embedding it all in a feature section.

> I guess POWER asm doesn't need this but it's good practice to prevent
> copy paste errors? It would be nice to have some macros to hide all
> these constants, but that's for another patch. The commenting is good.

The .machine X macros? Unfortunately the format of dcbt is different
for recent server chips. This wasn't a great idea in retrospect because
if you do get the instruction layout wrong, you wont get a fault to warn
you.

> I don't suppose the stream setup is costly enough to consider
> touching a cacheline or two ahead before starting it?

Starting up software streams is a bit of an art - if the demand loads
get ahead then a hardware stream gets started before the software one.
Note all the eieios to try and avoid this happening.

I've struggled with software prefetch on previous chips and sometimes I
wonder if it is worth the pain.

> (Also for another day) We might be able to avoid the stack and call
> for some common cases. Pretty small overcall cost I guess, but it
> could be beneficial for memcpy if not copy_page.

Definitely. Also the breakpoint for using vector should be much
lower if we have already saved the user state in a previous call.

Anton


[PATCH] powerpc: Add POWER9 copy_page() loop

2017-03-20 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Add a POWER9 optimised copy_page() loop. This loop uses the new D form
vector loads and stores, and uses dcbz to pre zero the destination.

A few questions:

- I'm using a nested feature section, but that is going to get unwieldy
  at some stage. It would be nice to update the call site for copy_page
  directly.

- I'm using CPU_FTR_ARCH_300, but as our functions grow perhaps we want
  the cputable entry to contain a pointer to optimised functions.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/lib/Makefile  |   2 +-
 arch/powerpc/lib/copypage_64.S |   4 +
 arch/powerpc/lib/copypage_power9.S | 224 +
 3 files changed, 229 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/copypage_power9.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 2b5e090..d3667b5 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_PPC32)   += div64.o copy_32.o
 
 obj64-y+= copypage_64.o copyuser_64.o usercopy_64.o mem_64.o 
hweight_64.o \
   copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
-  memcpy_64.o memcmp_64.o
+  memcpy_64.o memcmp_64.o copypage_power9.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 4bcc9e7..051423e 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -21,7 +21,11 @@ _GLOBAL_TOC(copy_page)
 BEGIN_FTR_SECTION
lis r5,PAGE_SIZE@h
 FTR_SECTION_ELSE
+  BEGIN_FTR_SECTION_NESTED(50)
+   b   copypage_power9
+  FTR_SECTION_ELSE_NESTED(50)
b   copypage_power7
+  ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50)
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
ori r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/lib/copypage_power9.S 
b/arch/powerpc/lib/copypage_power9.S
new file mode 100644
index 000..2493f94
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power9.S
@@ -0,0 +1,224 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Anton Blanchard <an...@au.ibm.com>
+ */
+#include 
+#include 
+
+_GLOBAL(copypage_power9)
+   /*
+* We prefetch the source using enhanced touch instructions. We use
+* a stream ID of 0 for this. Since the source is page aligned we
+* don't need to clear the bottom 7 bits of the address.
+*/
+#ifdef CONFIG_PPC_64K_PAGES
+   lis r7,0x0E01   /* depth=7
+* units/cachelines=512 */
+#else
+   lis r7,0x0E00   /* depth=7 */
+   ori r7,r7,0x1000/* units/cachelines=32 */
+#endif
+
+   lis r8,0x8000   /* GO=1 */
+   clrldi  r8,r8,32
+
+.machine push
+.machine "power4"
+   /* setup read stream 0 */
+   dcbtr0,r4,0b01000   /* addr from */
+   dcbtr0,r7,0b01010   /* length and depth from */
+   eieio
+   dcbtr0,r8,0b01010   /* all streams GO */
+   eieio
+.machine pop
+
+   /*
+* To reduce memory bandwidth on the store side we send dcbzs ahead.
+* Experimental testing shows 2 cachelines as good enough.
+*/
+   li  r6,128
+   dcbz0,r3
+   dcbzr6,r3
+
+#ifdef CONFIG_ALTIVEC
+   mflrr0
+   std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+   std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+   std r0,16(r1)
+   stdur1,-STACKFRAMESIZE(r1)
+   bl  enter_vmx_copy
+   cmpwi   r3,0
+   ld  r0,STACKFRAMESIZE+16(r1)
+   ld  r3,STK_REG(R31)(r1)
+   ld  r4,STK_REG(R30)(r1)
+   addir1,r1,STACKFRAMESIZE
+   mtlrr0
+
+   li  r0,((PAGE_SIZE/128)-2)
+   mtctr   r0
+
+   li  r8,256
+
+   beq .Lnonvmx_copy
+
+   .balign 16
+1: dcbzr8,r3
+   lxv vs32,0(r4)
+   lxv vs33,16(r4)
+   stxvvs32,0(r3)
+   stxvvs33,16(r3)
+
+   lxv vs34,32(r4)
+   lxv vs35,48(r4)
+   stxvvs34,32(r3)
+   stxvvs35,48(r3)
+
+   lxv vs36,64(r4)
+   lxv 

Re: [PATCH 1/4] crypto: powerpc - Factor out the core CRC vpmsum algorithm

2017-03-16 Thread Anton Blanchard
Hi David,

> While not part of this change, the unrolled loops look as though
> they just destroy the cpu cache.
> I'd like be convinced that anything does CRC over long enough buffers
> to make it a gain at all.

btrfs data checksumming is one area.

> With modern (not that modern now) superscalar cpus you can often
> get the loop instructions 'for free'.

A branch on POWER8 is a three cycle redirect. The vpmsum instructions
are 6 cycles.

> Sometimes pipelining the loop is needed to get full throughput.
> Unlike the IP checksum, you don't even have to 'loop carry' the
> cpu carry flag.

It went through quite a lot of simulation to reach peak performance.
The loop is quite delicate, we have to pace it just right to avoid
some pipeline reject conditions.

Note also that we already modulo schedule the loop across three
iterations, required to hide the latency of the vpmsum instructions.

Anton


Re: 5-level pagetable patches break ppc64le

2017-03-13 Thread Anton Blanchard
Hi Kirill,

> > My ppc64le boot tests stopped working as of commit c2febafc6773
> > ("mm: convert generic code to 5-level paging")
> > 
> > We hang part way during boot, just before bringing up the network. I
> > haven't had a chance to narrow it down yet.  
> 
> Please check if patch by this link helps:
> 
> http://lkml.kernel.org/r/20170313052213.11411-1-kirill.shute...@linux.intel.com

It does fix the ppc64le boot hangs, thanks.

Tested-by: Anton Blanchard <an...@samba.org>

Anton


5-level pagetable patches break ppc64le

2017-03-13 Thread Anton Blanchard
Hi,

My ppc64le boot tests stopped working as of commit c2febafc6773 ("mm:
convert generic code to 5-level paging")

We hang part way during boot, just before bringing up the network. I
haven't had a chance to narrow it down yet.

Anton


Re: [PATCH] crypto: powerpc - Fix initialisation of crc32c context

2017-03-05 Thread Anton Blanchard
Hi Daniel,

> Turning on crypto self-tests on a POWER8 shows:
> 
> alg: hash: Test 1 failed for crc32c-vpmsum
> : ff ff ff ff
> 
> Comparing the code with the Intel CRC32c implementation on which
> ours is based shows that we are doing an init with 0, not ~0
> as CRC32c requires.
> 
> This probably wasn't caught because btrfs does its own weird
> open-coded initialisation.
> 
> Initialise our internal context to ~0 on init.
> 
> This makes the self-tests pass, and btrfs continues to work.

Thanks! Not sure how I screwed that up.

Acked-by: Anton Blanchard <an...@samba.org>

> Fixes: 6dd7a82cc54e ("crypto: powerpc - Add POWER8 optimised crc32c")
> Cc: Anton Blanchard <an...@samba.org>
> Cc: sta...@vger.kernel.org
> Signed-off-by: Daniel Axtens <d...@axtens.net>
> ---
>  arch/powerpc/crypto/crc32c-vpmsum_glue.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
> b/arch/powerpc/crypto/crc32c-vpmsum_glue.c index
> 9fa046d56eba..411994551afc 100644 ---
> a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++
> b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -52,7 +52,7 @@ static
> int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm) {
>   u32 *key = crypto_tfm_ctx(tfm);
>  
> - *key = 0;
> + *key = ~0;
>  
>   return 0;
>  }



[PATCH] powerpc: Avoid panic during boot due to divide by zero in init_cache_info()

2017-03-04 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

I see a panic in early boot when building with a recent gcc toolchain.
The issue is a divide by zero, which is undefined. Older toolchains
let us get away with it:

int foo(int a) { return a / 0; }

foo:
li 9,0
divw 3,3,9
extsw 3,3
blr

But newer ones catch it:

foo:
trap

Add a check to avoid the divide by zero.

Fixes: bd067f83b084 ("powerpc/64: Fix naming of cache block vs. cache line")
Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/kernel/setup_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index adf2084..afd1c26 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -408,7 +408,8 @@ static void init_cache_info(struct ppc_cache_info *info, 
u32 size, u32 lsize,
info->line_size = lsize;
info->block_size = bsize;
info->log_block_size = __ilog2(bsize);
-   info->blocks_per_page = PAGE_SIZE / bsize;
+   if (bsize)
+   info->blocks_per_page = PAGE_SIZE / bsize;
 
if (sets == 0)
info->assoc = 0x;
-- 
2.7.4



Re: [PATCH] powernv:idle: Fix bug due to labeling ambiguity in power_enter_stop

2017-02-26 Thread Anton Blanchard
Hi Gautham,

> +handle_esl_ec_set:

Unless we want to expose this to things like perf, we might want to
make it a local label (eg .Lxx)

Anton


[PATCH] powerpc: Fix confusing help text for DISABLE_MPROFILE_KERNEL

2017-02-09 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

The final paragraph of the help text is reversed - we want to
enable this option by default, and disable it if the toolchain
has a working -mprofile-kernel.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a46d1c0..d2916ff 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -390,8 +390,8 @@ config DISABLE_MPROFILE_KERNEL
  be disabled also.
 
  If you have a toolchain which supports mprofile-kernel, then you can
- enable this. Otherwise leave it disabled. If you're not sure, say
- "N".
+ disable this. Otherwise leave it enabled. If you're not sure, say
+ "Y".
 
 config MPROFILE_KERNEL
depends on PPC64 && CPU_LITTLE_ENDIAN
-- 
2.9.3



Re: [PATCH 2/5] powerpc/perf: Add PM_INST_DISP event to Power9 event list

2017-02-01 Thread Anton Blanchard
Hi Maddy,

> +EVENT(PM_INST_DISP,  0x200f0)
> +EVENT(PM_INST_DISP_ALT,  0x300f0)

Are you sure these are the right events? 0x200f2, 0x300f2 should be
instruction dispatch I think.

Anton


Re: [PATCH] powerpc/mm: Fix RECLAIM_DISTANCE

2017-01-30 Thread Anton Blanchard
Hi,


> Anton, I think the behaviour looks good. Actually, it's not very
> relevant to the issue addressed by the patch. I will reply to
> Michael's reply about the reason. There are two nodes in your system
> and the memory is expected to be allocated from node-0. If node-0
> doesn't have enough free memory, the allocater switches to node-1. It
> means we need more stress.

Did you try setting zone_reclaim_mode? Surely we should reclaim local
clean pagecache if enabled?

Anton
--

zone_reclaim_mode:

Zone_reclaim_mode allows someone to set more or less aggressive approaches to
reclaim memory when a zone runs out of memory. If it is set to zero then no
zone reclaim occurs. Allocations will be satisfied from other zones / nodes
in the system.

This is value ORed together of

1   = Zone reclaim on
2   = Zone reclaim writes dirty pages out
4   = Zone reclaim swaps pages

zone_reclaim_mode is disabled by default.  For file servers or workloads
that benefit from having their data cached, zone_reclaim_mode should be
left disabled as the caching effect is likely to be more important than
data locality.

zone_reclaim may be enabled if it's known that the workload is partitioned
such that each partition fits within a NUMA node and that accessing remote
memory would cause a measurable performance reduction.  The page allocator
will then reclaim easily reusable pages (those page cache pages that are
currently not used) before allocating off node pages.

Allowing zone reclaim to write out pages stops processes that are
writing large amounts of data from dirtying pages on other nodes. Zone
reclaim will write out dirty pages if a zone fills up and so effectively
throttle the process. This may decrease the performance of a single process
since it cannot use all of system memory to buffer the outgoing writes
anymore but it preserve the memory on other nodes so that the performance
of other processes running on other nodes will not be affected.

Allowing regular swap effectively restricts allocations to the local
node unless explicitly overridden by memory policies or cpuset
configurations.


> 
> In the experiment, 38GB is allocated: 16GB for pagecache and 24GB for
> heap. It's not exceeding the memory capacity (64GB). So page reclaim
> in the fast and slow path weren't triggered. It's why the pagecache
> wasn't dropped. I think __GFP_THISNODE isn't specified when
> page-fault handler tries to allocate page to accomodate the VMA for
> the heap.
> 
> *Without* the patch applied, I got something as below in the system
> where two NUMA nodes and each of them has 64GB memory. Also, I don't
> think the patch is going to change the behaviour:
> 
> # cat /proc/sys/vm/zone_reclaim_mode 
> 0
> 
> Drop pagecache
> Read 8GB file, for pagecache to consume 8GB memory.
> Node 0 FilePages:   8496960 kB
> taskset -c 0 ./alloc 137438953472   <- 128GB sized heap
> Node 0 FilePages:503424 kB
> 
> Eventually, some of swap clusters have been used as well:
> 
> # free -m
>   totalusedfree  shared  buff/cache
> available Mem: 130583  129203 861
> 10 518 297 Swap: 1098731457842
> 
> Thanks,
> Gavin
> 



Re: [PATCH] powerpc/mm: Fix RECLAIM_DISTANCE

2017-01-29 Thread Anton Blanchard
Hi,

> Anton suggested that NUMA distances in powerpc mattered and hurted
> performance without this setting. We need to validate to see if this
> is still true. A simple way to start would be benchmarking

The original issue was that we never reclaimed local clean pagecache.

I just tried all settings for /proc/sys/vm/zone_reclaim_mode and none
of them caused me to reclaim local clean pagecache! We are very broken.

I would think we have test cases for this, but here is a dumb one.
First something to consume memory:

# cat alloc.c

#include 
#include 
#include 
#include 

int main(int argc, char *argv[])
{
void *p;

unsigned long size;

size = strtoul(argv[1], NULL, 0);

p = malloc(size);
assert(p);
memset(p, 0, size);
printf("%p\n", p);

sleep(3600);

return 0;
}

Now create a file to consume pagecache. My nodes have 32GB each, so
I create 16GB, enough to consume half of the node:

dd if=/dev/zero of=/tmp/file bs=1G count=16

Clear out our pagecache:

sync
echo 3 > /proc/sys/vm/drop_caches

Bring it in on node 0:

taskset -c 0 cat /tmp/file > /dev/null

Consume 24GB of memory on node 0:

taskset -c 0 ./alloc 25769803776

In all zone reclaim modes, the pagecache never gets reclaimed:

# grep FilePages /sys/devices/system/node/node0/meminfo

Node 0 FilePages:  16757376 kB

And our alloc process shows lots of off node memory used:

3ff9a463 default anon=393217 dirty=393217 N0=112474 N1=220490 N16=60253 
kernelpagesize_kB=64

Clearly nothing is working. Gavin, if your patch fixes this we should
get it into stable too.

Anton


gcc trunk fails to build kernel on PowerPC64 due to oprofile warnings

2017-01-25 Thread Anton Blanchard
Hi,

gcc trunk has failed to build PowerPC64 kernels for a month or so. The issue
is in oprofile, which is common code but ends up being sucked into
arch/powerpc and therefore subject to the -Werror applied to arch/powerpc:
 
linux/arch/powerpc/oprofile/../../../drivers/oprofile/oprofile_stats.c: In 
function ‘oprofile_create_stats_files’:
linux/arch/powerpc/oprofile/../../../drivers/oprofile/oprofile_stats.c:55:25: 
error: ‘%d’ directive output may be truncated writing between 1 and 11 bytes 
into a region of size 7 [-Werror=format-truncation=]
   snprintf(buf, 10, "cpu%d", i);
 ^~
linux/arch/powerpc/oprofile/../../../drivers/oprofile/oprofile_stats.c:55:21: 
note: using the range [1, -2147483648] for directive argument
   snprintf(buf, 10, "cpu%d", i);
 ^~~
linux/arch/powerpc/oprofile/../../../drivers/oprofile/oprofile_stats.c:55:3: 
note: format output between 5 and 15 bytes into a destination of size 10
   snprintf(buf, 10, "cpu%d", i);
   ^
  LD  crypto/async_tx/built-in.o
  CC  lib/random32.o
cc1: all warnings being treated as errors

Anton


Re: BUILD_BUG_ON(!__builtin_constant_p(feature)) breaks bcc trace tool

2017-01-21 Thread Anton Blanchard
Hi,

> We added:
> 
> BUILD_BUG_ON(!__builtin_constant_p(feature)) 
> 
> to cpu_has_feature() and mmu_has_feature() in order to catch usage
> issues (such as cpu_has_feature(cpu_has_feature(X)). Unfortunately
> LLVM isn't smart enough to resolve this, and it errors out.
> 
> I work around it in my clang/LLVM builds of the kernel, but I have
> just discovered that it causes a lot of issues for the bcc (eBPF)
> trace tool (which uses LLVM).
> 
> How should we work around this? Wrap the checks in !clang perhaps?

Looks like it's a weakness in LLVM with inlining:

#include 

#if 1
static inline void foo(unsigned long x)
{
assert(__builtin_constant_p(x));
}
#else
#define foo(X) assert(__builtin_constant_p(X))
#endif

int main(void)
{
foo(1);

return 0;
}

And there is an old bug on it:

https://llvm.org/bugs/show_bug.cgi?id=4898

Anton


BUILD_BUG_ON(!__builtin_constant_p(feature)) breaks bcc trace tool

2017-01-20 Thread Anton Blanchard
Hi,

We added:

BUILD_BUG_ON(!__builtin_constant_p(feature)) 

to cpu_has_feature() and mmu_has_feature() in order to catch usage
issues (such as cpu_has_feature(cpu_has_feature(X)). Unfortunately LLVM
isn't smart enough to resolve this, and it errors out.

I work around it in my clang/LLVM builds of the kernel, but I have just
discovered that it causes a lot of issues for the bcc (eBPF) trace tool
(which uses LLVM).

How should we work around this? Wrap the checks in !clang perhaps?

Anton


[PATCH] powerpc: Ignore reserved field in DCSR and PVR reads and writes

2017-01-18 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

IBM bit 31 (for the rest of us - bit 0) is a reserved field in the
instruction definition of mtspr and mfspr. Hardware is encouraged to
(and does) ignore it.

As a result, if userspace executes an mtspr DSCR with the reserved bit
set, we get a DSCR facility unavailable exception. The kernel fails to
match against the expected value/mask, and we silently return to
userspace to try and re-execute the same mtspr DSCR instruction. We
loop forever until the process is killed.

We should do something here, and it seems mirroring what hardware does
is the better option vs killing the process. While here, relax the
matching of mfspr PVR too.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/include/asm/ppc-opcode.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index c56ea8c..c4ced1d 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -157,7 +157,7 @@
 #define PPC_INST_MCRXR 0x7c000400
 #define PPC_INST_MCRXR_MASK0xfc0007fe
 #define PPC_INST_MFSPR_PVR 0x7c1f42a6
-#define PPC_INST_MFSPR_PVR_MASK0xfc1f
+#define PPC_INST_MFSPR_PVR_MASK0xfc1e
 #define PPC_INST_MFTMR 0x7c0002dc
 #define PPC_INST_MSGSND0x7c00019c
 #define PPC_INST_MSGCLR0x7c0001dc
@@ -174,13 +174,13 @@
 #define PPC_INST_RFDI  0x4c4e
 #define PPC_INST_RFMCI 0x4c4c
 #define PPC_INST_MFSPR_DSCR0x7c1102a6
-#define PPC_INST_MFSPR_DSCR_MASK   0xfc1f
+#define PPC_INST_MFSPR_DSCR_MASK   0xfc1e
 #define PPC_INST_MTSPR_DSCR0x7c1103a6
-#define PPC_INST_MTSPR_DSCR_MASK   0xfc1f
+#define PPC_INST_MTSPR_DSCR_MASK   0xfc1e
 #define PPC_INST_MFSPR_DSCR_USER   0x7c0302a6
-#define PPC_INST_MFSPR_DSCR_USER_MASK  0xfc1f
+#define PPC_INST_MFSPR_DSCR_USER_MASK  0xfc1e
 #define PPC_INST_MTSPR_DSCR_USER   0x7c0303a6
-#define PPC_INST_MTSPR_DSCR_USER_MASK  0xfc1f
+#define PPC_INST_MTSPR_DSCR_USER_MASK  0xfc1e
 #define PPC_INST_MFVSRD0x7c66
 #define PPC_INST_MTVSRD0x7c000166
 #define PPC_INST_SLBFEE0x7c0007a7
-- 
2.9.3



Re: llist code relies on undefined behaviour, upsets llvm/clang

2017-01-16 Thread Anton Blanchard
Hi Peter,

> Last I checked I couldn't build a x86_64 kernel with llvm. So no, not
> something I've ever ran into.
> 
> Also, I would argue that this is broken in llvm, the kernel very much
> relies on things like this all over the place. Sure, we're way outside
> of what the C language spec says, but who bloody cares ;-)

True, but is there anything preventing gcc from implementing this
optimisation in the future? If we are relying on undefined behaviour we
should have a -fno-strict-* option to cover it.

> If llvm wants to compile the kernel, it needs to learn the C dialect
> the kernel uses.

LLVM has done that before (eg adding -fno-strict-overflow). I don't
think that option covers this case however.

Anton


llist code relies on undefined behaviour, upsets llvm/clang

2017-01-15 Thread Anton Blanchard
Hi,

I was debugging a hang on a ppc64le kernel built with clang, and it
looks to be undefined behaviour with pointer wrapping in the llist code.

A test case is below. llist_for_each_entry() does container_of() on a
NULL pointer, which wraps our pointer negative, then adds the same
offset back in and expects to get back to NULL. Unfortunately clang
decides that this can never be NULL and optimises it into an infinite
loop.

Build with -DFIX, such that the llist_node has a zero offset from the
start of the struct, and things work.

Is anyone other than ppc64le building kernels with llvm/clang these
days? This should reproduce on ARM64 and x86-64.

Anton
--

#include 

#define __compiler_offsetof(a, b)   \
__builtin_offsetof(a, b)

#undef offsetof
#ifdef __compiler_offsetof
#define offsetof(TYPE, MEMBER)  __compiler_offsetof(TYPE, MEMBER)
#else
#define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)
#endif

struct llist_node {
struct llist_node *next;
};

#define container_of(ptr, type, member) ({  \
const typeof( ((type *)0)->member ) *__mptr = (ptr);\
(type *)( (char *)__mptr - offsetof(type,member) );})

#define llist_entry(ptr, type, member)  \
container_of(ptr, type, member)

#define llist_for_each_entry(pos, node, member) \
for ((pos) = llist_entry((node), typeof(*(pos)), member);   \
 &(pos)->member != NULL;\
 (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member))

struct foo {
#ifndef FIX
unsigned long a;
#endif
struct llist_node ll;
};

void working(void);

struct llist_node *ptr;

void bar(void)
{
struct foo *f;

llist_for_each_entry(f, ptr, ll) {
}

working();
}


[PATCH 2/2] powerpc/64: Add BPF_JIT to powernv and pseries defconfigs

2017-01-12 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

Commit db9112173b18 ("powerpc: Turn on BPF_JIT in ppc64_defconfig")
only added BPF_JIT to the ppc64 defconfig. Add it to our powernv
and pseries defconfigs too.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/configs/powernv_defconfig | 1 +
 arch/powerpc/configs/pseries_defconfig | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/powerpc/configs/powernv_defconfig 
b/arch/powerpc/configs/powernv_defconfig
index e4d53fe..b793550 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -79,6 +79,7 @@ CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
+CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
diff --git a/arch/powerpc/configs/pseries_defconfig 
b/arch/powerpc/configs/pseries_defconfig
index 5a06bdd..d99734f 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -82,6 +82,7 @@ CONFIG_NETFILTER=y
 # CONFIG_NETFILTER_ADVANCED is not set
 CONFIG_BRIDGE=m
 CONFIG_VLAN_8021Q=m
+CONFIG_BPF_JIT=y
 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
-- 
2.9.3



[PATCH 1/2] powerpc/64: Move HAVE_CONTEXT_TRACKING from pseries to common Kconfig

2017-01-12 Thread Anton Blanchard
From: Anton Blanchard <an...@samba.org>

We added support for HAVE_CONTEXT_TRACKING, but placed the option inside
PPC_PSERIES.

This has the undesirable effect that NO_HZ_FULL can be enabled on a
kernel with both powernv and pseries support, but cannot on a kernel
with powernv only support.

Signed-off-by: Anton Blanchard <an...@samba.org>
---
 arch/powerpc/Kconfig   | 1 +
 arch/powerpc/platforms/pseries/Kconfig | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 48001e7..f072f82 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -164,6 +164,7 @@ config PPC
select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_KERNEL_GZIP
select HAVE_CC_STACKPROTECTOR
+   select HAVE_CONTEXT_TRACKING if PPC64
 
 config GENERIC_CSUM
def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index e1c280a..30ec04f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,7 +17,6 @@ config PPC_PSERIES
select PPC_UDBG_16550
select PPC_NATIVE
select PPC_DOORBELL
-   select HAVE_CONTEXT_TRACKING
select HOTPLUG_CPU if SMP
select ARCH_RANDOM
select PPC_DOORBELL
-- 
2.9.3



Re: ext4 filesystem corruption with 4.10-rc2 on ppc64le

2017-01-05 Thread Anton Blanchard
Hi Ted,

> Anton or Chandan, could you do me a favor and verify whether or not
> 64k block sizes are working for you on ppcle on ext4 by running
> xfstests?  Light duty testing works for me but when I stress ext4 with
> pagesize==blocksize on ppcle64 via xfstests, it blows up.  I suspect
> (but am not sure) it's due to (non-upstream) device driver issues, and
> a verification that you can run xfstests on your ppcle64 systems using
> standard upstream device drivers would be very helpful, since I don't
> have easy console access on the machines I have access to at
> $WORK.  :-(

I fired off an xfstests run, and it looks good. There are 3 failures,
but they seem to be setup issues on my part. I also double checked
those same three failed on 4.8.

Chandan has been running the test suite regularly, and plans to do a
run against mainline too.

Anton


ext4 filesystem corruption with 4.10-rc2 on ppc64le

2017-01-03 Thread Anton Blanchard
Hi,

I'm consistently seeing ext4 filesystem corruption using a mainline
kernel. It doesn't take much to trigger it - download a ppc64le Ubuntu
cloud image, boot it in KVM and run:

sudo apt-get update
sudo apt-get dist-upgrade
sudo reboot

And it never makes it back up, dying with rather severe filesystem
corruption.

I've narrowed it down to:

64e1c57fa474 ("ext4: Use clean_bdev_aliases() instead of iteration")
e64855c6cfaa ("fs: Add helper to clean bdev aliases under a bh and use it")
ce98321bf7d2 ("fs: Remove unmap_underlying_metadata")

Backing these patches out fixes the issue.

Anton


Re: [PATCH] perf TUI: Don't throw error for zero length symbols

2016-12-16 Thread Anton Blanchard
Hi Ravi,

> > perf report (with TUI) exits with error when it finds a sample of
> > zero length symbol(i.e. addr == sym->start == sym->end). Actually
> > these are valid samples. Don't exit TUI and show report with such
> > symbols.
> >
> > Link: https://lkml.org/lkml/2016/10/8/189

You can add:

Tested-by: Anton Blanchard <an...@samba.org>

Also, since this issue makes perf report pretty much useless on
ppc64, can we mark it for stable@, at least to get it into 4.9 where
the ppc64 kernel changes that triggered this appeared?

Anton

> > Reported-by: Anton Blanchard <an...@samba.org>
> > Signed-off-by: Ravi Bangoria <ravi.bango...@linux.vnet.ibm.com>
> > ---
> >  tools/perf/util/annotate.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> > index aeb5a44..430d039 100644
> > --- a/tools/perf/util/annotate.c
> > +++ b/tools/perf/util/annotate.c
> > @@ -593,7 +593,8 @@ static int __symbol__inc_addr_samples(struct
> > symbol *sym, struct map *map,
> >
> > pr_debug3("%s: addr=%#" PRIx64 "\n", __func__,
> > map->unmap_ip(map, addr));
> >
> > -   if (addr < sym->start || addr >= sym->end) {
> > +   if ((addr < sym->start || addr >= sym->end) &&
> > +   (addr != sym->end || sym->start != sym->end)) {
> > pr_debug("%s(%d): ERANGE! sym->name=%s, start=%#"
> > PRIx64 ", addr=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__,
> > __LINE__, sym->name, sym->start, addr, sym->end); return -ERANGE;  
> 



Kernel build issues with upstream binutils

2016-11-26 Thread Anton Blanchard
Hi,

A recent binutils commit:

https://sourceware.org/git/?p=binutils-gdb.git;a=commitdiff;h=1a9ccd70f9a75dc6b48d340059f28ef3550c107b

has broken kernel builds:

/home/anton/gcc.install/bin/ld: arch/powerpc/boot/zImage.pseries: Not enough 
room for program headers, try linking with -N
/home/anton/gcc.install/bin/ld: final link failed: Bad value

I guess we have an issue with our linker script.

Anton


Re: [PATCH 13/38] powerpc: Put exception configuration in a common place

2016-11-11 Thread Anton Blanchard
Hi Ben,

> The various calls to establish exception endianness and AIL are
> now done from a single point using already established CPU and FW
> feature bits to decide what to do.
> 
> Signed-off-by: Benjamin Herrenschmidt 

...

+static void configure_exceptions(void)
+{
+   /* Setup the trampolines from the lowmem exception vectors
+* to the kdump kernel when not using a relocatable kernel.
+*/
+   setup_kdump_trampoline();
+
+   /* Under a PAPR hypervisor, we need hypercalls */
+   if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+   long rc;
+
+   /* Enable AIL */
+   rc = pSeries_enable_reloc_on_exc();
+   if (rc == H_P2) {
+   pr_info("Relocation on exceptions not supported\n");
+   } else if (rc != H_SUCCESS) {
+   pr_warn("Unable to enable relocation on exceptions: "
+   "%ld\n", rc);
+   }
+
+   /*
+* Tell the hypervisor that we want our exceptions to
+* be taken in little endian mode. If this fails we don't
+* want to use BUG() because it will trigger an exception.
+*
+* We don't call this for big endian as our calling convention
+* makes us always enter in BE, and the call may fail under
+* some circumstances with kdump.
+*/
+#ifdef __LITTLE_ENDIAN__
+   rc = pseries_little_endian_exceptions();
+   if (rc) {
+   ppc_md.progress("H_SET_MODE LE exception fail", 0);
+   panic("Could not enable little endian exceptions");
+   }
+#endif
+   } else {
+   /* Set endian mode using OPAL */
+   if (firmware_has_feature(FW_FEATURE_OPAL))
+   opal_configure_cores();
+
+   /* Enable AIL if supported, and we are in hypervisor mode */
+   if (cpu_has_feature(CPU_FTR_HVMODE) &&
+   cpu_has_feature(CPU_FTR_ARCH_207S)) {
+   unsigned long lpcr = mfspr(SPRN_LPCR);
+   mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
+   }
+   }
+}

It looks like we only set LPCR_AIL_3 on the boot CPU after this change,
is that expected? Before that we did it in cpu_ready_for_interrupts()
which is called for the primary and all secondary CPUs.

Anton


Re: [PATCH 1/3] powerpc: Emulation support for load/store instructions on LE

2016-11-05 Thread Anton Blanchard
Hi,

> kprobe, uprobe, hw-breakpoint and xmon are the only user of
> emulate_step.
> 
> Kprobe / uprobe single-steps instruction if they can't emulate it, so
> there is no problem with them. As I mention, hw-breakpoint is broken.
> However I'm not sure about xmon, I need to check that.

I was mostly concerned that it would impact kprobes. Sounds like we are
ok there.

> So yes, there is no user-visible feature that depends on this.

Aren't hardware breakpoints exposed via perf? I'd call perf
user-visible.

Anton


Re: [PATCH 1/3] powerpc: Emulation support for load/store instructions on LE

2016-11-02 Thread Anton Blanchard
Hi Ravi,

> emulate_step() uses a number of underlying kernel functions that were
> initially not enabled for LE. This has been rectified since. So, fix
> emulate_step() for LE for the corresponding instructions.

Thanks. Should this be queued up for stable?

Anton

> Reported-by: Anton Blanchard <an...@samba.org>
> Signed-off-by: Ravi Bangoria <ravi.bango...@linux.vnet.ibm.com>
> ---
>  arch/powerpc/lib/sstep.c | 20 
>  1 file changed, 20 deletions(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 3362299..6ca3b90 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -1807,8 +1807,6 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) goto instr_done;
>  
>   case LARX:
> - if (regs->msr & MSR_LE)
> - return 0;
>   if (op.ea & (size - 1))
>   break;  /* can't handle
> misaligned */ err = -EFAULT;
> @@ -1832,8 +1830,6 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) goto ldst_done;
>  
>   case STCX:
> - if (regs->msr & MSR_LE)
> - return 0;
>   if (op.ea & (size - 1))
>   break;  /* can't handle
> misaligned */ err = -EFAULT;
> @@ -1859,8 +1855,6 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) goto ldst_done;
>  
>   case LOAD:
> - if (regs->msr & MSR_LE)
> - return 0;
>   err = read_mem(>gpr[op.reg], op.ea, size,
> regs); if (!err) {
>   if (op.type & SIGNEXT)
> @@ -1872,8 +1866,6 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) 
>  #ifdef CONFIG_PPC_FPU
>   case LOAD_FP:
> - if (regs->msr & MSR_LE)
> - return 0;
>   if (size == 4)
>   err = do_fp_load(op.reg, do_lfs, op.ea,
> size, regs); else
> @@ -1882,15 +1874,11 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) #endif
>  #ifdef CONFIG_ALTIVEC
>   case LOAD_VMX:
> - if (regs->msr & MSR_LE)
> - return 0;
>   err = do_vec_load(op.reg, do_lvx, op.ea & ~0xfUL,
> regs); goto ldst_done;
>  #endif
>  #ifdef CONFIG_VSX
>   case LOAD_VSX:
> - if (regs->msr & MSR_LE)
> - return 0;
>   err = do_vsx_load(op.reg, do_lxvd2x, op.ea, regs);
>   goto ldst_done;
>  #endif
> @@ -1913,8 +1901,6 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) goto instr_done;
>  
>   case STORE:
> - if (regs->msr & MSR_LE)
> - return 0;
>   if ((op.type & UPDATE) && size == sizeof(long) &&
>   op.reg == 1 && op.update_reg == 1 &&
>   !(regs->msr & MSR_PR) &&
> @@ -1927,8 +1913,6 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) 
>  #ifdef CONFIG_PPC_FPU
>   case STORE_FP:
> - if (regs->msr & MSR_LE)
> - return 0;
>   if (size == 4)
>   err = do_fp_store(op.reg, do_stfs, op.ea,
> size, regs); else
> @@ -1937,15 +1921,11 @@ int __kprobes emulate_step(struct pt_regs
> *regs, unsigned int instr) #endif
>  #ifdef CONFIG_ALTIVEC
>   case STORE_VMX:
> - if (regs->msr & MSR_LE)
> - return 0;
>   err = do_vec_store(op.reg, do_stvx, op.ea & ~0xfUL,
> regs); goto ldst_done;
>  #endif
>  #ifdef CONFIG_VSX
>   case STORE_VSX:
> - if (regs->msr & MSR_LE)
> - return 0;
>   err = do_vsx_store(op.reg, do_stxvd2x, op.ea, regs);
>   goto ldst_done;
>  #endif



  1   2   3   4   5   6   7   8   9   10   >