[PATCH net-next 5/5] ibmvnic: merge ibmvnic_reset_init and ibmvnic_init

2020-08-18 Thread Lijun Pan
These two functions share the majority of the code, hence merge
them together. In the meanwhile, add a reset pass-in parameter
to differentiate them. Thus, the code is easier to read and to tell
the difference between reset_init and regular init.

Signed-off-by: Lijun Pan 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 65 ++
 1 file changed, 13 insertions(+), 52 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 280358dce8ba..c92615b74833 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -104,8 +104,7 @@ static int send_login(struct ibmvnic_adapter *adapter);
 static void send_cap_queries(struct ibmvnic_adapter *adapter);
 static int init_sub_crqs(struct ibmvnic_adapter *);
 static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter);
-static int ibmvnic_init(struct ibmvnic_adapter *);
-static int ibmvnic_reset_init(struct ibmvnic_adapter *);
+static int ibmvnic_reset_init(struct ibmvnic_adapter *, bool reset);
 static void release_crq_queue(struct ibmvnic_adapter *);
 static int __ibmvnic_set_mac(struct net_device *, u8 *);
 static int init_crq_queue(struct ibmvnic_adapter *adapter);
@@ -1868,7 +1867,7 @@ static int do_change_param_reset(struct ibmvnic_adapter 
*adapter,
return rc;
}
 
-   rc = ibmvnic_reset_init(adapter);
+   rc = ibmvnic_reset_init(adapter, true);
if (rc)
return IBMVNIC_INIT_FAILED;
 
@@ -1986,7 +1985,7 @@ static int do_reset(struct ibmvnic_adapter *adapter,
goto out;
}
 
-   rc = ibmvnic_reset_init(adapter);
+   rc = ibmvnic_reset_init(adapter, true);
if (rc) {
rc = IBMVNIC_INIT_FAILED;
goto out;
@@ -2093,7 +2092,7 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter,
return rc;
}
 
-   rc = ibmvnic_init(adapter);
+   rc = ibmvnic_reset_init(adapter, false);
if (rc)
return rc;
 
@@ -4970,7 +4969,7 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter)
return retrc;
 }
 
-static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter)
+static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset)
 {
struct device *dev = >vdev->dev;
unsigned long timeout = msecs_to_jiffies(3);
@@ -4979,10 +4978,12 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter 
*adapter)
 
adapter->from_passive_init = false;
 
-   old_num_rx_queues = adapter->req_rx_queues;
-   old_num_tx_queues = adapter->req_tx_queues;
+   if (reset) {
+   old_num_rx_queues = adapter->req_rx_queues;
+   old_num_tx_queues = adapter->req_tx_queues;
+   reinit_completion(>init_done);
+   }
 
-   reinit_completion(>init_done);
adapter->init_done_rc = 0;
rc = ibmvnic_send_crq_init(adapter);
if (rc) {
@@ -5000,7 +5001,8 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter 
*adapter)
return adapter->init_done_rc;
}
 
-   if (test_bit(0, >resetting) && !adapter->wait_for_reset &&
+   if (reset &&
+   test_bit(0, >resetting) && !adapter->wait_for_reset &&
adapter->reset_reason != VNIC_RESET_MOBILITY) {
if (adapter->req_rx_queues != old_num_rx_queues ||
adapter->req_tx_queues != old_num_tx_queues) {
@@ -5028,47 +5030,6 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter 
*adapter)
return rc;
 }
 
-static int ibmvnic_init(struct ibmvnic_adapter *adapter)
-{
-   struct device *dev = >vdev->dev;
-   unsigned long timeout = msecs_to_jiffies(3);
-   int rc;
-
-   adapter->from_passive_init = false;
-
-   adapter->init_done_rc = 0;
-   rc = ibmvnic_send_crq_init(adapter);
-   if (rc) {
-   dev_err(dev, "%s: Send crq init failed with error %d\n", 
__func__, rc);
-   return rc;
-   }
-
-   if (!wait_for_completion_timeout(>init_done, timeout)) {
-   dev_err(dev, "Initialization sequence timed out\n");
-   return -1;
-   }
-
-   if (adapter->init_done_rc) {
-   release_crq_queue(adapter);
-   return adapter->init_done_rc;
-   }
-
-   rc = init_sub_crqs(adapter);
-   if (rc) {
-   dev_err(dev, "Initialization of sub crqs failed\n");
-   release_crq_queue(adapter);
-   return rc;
-   }
-
-   rc = init_sub_crq_irqs(adapter);
-   if (rc) {
-   dev_err(dev, "Failed to initialize sub crq irqs\n");
-   release_crq_queue(adapter);
-   }
-
-   return rc;
-}
-
 static struct device_attribute dev_attr_failover;
 
 static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
@@ -5131,7 +5092,7 @@ static int ibmvnic_probe(struct vio_dev 

[PATCH net-next 4/5] ibmvnic: remove never executed if statement

2020-08-18 Thread Lijun Pan
At the beginning of the function, from_passive_init is set false by
"adapter->from_passive_init = false;",
hence the if statement will never run.

Signed-off-by: Lijun Pan 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index e366fd42a8c4..280358dce8ba 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -5000,12 +5000,6 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter 
*adapter)
return adapter->init_done_rc;
}
 
-   if (adapter->from_passive_init) {
-   adapter->state = VNIC_OPEN;
-   adapter->from_passive_init = false;
-   return -1;
-   }
-
if (test_bit(0, >resetting) && !adapter->wait_for_reset &&
adapter->reset_reason != VNIC_RESET_MOBILITY) {
if (adapter->req_rx_queues != old_num_rx_queues ||
@@ -5059,12 +5053,6 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
return adapter->init_done_rc;
}
 
-   if (adapter->from_passive_init) {
-   adapter->state = VNIC_OPEN;
-   adapter->from_passive_init = false;
-   return -1;
-   }
-
rc = init_sub_crqs(adapter);
if (rc) {
dev_err(dev, "Initialization of sub crqs failed\n");
-- 
2.23.0



[PATCH net-next 3/5] ibmvnic: improve ibmvnic_init and ibmvnic_reset_init

2020-08-18 Thread Lijun Pan
When H_SEND_CRQ command returns with H_CLOSED, it means the
server's CRQ is not ready yet. Instead of resetting immediately,
we wait for the server to launch passive init.
ibmvnic_init() and ibmvnic_reset_init() should also return the
error code from ibmvnic_send_crq_init() call.

Signed-off-by: Lijun Pan 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 50e86e65961e..e366fd42a8c4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3568,8 +3568,7 @@ static int ibmvnic_send_crq(struct ibmvnic_adapter 
*adapter,
if (rc) {
if (rc == H_CLOSED) {
dev_warn(dev, "CRQ Queue closed\n");
-   if (test_bit(0, >resetting))
-   ibmvnic_reset(adapter, VNIC_RESET_FATAL);
+   /* do not reset, report the fail, wait for passive init 
from server */
}
 
dev_warn(dev, "Send error (rc=%d)\n", rc);
@@ -4985,7 +4984,12 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter 
*adapter)
 
reinit_completion(>init_done);
adapter->init_done_rc = 0;
-   ibmvnic_send_crq_init(adapter);
+   rc = ibmvnic_send_crq_init(adapter);
+   if (rc) {
+   dev_err(dev, "%s: Send crq init failed with error %d\n", 
__func__, rc);
+   return rc;
+   }
+
if (!wait_for_completion_timeout(>init_done, timeout)) {
dev_err(dev, "Initialization sequence timed out\n");
return -1;
@@ -5039,7 +5043,12 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter)
adapter->from_passive_init = false;
 
adapter->init_done_rc = 0;
-   ibmvnic_send_crq_init(adapter);
+   rc = ibmvnic_send_crq_init(adapter);
+   if (rc) {
+   dev_err(dev, "%s: Send crq init failed with error %d\n", 
__func__, rc);
+   return rc;
+   }
+
if (!wait_for_completion_timeout(>init_done, timeout)) {
dev_err(dev, "Initialization sequence timed out\n");
return -1;
-- 
2.23.0



[PATCH net-next 1/5] ibmvnic: print caller in several error messages

2020-08-18 Thread Lijun Pan
The error messages in the changed functions are exactly the same.
In order to differentiate them and make debugging easier,
we print the function names in the error messages.

Signed-off-by: Lijun Pan 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 5afb3c9c52d2..aba1cd9862ac 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1864,7 +1864,7 @@ static int do_change_param_reset(struct ibmvnic_adapter 
*adapter,
 
if (rc) {
netdev_err(adapter->netdev,
-  "Couldn't initialize crq. rc=%d\n", rc);
+  "%s: Couldn't initialize crq. rc=%d\n", __func__, 
rc);
return rc;
}
 
@@ -2089,7 +2089,7 @@ static int do_hard_reset(struct ibmvnic_adapter *adapter,
rc = init_crq_queue(adapter);
if (rc) {
netdev_err(adapter->netdev,
-  "Couldn't initialize crq. rc=%d\n", rc);
+  "%s: Couldn't initialize crq. rc=%d\n", __func__, 
rc);
return rc;
}
 
@@ -2912,7 +2912,7 @@ static struct ibmvnic_sub_crq_queue 
*init_sub_crq_queue(struct ibmvnic_adapter
rc = ibmvnic_reset_crq(adapter);
 
if (rc == H_CLOSED) {
-   dev_warn(dev, "Partner adapter not ready, waiting.\n");
+   dev_warn(dev, "%s: Partner adapter not ready, waiting.\n", 
__func__);
} else if (rc) {
dev_warn(dev, "Error %d registering sub-crq\n", rc);
goto reg_failed;
@@ -4865,7 +4865,7 @@ static int ibmvnic_reset_crq(struct ibmvnic_adapter 
*adapter)
 
if (rc == H_CLOSED)
/* Adapter is good, but other end is not ready */
-   dev_warn(dev, "Partner adapter not ready\n");
+   dev_warn(dev, "%s: Partner adapter not ready\n", __func__);
else if (rc != 0)
dev_warn(dev, "Couldn't register crq (rc=%d)\n", rc);
 
@@ -4926,7 +4926,7 @@ static int init_crq_queue(struct ibmvnic_adapter *adapter)
retrc = rc;
 
if (rc == H_CLOSED) {
-   dev_warn(dev, "Partner adapter not ready\n");
+   dev_warn(dev, "%s: Partner adapter not ready\n", __func__);
} else if (rc) {
dev_warn(dev, "Error %d opening adapter\n", rc);
goto reg_crq_failed;
@@ -5129,8 +5129,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const 
struct vio_device_id *id)
do {
rc = init_crq_queue(adapter);
if (rc) {
-   dev_err(>dev, "Couldn't initialize crq. rc=%d\n",
-   rc);
+   dev_err(>dev, "%s: Couldn't initialize crq. 
rc=%d\n",
+   __func__, rc);
goto ibmvnic_init_fail;
}
 
-- 
2.23.0



[PATCH net-next 2/5] ibmvnic: compare adapter->init_done_rc with more readable ibmvnic_rc_codes

2020-08-18 Thread Lijun Pan
Instead of comparing (adapter->init_done_rc == 1), let it
be (adapter->init_done_rc == PARTIALSUCCESS).

Signed-off-by: Lijun Pan 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index aba1cd9862ac..50e86e65961e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -974,7 +974,7 @@ static int set_link_state(struct ibmvnic_adapter *adapter, 
u8 link_state)
return -1;
}
 
-   if (adapter->init_done_rc == 1) {
+   if (adapter->init_done_rc == PARTIALSUCCESS) {
/* Partuial success, delay and re-send */
mdelay(1000);
resend = true;
-- 
2.23.0



[PATCH net-next 0/5] refactoring of ibmvnic code

2020-08-18 Thread Lijun Pan
This patch series refactor reset_init and init functions,
improve the debugging messages, and make some other cosmetic changes
to make the code easier to read and debug.

Lijun Pan (5):
  ibmvnic: print caller in several error messages
  ibmvnic: compare adapter->init_done_rc with more readable
ibmvnic_rc_codes
  ibmvnic: improve ibmvnic_init and ibmvnic_reset_init
  ibmvnic: remove never executed if statement
  ibmvnic: merge ibmvnic_reset_init and ibmvnic_init

 drivers/net/ethernet/ibm/ibmvnic.c | 98 +-
 1 file changed, 28 insertions(+), 70 deletions(-)

-- 
2.23.0



Re: [PATCH] powerpc/pseries: Do not initiate shutdown when system is running on UPS

2020-08-18 Thread Vasant Hegde

On 8/19/20 1:05 AM, Tyrel Datwyler wrote:

On 8/18/20 3:54 AM, Vasant Hegde wrote:

As per PAPR specification whenever system is running on UPS we have to
wait for predefined time (default 10mins) before initiating shutdown.


The wording in PAPR seems a little unclear. It states for an
EPOW_SYSTEM_SHUTDOWN action code that an EPOW error should be logged followed by
scheduling a shutdown to begin after an OS defined delay interval (with 10
minutes the suggested default).

However, the modifier code descriptions seems to imply that a normal shutdown is
the only one that should happen with no additional delay.

For EPOW sensor value = 3 (EPOW_SYSTEM_SHUTDOWN)
0x01 = Normal system shutdown with no additional delay
0x02 = Loss of utility power, system is running on UPS/Battery
0x03 = Loss of system critical functions, system should be shutdown
0x04 = Ambient temperature too high

For 0x03-0x04 we also do an orderly_poweroff().

Not sure if it really matters, but I was curious and this is just what I gleaned
from glancing at PAPR.


Correct. PAPR is bit confusing. But we know for sure that when running on UPS we 
don't need to shutdown immediately.


For values 0x03 and 0x04 I think its ok to initiate shutdown (that's the same 
behaviour exists for long time). I can double check with firmware folks.


-Vasant



-Tyrel



We have user space tool (rtas_errd) to monitor for EPOW events and
initiate shutdown after predefined time. Hence do not initiate shutdown
whenever we get EPOW_SHUTDOWN_ON_UPS event.

Fixes: 79872e35 (powerpc/pseries: All events of EPOW_SYSTEM_SHUTDOWN must 
initiate shutdown)
Cc: sta...@vger.kernel.org # v4.0+
Cc: Michael Ellerman 
Signed-off-by: Vasant Hegde 
---
  arch/powerpc/platforms/pseries/ras.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..13c86a292c6d 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -184,7 +184,6 @@ static void handle_system_shutdown(char event_modifier)
case EPOW_SHUTDOWN_ON_UPS:
pr_emerg("Loss of system power detected. System is running on"
 " UPS/battery. Check RTAS error log for details\n");
-   orderly_poweroff(true);
break;

case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:







Re: [PATCH v3] powerpc/pseries/svm: Allocate SWIOTLB buffer anywhere in memory

2020-08-18 Thread Christoph Hellwig
On Tue, Aug 18, 2020 at 07:11:26PM -0300, Thiago Jung Bauermann wrote:
> POWER secure guests (i.e., guests which use the Protection Execution
> Facility) need to use SWIOTLB to be able to do I/O with the hypervisor, but
> they don't need the SWIOTLB memory to be in low addresses since the
> hypervisor doesn't have any addressing limitation.
> 
> This solves a SWIOTLB initialization problem we are seeing in secure guests
> with 128 GB of RAM: they are configured with 4 GB of crashkernel reserved
> memory, which leaves no space for SWIOTLB in low addresses.
> 
> To do this, we use mostly the same code as swiotlb_init(), but allocate the
> buffer using memblock_alloc() instead of memblock_alloc_low().
> 
> Signed-off-by: Thiago Jung Bauermann 

Looks fine to me (except for the pointlessly long comment lines, but I've
been told that's the powerpc way).


Re: [Virtual ppce500] virtio_gpu virtio0: swiotlb buffer is full

2020-08-18 Thread Gerd Hoffmann
On Tue, Aug 18, 2020 at 04:41:38PM +0200, Christian Zigotzky wrote:
> Hello Gerd,
> 
> I compiled a new kernel with the latest DRM misc updates today. The patch is
> included in these updates.
> 
> This kernel works with the VirtIO-GPU in a virtual e5500 QEMU/KVM HV machine
> on my X5000.
> 
> Unfortunately I can only use the VirtIO-GPU (Monitor: Red Hat, Inc. 8") with
> a resolution of 640x480. If I set a higher resolution then the guest
> disables the monitor.
> I can use higher resolutions with the stable kernel 5.8 and the VirtIO-GPU.
> 
> Please check the latest DRM updates.

https://patchwork.freedesktop.org/patch/385980/

(tests & reviews & acks are welcome)

HTH,
  Gerd



[PATCH v3] soc: fsl: enable acpi support

2020-08-18 Thread Ran Wang
From: Peng Ma 

This patch enables ACPI support in RCPM driver.

Signed-off-by: Peng Ma 
Signed-off-by: Ran Wang 
---
Change in v3:
 - Add #ifdef CONFIG_ACPI for acpi_device_id
 - Rename rcpm_acpi_imx_ids to rcpm_acpi_ids

Change in v2:
 - Update acpi_device_id to fix conflict with other driver

 drivers/soc/fsl/rcpm.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c
index a093dbe..55d1d73 100644
--- a/drivers/soc/fsl/rcpm.c
+++ b/drivers/soc/fsl/rcpm.c
@@ -2,7 +2,7 @@
 //
 // rcpm.c - Freescale QorIQ RCPM driver
 //
-// Copyright 2019 NXP
+// Copyright 2019-2020 NXP
 //
 // Author: Ran Wang 
 
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define RCPM_WAKEUP_CELL_MAX_SIZE  7
 
@@ -125,6 +126,7 @@ static int rcpm_probe(struct platform_device *pdev)
 
ret = device_property_read_u32(>dev,
"#fsl,rcpm-wakeup-cells", >wakeup_cells);
+
if (ret)
return ret;
 
@@ -139,10 +141,19 @@ static const struct of_device_id rcpm_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, rcpm_of_match);
 
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id rcpm_acpi_ids[] = {
+   {"NXP0015",},
+   { }
+};
+MODULE_DEVICE_TABLE(acpi, rcpm_acpi_ids);
+#endif
+
 static struct platform_driver rcpm_driver = {
.driver = {
.name = "rcpm",
.of_match_table = rcpm_of_match,
+   .acpi_match_table = ACPI_PTR(rcpm_acpi_ids),
.pm = _pm_ops,
},
.probe = rcpm_probe,
-- 
2.7.4



[PATCH 9/9] selftests/powerpc: Properly handle failure in switch_endian_test

2020-08-18 Thread Michael Ellerman
On older CPUs the switch_endian() syscall doesn't work. Currently that
causes the switch_endian_test to just crash. Instead detect the
failure and properly exit with a failure message.

Signed-off-by: Michael Ellerman 
---
 .../switch_endian/switch_endian_test.S| 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S 
b/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S
index cc4930467235..7887f78cf072 100644
--- a/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S
+++ b/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S
@@ -3,9 +3,13 @@
 
.data
.balign 8
-message:
+success_message:
.ascii "success: switch_endian_test\n\0"
 
+   .balign 8
+failure_message:
+   .ascii "failure: switch_endian_test\n\0"
+
.section ".toc"
.balign 8
 pattern:
@@ -64,6 +68,9 @@ FUNC_START(_start)
li r0, __NR_switch_endian
sc
 
+   tdi   0, 0, 0x48// b +8 if the endian was switched
+   b .Lfail// exit if endian didn't switch
+
 #include "check-reversed.S"
 
/* Flip back, r0 already has the switch syscall number */
@@ -71,12 +78,20 @@ FUNC_START(_start)
 
 #include "check.S"
 
+   ld  r4, success_message@got(%r2)
+   li  r5, 28  // strlen(success_message)
+   li  r14, 0  // exit status
+.Lout:
li  r0, __NR_write
li  r3, 1   /* stdout */
-   ld  r4, message@got(%r2)
-   li  r5, 28  /* strlen(message3) */
sc
li  r0, __NR_exit
-   li  r3, 0
+   mr  r3, r14
sc
b   .
+
+.Lfail:
+   ld  r4, failure_message@got(%r2)
+   li  r5, 28  // strlen(failure_message)
+   li  r14, 1
+   b   .Lout
-- 
2.25.1



[PATCH 8/9] selftests/powerpc: Don't touch VMX/VSX on older CPUs

2020-08-18 Thread Michael Ellerman
If we're running on a CPU without VMX/VSX then don't touch them. This
is fragile, the compiler could spill a VMX/VSX register and break the
test anyway. But in practice it seems to work, ie. the test runs to
completion on a system without VSX with this change.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/benchmarks/context_switch.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c 
b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
index d50cc05df495..96554e2794d1 100644
--- a/tools/testing/selftests/powerpc/benchmarks/context_switch.c
+++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
@@ -481,6 +481,12 @@ int main(int argc, char *argv[])
else
printf("futex");
 
+   if (!have_hwcap(PPC_FEATURE_HAS_ALTIVEC))
+   touch_altivec = 0;
+
+   if (!have_hwcap(PPC_FEATURE_HAS_VSX))
+   touch_vector = 0;
+
printf(" on cpus %d/%d touching FP:%s altivec:%s vector:%s vdso:%s\n",
   cpu1, cpu2, touch_fp ?  "yes" : "no", touch_altivec ? "yes" : 
"no",
   touch_vector ? "yes" : "no", touch_vdso ? "yes" : "no");
-- 
2.25.1



[PATCH 7/9] selftests/powerpc: Skip L3 bank test on older CPUs

2020-08-18 Thread Michael Ellerman
This is a test of specific piece of logic in isa207-common.c, which is
only used on Power8 or later. So skip it on older CPUs.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/pmu/l3_bank_test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/powerpc/pmu/l3_bank_test.c 
b/tools/testing/selftests/powerpc/pmu/l3_bank_test.c
index a96d512a18c4..a5dfa9bf3b9f 100644
--- a/tools/testing/selftests/powerpc/pmu/l3_bank_test.c
+++ b/tools/testing/selftests/powerpc/pmu/l3_bank_test.c
@@ -20,6 +20,9 @@ static int l3_bank_test(void)
char *p;
int i;
 
+   // The L3 bank logic is only used on Power8 or later
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
p = malloc(MALLOC_SIZE);
FAIL_IF(!p);
 
-- 
2.25.1



[PATCH 6/9] selftests/powerpc: Skip security tests on older CPUs

2020-08-18 Thread Michael Ellerman
Both these tests use PMU events that only work on newer CPUs, so skip
them on older CPUs.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/security/rfi_flush.c  | 3 +++
 tools/testing/selftests/powerpc/security/spectre_v2.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c 
b/tools/testing/selftests/powerpc/security/rfi_flush.c
index fd37ff9b1c45..93a65bd1f231 100644
--- a/tools/testing/selftests/powerpc/security/rfi_flush.c
+++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
@@ -89,6 +89,9 @@ int rfi_flush_test(void)
 
SKIP_IF(geteuid() != 0);
 
+   // The PMU event we use only works on Power7 or later
+   SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
+
if (read_debugfs_file("powerpc/rfi_flush", _flush_org)) {
perror("Unable to read powerpc/rfi_flush debugfs file");
SKIP_IF(1);
diff --git a/tools/testing/selftests/powerpc/security/spectre_v2.c 
b/tools/testing/selftests/powerpc/security/spectre_v2.c
index c8d82b784102..adc2b7294e5f 100644
--- a/tools/testing/selftests/powerpc/security/spectre_v2.c
+++ b/tools/testing/selftests/powerpc/security/spectre_v2.c
@@ -134,6 +134,9 @@ int spectre_v2_test(void)
s64 miss_percent;
bool is_p9;
 
+   // The PMU events we use only work on Power8 or later
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
state = get_sysfs_state();
if (state == UNKNOWN) {
printf("Error: couldn't determine spectre_v2 mitigation 
state?\n");
-- 
2.25.1



[PATCH 5/9] selftests/powerpc: Don't run DSCR tests on old systems

2020-08-18 Thread Michael Ellerman
The DSCR tests fail on systems that don't have DSCR, so check for the
DSCR in hwcap and skip if it's not present.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/dscr/Makefile | 2 +-
 tools/testing/selftests/powerpc/dscr/dscr_default_test.c  | 2 ++
 tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c | 2 ++
 tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c | 2 ++
 tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c  | 2 ++
 tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c| 2 ++
 tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c | 2 ++
 tools/testing/selftests/powerpc/dscr/dscr_user_test.c | 2 ++
 8 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile 
b/tools/testing/selftests/powerpc/dscr/Makefile
index cfa6eedcb66c..845db6273a1b 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -10,4 +10,4 @@ include ../../lib.mk
 
 $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread
 
-$(TEST_GEN_PROGS): ../harness.c
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_default_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
index 288a4e2ad156..e76611e608af 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
@@ -63,6 +63,8 @@ int dscr_default(void)
unsigned long i, *status[THREADS];
unsigned long orig_dscr_default;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
orig_dscr_default = get_default_dscr();
 
/* Initial DSCR default */
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
index aefcd8d8759b..32fcf2b324b1 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
@@ -21,6 +21,8 @@ int dscr_explicit(void)
 {
unsigned long i, dscr = 0;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
srand(getpid());
set_dscr(dscr);
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
index 7c1cb46397c6..c6a81b2d6b91 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
@@ -44,6 +44,8 @@ int dscr_inherit_exec(void)
unsigned long i, dscr = 0;
pid_t pid;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
for (i = 0; i < COUNT; i++) {
dscr++;
if (dscr > DSCR_MAX)
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
index 04297a69ab59..f9dfd3d3c2d5 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
@@ -22,6 +22,8 @@ int dscr_inherit(void)
unsigned long i, dscr = 0;
pid_t pid;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
srand(getpid());
set_dscr(dscr);
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
index 02f6b4efde14..fbbdffdb2e5d 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
@@ -77,6 +77,8 @@ int dscr_sysfs(void)
unsigned long orig_dscr_default;
int i, j;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
orig_dscr_default = get_default_dscr();
for (i = 0; i < COUNT; i++) {
for (j = 0; j < DSCR_MAX; j++) {
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
index 37be2c25f277..191ed126f118 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
@@ -56,6 +56,8 @@ int dscr_sysfs_thread(void)
unsigned long orig_dscr_default;
int i, j;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
orig_dscr_default = get_default_dscr();
for (i = 0; i < COUNT; i++) {
for (j = 0; j < DSCR_MAX; j++) {
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_user_test.c 
b/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
index eaf785d11eed..e09072446dd3 100644
--- a/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
+++ b/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
@@ -36,6 +36,8 @@ int dscr_user(void)
 {
int i;
 
+   SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
check_dscr("");
 
for (i = 0; i < COUNT; i++) {
-- 
2.25.1



[PATCH 3/9] selftests/powerpc: Move set_dscr() into rfi_flush.c

2020-08-18 Thread Michael Ellerman
This version of set_dscr() was added for the RFI flush test, and is
fairly specific to it. It also clashes with the version of set_dscr()
in dscr/dscr.h. So move it into the RFI flush test where it's used.

Signed-off-by: Michael Ellerman 
---
 .../testing/selftests/powerpc/include/utils.h |  1 -
 .../selftests/powerpc/security/rfi_flush.c| 35 +++
 tools/testing/selftests/powerpc/utils.c   | 35 ---
 3 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/tools/testing/selftests/powerpc/include/utils.h 
b/tools/testing/selftests/powerpc/include/utils.h
index 71d2924f5b8b..bba400d1bb90 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -35,7 +35,6 @@ int pick_online_cpu(void);
 int read_debugfs_file(char *debugfs_file, int *result);
 int write_debugfs_file(char *debugfs_file, int result);
 int read_sysfs_file(char *debugfs_file, char *result, size_t result_size);
-void set_dscr(unsigned long val);
 int perf_event_open_counter(unsigned int type,
unsigned long config, int group_fd);
 int perf_event_enable(int fd);
diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c 
b/tools/testing/selftests/powerpc/security/rfi_flush.c
index 0a7d0afb26b8..fd37ff9b1c45 100644
--- a/tools/testing/selftests/powerpc/security/rfi_flush.c
+++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -41,6 +42,40 @@ static void syscall_loop(char *p, unsigned long iterations,
}
 }
 
+static void sigill_handler(int signr, siginfo_t *info, void *unused)
+{
+   static int warned = 0;
+   ucontext_t *ctx = (ucontext_t *)unused;
+   unsigned long *pc = _NIA(ctx);
+
+   /* mtspr 3,RS to check for move to DSCR below */
+   if ((*((unsigned int *)*pc) & 0xfc1f) == 0x7c0303a6) {
+   if (!warned++)
+   printf("WARNING: Skipping over dscr setup. Consider 
running 'ppc64_cpu --dscr=1' manually.\n");
+   *pc += 4;
+   } else {
+   printf("SIGILL at %p\n", pc);
+   abort();
+   }
+}
+
+static void set_dscr(unsigned long val)
+{
+   static int init = 0;
+   struct sigaction sa;
+
+   if (!init) {
+   memset(, 0, sizeof(sa));
+   sa.sa_sigaction = sigill_handler;
+   sa.sa_flags = SA_SIGINFO;
+   if (sigaction(SIGILL, , NULL))
+   perror("sigill_handler");
+   init = 1;
+   }
+
+   asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR));
+}
+
 int rfi_flush_test(void)
 {
char *p;
diff --git a/tools/testing/selftests/powerpc/utils.c 
b/tools/testing/selftests/powerpc/utils.c
index 638ffacc90aa..1f36ee1a909a 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -10,7 +10,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -273,40 +272,6 @@ int perf_event_reset(int fd)
return 0;
 }
 
-static void sigill_handler(int signr, siginfo_t *info, void *unused)
-{
-   static int warned = 0;
-   ucontext_t *ctx = (ucontext_t *)unused;
-   unsigned long *pc = _NIA(ctx);
-
-   /* mtspr 3,RS to check for move to DSCR below */
-   if ((*((unsigned int *)*pc) & 0xfc1f) == 0x7c0303a6) {
-   if (!warned++)
-   printf("WARNING: Skipping over dscr setup. Consider 
running 'ppc64_cpu --dscr=1' manually.\n");
-   *pc += 4;
-   } else {
-   printf("SIGILL at %p\n", pc);
-   abort();
-   }
-}
-
-void set_dscr(unsigned long val)
-{
-   static int init = 0;
-   struct sigaction sa;
-
-   if (!init) {
-   memset(, 0, sizeof(sa));
-   sa.sa_sigaction = sigill_handler;
-   sa.sa_flags = SA_SIGINFO;
-   if (sigaction(SIGILL, , NULL))
-   perror("sigill_handler");
-   init = 1;
-   }
-
-   asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR));
-}
-
 int using_hash_mmu(bool *using_hash)
 {
char line[128];
-- 
2.25.1



[PATCH 4/9] selftests/powerpc: Include asm/cputable.h from utils.h

2020-08-18 Thread Michael Ellerman
utils.h provides have_hwcap() and have_hwcap2() which check for a
feature bit. Those bits are defined in asm/cputable.h, so include it
in utils.h so users of utils.h don't have to do it manually.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/alignment/alignment_handler.c | 2 --
 tools/testing/selftests/powerpc/include/utils.h   | 1 +
 tools/testing/selftests/powerpc/pmu/count_stcx_fail.c | 1 -
 tools/testing/selftests/powerpc/pmu/per_event_excludes.c  | 2 --
 tools/testing/selftests/powerpc/stringloops/memcmp.c  | 2 +-
 tools/testing/selftests/powerpc/tm/tm.h   | 3 +--
 6 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c 
b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
index 55ef15184057..e4063eba4a5b 100644
--- a/tools/testing/selftests/powerpc/alignment/alignment_handler.c
+++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
@@ -55,8 +55,6 @@
 #include 
 #include 
 
-#include 
-
 #include "utils.h"
 #include "instructions.h"
 
diff --git a/tools/testing/selftests/powerpc/include/utils.h 
b/tools/testing/selftests/powerpc/include/utils.h
index bba400d1bb90..052b5a775dc2 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "reg.h"
 
 /* Avoid headaches with PRI?64 - just use %ll? always */
diff --git a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c 
b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
index 2980abca31e0..2070a1e2b3a5 100644
--- a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
+++ b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
@@ -9,7 +9,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include "event.h"
 #include "utils.h"
diff --git a/tools/testing/selftests/powerpc/pmu/per_event_excludes.c 
b/tools/testing/selftests/powerpc/pmu/per_event_excludes.c
index 2d37942bf72b..ad32a09a6540 100644
--- a/tools/testing/selftests/powerpc/pmu/per_event_excludes.c
+++ b/tools/testing/selftests/powerpc/pmu/per_event_excludes.c
@@ -12,8 +12,6 @@
 #include 
 #include 
 
-#include 
-
 #include "event.h"
 #include "lib.h"
 #include "utils.h"
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 979df3d98368..cb2f18855c8d 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -4,7 +4,7 @@
 #include 
 #include 
 #include 
-#include 
+
 #include "utils.h"
 
 #define SIZE 256
diff --git a/tools/testing/selftests/powerpc/tm/tm.h 
b/tools/testing/selftests/powerpc/tm/tm.h
index c402464b038f..c5a1e5c163fc 100644
--- a/tools/testing/selftests/powerpc/tm/tm.h
+++ b/tools/testing/selftests/powerpc/tm/tm.h
@@ -6,9 +6,8 @@
 #ifndef _SELFTESTS_POWERPC_TM_TM_H
 #define _SELFTESTS_POWERPC_TM_TM_H
 
-#include 
-#include 
 #include 
+#include 
 
 #include "utils.h"
 
-- 
2.25.1



[PATCH 2/9] selftests/powerpc: Give the bad_accesses test longer to run

2020-08-18 Thread Michael Ellerman
On older systems this test takes longer to run (duh), give it five
minutes which is long enough on a G5 970FX @ 1.6GHz.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/mm/bad_accesses.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c 
b/tools/testing/selftests/powerpc/mm/bad_accesses.c
index a864ed7e2008..fd747b2ffcfc 100644
--- a/tools/testing/selftests/powerpc/mm/bad_accesses.c
+++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c
@@ -139,5 +139,6 @@ static int test(void)
 
 int main(void)
 {
+   test_harness_set_timeout(300);
return test_harness(test, "bad_accesses");
 }
-- 
2.25.1



[PATCH 1/9] selftests/powerpc: Make using_hash_mmu() work on Cell & PowerMac

2020-08-18 Thread Michael Ellerman
These platforms don't show the MMU in /proc/cpuinfo, but they always
use hash, so teach using_hash_mmu() that.

Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/utils.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/utils.c 
b/tools/testing/selftests/powerpc/utils.c
index 18b6a773d5c7..638ffacc90aa 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -318,7 +318,9 @@ int using_hash_mmu(bool *using_hash)
 
rc = 0;
while (fgets(line, sizeof(line), f) != NULL) {
-   if (strcmp(line, "MMU   : Hash\n") == 0) {
+   if (!strcmp(line, "MMU  : Hash\n") ||
+   !strcmp(line, "platform : Cell\n") ||
+   !strcmp(line, "platform : PowerMac\n")) {
*using_hash = true;
goto out;
}
-- 
2.25.1



[PATCH] powerpc/64: Remove unused generic_secondary_thread_init()

2020-08-18 Thread Michael Ellerman
The last caller was removed in 2014 in commit fb5a515704d7 ("powerpc:
Remove platforms/wsp and associated pieces").

As Jordan noticed even though there are no callers, the code above in
fsl_secondary_thread_init() falls through into
generic_secondary_thread_init(). So we can remove the _GLOBAL but not
the body of the function.

However because fsl_secondary_thread_init() is inside #ifdef
CONFIG_PPC_BOOK3E, we can never reach the body of
generic_secondary_thread_init() unless CONFIG_PPC_BOOK3E is enabled,
so we can wrap the whole thing in a single #ifdef.

Signed-off-by: Michael Ellerman 
---

v2: Remove the symbol but not the body.
---
 arch/powerpc/include/asm/smp.h | 1 -
 arch/powerpc/kernel/head_64.S  | 7 ++-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 0d00faf8f119..c074bff1ec30 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -245,7 +245,6 @@ extern void arch_send_call_function_ipi_mask(const struct 
cpumask *mask);
  * 64-bit but defining them all here doesn't harm
  */
 extern void generic_secondary_smp_init(void);
-extern void generic_secondary_thread_init(void);
 extern unsigned long __secondary_hold_spinloop;
 extern unsigned long __secondary_hold_acknowledge;
 extern char __secondary_hold;
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 0e05a9a47a4b..1510b2a56669 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -300,9 +300,6 @@ _GLOBAL(fsl_secondary_thread_init)
rlwimi  r3, r3, 30, 2, 30
mtspr   SPRN_PIR, r3
 1:
-#endif
-
-_GLOBAL(generic_secondary_thread_init)
mr  r24,r3
 
/* turn on 64-bit mode */
@@ -312,13 +309,13 @@ _GLOBAL(generic_secondary_thread_init)
bl  relative_toc
tovirt(r2,r2)
 
-#ifdef CONFIG_PPC_BOOK3E
/* Book3E initialization */
mr  r3,r24
bl  book3e_secondary_thread_init
-#endif
b   generic_secondary_common_init
 
+#endif /* CONFIG_PPC_BOOK3E */
+
 /*
  * On pSeries and most other platforms, secondary processors spin
  * in the following code.
-- 
2.25.1



[PATCH 1/3] powerpc: Move arch_cpu_idle_dead() into smp.c

2020-08-18 Thread Michael Ellerman
arch_cpu_idle_dead() is in idle.c, which makes sense, but it's inside
a CONFIG_HOTPLUG_CPU block.

It would be more at home in smp.c, inside the existing
CONFIG_HOTPLUG_CPU block. Note that CONFIG_HOTPLUG_CPU depends on
CONFIG_SMP so even though smp.c is not built for SMP=n builds, that's
fine.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/idle.c | 8 
 arch/powerpc/kernel/smp.c  | 6 ++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 422e31d2f5a2..ae0e2632393d 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -41,14 +41,6 @@ static int __init powersave_off(char *arg)
 }
 __setup("powersave=off", powersave_off);
 
-#ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
-{
-   sched_preempt_enable_no_resched();
-   cpu_die();
-}
-#endif
-
 void arch_cpu_idle(void)
 {
ppc64_runlatch_off();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8261999c7d52..b05d2db13d08 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1429,6 +1429,12 @@ void __cpu_die(unsigned int cpu)
smp_ops->cpu_die(cpu);
 }
 
+void arch_cpu_idle_dead(void)
+{
+   sched_preempt_enable_no_resched();
+   cpu_die();
+}
+
 void cpu_die(void)
 {
/*
-- 
2.25.1



[PATCH 2/3] powerpc/smp: Fold cpu_die() into its only caller

2020-08-18 Thread Michael Ellerman
Avoid the eternal confusion between cpu_die() and __cpu_die() by
removing the former, folding it into its only caller.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/smp.h | 1 -
 arch/powerpc/kernel/smp.c  | 4 
 2 files changed, 5 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..a314d2d2d2be 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -29,7 +29,6 @@ extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
 
-extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
 
 #ifdef CONFIG_SMP
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b05d2db13d08..c616d975bf95 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1432,11 +1432,7 @@ void __cpu_die(unsigned int cpu)
 void arch_cpu_idle_dead(void)
 {
sched_preempt_enable_no_resched();
-   cpu_die();
-}
 
-void cpu_die(void)
-{
/*
 * Disable on the down path. This will be re-enabled by
 * start_secondary() via start_secondary_resume() below
-- 
2.25.1



[PATCH 3/3] powerpc/smp: Move ppc_md.cpu_die() to smp_ops.cpu_offline_self()

2020-08-18 Thread Michael Ellerman
We have smp_ops->cpu_die() and ppc_md.cpu_die(). One of them offlines
the current CPU and one offlines another CPU, can you guess which is
which? Also one is in smp_ops and one is in ppc_md?

So rename ppc_md.cpu_die(), to cpu_offline_self(), because that's what
it does. And move it into smp_ops where it belongs.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/machdep.h   | 1 -
 arch/powerpc/include/asm/smp.h   | 3 +++
 arch/powerpc/kernel/smp.c| 4 ++--
 arch/powerpc/kernel/sysfs.c  | 4 +++-
 arch/powerpc/platforms/85xx/smp.c| 4 ++--
 arch/powerpc/platforms/powermac/pmac.h   | 2 +-
 arch/powerpc/platforms/powermac/sleep.S  | 6 +++---
 arch/powerpc/platforms/powermac/smp.c| 8 
 arch/powerpc/platforms/powernv/smp.c | 4 ++--
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 6 +++---
 10 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index a90b892f0bfe..cc2ec7101520 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,7 +65,6 @@ struct machdep_calls {
void __noreturn (*restart)(char *cmd);
void __noreturn (*halt)(void);
void(*panic)(char *str);
-   void(*cpu_die)(void);
 
long(*time_init)(void); /* Optional, may be NULL */
 
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index a314d2d2d2be..0d00faf8f119 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -49,6 +49,9 @@ struct smp_ops_t {
int   (*cpu_disable)(void);
void  (*cpu_die)(unsigned int nr);
int   (*cpu_bootable)(unsigned int nr);
+#ifdef CONFIG_HOTPLUG_CPU
+   void  (*cpu_offline_self)(void);
+#endif
 };
 
 extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 
delay_us);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index c616d975bf95..faba0fdee500 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1439,8 +1439,8 @@ void arch_cpu_idle_dead(void)
 */
this_cpu_disable_ftrace();
 
-   if (ppc_md.cpu_die)
-   ppc_md.cpu_die();
+   if (smp_ops->cpu_offline_self)
+   smp_ops->cpu_offline_self();
 
/* If we return, we re-enter start_secondary */
start_secondary_resume();
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 6bebc9a52444..7c4ccc03c2de 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -1161,6 +1161,7 @@ static int __init topology_init(void)
for_each_possible_cpu(cpu) {
struct cpu *c = _cpu(cpu_devices, cpu);
 
+#ifdef CONFIG_HOTPLUG_CPU
/*
 * For now, we just see if the system supports making
 * the RTAS calls for CPU hotplug.  But, there may be a
@@ -1168,8 +1169,9 @@ static int __init topology_init(void)
 * CPU.  For instance, the boot cpu might never be valid
 * for hotplugging.
 */
-   if (ppc_md.cpu_die)
+   if (smp_ops->cpu_offline_self)
c->hotpluggable = 1;
+#endif
 
if (cpu_online(cpu) || c->hotpluggable) {
register_cpu(c, cpu);
diff --git a/arch/powerpc/platforms/85xx/smp.c 
b/arch/powerpc/platforms/85xx/smp.c
index fda108bae95f..c6df294054fe 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -112,7 +112,7 @@ static void mpc85xx_take_timebase(void)
local_irq_restore(flags);
 }
 
-static void smp_85xx_mach_cpu_die(void)
+static void smp_85xx_cpu_offline_self(void)
 {
unsigned int cpu = smp_processor_id();
 
@@ -506,7 +506,7 @@ void __init mpc85xx_smp_init(void)
if (qoriq_pm_ops) {
smp_85xx_ops.give_timebase = mpc85xx_give_timebase;
smp_85xx_ops.take_timebase = mpc85xx_take_timebase;
-   ppc_md.cpu_die = smp_85xx_mach_cpu_die;
+   smp_85xx_ops.cpu_offline_self = smp_85xx_cpu_offline_self;
smp_85xx_ops.cpu_die = qoriq_cpu_kill;
}
 #endif
diff --git a/arch/powerpc/platforms/powermac/pmac.h 
b/arch/powerpc/platforms/powermac/pmac.h
index 16a52afdb76e..0d715db434dc 100644
--- a/arch/powerpc/platforms/powermac/pmac.h
+++ b/arch/powerpc/platforms/powermac/pmac.h
@@ -34,7 +34,7 @@ extern void pmac_check_ht_link(void);
 
 extern void pmac_setup_smp(void);
 extern int psurge_secondary_virq;
-extern void low_cpu_die(void) __attribute__((noreturn));
+extern void low_cpu_offline_self(void) __attribute__((noreturn));
 
 extern int pmac_nvram_init(void);
 extern void pmac_pic_init(void);
diff --git a/arch/powerpc/platforms/powermac/sleep.S 
b/arch/powerpc/platforms/powermac/sleep.S
index f9a680fdd9c4..c51bb63c9417 100644
--- 

Re: [PATCH v2 11/25] powerpc/signal: Refactor bad frame logging

2020-08-18 Thread Joe Perches
On Tue, 2020-08-18 at 17:19 +, Christophe Leroy wrote:
> The logging of bad frame appears half a dozen of times
> and is pretty similar.
[]
> diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
[]
> @@ -355,3 +355,14 @@ static unsigned long get_tm_stackpointer(struct 
> task_struct *tsk)
>  #endif
>   return ret;
>  }
> +
> +static const char fm32[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %08lx 
> lr %08lx\n";
> +static const char fm64[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %016lx 
> lr %016lx\n";

Why not remove this and use it in place with
%08lx/%016x used as %px with a case to (void *)?

> +void signal_fault(struct task_struct *tsk, struct pt_regs *regs,
> +   const char *where, void __user *ptr)
> +{
> + if (show_unhandled_signals)
> + printk_ratelimited(regs->msr & MSR_64BIT ? fm64 : fm32, 
> tsk->comm,
> +task_pid_nr(tsk), where, ptr, regs->nip, 
> regs->link);

pr_info_ratelimited("%s[%d]: bad frame in %s: %p nip %016lx lr 
%016lx\n",
tsk->comm, task_pid_nr(tsk), where, ptr,
(void *)regs->nip, (void *)regs->link);




Re: [PATCH 1/2] powerpc/64s: remove PROT_SAO support

2020-08-18 Thread Shawn Anastasio

On 8/18/20 2:11 AM, Nicholas Piggin wrote> Very reasonable point.


The problem we're trying to get a handle on is live partition migration
where a running guest might be using SAO then get migrated to a P10. I
don't think we have a good way to handle this case. Potentially the
hypervisor could revoke the page tables if the guest is running in hash
mode and the guest kernel could be taught about that and sigbus the
process, but in radix the guest controls those page tables and the SAO
state and I don't think there's a way to cause it to take a fault.

I also don't know what the proprietary hypervisor does here.

We could add it back, default to n, or make it bare metal only, or
somehow try to block live migration to a later CPU without the faciliy.
I wouldn't be against that.



Admittedly I'm not too familiar with the specifics of live migration
or guest memory management, but restoring the functionality and adding
a way to prevent migration of SAO-using guests seems like a reasonable
choice to me. Would this be done with help from the guest using some
sort of infrastructure to signal to the hypervisor that SAO is in use,
or entirely on the hypervisor by e.g. scanning the through the process
table for SAO pages?


It would be very interesting to know how it performs in such a "real"
situation. I don't know how well POWER9 has optimised it -- it's
possible that it's not much better than putting lwsync after every load
or store.



This is definitely worth investigating in depth. That said, even if the
performance on P9 isn't super great, I think the feature could still be
useful, since it would offer more granularity than the sledgehammer
approach of emitting lwsync everywhere.

I'd be happy to put in some of the work required to get this to a point
where it can be reintroduced without breaking guest migration - I'd just
need some pointers on getting started with whatever approach is decided on.

Thanks,
Shawn


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-08-18 Thread Nicholas Piggin
Excerpts from pet...@infradead.org's message of August 19, 2020 1:41 am:
> On Tue, Aug 18, 2020 at 05:22:33PM +1000, Nicholas Piggin wrote:
>> Excerpts from pet...@infradead.org's message of August 12, 2020 8:35 pm:
>> > On Wed, Aug 12, 2020 at 06:18:28PM +1000, Nicholas Piggin wrote:
>> >> Excerpts from pet...@infradead.org's message of August 7, 2020 9:11 pm:
>> >> > 
>> >> > What's wrong with something like this?
>> >> > 
>> >> > AFAICT there's no reason to actually try and add IRQ tracing here, it's
>> >> > just a hand full of instructions at the most.
>> >> 
>> >> Because we may want to use that in other places as well, so it would
>> >> be nice to have tracing.
>> >> 
>> >> Hmm... also, I thought NMI context was free to call local_irq_save/restore
>> >> anyway so the bug would still be there in those cases?
>> > 
>> > NMI code has in_nmi() true, in which case the IRQ tracing is disabled
>> > (except for x86 which has CONFIG_TRACE_IRQFLAGS_NMI).
>> > 
>> 
>> That doesn't help. It doesn't fix the lockdep irq state going out of
>> synch with the actual irq state. The code which triggered this with the
>> special powerpc irq disable has in_nmi() true as well.
> 
> Urgh, you're talking about using lockdep_assert_irqs*() from NMI
> context?
> 
> If not, I'm afraid I might've lost the plot a little on what exact
> failure case we're talking about.
> 

Hm, I may have been a bit confused actually. Since your Fix 
TRACE_IRQFLAGS vs NMIs patch it might now work.

I'm worried powerpc disables trace irqs trace_hardirqs_off()
before nmi_enter() might still be a problem, but not sure
actually. Alexey did you end up re-testing with Peter's patch
or current upstream?

Thanks,
Nick


Re: [PATCH] powerpc/book3s64/radix: Fix boot failure with large amount of guest memory

2020-08-18 Thread kernel test robot
Hi "Aneesh,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.9-rc1 next-20200818]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/powerpc-book3s64-radix-Fix-boot-failure-with-large-amount-of-guest-memory/20200814-002215
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allnoconfig (attached as .config)
compiler: powerpc-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   arch/powerpc/kernel/prom.c: In function 'early_init_devtree':
>> arch/powerpc/kernel/prom.c:818:3: error: implicit declaration of function 
>> 'radix__setup_initial_memory_limit'; did you mean 
>> 'setup_initial_memory_limit'? [-Werror=implicit-function-declaration]
 818 |   radix__setup_initial_memory_limit(memstart_addr, 
first_memblock_size);
 |   ^
 |   setup_initial_memory_limit
   cc1: all warnings being treated as errors

# 
https://github.com/0day-ci/linux/commit/082f192bfaabd1eeb28421d82574ce76ae0c4fba
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Aneesh-Kumar-K-V/powerpc-book3s64-radix-Fix-boot-failure-with-large-amount-of-guest-memory/20200814-002215
git checkout 082f192bfaabd1eeb28421d82574ce76ae0c4fba
vim +818 arch/powerpc/kernel/prom.c

   811  
   812  mmu_early_init_devtree();
   813  
   814  /*
   815   * Reset ppc64_rma_size and memblock memory limit
   816   */
   817  if (early_radix_enabled())
 > 818  radix__setup_initial_memory_limit(memstart_addr, 
 > first_memblock_size);
   819  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: fsl_espi errors on v5.7.15

2020-08-18 Thread Chris Packham
Hi Again,

On 17/08/20 9:09 am, Chris Packham wrote:

>
> On 14/08/20 6:19 pm, Heiner Kallweit wrote:
>> On 14.08.2020 04:48, Chris Packham wrote:
>>> Hi,
>>>
>>> I'm seeing a problem with accessing spi-nor after upgrading a T2081
>>> based system to linux v5.7.15
>>>
>>> For this board u-boot and the u-boot environment live on spi-nor.
>>>
>>> When I use fw_setenv from userspace I get the following kernel logs
>>>
>>> # fw_setenv foo=1
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
>>> fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty!
>>> fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>> fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty!
>>> fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>> fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty!
>>> fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>> ...
>>>
>> This error reporting doesn't exist yet in 4.4. So you may have an issue
>> under 4.4 too, it's just not reported.
>> Did you verify that under 4.4 fw_setenv actually has an effect?
> Just double checked and yes under 4.4 the setting does get saved.
>>> If I run fw_printenv (before getting it into a bad state) it is able to
>>> display the content of the boards u-boot environment.
>>>
>> This might indicate an issue with spi being locked. I've seen related
>> questions, just use the search engine of your choice and check for
>> fw_setenv and locked.
> I'm running a version of fw_setenv which includes 
> https://gitlab.denx.de/u-boot/u-boot/-/commit/db820159 so it shouldn't 
> be locking things unnecessarily.
>>> If been unsuccessful in producing a setup for bisecting the issue. I do
>>> know the issue doesn't occur on the old 4.4.x based kernel but that's
>>> probably not much help.
>>>
>>> Any pointers on what the issue (and/or solution) might be.

I finally managed to get our board running with a vanilla kernel. With 
corenet64_smp_defconfig I occasionally see

   fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!

other than the message things seem to be working.

With a custom defconfig I see

   fsl_espi ffe11.spi: Transfer done but SPIE_DON isn't set!
   fsl_espi ffe11.spi: Transfer done but rx/tx fifo's aren't empty!
   fsl_espi ffe11.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
   ...

and access to the spi-nor does not work until the board is reset.

I'll try and pick apart the differences between the two defconfigs.


[Bug 208957] New: 5.9-rc1 fails to build for a PowerMac G5: .../book3s64/hash_utils.c:1119:21: error: ‘default_uamor’ undeclared (first use in this function) 1119 | mtspr(SPRN_UAMOR, default_uamor)

2020-08-18 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=208957

Bug ID: 208957
   Summary: 5.9-rc1 fails to build for a PowerMac G5:
.../book3s64/hash_utils.c:1119:21: error:
‘default_uamor’ undeclared (first use in this
function)  1119 |   mtspr(SPRN_UAMOR, default_uamor);
   Product: Platform Specific/Hardware
   Version: 2.5
Kernel Version: 5.9-rc1
  Hardware: PPC-64
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: PPC-64
  Assignee: platform_ppc...@kernel-bugs.osdl.org
  Reporter: erhar...@mailbox.org
Regression: No

Created attachment 292021
  --> https://bugzilla.kernel.org/attachment.cgi?id=292021=edit
kernel .config (kernel 5.9-rc1, PowerMac G5 11,2)

[...]
  CALLscripts/checksyscalls.sh
  CALLscripts/atomic/check-atomics.sh
  CHK include/generated/compile.h
  CC  arch/powerpc/mm/book3s64/hash_utils.o
In file included from ./arch/powerpc/include/asm/processor.h:9,
 from ./arch/powerpc/include/asm/thread_info.h:40,
 from ./include/linux/thread_info.h:38,
 from ./include/asm-generic/preempt.h:5,
 from ./arch/powerpc/include/generated/asm/preempt.h:1,
 from ./include/linux/preempt.h:78,
 from ./include/linux/spinlock.h:51,
 from arch/powerpc/mm/book3s64/hash_utils.c:21:
arch/powerpc/mm/book3s64/hash_utils.c: In function
‘hash__early_init_mmu_secondary’:
arch/powerpc/mm/book3s64/hash_utils.c:1119:21: error: ‘default_uamor’
undeclared (first use in this function)
 1119 |   mtspr(SPRN_UAMOR, default_uamor);
  | ^
./arch/powerpc/include/asm/reg.h:1396:33: note: in definition of macro ‘mtspr’
 1396 |  : "r" ((unsigned long)(v)) \
  | ^
arch/powerpc/mm/book3s64/hash_utils.c:1119:21: note: each undeclared identifier
is reported only once for each function it appears in
 1119 |   mtspr(SPRN_UAMOR, default_uamor);
  | ^
./arch/powerpc/include/asm/reg.h:1396:33: note: in definition of macro ‘mtspr’
 1396 |  : "r" ((unsigned long)(v)) \
  | ^
make[3]: *** [scripts/Makefile.build:283:
arch/powerpc/mm/book3s64/hash_utils.o] Error 1
make[2]: *** [scripts/Makefile.build:500: arch/powerpc/mm/book3s64] Error 2
make[1]: *** [scripts/Makefile.build:500: arch/powerpc/mm] Error 2
make: *** [Makefile:1789: arch/powerpc] Error 2

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [PATCH v3 10/17] memblock: reduce number of parameters in for_each_mem_range()

2020-08-18 Thread Miguel Ojeda
On Tue, Aug 18, 2020 at 5:19 PM Mike Rapoport  wrote:
>
>  .clang-format  |  2 ++

For the .clang-format bit:

Acked-by: Miguel Ojeda 

Cheers,
Miguel


[PATCH v3] powerpc/pseries/svm: Allocate SWIOTLB buffer anywhere in memory

2020-08-18 Thread Thiago Jung Bauermann
POWER secure guests (i.e., guests which use the Protection Execution
Facility) need to use SWIOTLB to be able to do I/O with the hypervisor, but
they don't need the SWIOTLB memory to be in low addresses since the
hypervisor doesn't have any addressing limitation.

This solves a SWIOTLB initialization problem we are seeing in secure guests
with 128 GB of RAM: they are configured with 4 GB of crashkernel reserved
memory, which leaves no space for SWIOTLB in low addresses.

To do this, we use mostly the same code as swiotlb_init(), but allocate the
buffer using memblock_alloc() instead of memblock_alloc_low().

Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/svm.h   |  4 
 arch/powerpc/mm/mem.c|  6 +-
 arch/powerpc/platforms/pseries/svm.c | 26 ++
 3 files changed, 35 insertions(+), 1 deletion(-)

Changes from v2:
- Panic if unable to allocate buffer, as suggested by Christoph.

Changes from v1:
- Open-code swiotlb_init() in arch-specific code, as suggested by
  Christoph.

diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h
index 85580b30aba4..7546402d796a 100644
--- a/arch/powerpc/include/asm/svm.h
+++ b/arch/powerpc/include/asm/svm.h
@@ -15,6 +15,8 @@ static inline bool is_secure_guest(void)
return mfmsr() & MSR_S;
 }
 
+void __init svm_swiotlb_init(void);
+
 void dtl_cache_ctor(void *addr);
 #define get_dtl_cache_ctor()   (is_secure_guest() ? dtl_cache_ctor : NULL)
 
@@ -25,6 +27,8 @@ static inline bool is_secure_guest(void)
return false;
 }
 
+static inline void svm_swiotlb_init(void) {}
+
 #define get_dtl_cache_ctor() NULL
 
 #endif /* CONFIG_PPC_SVM */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c2c11eb8dcfc..0f21bcb16405 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -290,7 +291,10 @@ void __init mem_init(void)
 * back to to-down.
 */
memblock_set_bottom_up(true);
-   swiotlb_init(0);
+   if (is_secure_guest())
+   svm_swiotlb_init();
+   else
+   swiotlb_init(0);
 #endif
 
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index 40c0637203d5..81085eb8f225 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -7,6 +7,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -34,6 +35,31 @@ static int __init init_svm(void)
 }
 machine_early_initcall(pseries, init_svm);
 
+/*
+ * Initialize SWIOTLB. Essentially the same as swiotlb_init(), except that it
+ * can allocate the buffer anywhere in memory. Since the hypervisor doesn't 
have
+ * any addressing limitation, we don't need to allocate it in low addresses.
+ */
+void __init svm_swiotlb_init(void)
+{
+   unsigned char *vstart;
+   unsigned long bytes, io_tlb_nslabs;
+
+   io_tlb_nslabs = (swiotlb_size_or_default() >> IO_TLB_SHIFT);
+   io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+   bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+   vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
+   if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
+   return;
+
+   if (io_tlb_start)
+   memblock_free_early(io_tlb_start,
+   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+   panic("SVM: Cannot allocate SWIOTLB buffer");
+}
+
 int set_memory_encrypted(unsigned long addr, int numpages)
 {
if (!PAGE_ALIGNED(addr))


Re: [PATCH 08/11] x86: make TASK_SIZE_MAX usable from assembly code

2020-08-18 Thread Kees Cook
On Tue, Aug 18, 2020 at 10:00:16PM +0200, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 12:59:05PM -0700, Kees Cook wrote:
> > > I didn't see a problem bisecting, do you have something particular in
> > > mind?
> > 
> > Oh, I misunderstood this patch to be a fix for compilation. Is this just
> > a correctness fix?
> 
> It prepares for using the definition from assembly, which is done in
> the next patch.

Ah! Okay; thanks.

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH 03/11] fs: don't allow splice read/write without explicit ops

2020-08-18 Thread Christoph Hellwig
On Tue, Aug 18, 2020 at 12:58:07PM -0700, Kees Cook wrote:
> On Tue, Aug 18, 2020 at 09:54:46PM +0200, Christoph Hellwig wrote:
> > On Tue, Aug 18, 2020 at 12:39:34PM -0700, Kees Cook wrote:
> > > On Mon, Aug 17, 2020 at 09:32:04AM +0200, Christoph Hellwig wrote:
> > > > default_file_splice_write is the last piece of generic code that uses
> > > > set_fs to make the uaccess routines operate on kernel pointers.  It
> > > > implements a "fallback loop" for splicing from files that do not 
> > > > actually
> > > > provide a proper splice_read method.  The usual file systems and other
> > > > high bandwith instances all provide a ->splice_read, so this just 
> > > > removes
> > > > support for various device drivers and procfs/debugfs files.  If splice
> > > > support for any of those turns out to be important it can be added back
> > > > by switching them to the iter ops and using generic_file_splice_read.
> > > > 
> > > > Signed-off-by: Christoph Hellwig 
> > > 
> > > This seems a bit disruptive? I feel like this is going to make fuzzers
> > > really noisy (e.g. trinity likes to splice random stuff out of /sys and
> > > /proc).
> > 
> > Noisy in the sence of triggering the pr_debug or because they can't
> > handle -EINVAL?
> 
> Well, maybe both? I doubt much _expects_ to be using splice, so I'm fine
> with that, but it seems weird not to have a fall-back, especially if
> something would like to splice a file out of there. But, I'm not opposed
> to the change, it just seems like it might cause pain down the road.

The problem is that without pretending a buffer is in user space when
it actually isn't, we can't have a generic fallback.  So we'll have to
have specific support - I wrote generic support for seq_file, and
willy did for /proc/sys, but at least the first caused a few problems
and a fair amount of churn, so I'd rather see first if we can get
away without it.

> 
> -- 
> Kees Cook
---end quoted text---


Re: [PATCH 08/11] x86: make TASK_SIZE_MAX usable from assembly code

2020-08-18 Thread Christoph Hellwig
On Tue, Aug 18, 2020 at 12:59:05PM -0700, Kees Cook wrote:
> > I didn't see a problem bisecting, do you have something particular in
> > mind?
> 
> Oh, I misunderstood this patch to be a fix for compilation. Is this just
> a correctness fix?

It prepares for using the definition from assembly, which is done in
the next patch.


Re: [PATCH 08/11] x86: make TASK_SIZE_MAX usable from assembly code

2020-08-18 Thread Kees Cook
On Tue, Aug 18, 2020 at 09:55:39PM +0200, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 12:44:49PM -0700, Kees Cook wrote:
> > On Mon, Aug 17, 2020 at 09:32:09AM +0200, Christoph Hellwig wrote:
> > > For 64-bit the only hing missing was a strategic _AC, and for 32-bit we
> > 
> > typo: thing
> > 
> > > need to use __PAGE_OFFSET instead of PAGE_OFFSET in the TASK_SIZE
> > > definition to escape the explicit unsigned long cast.  This just works
> > > because __PAGE_OFFSET is defined using _AC itself and thus never needs
> > > the cast anyway.
> > 
> > Shouldn't this be folded into the prior patch so there's no bisection
> > problem?
> 
> I didn't see a problem bisecting, do you have something particular in
> mind?

Oh, I misunderstood this patch to be a fix for compilation. Is this just
a correctness fix?

-- 
Kees Cook


Re: [PATCH 03/11] fs: don't allow splice read/write without explicit ops

2020-08-18 Thread Kees Cook
On Tue, Aug 18, 2020 at 09:54:46PM +0200, Christoph Hellwig wrote:
> On Tue, Aug 18, 2020 at 12:39:34PM -0700, Kees Cook wrote:
> > On Mon, Aug 17, 2020 at 09:32:04AM +0200, Christoph Hellwig wrote:
> > > default_file_splice_write is the last piece of generic code that uses
> > > set_fs to make the uaccess routines operate on kernel pointers.  It
> > > implements a "fallback loop" for splicing from files that do not actually
> > > provide a proper splice_read method.  The usual file systems and other
> > > high bandwith instances all provide a ->splice_read, so this just removes
> > > support for various device drivers and procfs/debugfs files.  If splice
> > > support for any of those turns out to be important it can be added back
> > > by switching them to the iter ops and using generic_file_splice_read.
> > > 
> > > Signed-off-by: Christoph Hellwig 
> > 
> > This seems a bit disruptive? I feel like this is going to make fuzzers
> > really noisy (e.g. trinity likes to splice random stuff out of /sys and
> > /proc).
> 
> Noisy in the sence of triggering the pr_debug or because they can't
> handle -EINVAL?

Well, maybe both? I doubt much _expects_ to be using splice, so I'm fine
with that, but it seems weird not to have a fall-back, especially if
something would like to splice a file out of there. But, I'm not opposed
to the change, it just seems like it might cause pain down the road.

-- 
Kees Cook


Re: [PATCH 08/11] x86: make TASK_SIZE_MAX usable from assembly code

2020-08-18 Thread Christoph Hellwig
On Tue, Aug 18, 2020 at 12:44:49PM -0700, Kees Cook wrote:
> On Mon, Aug 17, 2020 at 09:32:09AM +0200, Christoph Hellwig wrote:
> > For 64-bit the only hing missing was a strategic _AC, and for 32-bit we
> 
> typo: thing
> 
> > need to use __PAGE_OFFSET instead of PAGE_OFFSET in the TASK_SIZE
> > definition to escape the explicit unsigned long cast.  This just works
> > because __PAGE_OFFSET is defined using _AC itself and thus never needs
> > the cast anyway.
> 
> Shouldn't this be folded into the prior patch so there's no bisection
> problem?

I didn't see a problem bisecting, do you have something particular in
mind?


Re: [PATCH 03/11] fs: don't allow splice read/write without explicit ops

2020-08-18 Thread Christoph Hellwig
On Tue, Aug 18, 2020 at 12:39:34PM -0700, Kees Cook wrote:
> On Mon, Aug 17, 2020 at 09:32:04AM +0200, Christoph Hellwig wrote:
> > default_file_splice_write is the last piece of generic code that uses
> > set_fs to make the uaccess routines operate on kernel pointers.  It
> > implements a "fallback loop" for splicing from files that do not actually
> > provide a proper splice_read method.  The usual file systems and other
> > high bandwith instances all provide a ->splice_read, so this just removes
> > support for various device drivers and procfs/debugfs files.  If splice
> > support for any of those turns out to be important it can be added back
> > by switching them to the iter ops and using generic_file_splice_read.
> > 
> > Signed-off-by: Christoph Hellwig 
> 
> This seems a bit disruptive? I feel like this is going to make fuzzers
> really noisy (e.g. trinity likes to splice random stuff out of /sys and
> /proc).

Noisy in the sence of triggering the pr_debug or because they can't
handle -EINVAL?


Re: [PATCH v2] powerpc/pseries/svm: Allocate SWIOTLB buffer anywhere in memory

2020-08-18 Thread Thiago Jung Bauermann


Christoph Hellwig  writes:

> On Mon, Aug 17, 2020 at 06:46:58PM -0300, Thiago Jung Bauermann wrote:
>> POWER secure guests (i.e., guests which use the Protection Execution
>> Facility) need to use SWIOTLB to be able to do I/O with the hypervisor, but
>> they don't need the SWIOTLB memory to be in low addresses since the
>> hypervisor doesn't have any addressing limitation.
>> 
>> This solves a SWIOTLB initialization problem we are seeing in secure guests
>> with 128 GB of RAM: they are configured with 4 GB of crashkernel reserved
>> memory, which leaves no space for SWIOTLB in low addresses.
>> 
>> To do this, we use mostly the same code as swiotlb_init(), but allocate the
>> buffer using memblock_alloc() instead of memblock_alloc_low().
>> 
>> We also need to add swiotlb_set_no_iotlb_memory() in order to set the
>> no_iotlb_memory flag if initialization fails.
>
> Do you really need the helper?  As far as I can tell the secure guests
> very much rely on swiotlb for all I/O, so you might as well panic if
> you fail to allocate it.

That is true. Ok, I will do that.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH 09/11] x86: remove address space overrides using set_fs()

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:10AM +0200, Christoph Hellwig wrote:
> Stop providing the possibility to override the address space using
> set_fs() now that there is no need for that any more.  To properly
> handle the TASK_SIZE_MAX checking for 4 vs 5-level page tables on
> x86 a new alternative is introduced, which just like the one in
> entry_64.S has to use the hardcoded virtual address bits to escape
> the fact that TASK_SIZE_MAX isn't actually a constant when 5-level
> page tables are enabled.
> 
> Signed-off-by: Christoph Hellwig 

Awesome. :)

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH 08/11] x86: make TASK_SIZE_MAX usable from assembly code

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:09AM +0200, Christoph Hellwig wrote:
> For 64-bit the only hing missing was a strategic _AC, and for 32-bit we

typo: thing

> need to use __PAGE_OFFSET instead of PAGE_OFFSET in the TASK_SIZE
> definition to escape the explicit unsigned long cast.  This just works
> because __PAGE_OFFSET is defined using _AC itself and thus never needs
> the cast anyway.

Shouldn't this be folded into the prior patch so there's no bisection
problem?

-Kees

> 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/x86/include/asm/page_32_types.h | 4 ++--
>  arch/x86/include/asm/page_64_types.h | 2 +-
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/page_32_types.h 
> b/arch/x86/include/asm/page_32_types.h
> index 26236925fb2c36..f462895a33e452 100644
> --- a/arch/x86/include/asm/page_32_types.h
> +++ b/arch/x86/include/asm/page_32_types.h
> @@ -44,8 +44,8 @@
>  /*
>   * User space process size: 3GB (default).
>   */
> -#define IA32_PAGE_OFFSET PAGE_OFFSET
> -#define TASK_SIZEPAGE_OFFSET
> +#define IA32_PAGE_OFFSET __PAGE_OFFSET
> +#define TASK_SIZE__PAGE_OFFSET
>  #define TASK_SIZE_LOWTASK_SIZE
>  #define TASK_SIZE_MAXTASK_SIZE
>  #define DEFAULT_MAP_WINDOW   TASK_SIZE
> diff --git a/arch/x86/include/asm/page_64_types.h 
> b/arch/x86/include/asm/page_64_types.h
> index 996595c9897e0a..838515daf87b36 100644
> --- a/arch/x86/include/asm/page_64_types.h
> +++ b/arch/x86/include/asm/page_64_types.h
> @@ -76,7 +76,7 @@
>   *
>   * With page table isolation enabled, we map the LDT in ... [stay tuned]
>   */
> -#define TASK_SIZE_MAX((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
> +#define TASK_SIZE_MAX((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 
> PAGE_SIZE)
>  
>  #define DEFAULT_MAP_WINDOW   ((1UL << 47) - PAGE_SIZE)
>  
> -- 
> 2.28.0
> 

-- 
Kees Cook


Re: [PATCH 05/11] test_bitmap: skip user bitmap tests for !CONFIG_SET_FS

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:06AM +0200, Christoph Hellwig wrote:
> We can't run the tests for userspace bitmap parsing if set_fs() doesn't
> exist.
> 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH 04/11] uaccess: add infrastructure for kernel builds with set_fs()

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:05AM +0200, Christoph Hellwig wrote:
> Add a CONFIG_SET_FS option that is selected by architecturess that
> implement set_fs, which is all of them initially.  If the option is not
> set stubs for routines related to overriding the address space are
> provided so that architectures can start to opt out of providing set_fs.
> 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH 03/11] fs: don't allow splice read/write without explicit ops

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:04AM +0200, Christoph Hellwig wrote:
> default_file_splice_write is the last piece of generic code that uses
> set_fs to make the uaccess routines operate on kernel pointers.  It
> implements a "fallback loop" for splicing from files that do not actually
> provide a proper splice_read method.  The usual file systems and other
> high bandwith instances all provide a ->splice_read, so this just removes
> support for various device drivers and procfs/debugfs files.  If splice
> support for any of those turns out to be important it can be added back
> by switching them to the iter ops and using generic_file_splice_read.
> 
> Signed-off-by: Christoph Hellwig 

This seems a bit disruptive? I feel like this is going to make fuzzers
really noisy (e.g. trinity likes to splice random stuff out of /sys and
/proc).

Conceptually, though:

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH] powerpc/pseries: Do not initiate shutdown when system is running on UPS

2020-08-18 Thread Tyrel Datwyler
On 8/18/20 3:54 AM, Vasant Hegde wrote:
> As per PAPR specification whenever system is running on UPS we have to
> wait for predefined time (default 10mins) before initiating shutdown.

The wording in PAPR seems a little unclear. It states for an
EPOW_SYSTEM_SHUTDOWN action code that an EPOW error should be logged followed by
scheduling a shutdown to begin after an OS defined delay interval (with 10
minutes the suggested default).

However, the modifier code descriptions seems to imply that a normal shutdown is
the only one that should happen with no additional delay.

For EPOW sensor value = 3 (EPOW_SYSTEM_SHUTDOWN)
0x01 = Normal system shutdown with no additional delay
0x02 = Loss of utility power, system is running on UPS/Battery
0x03 = Loss of system critical functions, system should be shutdown
0x04 = Ambient temperature too high

For 0x03-0x04 we also do an orderly_poweroff().

Not sure if it really matters, but I was curious and this is just what I gleaned
from glancing at PAPR.

-Tyrel

> 
> We have user space tool (rtas_errd) to monitor for EPOW events and
> initiate shutdown after predefined time. Hence do not initiate shutdown
> whenever we get EPOW_SHUTDOWN_ON_UPS event.
> 
> Fixes: 79872e35 (powerpc/pseries: All events of EPOW_SYSTEM_SHUTDOWN must 
> initiate shutdown)
> Cc: sta...@vger.kernel.org # v4.0+
> Cc: Michael Ellerman 
> Signed-off-by: Vasant Hegde 
> ---
>  arch/powerpc/platforms/pseries/ras.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/ras.c 
> b/arch/powerpc/platforms/pseries/ras.c
> index f3736fcd98fc..13c86a292c6d 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -184,7 +184,6 @@ static void handle_system_shutdown(char event_modifier)
>   case EPOW_SHUTDOWN_ON_UPS:
>   pr_emerg("Loss of system power detected. System is running on"
>" UPS/battery. Check RTAS error log for details\n");
> - orderly_poweroff(true);
>   break;
> 
>   case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
> 



Re: [PATCH 02/11] fs: don't allow kernel reads and writes without iter ops

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:03AM +0200, Christoph Hellwig wrote:
> Don't allow calling ->read or ->write with set_fs as a preparation for
> killing off set_fs.  All the instances that we use kernel_read/write on
> are using the iter ops already.
> 
> If a file has both the regular ->read/->write methods and the iter
> variants those could have different semantics for messed up enough
> drivers.  Also fails the kernel access to them in that case.
> 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH 01/11] mem: remove duplicate ops for /dev/zero and /dev/null

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:02AM +0200, Christoph Hellwig wrote:
> There is no good reason to implement both the traditional ->read and
> ->write as well as the iter based ops.  So implement just the iter
> based ones.
> 
> Suggested-by: Al Viro 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: [PATCH 06/11] lkdtm: disable set_fs-based tests for !CONFIG_SET_FS

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:07AM +0200, Christoph Hellwig wrote:
> Once we can't manipulate the address limit, we also can't test what
> happens when the manipulation is abused.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/misc/lkdtm/bugs.c | 2 ++
>  drivers/misc/lkdtm/core.c | 4 
>  drivers/misc/lkdtm/usercopy.c | 2 ++
>  3 files changed, 8 insertions(+)
> 
> diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
> index 4dfbfd51bdf774..66f1800b1cb82d 100644
> --- a/drivers/misc/lkdtm/bugs.c
> +++ b/drivers/misc/lkdtm/bugs.c
> @@ -312,6 +312,7 @@ void lkdtm_CORRUPT_LIST_DEL(void)
>   pr_err("list_del() corruption not detected!\n");
>  }
>  
> +#ifdef CONFIG_SET_FS
>  /* Test if unbalanced set_fs(KERNEL_DS)/set_fs(USER_DS) check exists. */
>  void lkdtm_CORRUPT_USER_DS(void)
>  {
> @@ -321,6 +322,7 @@ void lkdtm_CORRUPT_USER_DS(void)
>   /* Make sure we do not keep running with a KERNEL_DS! */
>   force_sig(SIGKILL);
>  }
> +#endif

Please let the test defined, but it should XFAIL with a message about
the CONFIG (see similar ifdefs in lkdtm).

>  /* Test that VMAP_STACK is actually allocating with a leading guard page */
>  void lkdtm_STACK_GUARD_PAGE_LEADING(void)
> diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
> index a5e344df916632..aae08b33a7ee2a 100644
> --- a/drivers/misc/lkdtm/core.c
> +++ b/drivers/misc/lkdtm/core.c
> @@ -112,7 +112,9 @@ static const struct crashtype crashtypes[] = {
>   CRASHTYPE(CORRUPT_STACK_STRONG),
>   CRASHTYPE(CORRUPT_LIST_ADD),
>   CRASHTYPE(CORRUPT_LIST_DEL),
> +#ifdef CONFIG_SET_FS
>   CRASHTYPE(CORRUPT_USER_DS),
> +#endif
>   CRASHTYPE(STACK_GUARD_PAGE_LEADING),
>   CRASHTYPE(STACK_GUARD_PAGE_TRAILING),
>   CRASHTYPE(UNSET_SMEP),
> @@ -172,7 +174,9 @@ static const struct crashtype crashtypes[] = {
>   CRASHTYPE(USERCOPY_STACK_FRAME_FROM),
>   CRASHTYPE(USERCOPY_STACK_BEYOND),
>   CRASHTYPE(USERCOPY_KERNEL),
> +#ifdef CONFIG_SET_FS
>   CRASHTYPE(USERCOPY_KERNEL_DS),
> +#endif
>   CRASHTYPE(STACKLEAK_ERASING),
>   CRASHTYPE(CFI_FORWARD_PROTO),

Then none of these are needed.

>  #ifdef CONFIG_X86_32

Hmpf, this ifdef was missed in ae56942c1474 ("lkdtm: Make arch-specific
tests always available"). I will fix that.

> diff --git a/drivers/misc/lkdtm/usercopy.c b/drivers/misc/lkdtm/usercopy.c
> index b833367a45d053..4b632fe79ab6bb 100644
> --- a/drivers/misc/lkdtm/usercopy.c
> +++ b/drivers/misc/lkdtm/usercopy.c
> @@ -325,6 +325,7 @@ void lkdtm_USERCOPY_KERNEL(void)
>   vm_munmap(user_addr, PAGE_SIZE);
>  }
>  
> +#ifdef CONFIG_SET_FS
>  void lkdtm_USERCOPY_KERNEL_DS(void)
>  {
>   char __user *user_ptr =
> @@ -339,6 +340,7 @@ void lkdtm_USERCOPY_KERNEL_DS(void)
>   pr_err("copy_to_user() to noncanonical address succeeded!?\n");
>   set_fs(old_fs);
>  }
> +#endif

(Same here, please.)

>  
>  void __init lkdtm_usercopy_init(void)
>  {
> -- 
> 2.28.0
> 

-- 
Kees Cook


Re: [PATCH 07/11] x86: move PAGE_OFFSET, TASK_SIZE & friends to page_{32,64}_types.h

2020-08-18 Thread Kees Cook
On Mon, Aug 17, 2020 at 09:32:08AM +0200, Christoph Hellwig wrote:
> At least for 64-bit this moves them closer to some of the defines
> they are based on, and it prepares for using the TASK_SIZE_MAX
> definition from assembly.
> 
> Signed-off-by: Christoph Hellwig 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc

2020-08-18 Thread Christophe Leroy




Le 18/08/2020 à 20:05, Christoph Hellwig a écrit :

On Tue, Aug 18, 2020 at 07:46:22PM +0200, Christophe Leroy wrote:

I gave it a go on my powerpc mpc832x. I tested it on top of my newest
series that reworks the 32 bits signal handlers (see
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=196278) with
the microbenchmark test used is that series.

With KUAP activated, on top of signal32 rework, performance is boosted as
system time for the microbenchmark goes from 1.73s down to 1.56s, that is
10% quicker

Surprisingly, with the kernel as is today without my signal's series, your
series degrades performance slightly (from 2.55s to 2.64s ie 3.5% slower).


I also observe, in both cases, a degradation on

dd if=/dev/zero of=/dev/null count=1M

Without your series, it runs in 5.29 seconds.
With your series, it runs in 5.82 seconds, that is 10% more time.


That's pretty strage, I wonder if some kernel text cache line
effects come into play here?

The kernel access side is only used in slow path code, so it should
not make a difference, and the uaccess code is simplified and should be
(marginally) faster.

Btw, was this with the __{get,put}_user_allowed cockup that you noticed
fixed?



Yes it is with the __get_user_size() replaced by __get_user_size_allowed().

Christophe


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc

2020-08-18 Thread Christoph Hellwig
On Tue, Aug 18, 2020 at 07:46:22PM +0200, Christophe Leroy wrote:
> I gave it a go on my powerpc mpc832x. I tested it on top of my newest 
> series that reworks the 32 bits signal handlers (see 
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=196278) with 
> the microbenchmark test used is that series.
>
> With KUAP activated, on top of signal32 rework, performance is boosted as 
> system time for the microbenchmark goes from 1.73s down to 1.56s, that is 
> 10% quicker
>
> Surprisingly, with the kernel as is today without my signal's series, your 
> series degrades performance slightly (from 2.55s to 2.64s ie 3.5% slower).
>
>
> I also observe, in both cases, a degradation on
>
>   dd if=/dev/zero of=/dev/null count=1M
>
> Without your series, it runs in 5.29 seconds.
> With your series, it runs in 5.82 seconds, that is 10% more time.

That's pretty strage, I wonder if some kernel text cache line
effects come into play here?

The kernel access side is only used in slow path code, so it should
not make a difference, and the uaccess code is simplified and should be
(marginally) faster.

Btw, was this with the __{get,put}_user_allowed cockup that you noticed
fixed?


Re: remove the last set_fs() in common code, and remove it for x86 and powerpc

2020-08-18 Thread Christophe Leroy




Le 17/08/2020 à 09:32, Christoph Hellwig a écrit :

Hi all,

this series removes the last set_fs() used to force a kernel address
space for the uaccess code in the kernel read/write/splice code, and then
stops implementing the address space overrides entirely for x86 and
powerpc.

The file system part has been posted a few times, and the read/write side
has been pretty much unchanced.  For splice this series drops the
conversion of the seq_file and sysctl code to the iter ops, and thus loses
the splice support for them.  The reasons for that is that it caused a lot
of churn for not much use - splice for these small files really isn't much
of a win, even if existing userspace uses it.  All callers I found do the
proper fallback, but if this turns out to be an issue the conversion can
be resurrected.


I like this series.

I gave it a go on my powerpc mpc832x. I tested it on top of my newest 
series that reworks the 32 bits signal handlers (see 
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=196278) 
with the microbenchmark test used is that series.


With KUAP activated, on top of signal32 rework, performance is boosted 
as system time for the microbenchmark goes from 1.73s down to 1.56s, 
that is 10% quicker


Surprisingly, with the kernel as is today without my signal's series, 
your series degrades performance slightly (from 2.55s to 2.64s ie 3.5% 
slower).



I also observe, in both cases, a degradation on

dd if=/dev/zero of=/dev/null count=1M

Without your series, it runs in 5.29 seconds.
With your series, it runs in 5.82 seconds, that is 10% more time.

Christophe




Besides x86 and powerpc I plan to eventually convert all other
architectures, although this will be a slow process, starting with the
easier ones once the infrastructure is merged.  The process to convert
architectures is roughtly:

  - ensure there is no set_fs(KERNEL_DS) left in arch specific code
  - implement __get_kernel_nofault and __put_kernel_nofault
  - remove the arch specific address limitation functionality

Diffstat:
  arch/Kconfig   |3
  arch/alpha/Kconfig |1
  arch/arc/Kconfig   |1
  arch/arm/Kconfig   |1
  arch/arm64/Kconfig |1
  arch/c6x/Kconfig   |1
  arch/csky/Kconfig  |1
  arch/h8300/Kconfig |1
  arch/hexagon/Kconfig   |1
  arch/ia64/Kconfig  |1
  arch/m68k/Kconfig  |1
  arch/microblaze/Kconfig|1
  arch/mips/Kconfig  |1
  arch/nds32/Kconfig |1
  arch/nios2/Kconfig |1
  arch/openrisc/Kconfig  |1
  arch/parisc/Kconfig|1
  arch/powerpc/include/asm/processor.h   |7 -
  arch/powerpc/include/asm/thread_info.h |5 -
  arch/powerpc/include/asm/uaccess.h |   78 ---
  arch/powerpc/kernel/signal.c   |3
  arch/powerpc/lib/sstep.c   |6 -
  arch/riscv/Kconfig |1
  arch/s390/Kconfig  |1
  arch/sh/Kconfig|1
  arch/sparc/Kconfig |1
  arch/um/Kconfig|1
  arch/x86/ia32/ia32_aout.c  |1
  arch/x86/include/asm/page_32_types.h   |   11 ++
  arch/x86/include/asm/page_64_types.h   |   38 +
  arch/x86/include/asm/processor.h   |   60 ---
  arch/x86/include/asm/thread_info.h |2
  arch/x86/include/asm/uaccess.h |   26 --
  arch/x86/kernel/asm-offsets.c  |3
  arch/x86/lib/getuser.S |   28 ---
  arch/x86/lib/putuser.S |   21 +++--
  arch/xtensa/Kconfig|1
  drivers/char/mem.c |   16 
  drivers/misc/lkdtm/bugs.c  |2
  drivers/misc/lkdtm/core.c  |4 +
  drivers/misc/lkdtm/usercopy.c  |2
  fs/read_write.c|   69 ++---
  fs/splice.c|  130 
+++--
  include/linux/fs.h |2
  include/linux/uaccess.h|   18 
  lib/test_bitmap.c  |   10 ++
  46 files changed, 235 insertions(+), 332 deletions(-)



[PATCH v2 25/25] powerpc/signal32: Transform save_user_regs() and save_tm_user_regs() in 'unsafe' version

2020-08-18 Thread Christophe Leroy
Change those two functions to be used within a user access block.

For that, change save_general_regs() to and unsafe_save_general_regs(),
then replace all user accesses by unsafe_ versions.

This series leads to a reduction from 2.55s to 1.73s of
the system CPU time with the following microbench app
on an mpc832x with KUAP (approx 32%)

Without KUAP, the difference is in the noise.

void sigusr1(int sig) { }

int main(int argc, char **argv)
{
int i = 10;

signal(SIGUSR1, sigusr1);
for (;i--;)
raise(SIGUSR1);
exit(0);
}

An additional 0.10s reduction is achieved by removing
CONFIG_PPC_FPU, as the mpc832x has no FPU.

A bit less spectacular on an 8xx as KUAP is less heavy, prior to
the series (with KUAP) it ran in 8.10 ms. Once applies the removal
of FPU regs handling, we get 7.05s. With the full series, we get 6.9s.
If artificially re-activating FPU regs handling with the full series,
we get 7.6s.

So for the 8xx, the removal of the FPU regs copy is what makes the
difference, but the rework of handle_signal also have a benefit.

Same as above, without KUAP the difference is in the noise.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 224 
 1 file changed, 111 insertions(+), 113 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 86539a4e0514..f795fe0240a1 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -93,8 +93,8 @@ static inline int get_sigset_t(sigset_t *set,
 #define to_user_ptr(p) ptr_to_compat(p)
 #define from_user_ptr(p)   compat_ptr(p)
 
-static inline int save_general_regs(struct pt_regs *regs,
-   struct mcontext __user *frame)
+static __always_inline int
+save_general_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame)
 {
elf_greg_t64 *gregs = (elf_greg_t64 *)regs;
int val, i;
@@ -108,10 +108,12 @@ static inline int save_general_regs(struct pt_regs *regs,
else
val = gregs[i];
 
-   if (__put_user(val, >mc_gregs[i]))
-   return -EFAULT;
+   unsafe_put_user(val, >mc_gregs[i], failed);
}
return 0;
+
+failed:
+   return 1;
 }
 
 static inline int restore_general_regs(struct pt_regs *regs,
@@ -148,11 +150,15 @@ static inline int get_sigset_t(sigset_t *set, const 
sigset_t __user *uset)
 #define to_user_ptr(p) ((unsigned long)(p))
 #define from_user_ptr(p)   ((void __user *)(p))
 
-static inline int save_general_regs(struct pt_regs *regs,
-   struct mcontext __user *frame)
+static __always_inline int
+save_general_regs_unsafe(struct pt_regs *regs, struct mcontext __user *frame)
 {
WARN_ON(!FULL_REGS(regs));
-   return __copy_to_user(>mc_gregs, regs, GP_REGS_SIZE);
+   unsafe_copy_to_user(>mc_gregs, regs, GP_REGS_SIZE, failed);
+   return 0;
+
+failed:
+   return 1;
 }
 
 static inline int restore_general_regs(struct pt_regs *regs,
@@ -170,6 +176,11 @@ static inline int restore_general_regs(struct pt_regs 
*regs,
 }
 #endif
 
+#define unsafe_save_general_regs(regs, frame, label) do {  \
+   if (save_general_regs_unsafe(regs, frame))  \
+   goto label; \
+} while (0)
+
 /*
  * When we have signals to deliver, we set up on the
  * user stack, going down from the original stack pointer:
@@ -249,21 +260,19 @@ static void prepare_save_user_regs(int ctx_has_vsx_region)
 #endif
 }
 
-static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
- struct mcontext __user *tm_frame, int 
ctx_has_vsx_region)
+static int save_user_regs_unsafe(struct pt_regs *regs, struct mcontext __user 
*frame,
+struct mcontext __user *tm_frame, int 
ctx_has_vsx_region)
 {
unsigned long msr = regs->msr;
 
/* save general registers */
-   if (save_general_regs(regs, frame))
-   return 1;
+   unsafe_save_general_regs(regs, frame, failed);
 
 #ifdef CONFIG_ALTIVEC
/* save altivec registers */
if (current->thread.used_vr) {
-   if (__copy_to_user(>mc_vregs, >thread.vr_state,
-  ELF_NVRREG * sizeof(vector128)))
-   return 1;
+   unsafe_copy_to_user(>mc_vregs, >thread.vr_state,
+   ELF_NVRREG * sizeof(vector128), failed);
/* set MSR_VEC in the saved MSR value to indicate that
   frame->mc_vregs contains valid data */
msr |= MSR_VEC;
@@ -276,11 +285,10 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * most significant bits of that same vector. --BenH
 * Note that the current VRSAVE value is 

[PATCH v2 24/25] powerpc/signal32: Isolate non-copy actions in save_user_regs() and save_tm_user_regs()

2020-08-18 Thread Christophe Leroy
Reorder actions in save_user_regs() and save_tm_user_regs() to
regroup copies together in order to switch to user_access_begin()
logic in a later patch.

Move non-copy actions into new functions called
prepare_save_user_regs() and prepare_save_tm_user_regs().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 54 +
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 5b8a4ede142c..86539a4e0514 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -229,14 +229,31 @@ struct rt_sigframe {
  * We only save the altivec/spe registers if the process has used
  * altivec/spe instructions at some point.
  */
+static void prepare_save_user_regs(int ctx_has_vsx_region)
+{
+   /* Make sure floating point registers are stored in regs */
+   flush_fp_to_thread(current);
+#ifdef CONFIG_ALTIVEC
+   if (current->thread.used_vr)
+   flush_altivec_to_thread(current);
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   current->thread.vrsave = mfspr(SPRN_VRSAVE);
+#endif
+#ifdef CONFIG_VSX
+   if (current->thread.used_vsr && ctx_has_vsx_region)
+   flush_vsx_to_thread(current);
+#endif
+#ifdef CONFIG_SPE
+   if (current->thread.used_spe)
+   flush_spe_to_thread(current);
+#endif
+}
+
 static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
  struct mcontext __user *tm_frame, int 
ctx_has_vsx_region)
 {
unsigned long msr = regs->msr;
 
-   /* Make sure floating point registers are stored in regs */
-   flush_fp_to_thread(current);
-
/* save general registers */
if (save_general_regs(regs, frame))
return 1;
@@ -244,7 +261,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 #ifdef CONFIG_ALTIVEC
/* save altivec registers */
if (current->thread.used_vr) {
-   flush_altivec_to_thread(current);
if (__copy_to_user(>mc_vregs, >thread.vr_state,
   ELF_NVRREG * sizeof(vector128)))
return 1;
@@ -260,8 +276,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * most significant bits of that same vector. --BenH
 * Note that the current VRSAVE value is in the SPR at this point.
 */
-   if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   current->thread.vrsave = mfspr(SPRN_VRSAVE);
if (__put_user(current->thread.vrsave, (u32 __user 
*)>mc_vregs[32]))
return 1;
 #endif /* CONFIG_ALTIVEC */
@@ -281,7 +295,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 * contains valid data
 */
if (current->thread.used_vsr && ctx_has_vsx_region) {
-   flush_vsx_to_thread(current);
if (copy_vsx_to_user(>mc_vsregs, current))
return 1;
msr |= MSR_VSX;
@@ -290,7 +303,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
 #ifdef CONFIG_SPE
/* save spe registers */
if (current->thread.used_spe) {
-   flush_spe_to_thread(current);
if (__copy_to_user(>mc_vregs, current->thread.evr,
   ELF_NEVRREG * sizeof(u32)))
return 1;
@@ -326,11 +338,23 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
  *
  * See save_user_regs() and signal_64.c:setup_tm_sigcontexts().
  */
-static int save_tm_user_regs(struct pt_regs *regs, struct mcontext __user 
*frame,
-struct mcontext __user *tm_frame, unsigned long 
msr)
+static void prepare_save_tm_user_regs(void)
 {
WARN_ON(tm_suspend_disabled);
 
+#ifdef CONFIG_ALTIVEC
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   current->thread.ckvrsave = mfspr(SPRN_VRSAVE);
+#endif
+#ifdef CONFIG_SPE
+   if (current->thread.used_spe)
+   flush_spe_to_thread(current);
+#endif
+}
+
+static int save_tm_user_regs(struct pt_regs *regs, struct mcontext __user 
*frame,
+struct mcontext __user *tm_frame, unsigned long 
msr)
+{
/* Save both sets of general registers */
if (save_general_regs(>thread.ckpt_regs, frame)
|| save_general_regs(regs, tm_frame))
@@ -374,8 +398,6 @@ static int save_tm_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame
 * significant bits of a vector, we "cheat" and stuff VRSAVE in the
 * most significant bits of that same vector. --BenH
 */
-   if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   current->thread.ckvrsave = mfspr(SPRN_VRSAVE);
if (__put_user(current->thread.ckvrsave,
   (u32 __user *)>mc_vregs[32]))
  

[PATCH v2 23/25] powerpc/signal: Create 'unsafe' versions of copy_[ck][fpr/vsx]_to_user()

2020-08-18 Thread Christophe Leroy
For the non VSX version, that's trivial. Just use unsafe_copy_to_user()
instead of __copy_to_user().

For the VSX version, remove the intermediate step through a buffer and
use unsafe_put_user() directly. This generates a far smaller code which
is acceptable to inline, see below:

Standard VSX version:

 <.copy_fpr_to_user>:
   0:   7c 08 02 a6 mflrr0
   4:   fb e1 ff f8 std r31,-8(r1)
   8:   39 00 00 20 li  r8,32
   c:   39 24 0b 80 addir9,r4,2944
  10:   7d 09 03 a6 mtctr   r8
  14:   f8 01 00 10 std r0,16(r1)
  18:   f8 21 fe 71 stdur1,-400(r1)
  1c:   39 41 00 68 addir10,r1,104
  20:   e9 09 00 00 ld  r8,0(r9)
  24:   39 4a 00 08 addir10,r10,8
  28:   39 29 00 10 addir9,r9,16
  2c:   f9 0a 00 00 std r8,0(r10)
  30:   42 00 ff f0 bdnz20 <.copy_fpr_to_user+0x20>
  34:   e9 24 0d 80 ld  r9,3456(r4)
  38:   3d 42 00 00 addis   r10,r2,0
3a: R_PPC64_TOC16_HA.toc
  3c:   eb ea 00 00 ld  r31,0(r10)
3e: R_PPC64_TOC16_LO_DS .toc
  40:   f9 21 01 70 std r9,368(r1)
  44:   e9 3f 00 00 ld  r9,0(r31)
  48:   81 29 00 20 lwz r9,32(r9)
  4c:   2f 89 00 00 cmpwi   cr7,r9,0
  50:   40 9c 00 18 bge cr7,68 <.copy_fpr_to_user+0x68>
  54:   4c 00 01 2c isync
  58:   3d 20 40 00 lis r9,16384
  5c:   79 29 07 c6 rldicr  r9,r9,32,31
  60:   7d 3d 03 a6 mtspr   29,r9
  64:   4c 00 01 2c isync
  68:   38 a0 01 08 li  r5,264
  6c:   38 81 00 70 addir4,r1,112
  70:   48 00 00 01 bl  70 <.copy_fpr_to_user+0x70>
70: R_PPC64_REL24   .__copy_tofrom_user
  74:   60 00 00 00 nop
  78:   e9 3f 00 00 ld  r9,0(r31)
  7c:   81 29 00 20 lwz r9,32(r9)
  80:   2f 89 00 00 cmpwi   cr7,r9,0
  84:   40 9c 00 18 bge cr7,9c <.copy_fpr_to_user+0x9c>
  88:   4c 00 01 2c isync
  8c:   39 20 ff ff li  r9,-1
  90:   79 29 00 44 rldicr  r9,r9,0,1
  94:   7d 3d 03 a6 mtspr   29,r9
  98:   4c 00 01 2c isync
  9c:   38 21 01 90 addir1,r1,400
  a0:   e8 01 00 10 ld  r0,16(r1)
  a4:   eb e1 ff f8 ld  r31,-8(r1)
  a8:   7c 08 03 a6 mtlrr0
  ac:   4e 80 00 20 blr

'unsafe' simulated VSX version (The ... are only nops) using
unsafe_copy_fpr_to_user() macro:

unsigned long copy_fpr_to_user(void __user *to,
   struct task_struct *task)
{
unsafe_copy_fpr_to_user(to, task, failed);
return 0;
failed:
return 1;
}

 <.copy_fpr_to_user>:
   0:   39 00 00 20 li  r8,32
   4:   39 44 0b 80 addir10,r4,2944
   8:   7d 09 03 a6 mtctr   r8
   c:   7c 69 1b 78 mr  r9,r3
...
  20:   e9 0a 00 00 ld  r8,0(r10)
  24:   f9 09 00 00 std r8,0(r9)
  28:   39 4a 00 10 addir10,r10,16
  2c:   39 29 00 08 addir9,r9,8
  30:   42 00 ff f0 bdnz20 <.copy_fpr_to_user+0x20>
  34:   e9 24 0d 80 ld  r9,3456(r4)
  38:   f9 23 01 00 std r9,256(r3)
  3c:   38 60 00 00 li  r3,0
  40:   4e 80 00 20 blr
...
  50:   38 60 00 01 li  r3,1
  54:   4e 80 00 20 blr

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.h | 53 
 1 file changed, 53 insertions(+)

diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index f610cfafa478..2559a681536e 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -32,7 +32,54 @@ unsigned long copy_fpr_to_user(void __user *to, struct 
task_struct *task);
 unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct *task);
 unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from);
 unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user 
*from);
+
+#define unsafe_copy_fpr_to_user(to, task, label)   do {\
+   struct task_struct *__t = task; \
+   u64 __user *buf = (u64 __user *)to; \
+   int i;  \
+   \
+   for (i = 0; i < ELF_NFPREG - 1 ; i++)   \
+   unsafe_put_user(__t->thread.TS_FPR(i), [i], label); \
+   unsafe_put_user(__t->thread.fp_state.fpscr, [i], label);\
+} while (0)
+
+#define unsafe_copy_vsx_to_user(to, task, label)   do {\
+   struct task_struct *__t = task; \
+   u64 __user *buf = (u64 __user *)to; \
+   int i;  \
+   \
+   for (i = 0; i < ELF_NVSRHALFREG ; i++)  \
+ 

[PATCH v2 22/25] powerpc/signal32: Switch swap_context() to user_access_begin() logic

2020-08-18 Thread Christophe Leroy
As this was the last user of put_sigset_t(), remove it as well.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 24 ++--
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 3f9f315dd036..5b8a4ede142c 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -82,11 +82,6 @@
  * Functions for flipping sigsets (thanks to brain dead generic
  * implementation that makes things simple for little endian only)
  */
-static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set)
-{
-   return put_compat_sigset(uset, set, sizeof(*uset));
-}
-
 #define unsafe_put_sigset_tunsafe_put_compat_sigset
 
 static inline int get_sigset_t(sigset_t *set,
@@ -138,11 +133,6 @@ static inline int restore_general_regs(struct pt_regs 
*regs,
 
 #define GP_REGS_SIZE   min(sizeof(elf_gregset_t), sizeof(struct pt_regs))
 
-static inline int put_sigset_t(sigset_t __user *uset, sigset_t *set)
-{
-   return copy_to_user(uset, set, sizeof(*uset));
-}
-
 #define unsafe_put_sigset_t(uset, set, label) do { \
sigset_t __user *__us = uset;   \
const sigset_t *__s = set;  \
@@ -1048,11 +1038,13 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, 
old_ctx,
 */
mctx = (struct mcontext __user *)
((unsigned long) _ctx->uc_mcontext & ~0xfUL);
-   if (!access_ok(old_ctx, ctx_size)
-   || save_user_regs(regs, mctx, NULL, ctx_has_vsx_region)
-   || put_sigset_t(_ctx->uc_sigmask, >blocked)
-   || __put_user(to_user_ptr(mctx), _ctx->uc_regs))
+   if (save_user_regs(regs, mctx, NULL, ctx_has_vsx_region))
+   return -EFAULT;
+   if (!user_write_access_begin(old_ctx, ctx_size))
return -EFAULT;
+   unsafe_put_sigset_t(_ctx->uc_sigmask, >blocked, 
failed);
+   unsafe_put_user(to_user_ptr(mctx), _ctx->uc_regs, failed);
+   user_write_access_end();
}
if (new_ctx == NULL)
return 0;
@@ -1076,6 +1068,10 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, 
old_ctx,
 
set_thread_flag(TIF_RESTOREALL);
return 0;
+
+failed:
+   user_write_access_end();
+   return -EFAULT;
 }
 
 #ifdef CONFIG_PPC64
-- 
2.25.0



[PATCH v2 21/25] powerpc/signal32: Add and use unsafe_put_sigset_t()

2020-08-18 Thread Christophe Leroy
put_sigset_t() calls copy_to_user() for copying two words.

This is terribly inefficient for copying two words.

By switching to unsafe_put_user(), we end up with something as
simple as:

 3cc:   81 3d 00 00 lwz r9,0(r29)
 3d0:   91 26 00 b4 stw r9,180(r6)
 3d4:   81 3d 00 04 lwz r9,4(r29)
 3d8:   91 26 00 b8 stw r9,184(r6)

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 310d3b8d9ad5..3f9f315dd036 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -87,6 +87,8 @@ static inline int put_sigset_t(compat_sigset_t __user *uset, 
sigset_t *set)
return put_compat_sigset(uset, set, sizeof(*uset));
 }
 
+#define unsafe_put_sigset_tunsafe_put_compat_sigset
+
 static inline int get_sigset_t(sigset_t *set,
   const compat_sigset_t __user *uset)
 {
@@ -141,6 +143,13 @@ static inline int put_sigset_t(sigset_t __user *uset, 
sigset_t *set)
return copy_to_user(uset, set, sizeof(*uset));
 }
 
+#define unsafe_put_sigset_t(uset, set, label) do { \
+   sigset_t __user *__us = uset;   \
+   const sigset_t *__s = set;  \
+   \
+   unsafe_copy_to_user(__us, __s, sizeof(*__us), label);   \
+} while (0)
+
 static inline int get_sigset_t(sigset_t *set, const sigset_t __user *uset)
 {
return copy_from_user(set, uset, sizeof(*uset));
@@ -780,10 +789,10 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
failed);
unsafe_put_user(PPC_INST_SC, >mc_pad[1], failed);
}
+   unsafe_put_sigset_t(>uc.uc_sigmask, oldset, failed);
+
user_write_access_end();
 
-   if (put_sigset_t(>uc.uc_sigmask, oldset))
-   goto badframe;
if (copy_siginfo_to_user(>info, >info))
goto badframe;
 
-- 
2.25.0



[PATCH v2 19/25] powerpc/signal32: Remove ifdefery in middle of if/else

2020-08-18 Thread Christophe Leroy
MSR_TM_ACTIVE() is always defined and returns always 0 when
CONFIG_PPC_TRANSACTIONAL_MEM is not selected, so the awful
ifdefery in the middle of an if/else can be removed.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 93c2d6304831..310d3b8d9ad5 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -447,6 +447,12 @@ static int save_tm_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame
 
return 0;
 }
+#else
+static int save_tm_user_regs(struct pt_regs *regs, struct mcontext __user 
*frame,
+struct mcontext __user *tm_frame, unsigned long 
msr)
+{
+   return 0;
+}
 #endif
 
 /*
@@ -732,10 +738,8 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
unsigned long newsp = 0;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* Save the thread's msr before get_tm_stackpointer() changes it */
unsigned long msr = regs->msr;
-#endif
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
@@ -786,14 +790,10 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
if (tramp == (unsigned long)mctx->mc_pad)
flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(msr)) {
if (save_tm_user_regs(regs, mctx, tm_mctx, msr))
goto badframe;
-   }
-   else
-#endif
-   {
+   } else {
if (save_user_regs(regs, mctx, tm_mctx, 1))
goto badframe;
}
@@ -842,10 +842,8 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
unsigned long newsp = 0;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* Save the thread's msr before get_tm_stackpointer() changes it */
unsigned long msr = regs->msr;
-#endif
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
@@ -883,14 +881,10 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
if (tramp == (unsigned long)mctx->mc_pad)
flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(msr)) {
if (save_tm_user_regs(regs, mctx, tm_mctx, msr))
goto badframe;
-   }
-   else
-#endif
-   {
+   } else {
if (save_user_regs(regs, mctx, tm_mctx, 1))
goto badframe;
}
-- 
2.25.0



[PATCH v2 20/25] signal: Add unsafe_put_compat_sigset()

2020-08-18 Thread Christophe Leroy
Implement 'unsafe' version of put_compat_sigset()

For the bigendian, use unsafe_put_user() directly
to avoid intermediate copy through the stack.

For the littleendian, use a straight unsafe_copy_to_user().

Signed-off-by: Christophe Leroy 
---
 include/linux/compat.h | 32 
 1 file changed, 32 insertions(+)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index d38c4d7e83bd..7ec6e44b093b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -442,6 +442,38 @@ put_compat_sigset(compat_sigset_t __user *compat, const 
sigset_t *set,
 #endif
 }
 
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define unsafe_put_compat_sigset(compat, set, label) do {  \
+   compat_sigset_t __user *__c = compat;   \
+   const sigset_t *__s = set;  \
+   \
+   switch (_NSIG_WORDS) {  \
+   case 4: \
+   unsafe_put_user(__s->sig[3] >> 32, &__c->sig[7], label);
\
+   unsafe_put_user(__s->sig[3], &__c->sig[6], label);  \
+   fallthrough;\
+   case 3: \
+   unsafe_put_user(__s->sig[2] >> 32, &__c->sig[5], label);
\
+   unsafe_put_user(__s->sig[2], &__c->sig[4], label);  \
+   fallthrough;\
+   case 2: \
+   unsafe_put_user(__s->sig[1] >> 32, &__c->sig[3], label);
\
+   unsafe_put_user(__s->sig[1], &__c->sig[2], label);  \
+   fallthrough;\
+   case 1: \
+   unsafe_put_user(__s->sig[0] >> 32, &__c->sig[1], label);
\
+   unsafe_put_user(__s->sig[0], &__c->sig[0], label);  \
+   }   \
+} while (0)
+#else
+#define unsafe_put_compat_sigset(compat, set, label) do {  \
+   compat_sigset_t __user *__c = compat;   \
+   const sigset_t *__s = set;  \
+   \
+   unsafe_copy_to_user(__c, __s, sizeof(*__c), label); \
+} while (0)
+#endif
+
 extern int compat_ptrace_request(struct task_struct *child,
 compat_long_t request,
 compat_ulong_t addr, compat_ulong_t data);
-- 
2.25.0



[PATCH v2 18/25] powerpc/signal32: Switch handle_rt_signal32() to user_access_begin() logic

2020-08-18 Thread Christophe Leroy
On the same way as handle_signal32(), replace all user
accesses with equivalent unsafe_ versions, and move the
trampoline code icache flush outside the user access block.

Functions that have no unsafe_ equivalent also remains outside
the access block.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 55 -
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index fc8ba4b29edf..93c2d6304831 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -58,8 +58,6 @@
 #define mcontext   mcontext32
 #define ucontext   ucontext32
 
-#define __save_altstack __compat_save_altstack
-
 /*
  * Userspace code may pass a ucontext which doesn't include VSX added
  * at the end.  We need to check for this case.
@@ -745,16 +743,28 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
tm_mctx = >uc_transact.uc_mcontext;
 #endif
-   if (!access_ok(frame, sizeof(*frame)))
+   if (!user_write_access_begin(frame, sizeof(*frame)))
goto badframe;
 
/* Put the siginfo & fill in most of the ucontext */
-   if (copy_siginfo_to_user(>info, >info) ||
-   __put_user(0, >uc.uc_flags) ||
-   __save_altstack(>uc.uc_stack, regs->gpr[1]) ||
-   __put_user(to_user_ptr(>uc.uc_mcontext), >uc.uc_regs) 
||
-   put_sigset_t(>uc.uc_sigmask, oldset))
-   goto badframe;
+   unsafe_put_user(0, >uc.uc_flags, failed);
+#ifdef CONFIG_PPC64
+   unsafe_compat_save_altstack(>uc.uc_stack, regs->gpr[1], failed);
+#else
+   unsafe_save_altstack(>uc.uc_stack, regs->gpr[1], failed);
+#endif
+   unsafe_put_user(to_user_ptr(>uc.uc_mcontext), 
>uc.uc_regs, failed);
+
+   if (MSR_TM_ACTIVE(msr)) {
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   unsafe_put_user((unsigned long)>uc_transact,
+   >uc.uc_link, failed);
+   unsafe_put_user((unsigned long)tm_mctx,
+   >uc_transact.uc_regs, failed);
+#endif
+   } else {
+   unsafe_put_user(0, >uc.uc_link, failed);
+   }
 
/* Save user registers on the stack */
if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
@@ -762,28 +772,28 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
} else {
tramp = (unsigned long)mctx->mc_pad;
/* Set up the sigreturn trampoline: li r0,sigret; sc */
-   if (__put_user(PPC_INST_ADDI + __NR_sigreturn, 
>mc_pad[0]))
-   goto badframe;
-   if (__put_user(PPC_INST_SC, >mc_pad[1]))
-   goto badframe;
-   flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
+   unsafe_put_user(PPC_INST_ADDI + __NR_rt_sigreturn, 
>mc_pad[0],
+   failed);
+   unsafe_put_user(PPC_INST_SC, >mc_pad[1], failed);
}
+   user_write_access_end();
+
+   if (put_sigset_t(>uc.uc_sigmask, oldset))
+   goto badframe;
+   if (copy_siginfo_to_user(>info, >info))
+   goto badframe;
+
+   if (tramp == (unsigned long)mctx->mc_pad)
+   flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(msr)) {
-   if (__put_user((unsigned long)>uc_transact,
-  >uc.uc_link) ||
-   __put_user((unsigned long)tm_mctx,
-  >uc_transact.uc_regs))
-   goto badframe;
if (save_tm_user_regs(regs, mctx, tm_mctx, msr))
goto badframe;
}
else
 #endif
{
-   if (__put_user(0, >uc.uc_link))
-   goto badframe;
if (save_user_regs(regs, mctx, tm_mctx, 1))
goto badframe;
}
@@ -810,6 +820,9 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
regs->msr |= (MSR_KERNEL & MSR_LE);
return 0;
 
+failed:
+   user_write_access_end();
+
 badframe:
signal_fault(tsk, regs, "handle_rt_signal32", frame);
 
-- 
2.25.0



[PATCH v2 17/25] powerpc/signal32: Switch handle_signal32() to user_access_begin() logic

2020-08-18 Thread Christophe Leroy
Replace the access_ok() by user_access_begin() and change all user
accesses to unsafe_ version.

Move flush_icache_range() outside the user access block.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index d8c3843102df..fc8ba4b29edf 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -840,35 +840,35 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
tm_mctx = >mctx_transact;
 #endif
-   if (!access_ok(frame, sizeof(*frame)))
+   if (!user_write_access_begin(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
 
 #if _NSIG != 64
 #error "Please adjust handle_signal()"
 #endif
-   if (__put_user(to_user_ptr(ksig->ka.sa.sa_handler), >handler)
-   || __put_user(oldset->sig[0], >oldmask)
+   unsafe_put_user(to_user_ptr(ksig->ka.sa.sa_handler), >handler, 
failed);
+   unsafe_put_user(oldset->sig[0], >oldmask, failed);
 #ifdef CONFIG_PPC64
-   || __put_user((oldset->sig[0] >> 32), >_unused[3])
+   unsafe_put_user((oldset->sig[0] >> 32), >_unused[3], failed);
 #else
-   || __put_user(oldset->sig[1], >_unused[3])
+   unsafe_put_user(oldset->sig[1], >_unused[3], failed);
 #endif
-   || __put_user(to_user_ptr(mctx), >regs)
-   || __put_user(ksig->sig, >signal))
-   goto badframe;
+   unsafe_put_user(to_user_ptr(mctx), >regs, failed);
+   unsafe_put_user(ksig->sig, >signal, failed);
 
if (vdso32_sigtramp && tsk->mm->context.vdso_base) {
tramp = tsk->mm->context.vdso_base + vdso32_sigtramp;
} else {
tramp = (unsigned long)mctx->mc_pad;
/* Set up the sigreturn trampoline: li r0,sigret; sc */
-   if (__put_user(PPC_INST_ADDI + __NR_sigreturn, 
>mc_pad[0]))
-   goto badframe;
-   if (__put_user(PPC_INST_SC, >mc_pad[1]))
-   goto badframe;
-   flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
+   unsafe_put_user(PPC_INST_ADDI + __NR_sigreturn, 
>mc_pad[0], failed);
+   unsafe_put_user(PPC_INST_SC, >mc_pad[1], failed);
}
+   user_write_access_end();
+
+   if (tramp == (unsigned long)mctx->mc_pad)
+   flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(msr)) {
@@ -901,6 +901,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
regs->msr &= ~MSR_LE;
return 0;
 
+failed:
+   user_write_access_end();
+
 badframe:
signal_fault(tsk, regs, "handle_signal32", frame);
 
-- 
2.25.0



[PATCH v2 16/25] powerpc/signal32: Move signal trampoline setup to handle_[rt_]signal32

2020-08-18 Thread Christophe Leroy
Move signal trampoline setup into handle_signal32()
and handle_rt_signal32().

At the same time, remove the define which hides the mc_pad field
used for trampoline.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 61 -
 1 file changed, 22 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index ab8c8cb98b15..d8c3843102df 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -199,9 +199,6 @@ struct sigframe {
int abigap[56];
 };
 
-/* We use the mc_pad field for the signal return trampoline. */
-#define tramp  mc_pad
-
 /*
  *  When we have rt signals to deliver, we set up on the
  *  user stack, going down from the original stack pointer:
@@ -236,8 +233,7 @@ struct rt_sigframe {
  * altivec/spe instructions at some point.
  */
 static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
- struct mcontext __user *tm_frame, int sigret,
- int ctx_has_vsx_region)
+ struct mcontext __user *tm_frame, int 
ctx_has_vsx_region)
 {
unsigned long msr = regs->msr;
 
@@ -320,15 +316,6 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
if (tm_frame && __put_user(0, _frame->mc_gregs[PT_MSR]))
return 1;
 
-   if (sigret) {
-   /* Set up the sigreturn trampoline: li 0,sigret; sc */
-   if (__put_user(PPC_INST_ADDI + sigret, >tramp[0])
-   || __put_user(PPC_INST_SC, >tramp[1]))
-   return 1;
-   flush_icache_range((unsigned long) >tramp[0],
-  (unsigned long) >tramp[2]);
-   }
-
return 0;
 }
 
@@ -342,10 +329,8 @@ static int save_user_regs(struct pt_regs *regs, struct 
mcontext __user *frame,
  *
  * See save_user_regs() and signal_64.c:setup_tm_sigcontexts().
  */
-static int save_tm_user_regs(struct pt_regs *regs,
-struct mcontext __user *frame,
-struct mcontext __user *tm_frame, int sigret,
-unsigned long msr)
+static int save_tm_user_regs(struct pt_regs *regs, struct mcontext __user 
*frame,
+struct mcontext __user *tm_frame, unsigned long 
msr)
 {
WARN_ON(tm_suspend_disabled);
 
@@ -461,14 +446,6 @@ static int save_tm_user_regs(struct pt_regs *regs,
 
if (__put_user(msr, >mc_gregs[PT_MSR]))
return 1;
-   if (sigret) {
-   /* Set up the sigreturn trampoline: li 0,sigret; sc */
-   if (__put_user(PPC_INST_ADDI + sigret, >tramp[0])
-   || __put_user(PPC_INST_SC, >tramp[1]))
-   return 1;
-   flush_icache_range((unsigned long) >tramp[0],
-  (unsigned long) >tramp[2]);
-   }
 
return 0;
 }
@@ -755,7 +732,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
struct mcontext __user *mctx;
struct mcontext __user *tm_mctx = NULL;
unsigned long newsp = 0;
-   int sigret;
unsigned long tramp;
struct pt_regs *regs = tsk->thread.regs;
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -782,11 +758,15 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Save user registers on the stack */
if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
-   sigret = 0;
tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
} else {
-   sigret = __NR_rt_sigreturn;
-   tramp = (unsigned long)mctx->tramp;
+   tramp = (unsigned long)mctx->mc_pad;
+   /* Set up the sigreturn trampoline: li r0,sigret; sc */
+   if (__put_user(PPC_INST_ADDI + __NR_sigreturn, 
>mc_pad[0]))
+   goto badframe;
+   if (__put_user(PPC_INST_SC, >mc_pad[1]))
+   goto badframe;
+   flush_icache_range(tramp, tramp + 2 * sizeof(unsigned long));
}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -796,7 +776,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
__put_user((unsigned long)tm_mctx,
   >uc_transact.uc_regs))
goto badframe;
-   if (save_tm_user_regs(regs, mctx, tm_mctx, sigret, msr))
+   if (save_tm_user_regs(regs, mctx, tm_mctx, msr))
goto badframe;
}
else
@@ -804,7 +784,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
{
if (__put_user(0, >uc.uc_link))
goto badframe;
-   if (save_user_regs(regs, mctx, tm_mctx, sigret, 1))
+   if (save_user_regs(regs, mctx, tm_mctx, 1))
   

[PATCH v2 15/25] powerpc/signal32: Misc changes to make handle_[rt_]_signal32() more similar

2020-08-18 Thread Christophe Leroy
Miscellaneous changes to clean and make handle_signal32() and
handle_rt_signal32() even more similar.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index d0fcb3de66aa..ab8c8cb98b15 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -764,8 +764,11 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 #endif
 
/* Set up Signal Frame */
-   /* Put a Real Time Context onto stack */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+   mctx = >uc.uc_mcontext;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   tm_mctx = >uc_transact.uc_mcontext;
+#endif
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
 
@@ -778,7 +781,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
goto badframe;
 
/* Save user registers on the stack */
-   mctx = >uc.uc_mcontext;
if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
sigret = 0;
tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
@@ -788,7 +790,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   tm_mctx = >uc_transact.uc_mcontext;
if (MSR_TM_ACTIVE(msr)) {
if (__put_user((unsigned long)>uc_transact,
   >uc.uc_link) ||
@@ -843,6 +844,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 {
struct sigcontext __user *sc;
struct sigframe __user *frame;
+   struct mcontext __user *mctx;
struct mcontext __user *tm_mctx = NULL;
unsigned long newsp = 0;
int sigret;
@@ -855,6 +857,10 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+   mctx = >mctx;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   tm_mctx = >mctx_transact;
+#endif
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
@@ -869,7 +875,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 #else
|| __put_user(oldset->sig[1], >_unused[3])
 #endif
-   || __put_user(to_user_ptr(>mctx), >regs)
+   || __put_user(to_user_ptr(mctx), >regs)
|| __put_user(ksig->sig, >signal))
goto badframe;
 
@@ -878,20 +884,18 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
tramp = tsk->mm->context.vdso_base + vdso32_sigtramp;
} else {
sigret = __NR_sigreturn;
-   tramp = (unsigned long) frame->mctx.tramp;
+   tramp = (unsigned long)mctx->tramp;
}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   tm_mctx = >mctx_transact;
if (MSR_TM_ACTIVE(msr)) {
-   if (save_tm_user_regs(regs, >mctx, >mctx_transact,
- sigret, msr))
+   if (save_tm_user_regs(regs, mctx, tm_mctx, sigret, msr))
goto badframe;
}
else
 #endif
{
-   if (save_user_regs(regs, >mctx, tm_mctx, sigret, 1))
+   if (save_user_regs(regs, mctx, tm_mctx, sigret, 1))
goto badframe;
}
 
@@ -909,7 +913,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
regs->gpr[1] = newsp;
regs->gpr[3] = ksig->sig;
regs->gpr[4] = (unsigned long) sc;
-   regs->nip = (unsigned long) (unsigned long)ksig->ka.sa.sa_handler;
+   regs->nip = (unsigned long)ksig->ka.sa.sa_handler;
/* enter the signal handler in big-endian mode */
regs->msr &= ~MSR_LE;
return 0;
-- 
2.25.0



[PATCH v2 14/25] powerpc/signal32: Rename local pointers in handle_rt_signal32()

2020-08-18 Thread Christophe Leroy
Rename pointers in handle_rt_signal32() to make it more similar to
handle_signal32()

tm_frame becomes tm_mctx
frame becomes mctx
rt_sf becomes frame

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 51 -
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 2cc686b9f566..d0fcb3de66aa 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -751,9 +751,9 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
   struct task_struct *tsk)
 {
-   struct rt_sigframe __user *rt_sf;
-   struct mcontext __user *frame;
-   struct mcontext __user *tm_frame = NULL;
+   struct rt_sigframe __user *frame;
+   struct mcontext __user *mctx;
+   struct mcontext __user *tm_mctx = NULL;
unsigned long newsp = 0;
int sigret;
unsigned long tramp;
@@ -765,46 +765,45 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
-   rt_sf = get_sigframe(ksig, tsk, sizeof(*rt_sf), 1);
-   if (!access_ok(rt_sf, sizeof(*rt_sf)))
+   frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+   if (!access_ok(frame, sizeof(*frame)))
goto badframe;
 
/* Put the siginfo & fill in most of the ucontext */
-   if (copy_siginfo_to_user(_sf->info, >info)
-   || __put_user(0, _sf->uc.uc_flags)
-   || __save_altstack(_sf->uc.uc_stack, regs->gpr[1])
-   || __put_user(to_user_ptr(_sf->uc.uc_mcontext),
-   _sf->uc.uc_regs)
-   || put_sigset_t(_sf->uc.uc_sigmask, oldset))
+   if (copy_siginfo_to_user(>info, >info) ||
+   __put_user(0, >uc.uc_flags) ||
+   __save_altstack(>uc.uc_stack, regs->gpr[1]) ||
+   __put_user(to_user_ptr(>uc.uc_mcontext), >uc.uc_regs) 
||
+   put_sigset_t(>uc.uc_sigmask, oldset))
goto badframe;
 
/* Save user registers on the stack */
-   frame = _sf->uc.uc_mcontext;
+   mctx = >uc.uc_mcontext;
if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
sigret = 0;
tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
} else {
sigret = __NR_rt_sigreturn;
-   tramp = (unsigned long) frame->tramp;
+   tramp = (unsigned long)mctx->tramp;
}
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   tm_frame = _sf->uc_transact.uc_mcontext;
+   tm_mctx = >uc_transact.uc_mcontext;
if (MSR_TM_ACTIVE(msr)) {
-   if (__put_user((unsigned long)_sf->uc_transact,
-  _sf->uc.uc_link) ||
-   __put_user((unsigned long)tm_frame,
-  _sf->uc_transact.uc_regs))
+   if (__put_user((unsigned long)>uc_transact,
+  >uc.uc_link) ||
+   __put_user((unsigned long)tm_mctx,
+  >uc_transact.uc_regs))
goto badframe;
-   if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr))
+   if (save_tm_user_regs(regs, mctx, tm_mctx, sigret, msr))
goto badframe;
}
else
 #endif
{
-   if (__put_user(0, _sf->uc.uc_link))
+   if (__put_user(0, >uc.uc_link))
goto badframe;
-   if (save_user_regs(regs, frame, tm_frame, sigret, 1))
+   if (save_user_regs(regs, mctx, tm_mctx, sigret, 1))
goto badframe;
}
regs->link = tramp;
@@ -814,16 +813,16 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 #endif
 
/* create a stack frame for the caller of the handler */
-   newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
+   newsp = ((unsigned long)frame) - (__SIGNAL_FRAMESIZE + 16);
if (put_user(regs->gpr[1], (u32 __user *)newsp))
goto badframe;
 
/* Fill registers for signal handler */
regs->gpr[1] = newsp;
regs->gpr[3] = ksig->sig;
-   regs->gpr[4] = (unsigned long) _sf->info;
-   regs->gpr[5] = (unsigned long) _sf->uc;
-   regs->gpr[6] = (unsigned long) rt_sf;
+   regs->gpr[4] = (unsigned long)>info;
+   regs->gpr[5] = (unsigned long)>uc;
+   regs->gpr[6] = (unsigned long)frame;
regs->nip = (unsigned long) ksig->ka.sa.sa_handler;
/* enter the signal handler in native-endian mode */
regs->msr &= ~MSR_LE;
@@ -831,7 +830,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   signal_fault(tsk, regs, "handle_rt_signal32", rt_sf);
+   signal_fault(tsk, regs, "handle_rt_signal32", frame);
 
 

[PATCH v2 13/25] powerpc/signal32: Move handle_signal32() close to handle_rt_signal32()

2020-08-18 Thread Christophe Leroy
Those two functions are similar and serving the same purpose.
To ease refactorisation, move them close to each other.

This is pure move, no code change, no cosmetic. Yes, checkpatch is
not happy, most will clear later.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 170 
 1 file changed, 85 insertions(+), 85 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 44a46911ff98..2cc686b9f566 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -836,6 +836,91 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 1;
 }
 
+/*
+ * OK, we're invoking a handler
+ */
+int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
+   struct task_struct *tsk)
+{
+   struct sigcontext __user *sc;
+   struct sigframe __user *frame;
+   struct mcontext __user *tm_mctx = NULL;
+   unsigned long newsp = 0;
+   int sigret;
+   unsigned long tramp;
+   struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   /* Save the thread's msr before get_tm_stackpointer() changes it */
+   unsigned long msr = regs->msr;
+#endif
+
+   /* Set up Signal Frame */
+   frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
+   if (!access_ok(frame, sizeof(*frame)))
+   goto badframe;
+   sc = (struct sigcontext __user *) >sctx;
+
+#if _NSIG != 64
+#error "Please adjust handle_signal()"
+#endif
+   if (__put_user(to_user_ptr(ksig->ka.sa.sa_handler), >handler)
+   || __put_user(oldset->sig[0], >oldmask)
+#ifdef CONFIG_PPC64
+   || __put_user((oldset->sig[0] >> 32), >_unused[3])
+#else
+   || __put_user(oldset->sig[1], >_unused[3])
+#endif
+   || __put_user(to_user_ptr(>mctx), >regs)
+   || __put_user(ksig->sig, >signal))
+   goto badframe;
+
+   if (vdso32_sigtramp && tsk->mm->context.vdso_base) {
+   sigret = 0;
+   tramp = tsk->mm->context.vdso_base + vdso32_sigtramp;
+   } else {
+   sigret = __NR_sigreturn;
+   tramp = (unsigned long) frame->mctx.tramp;
+   }
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   tm_mctx = >mctx_transact;
+   if (MSR_TM_ACTIVE(msr)) {
+   if (save_tm_user_regs(regs, >mctx, >mctx_transact,
+ sigret, msr))
+   goto badframe;
+   }
+   else
+#endif
+   {
+   if (save_user_regs(regs, >mctx, tm_mctx, sigret, 1))
+   goto badframe;
+   }
+
+   regs->link = tramp;
+
+#ifdef CONFIG_PPC_FPU_REGS
+   tsk->thread.fp_state.fpscr = 0; /* turn off all fp exceptions */
+#endif
+
+   /* create a stack frame for the caller of the handler */
+   newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
+   if (put_user(regs->gpr[1], (u32 __user *)newsp))
+   goto badframe;
+
+   regs->gpr[1] = newsp;
+   regs->gpr[3] = ksig->sig;
+   regs->gpr[4] = (unsigned long) sc;
+   regs->nip = (unsigned long) (unsigned long)ksig->ka.sa.sa_handler;
+   /* enter the signal handler in big-endian mode */
+   regs->msr &= ~MSR_LE;
+   return 0;
+
+badframe:
+   signal_fault(tsk, regs, "handle_signal32", frame);
+
+   return 1;
+}
+
 static int do_setcontext(struct ucontext __user *ucp, struct pt_regs *regs, 
int sig)
 {
sigset_t set;
@@ -1188,91 +1273,6 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user 
*, ctx,
 }
 #endif
 
-/*
- * OK, we're invoking a handler
- */
-int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
-   struct task_struct *tsk)
-{
-   struct sigcontext __user *sc;
-   struct sigframe __user *frame;
-   struct mcontext __user *tm_mctx = NULL;
-   unsigned long newsp = 0;
-   int sigret;
-   unsigned long tramp;
-   struct pt_regs *regs = tsk->thread.regs;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-   /* Save the thread's msr before get_tm_stackpointer() changes it */
-   unsigned long msr = regs->msr;
-#endif
-
-   /* Set up Signal Frame */
-   frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
-   if (!access_ok(frame, sizeof(*frame)))
-   goto badframe;
-   sc = (struct sigcontext __user *) >sctx;
-
-#if _NSIG != 64
-#error "Please adjust handle_signal()"
-#endif
-   if (__put_user(to_user_ptr(ksig->ka.sa.sa_handler), >handler)
-   || __put_user(oldset->sig[0], >oldmask)
-#ifdef CONFIG_PPC64
-   || __put_user((oldset->sig[0] >> 32), >_unused[3])
-#else
-   || __put_user(oldset->sig[1], >_unused[3])
-#endif
-   || __put_user(to_user_ptr(>mctx), >regs)
-   || __put_user(ksig->sig, >signal))
-   goto badframe;
-
-   if (vdso32_sigtramp && tsk->mm->context.vdso_base) {
-   sigret = 0;
-   tramp = 

[PATCH v2 12/25] powerpc/signal32: Simplify logging in handle_rt_signal32()

2020-08-18 Thread Christophe Leroy
If something is bad in the frame, there is no point in
knowing which part of the frame exactly is wrong as it
got allocated as a single block.

Always print the root address of the frame in case of
failed user access, just like handle_signal32().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index deb729c8b79d..44a46911ff98 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -754,7 +754,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
struct rt_sigframe __user *rt_sf;
struct mcontext __user *frame;
struct mcontext __user *tm_frame = NULL;
-   void __user *addr;
unsigned long newsp = 0;
int sigret;
unsigned long tramp;
@@ -767,7 +766,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
rt_sf = get_sigframe(ksig, tsk, sizeof(*rt_sf), 1);
-   addr = rt_sf;
if (!access_ok(rt_sf, sizeof(*rt_sf)))
goto badframe;
 
@@ -782,7 +780,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Save user registers on the stack */
frame = _sf->uc.uc_mcontext;
-   addr = frame;
if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
sigret = 0;
tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
@@ -818,7 +815,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* create a stack frame for the caller of the handler */
newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
-   addr = (void __user *)regs->gpr[1];
if (put_user(regs->gpr[1], (u32 __user *)newsp))
goto badframe;
 
@@ -835,7 +831,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   signal_fault(tsk, regs, "handle_rt_signal32", addr);
+   signal_fault(tsk, regs, "handle_rt_signal32", rt_sf);
 
return 1;
 }
-- 
2.25.0



[PATCH v2 11/25] powerpc/signal: Refactor bad frame logging

2020-08-18 Thread Christophe Leroy
The logging of bad frame appears half a dozen of times
and is pretty similar.

Create signal_fault() fonction to perform that logging.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c| 11 +++
 arch/powerpc/kernel/signal.h|  3 +++
 arch/powerpc/kernel/signal_32.c | 35 +
 arch/powerpc/kernel/signal_64.c | 15 ++
 4 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 5edded5c5d20..a1d31d26dbd6 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -355,3 +355,14 @@ static unsigned long get_tm_stackpointer(struct 
task_struct *tsk)
 #endif
return ret;
 }
+
+static const char fm32[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %08lx lr 
%08lx\n";
+static const char fm64[] = KERN_INFO "%s[%d]: bad frame in %s: %p nip %016lx 
lr %016lx\n";
+
+void signal_fault(struct task_struct *tsk, struct pt_regs *regs,
+ const char *where, void __user *ptr)
+{
+   if (show_unhandled_signals)
+   printk_ratelimited(regs->msr & MSR_64BIT ? fm64 : fm32, 
tsk->comm,
+  task_pid_nr(tsk), where, ptr, regs->nip, 
regs->link);
+}
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index fb98731348c3..f610cfafa478 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -93,4 +93,7 @@ static inline int handle_rt_signal64(struct ksignal *ksig, 
sigset_t *set,
 
 #endif /* !defined(CONFIG_PPC64) */
 
+void signal_fault(struct task_struct *tsk, struct pt_regs *regs,
+ const char *where, void __user *ptr);
+
 #endif  /* _POWERPC_ARCH_SIGNAL_H */
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index e5b2801a94ac..deb729c8b79d 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -835,12 +835,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in handle_rt_signal32: "
-  "%p nip %08lx lr %08lx\n",
-  tsk->comm, tsk->pid,
-  addr, regs->nip, regs->link);
+   signal_fault(tsk, regs, "handle_rt_signal32", addr);
 
return 1;
 }
@@ -1092,12 +1087,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
return 0;
 
  bad:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in sys_rt_sigreturn: "
-  "%p nip %08lx lr %08lx\n",
-  current->comm, current->pid,
-  rt_sf, regs->nip, regs->link);
+   signal_fault(current, regs, "sys_rt_sigreturn", rt_sf);
 
force_sig(SIGSEGV);
return 0;
@@ -1181,12 +1171,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user 
*, ctx,
 * We kill the task with a SIGSEGV in this situation.
 */
if (do_setcontext(ctx, regs, 1)) {
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO "%s[%d]: bad frame in "
-  "sys_debug_setcontext: %p nip %08lx "
-  "lr %08lx\n",
-  current->comm, current->pid,
-  ctx, regs->nip, regs->link);
+   signal_fault(current, regs, "sys_debug_setcontext", ctx);
 
force_sig(SIGSEGV);
goto out;
@@ -1287,12 +1272,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
return 0;
 
 badframe:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in handle_signal32: "
-  "%p nip %08lx lr %08lx\n",
-  tsk->comm, tsk->pid,
-  frame, regs->nip, regs->link);
+   signal_fault(tsk, regs, "handle_signal32", frame);
 
return 1;
 }
@@ -1363,12 +1343,7 @@ SYSCALL_DEFINE0(sigreturn)
return 0;
 
 badframe:
-   if (show_unhandled_signals)
-   printk_ratelimited(KERN_INFO
-  "%s[%d]: bad frame in sys_sigreturn: "
-  "%p nip %08lx lr %08lx\n",
-  current->comm, current->pid,
-  addr, regs->nip, regs->link);
+   signal_fault(current, regs, "sys_sigreturn", addr);
 
force_sig(SIGSEGV);
return 0;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index fec27d599e87..7df088b9ad0f 100644
--- 

[PATCH v2 10/25] powerpc/signal: Call get_tm_stackpointer() from get_sigframe()

2020-08-18 Thread Christophe Leroy
Instead of calling get_tm_stackpointer() from the caller, call it
directly from get_sigframe(). This avoids a double call and
allows get_tm_stackpointer() to become static and be inlined
into get_sigframe() by GCC.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c| 9 ++---
 arch/powerpc/kernel/signal.h| 6 ++
 arch/powerpc/kernel/signal_32.c | 4 ++--
 arch/powerpc/kernel/signal_64.c | 2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index a295d482adec..5edded5c5d20 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -144,10 +144,13 @@ int show_unhandled_signals = 1;
 /*
  * Allocate space for the signal frame
  */
-void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
-  size_t frame_size, int is_32)
+static unsigned long get_tm_stackpointer(struct task_struct *tsk);
+
+void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk,
+ size_t frame_size, int is_32)
 {
 unsigned long oldsp, newsp;
+   unsigned long sp = get_tm_stackpointer(tsk);
 
 /* Default to using normal stack */
if (is_32)
@@ -304,7 +307,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long 
thread_info_flags)
user_enter();
 }
 
-unsigned long get_tm_stackpointer(struct task_struct *tsk)
+static unsigned long get_tm_stackpointer(struct task_struct *tsk)
 {
/* When in an active transaction that takes a signal, we need to be
 * careful with the stack.  It's possible that the stack has moved back
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 6c2a33ab042c..fb98731348c3 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -10,8 +10,8 @@
 #ifndef _POWERPC_ARCH_SIGNAL_H
 #define _POWERPC_ARCH_SIGNAL_H
 
-extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
- size_t frame_size, int is_32);
+void __user *get_sigframe(struct ksignal *ksig, struct task_struct *tsk,
+ size_t frame_size, int is_32);
 
 extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
   struct task_struct *tsk);
@@ -19,8 +19,6 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
  struct task_struct *tsk);
 
-extern unsigned long get_tm_stackpointer(struct task_struct *tsk);
-
 #ifdef CONFIG_VSX
 extern unsigned long copy_vsx_to_user(void __user *to,
  struct task_struct *task);
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 61621acacc63..e5b2801a94ac 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -766,7 +766,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
-   rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
+   rt_sf = get_sigframe(ksig, tsk, sizeof(*rt_sf), 1);
addr = rt_sf;
if (!access_ok(rt_sf, sizeof(*rt_sf)))
goto badframe;
@@ -1226,7 +1226,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 #endif
 
/* Set up Signal Frame */
-   frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
+   frame = get_sigframe(ksig, tsk, sizeof(*frame), 1);
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index d3db78732070..fec27d599e87 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -822,7 +822,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
unsigned long msr = regs->msr;
 #endif
 
-   frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
+   frame = get_sigframe(ksig, tsk, sizeof(*frame), 0);
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
 
-- 
2.25.0



[PATCH v2 05/25] powerpc/signal: Don't manage floating point regs when no FPU

2020-08-18 Thread Christophe Leroy
There is no point in copying floating point regs when there
is no FPU and MATH_EMULATION is not selected.

Create a new CONFIG_PPC_FPU_REGS bool that is selected by
CONFIG_MATH_EMULATION and CONFIG_PPC_FPU, and use it to
opt out everything related to fp_state in thread_struct.

The asm const used only by fpu.S are opted out with CONFIG_PPC_FPU
as fpu.S build is conditionnal to CONFIG_PPC_FPU.

The following app spends approx 8.1 seconds system time on an 8xx
without the patch, and 7.0 seconds with the patch (13.5% reduction).

On an 832x, it spends approx 2.6 seconds system time without
the patch and 2.1 seconds with the patch (19% reduction).

void sigusr1(int sig) { }

int main(int argc, char **argv)
{
int i = 10;

signal(SIGUSR1, sigusr1);
for (;i--;)
raise(SIGUSR1);
exit(0);
}

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/include/asm/processor.h |  2 ++
 arch/powerpc/kernel/asm-offsets.c|  2 ++
 arch/powerpc/kernel/process.c|  4 
 arch/powerpc/kernel/ptrace/Makefile  |  4 ++--
 arch/powerpc/kernel/ptrace/ptrace-decl.h | 14 ++
 arch/powerpc/kernel/ptrace/ptrace-view.c |  2 ++
 arch/powerpc/kernel/signal.h | 14 +-
 arch/powerpc/kernel/signal_32.c  |  4 
 arch/powerpc/kernel/traps.c  |  2 ++
 arch/powerpc/platforms/Kconfig.cputype   |  4 
 11 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f48bbfb3ce9..a2611880b904 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -416,6 +416,7 @@ config HUGETLB_PAGE_SIZE_VARIABLE
 config MATH_EMULATION
bool "Math emulation"
depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE
+   select PPC_FPU_REGS
help
  Some PowerPC chips designed for embedded applications do not have
  a floating-point unit and therefore do not implement the
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index ed0d633ab5aa..e20b0c5abe62 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -175,8 +175,10 @@ struct thread_struct {
 #endif
/* Debug Registers */
struct debug_reg debug;
+#ifdef CONFIG_PPC_FPU_REGS
struct thread_fp_state  fp_state;
struct thread_fp_state  *fp_save_area;
+#endif
int fpexc_mode; /* floating-point exception mode */
unsigned intalign_ctl;  /* alignment handling control */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 8711c2164b45..6cb36c341c70 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -110,9 +110,11 @@ int main(void)
 #ifdef CONFIG_BOOKE
OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]);
 #endif
+#ifdef CONFIG_PPC_FPU
OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode);
OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr);
OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area);
+#endif
OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr);
OFFSET(THREAD_LOAD_FP, thread_struct, load_fp);
 #ifdef CONFIG_ALTIVEC
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 016bd831908e..7e0082ac0a39 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1694,7 +1694,9 @@ int copy_thread(unsigned long clone_flags, unsigned long 
usp,
p->thread.ptrace_bps[i] = NULL;
 #endif
 
+#ifdef CONFIG_PPC_FPU_REGS
p->thread.fp_save_area = NULL;
+#endif
 #ifdef CONFIG_ALTIVEC
p->thread.vr_save_area = NULL;
 #endif
@@ -1821,8 +1823,10 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
 #endif
current->thread.load_slb = 0;
current->thread.load_fp = 0;
+#ifdef CONFIG_PPC_FPU_REGS
memset(>thread.fp_state, 0, sizeof(current->thread.fp_state));
current->thread.fp_save_area = NULL;
+#endif
 #ifdef CONFIG_ALTIVEC
memset(>thread.vr_state, 0, sizeof(current->thread.vr_state));
current->thread.vr_state.vscr.u[3] = 0x0001; /* Java mode disabled 
*/
diff --git a/arch/powerpc/kernel/ptrace/Makefile 
b/arch/powerpc/kernel/ptrace/Makefile
index 77abd1a5a508..8ebc11d1168d 100644
--- a/arch/powerpc/kernel/ptrace/Makefile
+++ b/arch/powerpc/kernel/ptrace/Makefile
@@ -6,11 +6,11 @@
 CFLAGS_ptrace-view.o   += -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
 obj-y  += ptrace.o ptrace-view.o
-obj-y  += ptrace-fpu.o
+obj-$(CONFIG_PPC_FPU_REGS) += ptrace-fpu.o
 obj-$(CONFIG_COMPAT)   += ptrace32.o
 obj-$(CONFIG_VSX)  += ptrace-vsx.o
 ifneq ($(CONFIG_VSX),y)
-obj-y 

[PATCH v2 08/25] powerpc/signal: Move access_ok() out of get_sigframe()

2020-08-18 Thread Christophe Leroy
This access_ok() will soon be performed by user_access_begin().
So move it out of get_sigframe().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c| 4 
 arch/powerpc/kernel/signal_32.c | 4 ++--
 arch/powerpc/kernel/signal_64.c | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 3b56db02b762..1be5fd01f866 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -154,10 +154,6 @@ void __user *get_sigframe(struct ksignal *ksig, unsigned 
long sp,
oldsp = sigsp(oldsp, ksig);
newsp = (oldsp - frame_size) & ~0xFUL;
 
-   /* Check access */
-   if (!access_ok((void __user *)newsp, oldsp - newsp))
-   return NULL;
-
 return (void __user *)newsp;
 }
 
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 8cbc9ac1343d..61621acacc63 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -768,7 +768,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
/* Put a Real Time Context onto stack */
rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
addr = rt_sf;
-   if (unlikely(rt_sf == NULL))
+   if (!access_ok(rt_sf, sizeof(*rt_sf)))
goto badframe;
 
/* Put the siginfo & fill in most of the ucontext */
@@ -1227,7 +1227,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 
/* Set up Signal Frame */
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
-   if (unlikely(frame == NULL))
+   if (!access_ok(frame, sizeof(*frame)))
goto badframe;
sc = (struct sigcontext __user *) >sctx;
 
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index cae612bdde5f..d3db78732070 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -823,7 +823,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 #endif
 
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
-   if (unlikely(frame == NULL))
+   if (!access_ok(frame, sizeof(*frame)))
goto badframe;
 
err |= __put_user(>info, >pinfo);
-- 
2.25.0



[PATCH v2 06/25] powerpc/32s: Allow deselecting CONFIG_PPC_FPU on mpc832x

2020-08-18 Thread Christophe Leroy
The e300c2 core which is embedded in mpc832x CPU doesn't have
an FPU.

Make it possible to not select CONFIG_PPC_FPU when building a
kernel dedicated to that target.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_32.S  |  4 
 arch/powerpc/platforms/Kconfig.cputype | 11 +--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index f3ab94d73936..588fe8644df6 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -413,6 +413,7 @@ Alignment:
. = 0x800
DO_KVM  0x800
 FPUnavailable:
+#ifdef CONFIG_PPC_FPU
 BEGIN_FTR_SECTION
 /*
  * Certain Freescale cores don't have a FPU and treat fp instructions
@@ -426,6 +427,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE)
b   fast_exception_return
 1: addir3,r1,STACK_FRAME_OVERHEAD
EXC_XFER_LITE(0x800, kernel_fp_unavailable_exception)
+#else
+   b   ProgramCheck
+#endif
 
 /* Decrementer */
EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE)
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 40ffcdba42b8..d4fd109f177e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -32,7 +32,7 @@ choice
 config PPC_BOOK3S_6xx
bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601"
select PPC_BOOK3S_32
-   select PPC_FPU
+   imply PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
@@ -229,9 +229,16 @@ config PPC_FPU_REGS
bool
 
 config PPC_FPU
-   bool
+   bool "Support for Floating Point Unit (FPU)" if PPC_MPC832x
default y if PPC64
select PPC_FPU_REGS
+   help
+ This must be enabled to support the Floating Point Unit
+ Most 6xx have an FPU but e300c2 core (mpc832x) don't have
+ an FPU, so when building an embedded kernel for that target
+ you can disable FPU support.
+
+ If unsure say Y.
 
 config FSL_EMB_PERFMON
bool "Freescale Embedded Perfmon"
-- 
2.25.0



[PATCH v2 09/25] powerpc/signal: Remove get_clean_sp()

2020-08-18 Thread Christophe Leroy
get_clean_sp() is only used once in kernel/signal.c .

GCC is smart enough to see that x & 0x is a nop
calculation on PPC32, no need of a special PPC32 trivial version.

Include the logic from the PPC64 version of get_clean_sp() directly
in get_sigframe().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/processor.h | 14 --
 arch/powerpc/kernel/signal.c |  5 -
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index e20b0c5abe62..8320aedbdca3 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -406,20 +406,6 @@ static inline void prefetchw(const void *x)
 
 #define HAVE_ARCH_PICK_MMAP_LAYOUT
 
-#ifdef CONFIG_PPC64
-static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
-{
-   if (is_32)
-   return sp & 0x0UL;
-   return sp;
-}
-#else
-static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
-{
-   return sp;
-}
-#endif
-
 /* asm stubs */
 extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
 extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 1be5fd01f866..a295d482adec 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -150,7 +150,10 @@ void __user *get_sigframe(struct ksignal *ksig, unsigned 
long sp,
 unsigned long oldsp, newsp;
 
 /* Default to using normal stack */
-oldsp = get_clean_sp(sp, is_32);
+   if (is_32)
+   oldsp = sp & 0x0UL;
+   else
+   oldsp = sp;
oldsp = sigsp(oldsp, ksig);
newsp = (oldsp - frame_size) & ~0xFUL;
 
-- 
2.25.0



[PATCH v2 07/25] powerpc/signal: Remove BUG_ON() in handler_signal functions

2020-08-18 Thread Christophe Leroy
There is already the same BUG_ON() check in do_signal() which
is the only caller of handle_rt_signal64() handle_rt_signal32() and
handle_signal32().

Remove those three redundant BUG_ON().

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal_32.c | 4 
 arch/powerpc/kernel/signal_64.c | 2 --
 2 files changed, 6 deletions(-)

diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 7b291707eb31..8cbc9ac1343d 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -764,8 +764,6 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t 
*oldset,
unsigned long msr = regs->msr;
 #endif
 
-   BUG_ON(tsk != current);
-
/* Set up Signal Frame */
/* Put a Real Time Context onto stack */
rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
@@ -1227,8 +1225,6 @@ int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
unsigned long msr = regs->msr;
 #endif
 
-   BUG_ON(tsk != current);
-
/* Set up Signal Frame */
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
if (unlikely(frame == NULL))
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index bfc939360bad..cae612bdde5f 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -822,8 +822,6 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
unsigned long msr = regs->msr;
 #endif
 
-   BUG_ON(tsk != current);
-
frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
if (unlikely(frame == NULL))
goto badframe;
-- 
2.25.0



[PATCH v2 03/25] powerpc/ptrace: Consolidate reg index calculation

2020-08-18 Thread Christophe Leroy
Today we have:

#ifdef CONFIG_PPC32
index = addr >> 2;
if ((addr & 3) || child->thread.regs == NULL)
#else
index = addr >> 3;
if ((addr & 7))
#endif

sizeof(long) has value 4 for PPC32 and value 8 for PPC64.

Dividing by 4 is equivalent to >> 2 and dividing by 8 is equivalent
to >> 3.

And 3 and 7 are respectively (sizeof(long) - 1).

Use sizeof(long) to get rid of the #ifdef CONFIG_PPC32 and consolidate
the calculation and checking.

thread.regs have to be not NULL on both PPC32 and PPC64 so adding
that test on PPC64 is harmless.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/ptrace/ptrace.c | 18 --
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index f6e51be47c6e..51557a9c0765 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -55,14 +55,9 @@ long arch_ptrace(struct task_struct *child, long request,
 
ret = -EIO;
/* convert to index and check */
-#ifdef CONFIG_PPC32
-   index = addr >> 2;
-   if ((addr & 3) || (index > PT_FPSCR)
+   index = addr / sizeof(long);
+   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR)
|| (child->thread.regs == NULL))
-#else
-   index = addr >> 3;
-   if ((addr & 7) || (index > PT_FPSCR))
-#endif
break;
 
CHECK_FULL_REGS(child->thread.regs);
@@ -90,14 +85,9 @@ long arch_ptrace(struct task_struct *child, long request,
 
ret = -EIO;
/* convert to index and check */
-#ifdef CONFIG_PPC32
-   index = addr >> 2;
-   if ((addr & 3) || (index > PT_FPSCR)
+   index = addr / sizeof(long);
+   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR)
|| (child->thread.regs == NULL))
-#else
-   index = addr >> 3;
-   if ((addr & 7) || (index > PT_FPSCR))
-#endif
break;
 
CHECK_FULL_REGS(child->thread.regs);
-- 
2.25.0



[PATCH v2 01/25] powerpc/signal: Move inline functions in signal.h

2020-08-18 Thread Christophe Leroy
To really be inlined, the functions need to be defined in the
same C file as the caller, or in an included header.

Move functions defined inline from signal .c in signal.h

Fixes: 3dd4eb83a9c0 ("powerpc: move common register copy functions from 
signal_32.c to signal.c")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/signal.c | 30 --
 arch/powerpc/kernel/signal.h | 41 +---
 2 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index d15a98c758b8..3b56db02b762 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -133,36 +133,6 @@ unsigned long copy_ckvsx_from_user(struct task_struct 
*task,
return 0;
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-#else
-inline unsigned long copy_fpr_to_user(void __user *to,
- struct task_struct *task)
-{
-   return __copy_to_user(to, task->thread.fp_state.fpr,
- ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_fpr_from_user(struct task_struct *task,
-   void __user *from)
-{
-   return __copy_from_user(task->thread.fp_state.fpr, from,
- ELF_NFPREG * sizeof(double));
-}
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-inline unsigned long copy_ckfpr_to_user(void __user *to,
-struct task_struct *task)
-{
-   return __copy_to_user(to, task->thread.ckfp_state.fpr,
- ELF_NFPREG * sizeof(double));
-}
-
-inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
-void __user *from)
-{
-   return __copy_from_user(task->thread.ckfp_state.fpr, from,
-   ELF_NFPREG * sizeof(double));
-}
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 #endif
 
 /* Log an error when sending an unhandled signal to a process. Controlled
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index d396efca4068..4626d39cc0f0 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -19,14 +19,6 @@ extern int handle_signal32(struct ksignal *ksig, sigset_t 
*oldset,
 extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
  struct task_struct *tsk);
 
-extern unsigned long copy_fpr_to_user(void __user *to,
- struct task_struct *task);
-extern unsigned long copy_ckfpr_to_user(void __user *to,
-  struct task_struct *task);
-extern unsigned long copy_fpr_from_user(struct task_struct *task,
-   void __user *from);
-extern unsigned long copy_ckfpr_from_user(struct task_struct *task,
-void __user *from);
 extern unsigned long get_tm_stackpointer(struct task_struct *tsk);
 
 #ifdef CONFIG_VSX
@@ -38,6 +30,39 @@ extern unsigned long copy_vsx_from_user(struct task_struct 
*task,
void __user *from);
 extern unsigned long copy_ckvsx_from_user(struct task_struct *task,
 void __user *from);
+unsigned long copy_fpr_to_user(void __user *to, struct task_struct *task);
+unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct *task);
+unsigned long copy_fpr_from_user(struct task_struct *task, void __user *from);
+unsigned long copy_ckfpr_from_user(struct task_struct *task, void __user 
*from);
+#else
+static inline unsigned long
+copy_fpr_to_user(void __user *to, struct task_struct *task)
+{
+   return __copy_to_user(to, task->thread.fp_state.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+static inline unsigned long
+copy_fpr_from_user(struct task_struct *task, void __user *from)
+{
+   return __copy_from_user(task->thread.fp_state.fpr, from,
+ ELF_NFPREG * sizeof(double));
+}
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+inline unsigned long copy_ckfpr_to_user(void __user *to, struct task_struct 
*task)
+{
+   return __copy_to_user(to, task->thread.ckfp_state.fpr,
+ ELF_NFPREG * sizeof(double));
+}
+
+static inline unsigned long
+copy_ckfpr_from_user(struct task_struct *task, void __user *from)
+{
+   return __copy_from_user(task->thread.ckfp_state.fpr, from,
+   ELF_NFPREG * sizeof(double));
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 #endif
 
 #ifdef CONFIG_PPC64
-- 
2.25.0



[PATCH v2 02/25] powerpc/ptrace: Move declaration of ptrace_get_reg() and ptrace_set_reg()

2020-08-18 Thread Christophe Leroy
ptrace_get_reg() and ptrace_set_reg() are only used internally by
ptrace.

Move them in arch/powerpc/kernel/ptrace/ptrace-decl.h

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/ptrace.h| 6 --
 arch/powerpc/kernel/ptrace/ptrace-decl.h | 3 +++
 arch/powerpc/kernel/ptrace/ptrace32.c| 2 ++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/ptrace.h 
b/arch/powerpc/include/asm/ptrace.h
index 155a197c0aa1..3c3cf537c3bf 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -171,12 +171,6 @@ static inline void regs_set_return_value(struct pt_regs 
*regs, unsigned long rc)
set_thread_flag(TIF_NOERROR); \
} while(0)
 
-struct task_struct;
-extern int ptrace_get_reg(struct task_struct *task, int regno,
- unsigned long *data);
-extern int ptrace_put_reg(struct task_struct *task, int regno,
- unsigned long data);
-
 #define current_pt_regs() \
((struct pt_regs *)((unsigned long)task_stack_page(current) + 
THREAD_SIZE) - 1)
 
diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index 67447a6197eb..2ddc68412fa8 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -159,6 +159,9 @@ int tm_cgpr32_set(struct task_struct *target, const struct 
user_regset *regset,
 
 /* ptrace-view */
 
+int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data);
+int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data);
+
 extern const struct user_regset_view user_ppc_native_view;
 
 /* ptrace-(no)adv */
diff --git a/arch/powerpc/kernel/ptrace/ptrace32.c 
b/arch/powerpc/kernel/ptrace/ptrace32.c
index 7589a9665ffb..d30b9ad70edc 100644
--- a/arch/powerpc/kernel/ptrace/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace/ptrace32.c
@@ -23,6 +23,8 @@
 
 #include 
 
+#include "ptrace-decl.h"
+
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
-- 
2.25.0



[PATCH v2 04/25] powerpc/ptrace: Create ptrace_get_fpr() and ptrace_put_fpr()

2020-08-18 Thread Christophe Leroy
On the same model as ptrace_get_reg() and ptrace_put_reg(),
create ptrace_get_fpr() and ptrace_put_fpr() to get/set
the floating points registers.

We move the boundary checkings in them.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/ptrace/Makefile  |  1 +
 arch/powerpc/kernel/ptrace/ptrace-decl.h |  4 +++
 arch/powerpc/kernel/ptrace/ptrace-fpu.c  | 40 
 arch/powerpc/kernel/ptrace/ptrace.c  | 40 +++-
 4 files changed, 56 insertions(+), 29 deletions(-)
 create mode 100644 arch/powerpc/kernel/ptrace/ptrace-fpu.c

diff --git a/arch/powerpc/kernel/ptrace/Makefile 
b/arch/powerpc/kernel/ptrace/Makefile
index c2f2402ebc8c..77abd1a5a508 100644
--- a/arch/powerpc/kernel/ptrace/Makefile
+++ b/arch/powerpc/kernel/ptrace/Makefile
@@ -6,6 +6,7 @@
 CFLAGS_ptrace-view.o   += -DUTS_MACHINE='"$(UTS_MACHINE)"'
 
 obj-y  += ptrace.o ptrace-view.o
+obj-y  += ptrace-fpu.o
 obj-$(CONFIG_COMPAT)   += ptrace32.o
 obj-$(CONFIG_VSX)  += ptrace-vsx.o
 ifneq ($(CONFIG_VSX),y)
diff --git a/arch/powerpc/kernel/ptrace/ptrace-decl.h 
b/arch/powerpc/kernel/ptrace/ptrace-decl.h
index 2ddc68412fa8..eafe5f0f6289 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-decl.h
+++ b/arch/powerpc/kernel/ptrace/ptrace-decl.h
@@ -164,6 +164,10 @@ int ptrace_put_reg(struct task_struct *task, int regno, 
unsigned long data);
 
 extern const struct user_regset_view user_ppc_native_view;
 
+/* ptrace-fpu */
+int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data);
+int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data);
+
 /* ptrace-(no)adv */
 void ppc_gethwdinfo(struct ppc_debug_info *dbginfo);
 int ptrace_get_debugreg(struct task_struct *child, unsigned long addr,
diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c 
b/arch/powerpc/kernel/ptrace/ptrace-fpu.c
new file mode 100644
index ..8301cb52dd99
--- /dev/null
+++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include 
+
+#include 
+
+#include "ptrace-decl.h"
+
+int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data)
+{
+   unsigned int fpidx = index - PT_FPR0;
+
+   if (index > PT_FPSCR)
+   return -EIO;
+
+   flush_fp_to_thread(child);
+   if (fpidx < (PT_FPSCR - PT_FPR0))
+   memcpy(data, >thread.TS_FPR(fpidx), sizeof(long));
+   else
+   *data = child->thread.fp_state.fpscr;
+
+   return 0;
+}
+
+int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data)
+{
+   unsigned int fpidx = index - PT_FPR0;
+
+   if (index > PT_FPSCR)
+   return -EIO;
+
+   flush_fp_to_thread(child);
+   if (fpidx < (PT_FPSCR - PT_FPR0))
+   memcpy(>thread.TS_FPR(fpidx), , sizeof(long));
+   else
+   child->thread.fp_state.fpscr = data;
+
+   return 0;
+}
+
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c 
b/arch/powerpc/kernel/ptrace/ptrace.c
index 51557a9c0765..3d44b73adb83 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -56,25 +56,17 @@ long arch_ptrace(struct task_struct *child, long request,
ret = -EIO;
/* convert to index and check */
index = addr / sizeof(long);
-   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR)
-   || (child->thread.regs == NULL))
+   if ((addr & (sizeof(long) - 1)) || !child->thread.regs)
break;
 
CHECK_FULL_REGS(child->thread.regs);
-   if (index < PT_FPR0) {
+   if (index < PT_FPR0)
ret = ptrace_get_reg(child, (int) index, );
-   if (ret)
-   break;
-   } else {
-   unsigned int fpidx = index - PT_FPR0;
-
-   flush_fp_to_thread(child);
-   if (fpidx < (PT_FPSCR - PT_FPR0))
-   memcpy(, >thread.TS_FPR(fpidx),
-  sizeof(long));
-   else
-   tmp = child->thread.fp_state.fpscr;
-   }
+   else
+   ret = ptrace_get_fpr(child, index, );
+
+   if (ret)
+   break;
ret = put_user(tmp, datalp);
break;
}
@@ -86,24 +78,14 @@ long arch_ptrace(struct task_struct *child, long request,
ret = -EIO;
/* convert to index and check */
index = addr / sizeof(long);
-   if ((addr & (sizeof(long) - 1)) || (index > PT_FPSCR)
-   || (child->thread.regs == NULL))
+   if ((addr & (sizeof(long) - 1)) || !child->thread.regs)
break;
 
   

[PATCH v2 00/25] powerpc: Switch signal 32 to using unsafe_put_user() and friends

2020-08-18 Thread Christophe Leroy
This series leads to a reduction from 2.55s to 1.73s of
the system CPU time with the following microbench app
on an mpc832x with KUAP (approx 32%)

This series replaces copies to users by unsafe_put_user() and friends
with user_write_access_begin() dance in signal32.

The advantages are:
- No KUAP unlock/lock at every copy
- More readable code.
- Better generated code.

Without KUAP, the difference is in the noise.

void sigusr1(int sig) { }

int main(int argc, char **argv)
{
int i = 10;

signal(SIGUSR1, sigusr1);
for (;i--;)
raise(SIGUSR1);
exit(0);
}

An additional 0.10s reduction is achieved by removing
CONFIG_PPC_FPU, as the mpc832x has no FPU.

A bit less spectacular on an 8xx as KUAP is less heavy, prior to
the series (with KUAP) it ran in 8.10 ms. Once applies the removal
of FPU regs handling, we get 7.05s. With the full series, we get 6.9s.
If artificially re-activating FPU regs handling with the full series,
we get 7.6s.

So for the 8xx, the removal of the FPU regs copy is what makes the
difference, but the rework of handle_signal also have a benefit.

Same as above, without KUAP the difference is in the noise.

Difference since v1(RFC):
- Almost copies to user are now replaced by unsafe_ alternative.
- Reworked a bit the FPU registers handling following feedback from Michael.
- Fixed a few build failures reported by Mr Robot on the RFC.

Christophe Leroy (25):
  powerpc/signal: Move inline functions in signal.h
  powerpc/ptrace: Move declaration of ptrace_get_reg() and
ptrace_set_reg()
  powerpc/ptrace: Consolidate reg index calculation
  powerpc/ptrace: Create ptrace_get_fpr() and ptrace_put_fpr()
  powerpc/signal: Don't manage floating point regs when no FPU
  powerpc/32s: Allow deselecting CONFIG_PPC_FPU on mpc832x
  powerpc/signal: Remove BUG_ON() in handler_signal functions
  powerpc/signal: Move access_ok() out of get_sigframe()
  powerpc/signal: Remove get_clean_sp()
  powerpc/signal: Call get_tm_stackpointer() from get_sigframe()
  powerpc/signal: Refactor bad frame logging
  powerpc/signal32: Simplify logging in handle_rt_signal32()
  powerpc/signal32: Move handle_signal32() close to handle_rt_signal32()
  powerpc/signal32: Rename local pointers in handle_rt_signal32()
  powerpc/signal32: Misc changes to make handle_[rt_]_signal32() more
similar
  powerpc/signal32: Move signal trampoline setup to handle_[rt_]signal32
  powerpc/signal32: Switch handle_signal32() to user_access_begin()
logic
  powerpc/signal32: Switch handle_rt_signal32() to user_access_begin()
logic
  powerpc/signal32: Remove ifdefery in middle of if/else
  signal: Add unsafe_put_compat_sigset()
  powerpc/signal32: Add and use unsafe_put_sigset_t()
  powerpc/signal32: Switch swap_context() to user_access_begin() logic
  powerpc/signal: Create 'unsafe' versions of
copy_[ck][fpr/vsx]_to_user()
  powerpc/signal32: Isolate non-copy actions in save_user_regs() and
save_tm_user_regs()
  powerpc/signal32: Transform save_user_regs() and save_tm_user_regs()
in 'unsafe' version

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/processor.h |  16 +-
 arch/powerpc/include/asm/ptrace.h|   6 -
 arch/powerpc/kernel/asm-offsets.c|   2 +
 arch/powerpc/kernel/head_32.S|   4 +
 arch/powerpc/kernel/process.c|   4 +
 arch/powerpc/kernel/ptrace/Makefile  |   3 +-
 arch/powerpc/kernel/ptrace/ptrace-decl.h |  21 +
 arch/powerpc/kernel/ptrace/ptrace-fpu.c  |  40 ++
 arch/powerpc/kernel/ptrace/ptrace-view.c |   2 +
 arch/powerpc/kernel/ptrace/ptrace.c  |  54 +-
 arch/powerpc/kernel/ptrace/ptrace32.c|   2 +
 arch/powerpc/kernel/signal.c |  59 +--
 arch/powerpc/kernel/signal.h | 115 -
 arch/powerpc/kernel/signal_32.c  | 598 +++
 arch/powerpc/kernel/signal_64.c  |  21 +-
 arch/powerpc/kernel/traps.c  |   2 +
 arch/powerpc/platforms/Kconfig.cputype   |  15 +-
 include/linux/compat.h   |  32 ++
 19 files changed, 566 insertions(+), 431 deletions(-)
 create mode 100644 arch/powerpc/kernel/ptrace/ptrace-fpu.c

-- 
2.25.0



Re: Flushing transparent hugepages

2020-08-18 Thread Will Deacon
On Tue, Aug 18, 2020 at 04:07:36PM +0100, Matthew Wilcox wrote:
> For example, arm64 seems confused in this scenario:
> 
> void flush_dcache_page(struct page *page)
> {
> if (test_bit(PG_dcache_clean, >flags))
> clear_bit(PG_dcache_clean, >flags);
> }
> 
> ...
> 
> void __sync_icache_dcache(pte_t pte)
> {
> struct page *page = pte_page(pte);
> 
> if (!test_and_set_bit(PG_dcache_clean, >flags))
> sync_icache_aliases(page_address(page), page_size(page));
> }
> 
> So arm64 keeps track on a per-page basis which ones have been flushed.
> page_size() will return PAGE_SIZE if called on a tail page or regular
> page, but will return PAGE_SIZE << compound_order if called on a head
> page.  So this will either over-flush, or it's missing the opportunity
> to clear the bits on all the subpages which have now been flushed.

Hmm, that seems to go all the way back to 2014 as the result of a bug fix
in 923b8f5044da ("arm64: mm: Make icache synchronisation logic huge page
aware") which has a Reported-by Mark and a CC stable, suggesting something
_was_ going wrong at the time :/ Was there a point where the tail pages
could end up with PG_arch_1 uncleared on allocation?

> What would you _like_ to see?  Would you rather flush_dcache_page()
> were called once for each subpage, or would you rather maintain
> the page-needs-flushing state once per compound page?  We could also
> introduce flush_dcache_thp() if some architectures would prefer it one
> way and one the other, although that brings into question what to do
> for hugetlbfs pages.

For arm64, we'd like to see PG_arch_1 preserved during huge page splitting
[1], but there was a worry that it might break x86 and s390. It's also not
clear to me that we can change __sync_icache_dcache() as it's called when
we're installing the entry in the page-table, so why would it be called
again for the tail pages?

Will

[1] 
https://lore.kernel.org/linux-arch/20200703153718.16973-8-catalin.mari...@arm.com/


Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state

2020-08-18 Thread peterz
On Tue, Aug 18, 2020 at 05:22:33PM +1000, Nicholas Piggin wrote:
> Excerpts from pet...@infradead.org's message of August 12, 2020 8:35 pm:
> > On Wed, Aug 12, 2020 at 06:18:28PM +1000, Nicholas Piggin wrote:
> >> Excerpts from pet...@infradead.org's message of August 7, 2020 9:11 pm:
> >> > 
> >> > What's wrong with something like this?
> >> > 
> >> > AFAICT there's no reason to actually try and add IRQ tracing here, it's
> >> > just a hand full of instructions at the most.
> >> 
> >> Because we may want to use that in other places as well, so it would
> >> be nice to have tracing.
> >> 
> >> Hmm... also, I thought NMI context was free to call local_irq_save/restore
> >> anyway so the bug would still be there in those cases?
> > 
> > NMI code has in_nmi() true, in which case the IRQ tracing is disabled
> > (except for x86 which has CONFIG_TRACE_IRQFLAGS_NMI).
> > 
> 
> That doesn't help. It doesn't fix the lockdep irq state going out of
> synch with the actual irq state. The code which triggered this with the
> special powerpc irq disable has in_nmi() true as well.

Urgh, you're talking about using lockdep_assert_irqs*() from NMI
context?

If not, I'm afraid I might've lost the plot a little on what exact
failure case we're talking about.


[PATCH v3 17/17] memblock: use separate iterators for memory and reserved regions

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

for_each_memblock() is used to iterate over memblock.memory in
a few places that use data from memblock_region rather than the memory
ranges.

Introduce separate for_each_mem_region() and for_each_reserved_mem_region()
to improve encapsulation of memblock internals from its users.

Signed-off-by: Mike Rapoport 
Reviewed-by: Baoquan He 
Acked-by: Ingo Molnar# x86
Acked-by: Thomas Bogendoerfer   # MIPS
Acked-by: Miguel Ojeda# .clang-format
---
 .clang-format  |  3 ++-
 arch/arm64/kernel/setup.c  |  2 +-
 arch/arm64/mm/numa.c   |  2 +-
 arch/mips/netlogic/xlp/setup.c |  2 +-
 arch/riscv/mm/init.c   |  2 +-
 arch/x86/mm/numa.c |  2 +-
 include/linux/memblock.h   | 19 ---
 mm/memblock.c  |  4 ++--
 mm/page_alloc.c|  8 
 9 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/.clang-format b/.clang-format
index 2b77cc419b97..a118fdde25c1 100644
--- a/.clang-format
+++ b/.clang-format
@@ -201,7 +201,7 @@ ForEachMacros:
   - 'for_each_matching_node'
   - 'for_each_matching_node_and_match'
   - 'for_each_member'
-  - 'for_each_memblock'
+  - 'for_each_mem_region'
   - 'for_each_memblock_type'
   - 'for_each_memcg_cache_index'
   - 'for_each_mem_pfn_range'
@@ -268,6 +268,7 @@ ForEachMacros:
   - 'for_each_property_of_node'
   - 'for_each_registered_fb'
   - 'for_each_reserved_mem_range'
+  - 'for_each_reserved_mem_region'
   - 'for_each_rtd_codec_dais'
   - 'for_each_rtd_codec_dais_rollback'
   - 'for_each_rtd_components'
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index a986c6f8ab42..dcce72ac072b 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -217,7 +217,7 @@ static void __init request_standard_resources(void)
if (!standard_resources)
panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
 
-   for_each_memblock(memory, region) {
+   for_each_mem_region(region) {
res = _resources[i++];
if (memblock_is_nomap(region)) {
res->name  = "reserved";
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 8a97cd3d2dfe..5efdbd01a59c 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -350,7 +350,7 @@ static int __init numa_register_nodes(void)
struct memblock_region *mblk;
 
/* Check that valid nid is set to memblks */
-   for_each_memblock(memory, mblk) {
+   for_each_mem_region(mblk) {
int mblk_nid = memblock_get_region_node(mblk);
 
if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c
index 1a0fc5b62ba4..6e3102bcd2f1 100644
--- a/arch/mips/netlogic/xlp/setup.c
+++ b/arch/mips/netlogic/xlp/setup.c
@@ -70,7 +70,7 @@ static void nlm_fixup_mem(void)
const int pref_backup = 512;
struct memblock_region *mem;
 
-   for_each_memblock(memory, mem) {
+   for_each_mem_region(mem) {
memblock_remove(mem->base + mem->size - pref_backup,
pref_backup);
}
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 06355716d19a..1fb6a826c2fd 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -531,7 +531,7 @@ static void __init resource_init(void)
 {
struct memblock_region *region;
 
-   for_each_memblock(memory, region) {
+   for_each_mem_region(region) {
struct resource *res;
 
res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index aa76ec2d359b..b6246768479d 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -516,7 +516,7 @@ static void __init numa_clear_kernel_node_hotplug(void)
 *   memory ranges, because quirks such as trim_snb_memory()
 *   reserve specific pages for Sandy Bridge graphics. ]
 */
-   for_each_memblock(reserved, mb_region) {
+   for_each_reserved_mem_region(mb_region) {
int nid = memblock_get_region_node(mb_region);
 
if (nid != MAX_NUMNODES)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 354078713cd1..ef131255cedc 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -553,9 +553,22 @@ static inline unsigned long 
memblock_region_reserved_end_pfn(const struct memblo
return PFN_UP(reg->base + reg->size);
 }
 
-#define for_each_memblock(memblock_type, region)   
\
-   for (region = memblock.memblock_type.regions;   
\
-region < (memblock.memblock_type.regions + 
memblock.memblock_type.cnt);\
+/**
+ * for_each_mem_region - itereate over memory regions
+ * @region: loop variable
+ */
+#define for_each_mem_region(region)

[PATCH v3 16/17] memblock: implement for_each_reserved_mem_region() using __next_mem_region()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

Iteration over memblock.reserved with for_each_reserved_mem_region() used
__next_reserved_mem_region() that implemented a subset of
__next_mem_region().

Use __for_each_mem_range() and, essentially, __next_mem_region() with
appropriate parameters to reduce code duplication.

While on it, rename for_each_reserved_mem_region() to
for_each_reserved_mem_range() for consistency.

Signed-off-by: Mike Rapoport 
Acked-by: Miguel Ojeda  # .clang-format
---
 .clang-format|  2 +-
 arch/arm64/kernel/setup.c|  2 +-
 drivers/irqchip/irq-gic-v3-its.c |  2 +-
 include/linux/memblock.h | 12 +++
 mm/memblock.c| 56 
 5 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/.clang-format b/.clang-format
index 3e42a8e4df73..2b77cc419b97 100644
--- a/.clang-format
+++ b/.clang-format
@@ -267,7 +267,7 @@ ForEachMacros:
   - 'for_each_process_thread'
   - 'for_each_property_of_node'
   - 'for_each_registered_fb'
-  - 'for_each_reserved_mem_region'
+  - 'for_each_reserved_mem_range'
   - 'for_each_rtd_codec_dais'
   - 'for_each_rtd_codec_dais_rollback'
   - 'for_each_rtd_components'
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c4c9bad1b8..a986c6f8ab42 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -257,7 +257,7 @@ static int __init reserve_memblock_reserved_regions(void)
if (!memblock_is_region_reserved(mem->start, mem_size))
continue;
 
-   for_each_reserved_mem_region(j, _start, _end) {
+   for_each_reserved_mem_range(j, _start, _end) {
resource_size_t start, end;
 
start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start);
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 95f097448f97..ca5c470ed0d0 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2192,7 +2192,7 @@ static bool gic_check_reserved_range(phys_addr_t addr, 
unsigned long size)
 
addr_end = addr + size - 1;
 
-   for_each_reserved_mem_region(i, , ) {
+   for_each_reserved_mem_range(i, , ) {
if (addr >= start && addr_end <= end)
return true;
}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 15ed119701c1..354078713cd1 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -132,9 +132,6 @@ void __next_mem_range_rev(u64 *idx, int nid, enum 
memblock_flags flags,
  struct memblock_type *type_b, phys_addr_t *out_start,
  phys_addr_t *out_end, int *out_nid);
 
-void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
-   phys_addr_t *out_end);
-
 void __memblock_free_late(phys_addr_t base, phys_addr_t size);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -224,7 +221,7 @@ static inline void __next_physmem_range(u64 *idx, struct 
memblock_type *type,
 MEMBLOCK_NONE, p_start, p_end, NULL)
 
 /**
- * for_each_reserved_mem_region - iterate over all reserved memblock areas
+ * for_each_reserved_mem_range - iterate over all reserved memblock areas
  * @i: u64 used as loop variable
  * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
@@ -232,10 +229,9 @@ static inline void __next_physmem_range(u64 *idx, struct 
memblock_type *type,
  * Walks over reserved areas of memblock. Available as soon as memblock
  * is initialized.
  */
-#define for_each_reserved_mem_region(i, p_start, p_end)
\
-   for (i = 0UL, __next_reserved_mem_region(, p_start, p_end);   \
-i != (u64)ULLONG_MAX;  \
-__next_reserved_mem_region(, p_start, p_end))
+#define for_each_reserved_mem_range(i, p_start, p_end) \
+   __for_each_mem_range(i, , NULL, NUMA_NO_NODE, \
+MEMBLOCK_NONE, p_start, p_end, NULL)
 
 static inline bool memblock_is_hotpluggable(struct memblock_region *m)
 {
diff --git a/mm/memblock.c b/mm/memblock.c
index eb4f866bea34..d0be57acccf2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -132,6 +132,14 @@ struct memblock_type physmem = {
 };
 #endif
 
+/*
+ * keep a pointer to  in the text section to use it in
+ * __next_mem_range() and its helpers.
+ *  For architectures that do not keep memblock data after init, this
+ * pointer will be reset to NULL at memblock_discard()
+ */
+static __refdata struct memblock_type *memblock_memory = 
+
 #define for_each_memblock_type(i, memblock_type, rgn)  \
for (i = 0, rgn = _type->regions[0];   \
 i < memblock_type->cnt;\
@@ -399,6 +407,8 @@ void __init memblock_discard(void)

[PATCH v3 15/17] memblock: remove unused memblock_mem_size()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

The only user of memblock_mem_size() was x86 setup code, it is gone now and
memblock_mem_size() funciton can be removed.

Signed-off-by: Mike Rapoport 
Reviewed-by: Baoquan He 
---
 include/linux/memblock.h |  1 -
 mm/memblock.c| 15 ---
 2 files changed, 16 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 27c3b84d1615..15ed119701c1 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -481,7 +481,6 @@ static inline bool memblock_bottom_up(void)
 
 phys_addr_t memblock_phys_mem_size(void);
 phys_addr_t memblock_reserved_size(void);
-phys_addr_t memblock_mem_size(unsigned long limit_pfn);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
diff --git a/mm/memblock.c b/mm/memblock.c
index 567e454ce0a1..eb4f866bea34 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1657,21 +1657,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
 }
 
-phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
-{
-   unsigned long pages = 0;
-   unsigned long start_pfn, end_pfn;
-   int i;
-
-   for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
-   start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
-   end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
-   pages += end_pfn - start_pfn;
-   }
-
-   return PFN_PHYS(pages);
-}
-
 /* lowest address */
 phys_addr_t __init_memblock memblock_start_of_DRAM(void)
 {
-- 
2.26.2



[PATCH v3 14/17] x86/setup: simplify reserve_crashkernel()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

* Replace magic numbers with defines
* Replace memblock_find_in_range() + memblock_reserve() with
  memblock_phys_alloc_range()
* Stop checking for low memory size in reserve_crashkernel_low(). The
  allocation from limited range will anyway fail if there is no enough
  memory, so there is no need for extra traversal of memblock.memory

Signed-off-by: Mike Rapoport 
Acked-by: Ingo Molnar 
Reviewed-by: Baoquan He 
---
 arch/x86/kernel/setup.c | 40 ++--
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2cac39ade2e3..52e83ba607b3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -420,13 +420,13 @@ static int __init reserve_crashkernel_low(void)
 {
 #ifdef CONFIG_X86_64
unsigned long long base, low_base = 0, low_size = 0;
-   unsigned long total_low_mem;
+   unsigned long low_mem_limit;
int ret;
 
-   total_low_mem = memblock_mem_size(1UL << (32 - PAGE_SHIFT));
+   low_mem_limit = min(memblock_phys_mem_size(), CRASH_ADDR_LOW_MAX);
 
/* crashkernel=Y,low */
-   ret = parse_crashkernel_low(boot_command_line, total_low_mem, 
_size, );
+   ret = parse_crashkernel_low(boot_command_line, low_mem_limit, 
_size, );
if (ret) {
/*
 * two parts from kernel/dma/swiotlb.c:
@@ -444,23 +444,17 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
 
-   low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
+   low_base = memblock_phys_alloc_range(low_size, CRASH_ALIGN, 0, 
CRASH_ADDR_LOW_MAX);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try 
smaller size.\n",
   (unsigned long)(low_size >> 20));
return -ENOMEM;
}
 
-   ret = memblock_reserve(low_base, low_size);
-   if (ret) {
-   pr_err("%s: Error reserving crashkernel low memblock.\n", 
__func__);
-   return ret;
-   }
-
-   pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System 
low RAM: %ldMB)\n",
+   pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (low 
RAM limit: %ldMB)\n",
(unsigned long)(low_size >> 20),
(unsigned long)(low_base >> 20),
-   (unsigned long)(total_low_mem >> 20));
+   (unsigned long)(low_mem_limit >> 20));
 
crashk_low_res.start = low_base;
crashk_low_res.end   = low_base + low_size - 1;
@@ -504,13 +498,13 @@ static void __init reserve_crashkernel(void)
 * unless "crashkernel=size[KMG],high" is specified.
 */
if (!high)
-   crash_base = memblock_find_in_range(CRASH_ALIGN,
-   CRASH_ADDR_LOW_MAX,
-   crash_size, CRASH_ALIGN);
+   crash_base = memblock_phys_alloc_range(crash_size,
+   CRASH_ALIGN, CRASH_ALIGN,
+   CRASH_ADDR_LOW_MAX);
if (!crash_base)
-   crash_base = memblock_find_in_range(CRASH_ALIGN,
-   CRASH_ADDR_HIGH_MAX,
-   crash_size, CRASH_ALIGN);
+   crash_base = memblock_phys_alloc_range(crash_size,
+   CRASH_ALIGN, CRASH_ALIGN,
+   CRASH_ADDR_HIGH_MAX);
if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable 
area found.\n");
return;
@@ -518,19 +512,13 @@ static void __init reserve_crashkernel(void)
} else {
unsigned long long start;
 
-   start = memblock_find_in_range(crash_base,
-  crash_base + crash_size,
-  crash_size, 1 << 20);
+   start = memblock_phys_alloc_range(crash_size, SZ_1M, crash_base,
+ crash_base + crash_size);
if (start != crash_base) {
pr_info("crashkernel reservation failed - memory is in 
use.\n");
return;
}
}
-   ret = memblock_reserve(crash_base, crash_size);
-   if (ret) {
-   pr_err("%s: Error reserving crashkernel memblock.\n", __func__);
-   return;
-   }
 
if (crash_base >= (1ULL << 32) && reserve_crashkernel_low()) {
memblock_free(crash_base, crash_size);
-- 
2.26.2



[PATCH v3 13/17] x86/setup: simplify initrd relocation and reservation

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

Currently, initrd image is reserved very early during setup and then it
might be relocated and re-reserved after the initial physical memory
mapping is created. The "late" reservation of memblock verifies that mapped
memory size exceeds the size of initrd, then checks whether the relocation
required and, if yes, relocates inirtd to a new memory allocated from
memblock and frees the old location.

The check for memory size is excessive as memblock allocation will anyway
fail if there is not enough memory. Besides, there is no point to allocate
memory from memblock using memblock_find_in_range() + memblock_reserve()
when there exists memblock_phys_alloc_range() with required functionality.

Remove the redundant check and simplify memblock allocation.

Signed-off-by: Mike Rapoport 
Acked-by: Ingo Molnar 
Reviewed-by: Baoquan He 
---
 arch/x86/kernel/setup.c | 16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3511736fbc74..2cac39ade2e3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -263,16 +263,12 @@ static void __init relocate_initrd(void)
u64 area_size = PAGE_ALIGN(ramdisk_size);
 
/* We need to move the initrd down into directly mapped mem */
-   relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
-  area_size, PAGE_SIZE);
-
+   relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
  ramdisk_size);
 
-   /* Note: this includes all the mem currently occupied by
-  the initrd, we rely on that fact to keep the data intact. */
-   memblock_reserve(relocated_ramdisk, area_size);
initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end   = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
@@ -299,13 +295,13 @@ static void __init early_reserve_initrd(void)
 
memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
 }
+
 static void __init reserve_initrd(void)
 {
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size  = get_ramdisk_size();
u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-   u64 mapped_size;
 
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -313,12 +309,6 @@ static void __init reserve_initrd(void)
 
initrd_start = 0;
 
-   mapped_size = memblock_mem_size(max_pfn_mapped);
-   if (ramdisk_size >= (mapped_size>>1))
-   panic("initrd too large to handle, "
-  "disabling initrd (%lld needed, %lld available)\n",
-  ramdisk_size, mapped_size>>1);
-
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
 
-- 
2.26.2



[PATCH v3 12/17] arch, drivers: replace for_each_membock() with for_each_mem_range()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

There are several occurrences of the following pattern:

for_each_memblock(memory, reg) {
start = __pfn_to_phys(memblock_region_memory_base_pfn(reg);
end = __pfn_to_phys(memblock_region_memory_end_pfn(reg));

/* do something with start and end */
}

Using for_each_mem_range() iterator is more appropriate in such cases and
allows simpler and cleaner code.

Signed-off-by: Mike Rapoport 
---
 arch/arm/kernel/setup.c  | 18 ++---
 arch/arm/mm/mmu.c| 39 ++
 arch/arm/mm/pmsa-v7.c| 23 ++-
 arch/arm/mm/pmsa-v8.c| 17 
 arch/arm/xen/mm.c|  7 ++--
 arch/arm64/mm/kasan_init.c   | 10 ++---
 arch/arm64/mm/mmu.c  | 11 ++
 arch/c6x/kernel/setup.c  |  9 +++--
 arch/microblaze/mm/init.c|  9 +++--
 arch/mips/cavium-octeon/dma-octeon.c | 12 +++---
 arch/mips/kernel/setup.c | 31 +++
 arch/openrisc/mm/init.c  |  8 ++--
 arch/powerpc/kernel/fadump.c | 50 +++-
 arch/powerpc/kexec/file_load_64.c| 10 ++---
 arch/powerpc/mm/book3s64/hash_utils.c| 16 
 arch/powerpc/mm/book3s64/radix_pgtable.c | 10 ++---
 arch/powerpc/mm/kasan/kasan_init_32.c|  8 ++--
 arch/powerpc/mm/mem.c| 16 +---
 arch/powerpc/mm/pgtable_32.c |  8 ++--
 arch/riscv/mm/init.c | 25 +---
 arch/riscv/mm/kasan_init.c   | 10 ++---
 arch/s390/kernel/setup.c | 23 +++
 arch/s390/mm/vmem.c  |  7 ++--
 arch/sparc/mm/init_64.c  | 12 ++
 drivers/bus/mvebu-mbus.c | 12 +++---
 25 files changed, 194 insertions(+), 207 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index d8e18cdd96d3..3f65d0ac9f63 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -843,19 +843,25 @@ early_param("mem", early_mem);
 
 static void __init request_standard_resources(const struct machine_desc *mdesc)
 {
-   struct memblock_region *region;
+   phys_addr_t start, end, res_end;
struct resource *res;
+   u64 i;
 
kernel_code.start   = virt_to_phys(_text);
kernel_code.end = virt_to_phys(__init_begin - 1);
kernel_data.start   = virt_to_phys(_sdata);
kernel_data.end = virt_to_phys(_end - 1);
 
-   for_each_memblock(memory, region) {
-   phys_addr_t start = 
__pfn_to_phys(memblock_region_memory_base_pfn(region));
-   phys_addr_t end = 
__pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
+   for_each_mem_range(i, , ) {
unsigned long boot_alias_start;
 
+   /*
+* In memblock, end points to the first byte after the
+* range while in resourses, end points to the last byte in
+* the range.
+*/
+   res_end = end - 1;
+
/*
 * Some systems have a special memory alias which is only
 * used for booting.  We need to advertise this region to
@@ -869,7 +875,7 @@ static void __init request_standard_resources(const struct 
machine_desc *mdesc)
  __func__, sizeof(*res));
res->name = "System RAM (boot alias)";
res->start = boot_alias_start;
-   res->end = phys_to_idmap(end);
+   res->end = phys_to_idmap(res_end);
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
request_resource(_resource, res);
}
@@ -880,7 +886,7 @@ static void __init request_standard_resources(const struct 
machine_desc *mdesc)
  sizeof(*res));
res->name  = "System RAM";
res->start = start;
-   res->end = end;
+   res->end = res_end;
res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
request_resource(_resource, res);
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index c36f977b2ccb..698cc740c6b8 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1154,9 +1154,8 @@ phys_addr_t arm_lowmem_limit __initdata = 0;
 
 void __init adjust_lowmem_bounds(void)
 {
-   phys_addr_t memblock_limit = 0;
-   u64 vmalloc_limit;
-   struct memblock_region *reg;
+   phys_addr_t block_start, block_end, memblock_limit = 0;
+   u64 vmalloc_limit, i;
phys_addr_t lowmem_limit = 0;
 
/*
@@ -1172,26 +1171,18 @@ void __init adjust_lowmem_bounds(void)
 * The first usable region must be PMD aligned. Mark its start
 * as MEMBLOCK_NOMAP if it isn't
 */
-   for_each_memblock(memory, 

[PATCH v3 11/17] arch, mm: replace for_each_memblock() with for_each_mem_pfn_range()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

There are several occurrences of the following pattern:

for_each_memblock(memory, reg) {
start_pfn = memblock_region_memory_base_pfn(reg);
end_pfn = memblock_region_memory_end_pfn(reg);

/* do something with start_pfn and end_pfn */
}

Rather than iterate over all memblock.memory regions and each time query
for their start and end PFNs, use for_each_mem_pfn_range() iterator to get
simpler and clearer code.

Signed-off-by: Mike Rapoport 
Reviewed-by: Baoquan He 
---
 arch/arm/mm/init.c   | 11 ---
 arch/arm64/mm/init.c | 11 ---
 arch/powerpc/kernel/fadump.c | 11 ++-
 arch/powerpc/mm/mem.c| 15 ---
 arch/powerpc/mm/numa.c   |  7 ++-
 arch/s390/mm/page-states.c   |  6 ++
 arch/sh/mm/init.c|  9 +++--
 mm/memblock.c|  6 ++
 mm/sparse.c  | 10 --
 9 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 50a5a30a78ff..45f9d5ec2360 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -299,16 +299,14 @@ free_memmap(unsigned long start_pfn, unsigned long 
end_pfn)
  */
 static void __init free_unused_memmap(void)
 {
-   unsigned long start, prev_end = 0;
-   struct memblock_region *reg;
+   unsigned long start, end, prev_end = 0;
+   int i;
 
/*
 * This relies on each bank being in address order.
 * The banks are sorted previously in bootmem_init().
 */
-   for_each_memblock(memory, reg) {
-   start = memblock_region_memory_base_pfn(reg);
-
+   for_each_mem_pfn_range(i, MAX_NUMNODES, , , NULL) {
 #ifdef CONFIG_SPARSEMEM
/*
 * Take care not to free memmap entries that don't exist
@@ -336,8 +334,7 @@ static void __init free_unused_memmap(void)
 * memmap entries are valid from the bank end aligned to
 * MAX_ORDER_NR_PAGES.
 */
-   prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
-MAX_ORDER_NR_PAGES);
+   prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
}
 
 #ifdef CONFIG_SPARSEMEM
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 481d22c32a2e..f0bf86d81622 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -471,12 +471,10 @@ static inline void free_memmap(unsigned long start_pfn, 
unsigned long end_pfn)
  */
 static void __init free_unused_memmap(void)
 {
-   unsigned long start, prev_end = 0;
-   struct memblock_region *reg;
-
-   for_each_memblock(memory, reg) {
-   start = __phys_to_pfn(reg->base);
+   unsigned long start, end, prev_end = 0;
+   int i;
 
+   for_each_mem_pfn_range(i, MAX_NUMNODES, , , NULL) {
 #ifdef CONFIG_SPARSEMEM
/*
 * Take care not to free memmap entries that don't exist due
@@ -496,8 +494,7 @@ static void __init free_unused_memmap(void)
 * memmap entries are valid from the bank end aligned to
 * MAX_ORDER_NR_PAGES.
 */
-   prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
-MAX_ORDER_NR_PAGES);
+   prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
}
 
 #ifdef CONFIG_SPARSEMEM
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 10ebb4bf71ad..e469b150be21 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1242,14 +1242,15 @@ static void fadump_free_reserved_memory(unsigned long 
start_pfn,
  */
 static void fadump_release_reserved_area(u64 start, u64 end)
 {
-   u64 tstart, tend, spfn, epfn;
-   struct memblock_region *reg;
+   u64 tstart, tend, spfn, epfn, reg_spfn, reg_epfn, i;
 
spfn = PHYS_PFN(start);
epfn = PHYS_PFN(end);
-   for_each_memblock(memory, reg) {
-   tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg));
-   tend   = min_t(u64, epfn, memblock_region_memory_end_pfn(reg));
+
+   for_each_mem_pfn_range(i, MAX_NUMNODES, _spfn, _epfn, NULL) {
+   tstart = max_t(u64, spfn, reg_spfn);
+   tend   = min_t(u64, epfn, reg_epfn);
+
if (tstart < tend) {
fadump_free_reserved_memory(tstart, tend);
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 42e25874f5a8..80df329f180e 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -184,15 +184,16 @@ void __init initmem_init(void)
 /* mark pages that don't exist as nosave */
 static int __init mark_nonram_nosave(void)
 {
-   struct memblock_region *reg, *prev = NULL;
+   unsigned long spfn, epfn, prev = 0;
+   int i;
 
-   for_each_memblock(memory, reg) {
-   if (prev &&
-   memblock_region_memory_end_pfn(prev) 

[PATCH v3 10/17] memblock: reduce number of parameters in for_each_mem_range()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

Currently for_each_mem_range() and for_each_mem_range_rev() iterators are
the most generic way to traverse memblock regions. As such, they have 8
parameters and they are hardly convenient to users. Most users choose to
utilize one of their wrappers and the only user that actually needs most of
the parameters is memblock itself.

To avoid yet another naming for memblock iterators, rename the existing
for_each_mem_range[_rev]() to __for_each_mem_range[_rev]() and add a new
for_each_mem_range[_rev]() wrappers with only index, start and end
parameters.

The new wrapper nicely fits into init_unavailable_mem() and will be used in
upcoming changes to simplify memblock traversals.

Signed-off-by: Mike Rapoport 
Acked-by: Thomas Bogendoerfer   # MIPS
---
 .clang-format  |  2 ++
 arch/arm64/kernel/machine_kexec_file.c |  6 ++--
 arch/powerpc/kexec/file_load_64.c  |  6 ++--
 include/linux/memblock.h   | 41 +++---
 mm/page_alloc.c|  3 +-
 5 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/.clang-format b/.clang-format
index a0a96088c74f..3e42a8e4df73 100644
--- a/.clang-format
+++ b/.clang-format
@@ -205,7 +205,9 @@ ForEachMacros:
   - 'for_each_memblock_type'
   - 'for_each_memcg_cache_index'
   - 'for_each_mem_pfn_range'
+  - '__for_each_mem_range'
   - 'for_each_mem_range'
+  - '__for_each_mem_range_rev'
   - 'for_each_mem_range_rev'
   - 'for_each_migratetype_order'
   - 'for_each_msi_entry'
diff --git a/arch/arm64/kernel/machine_kexec_file.c 
b/arch/arm64/kernel/machine_kexec_file.c
index 361a1143e09e..5b0e67b93cdc 100644
--- a/arch/arm64/kernel/machine_kexec_file.c
+++ b/arch/arm64/kernel/machine_kexec_file.c
@@ -215,8 +215,7 @@ static int prepare_elf_headers(void **addr, unsigned long 
*sz)
phys_addr_t start, end;
 
nr_ranges = 1; /* for exclusion of crashkernel region */
-   for_each_mem_range(i, , NULL, NUMA_NO_NODE,
-   MEMBLOCK_NONE, , , NULL)
+   for_each_mem_range(i, , )
nr_ranges++;
 
cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL);
@@ -225,8 +224,7 @@ static int prepare_elf_headers(void **addr, unsigned long 
*sz)
 
cmem->max_nr_ranges = nr_ranges;
cmem->nr_ranges = 0;
-   for_each_mem_range(i, , NULL, NUMA_NO_NODE,
-   MEMBLOCK_NONE, , , NULL) {
+   for_each_mem_range(i, , ) {
cmem->ranges[cmem->nr_ranges].start = start;
cmem->ranges[cmem->nr_ranges].end = end - 1;
cmem->nr_ranges++;
diff --git a/arch/powerpc/kexec/file_load_64.c 
b/arch/powerpc/kexec/file_load_64.c
index 53bb71e3a2e1..2c9d908eab96 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -250,8 +250,7 @@ static int __locate_mem_hole_top_down(struct kexec_buf 
*kbuf,
phys_addr_t start, end;
u64 i;
 
-   for_each_mem_range_rev(i, , NULL, NUMA_NO_NODE,
-  MEMBLOCK_NONE, , , NULL) {
+   for_each_mem_range_rev(i, , ) {
/*
 * memblock uses [start, end) convention while it is
 * [start, end] here. Fix the off-by-one to have the
@@ -350,8 +349,7 @@ static int __locate_mem_hole_bottom_up(struct kexec_buf 
*kbuf,
phys_addr_t start, end;
u64 i;
 
-   for_each_mem_range(i, , NULL, NUMA_NO_NODE,
-  MEMBLOCK_NONE, , , NULL) {
+   for_each_mem_range(i, , ) {
/*
 * memblock uses [start, end) convention while it is
 * [start, end] here. Fix the off-by-one to have the
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 47a76e237fca..27c3b84d1615 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -162,7 +162,7 @@ static inline void __next_physmem_range(u64 *idx, struct 
memblock_type *type,
 #endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */
 
 /**
- * for_each_mem_range - iterate through memblock areas from type_a and not
+ * __for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
  * @i: u64 used as loop variable
  * @type_a: ptr to memblock_type to iterate
@@ -173,7 +173,7 @@ static inline void __next_physmem_range(u64 *idx, struct 
memblock_type *type,
  * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
  * @p_nid: ptr to int for nid of the range, can be %NULL
  */
-#define for_each_mem_range(i, type_a, type_b, nid, flags,  \
+#define __for_each_mem_range(i, type_a, type_b, nid, flags,\
   p_start, p_end, p_nid)   \
for (i = 0, __next_mem_range(, nid, flags, type_a, type_b,\
 p_start, p_end, p_nid);\
@@ -182,7 +182,7 @@ static inline void 

[PATCH v3 09/17] memblock: make memblock_debug and related functionality private

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

The only user of memblock_dbg() outside memblock was s390 setup code and it
is converted to use pr_debug() instead.
This allows to stop exposing memblock_debug and memblock_dbg() to the rest
of the kernel.

Signed-off-by: Mike Rapoport 
Reviewed-by: Baoquan He 
---
 arch/s390/kernel/setup.c |  4 ++--
 include/linux/memblock.h | 12 +---
 mm/memblock.c| 13 +++--
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index e600f6953d7c..68089eabae27 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -776,8 +776,8 @@ static void __init memblock_add_mem_detect_info(void)
unsigned long start, end;
int i;
 
-   memblock_dbg("physmem info source: %s (%hhd)\n",
-get_mem_info_source(), mem_detect.info_source);
+   pr_debug("physmem info source: %s (%hhd)\n",
+get_mem_info_source(), mem_detect.info_source);
/* keep memblock lists close to the kernel */
memblock_set_bottom_up(true);
for_each_mem_detect_block(i, , ) {
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 550faf69fc1c..47a76e237fca 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -86,7 +86,6 @@ struct memblock {
 };
 
 extern struct memblock memblock;
-extern int memblock_debug;
 
 #ifndef CONFIG_ARCH_KEEP_MEMBLOCK
 #define __init_memblock __meminit
@@ -98,9 +97,6 @@ void memblock_discard(void);
 static inline void memblock_discard(void) {}
 #endif
 
-#define memblock_dbg(fmt, ...) \
-   if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
-
 phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
   phys_addr_t size, phys_addr_t align);
 void memblock_allow_resize(void);
@@ -476,13 +472,7 @@ bool memblock_is_region_memory(phys_addr_t base, 
phys_addr_t size);
 bool memblock_is_reserved(phys_addr_t addr);
 bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
 
-extern void __memblock_dump_all(void);
-
-static inline void memblock_dump_all(void)
-{
-   if (memblock_debug)
-   __memblock_dump_all();
-}
+void memblock_dump_all(void);
 
 /**
  * memblock_set_current_limit - Set the current allocation limit to allow
diff --git a/mm/memblock.c b/mm/memblock.c
index 59f3998ae5db..799513f3d6a9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,7 +137,10 @@ struct memblock_type physmem = {
 i < memblock_type->cnt;\
 i++, rgn = _type->regions[i])
 
-int memblock_debug __initdata_memblock;
+#define memblock_dbg(fmt, ...) \
+   if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+
+static int memblock_debug __initdata_memblock;
 static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -1920,7 +1923,7 @@ static void __init_memblock memblock_dump(struct 
memblock_type *type)
}
 }
 
-void __init_memblock __memblock_dump_all(void)
+static void __init_memblock __memblock_dump_all(void)
 {
pr_info("MEMBLOCK configuration:\n");
pr_info(" memory size = %pa reserved size = %pa\n",
@@ -1934,6 +1937,12 @@ void __init_memblock __memblock_dump_all(void)
 #endif
 }
 
+void __init_memblock memblock_dump_all(void)
+{
+   if (memblock_debug)
+   __memblock_dump_all();
+}
+
 void __init memblock_allow_resize(void)
 {
memblock_can_resize = 1;
-- 
2.26.2



[PATCH v3 08/17] memblock: make for_each_memblock_type() iterator private

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

for_each_memblock_type() is not used outside mm/memblock.c, move it there
from include/linux/memblock.h

Signed-off-by: Mike Rapoport 
Reviewed-by: Baoquan He 
---
 include/linux/memblock.h | 5 -
 mm/memblock.c| 5 +
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 9d925db0d355..550faf69fc1c 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -552,11 +552,6 @@ static inline unsigned long 
memblock_region_reserved_end_pfn(const struct memblo
 region < (memblock.memblock_type.regions + 
memblock.memblock_type.cnt);\
 region++)
 
-#define for_each_memblock_type(i, memblock_type, rgn)  \
-   for (i = 0, rgn = _type->regions[0];   \
-i < memblock_type->cnt;\
-i++, rgn = _type->regions[i])
-
 extern void *alloc_large_system_hash(const char *tablename,
 unsigned long bucketsize,
 unsigned long numentries,
diff --git a/mm/memblock.c b/mm/memblock.c
index 45f198750be9..59f3998ae5db 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -132,6 +132,11 @@ struct memblock_type physmem = {
 };
 #endif
 
+#define for_each_memblock_type(i, memblock_type, rgn)  \
+   for (i = 0, rgn = _type->regions[0];   \
+i < memblock_type->cnt;\
+i++, rgn = _type->regions[i])
+
 int memblock_debug __initdata_memblock;
 static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
-- 
2.26.2



[PATCH v3 07/17] mircoblaze: drop unneeded NUMA and sparsemem initializations

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

microblaze does not support neither NUMA not SPARSMEM, so there is no point
to call memblock_set_node() and sparse_memory_present_with_active_regions()
functions during microblaze memory initialization.

Remove these calls and the surrounding code.

Signed-off-by: Mike Rapoport 
---
 arch/microblaze/mm/init.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 0880a003573d..49e0c241f9b1 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -105,9 +105,8 @@ static void __init paging_init(void)
 
 void __init setup_memory(void)
 {
-   struct memblock_region *reg;
-
 #ifndef CONFIG_MMU
+   struct memblock_region *reg;
u32 kernel_align_start, kernel_align_size;
 
/* Find main memory where is the kernel */
@@ -161,17 +160,6 @@ void __init setup_memory(void)
pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn);
pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn);
 
-   /* Add active regions with valid PFNs */
-   for_each_memblock(memory, reg) {
-   unsigned long start_pfn, end_pfn;
-
-   start_pfn = memblock_region_memory_base_pfn(reg);
-   end_pfn = memblock_region_memory_end_pfn(reg);
-   memblock_set_node(start_pfn << PAGE_SHIFT,
- (end_pfn - start_pfn) << PAGE_SHIFT,
- , 0);
-   }
-
paging_init();
 }
 
-- 
2.26.2



[PATCH v3 06/17] riscv: drop unneeded node initialization

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

RISC-V does not (yet) support NUMA  and for UMA architectures node 0 is
used implicitly during early memory initialization.

There is no need to call memblock_set_node(), remove this call and the
surrounding code.

Signed-off-by: Mike Rapoport 
---
 arch/riscv/mm/init.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 787c75f751a5..0485cfaacc72 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -191,15 +191,6 @@ void __init setup_bootmem(void)
early_init_fdt_scan_reserved_mem();
memblock_allow_resize();
memblock_dump_all();
-
-   for_each_memblock(memory, reg) {
-   unsigned long start_pfn = memblock_region_memory_base_pfn(reg);
-   unsigned long end_pfn = memblock_region_memory_end_pfn(reg);
-
-   memblock_set_node(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn),
- , 0);
-   }
 }
 
 #ifdef CONFIG_MMU
-- 
2.26.2



[PATCH v3 05/17] h8300, nds32, openrisc: simplify detection of memory extents

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

Instead of traversing memblock.memory regions to find memory_start and
memory_end, simply query memblock_{start,end}_of_DRAM().

Signed-off-by: Mike Rapoport 
Acked-by: Stafford Horne 
---
 arch/h8300/kernel/setup.c| 8 +++-
 arch/nds32/kernel/setup.c| 8 ++--
 arch/openrisc/kernel/setup.c | 9 ++---
 3 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c
index 28ac88358a89..0281f92eea3d 100644
--- a/arch/h8300/kernel/setup.c
+++ b/arch/h8300/kernel/setup.c
@@ -74,17 +74,15 @@ static void __init bootmem_init(void)
memory_end = memory_start = 0;
 
/* Find main memory where is the kernel */
-   for_each_memblock(memory, region) {
-   memory_start = region->base;
-   memory_end = region->base + region->size;
-   }
+   memory_start = memblock_start_of_DRAM();
+   memory_end = memblock_end_of_DRAM();
 
if (!memory_end)
panic("No memory!");
 
/* setup bootmem globals (we use no_bootmem, but mm still depends on 
this) */
min_low_pfn = PFN_UP(memory_start);
-   max_low_pfn = PFN_DOWN(memblock_end_of_DRAM());
+   max_low_pfn = PFN_DOWN(memory_end);
max_pfn = max_low_pfn;
 
memblock_reserve(__pa(_stext), _end - _stext);
diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c
index a066efbe53c0..c356e484dcab 100644
--- a/arch/nds32/kernel/setup.c
+++ b/arch/nds32/kernel/setup.c
@@ -249,12 +249,8 @@ static void __init setup_memory(void)
memory_end = memory_start = 0;
 
/* Find main memory where is the kernel */
-   for_each_memblock(memory, region) {
-   memory_start = region->base;
-   memory_end = region->base + region->size;
-   pr_info("%s: Memory: 0x%x-0x%x\n", __func__,
-   memory_start, memory_end);
-   }
+   memory_start = memblock_start_of_DRAM();
+   memory_end = memblock_end_of_DRAM();
 
if (!memory_end) {
panic("No memory!");
diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c
index b18e775f8be3..5a5940f7ebb1 100644
--- a/arch/openrisc/kernel/setup.c
+++ b/arch/openrisc/kernel/setup.c
@@ -48,17 +48,12 @@ static void __init setup_memory(void)
unsigned long ram_start_pfn;
unsigned long ram_end_pfn;
phys_addr_t memory_start, memory_end;
-   struct memblock_region *region;
 
memory_end = memory_start = 0;
 
/* Find main memory where is the kernel, we assume its the only one */
-   for_each_memblock(memory, region) {
-   memory_start = region->base;
-   memory_end = region->base + region->size;
-   printk(KERN_INFO "%s: Memory: 0x%x-0x%x\n", __func__,
-  memory_start, memory_end);
-   }
+   memory_start = memblock_start_of_DRAM();
+   memory_end = memblock_end_of_DRAM();
 
if (!memory_end) {
panic("No memory!");
-- 
2.26.2



[PATCH v3 04/17] arm64: numa: simplify dummy_numa_init()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

dummy_numa_init() loops over memblock.memory and passes nid=0 to
numa_add_memblk() which essentially wraps memblock_set_node(). However,
memblock_set_node() can cope with entire memory span itself, so the loop
over memblock.memory regions is redundant.

Using a single call to memblock_set_node() rather than a loop also fixes an
issue with a buggy ACPI firmware in which the SRAT table covers some but
not all of the memory in the EFI memory map.

Jonathan Cameron says:

  This issue can be easily triggered by having an SRAT table which fails
  to cover all elements of the EFI memory map.

  This firmware error is detected and a warning printed. e.g.
  "NUMA: Warning: invalid memblk node 64 [mem 0x24000-0x27fff]"
  At that point we fall back to dummy_numa_init().

  However, the failed ACPI init has left us with our memblocks all broken
  up as we split them when trying to assign them to NUMA nodes.

  We then iterate over the memblocks and add them to node 0.

  numa_add_memblk() calls memblock_set_node() which merges regions that
  were previously split up during the earlier attempt to add them to different
  nodes during parsing of SRAT.

  This means elements are moved in the memblock array and we can end up
  in a different memblock after the call to numa_add_memblk().
  Result is:

  Unable to handle kernel paging request at virtual address 3a40
  Mem abort info:
ESR = 0x9604
EC = 0x25: DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
  Data abort info:
ISV = 0, ISS = 0x0004
CM = 0, WnR = 0
  [3a40] user address but active_mm is swapper
  Internal error: Oops: 9604 [#1] PREEMPT SMP

  ...

  Call trace:
sparse_init_nid+0x5c/0x2b0
sparse_init+0x138/0x170
bootmem_init+0x80/0xe0
setup_arch+0x2a0/0x5fc
start_kernel+0x8c/0x648

Replace the loop with a single call to memblock_set_node() to the entire
memory.

Signed-off-by: Mike Rapoport 
Acked-by: Jonathan Cameron 
Acked-by: Catalin Marinas 
---
 arch/arm64/mm/numa.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 73f8b49d485c..8a97cd3d2dfe 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -423,19 +423,16 @@ static int __init numa_init(int (*init_func)(void))
  */
 static int __init dummy_numa_init(void)
 {
+   phys_addr_t start = memblock_start_of_DRAM();
+   phys_addr_t end = memblock_end_of_DRAM();
int ret;
-   struct memblock_region *mblk;
 
if (numa_off)
pr_info("NUMA disabled\n"); /* Forced off on command line. */
-   pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n",
-   memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1);
-
-   for_each_memblock(memory, mblk) {
-   ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
-   if (!ret)
-   continue;
+   pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
 
+   ret = numa_add_memblk(0, start, end);
+   if (ret) {
pr_err("NUMA init failed\n");
return ret;
}
-- 
2.26.2



[PATCH v3 03/17] arm, xtensa: simplify initialization of high memory pages

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

The function free_highpages() in both arm and xtensa essentially open-code
for_each_free_mem_range() loop to detect high memory pages that were not
reserved and that should be initialized and passed to the buddy allocator.

Replace open-coded implementation of for_each_free_mem_range() with usage
of memblock API to simplify the code.

Signed-off-by: Mike Rapoport 
Reviewed-by: Max Filippov   # xtensa
Tested-by: Max Filippov # xtensa
---
 arch/arm/mm/init.c| 48 +++--
 arch/xtensa/mm/init.c | 55 ---
 2 files changed, 18 insertions(+), 85 deletions(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 000c1b48e973..50a5a30a78ff 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -347,61 +347,29 @@ static void __init free_unused_memmap(void)
 #endif
 }
 
-#ifdef CONFIG_HIGHMEM
-static inline void free_area_high(unsigned long pfn, unsigned long end)
-{
-   for (; pfn < end; pfn++)
-   free_highmem_page(pfn_to_page(pfn));
-}
-#endif
-
 static void __init free_highpages(void)
 {
 #ifdef CONFIG_HIGHMEM
unsigned long max_low = max_low_pfn;
-   struct memblock_region *mem, *res;
+   phys_addr_t range_start, range_end;
+   u64 i;
 
/* set highmem page free */
-   for_each_memblock(memory, mem) {
-   unsigned long start = memblock_region_memory_base_pfn(mem);
-   unsigned long end = memblock_region_memory_end_pfn(mem);
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+   _start, _end, NULL) {
+   unsigned long start = PHYS_PFN(range_start);
+   unsigned long end = PHYS_PFN(range_end);
 
/* Ignore complete lowmem entries */
if (end <= max_low)
continue;
 
-   if (memblock_is_nomap(mem))
-   continue;
-
/* Truncate partial highmem entries */
if (start < max_low)
start = max_low;
 
-   /* Find and exclude any reserved regions */
-   for_each_memblock(reserved, res) {
-   unsigned long res_start, res_end;
-
-   res_start = memblock_region_reserved_base_pfn(res);
-   res_end = memblock_region_reserved_end_pfn(res);
-
-   if (res_end < start)
-   continue;
-   if (res_start < start)
-   res_start = start;
-   if (res_start > end)
-   res_start = end;
-   if (res_end > end)
-   res_end = end;
-   if (res_start != start)
-   free_area_high(start, res_start);
-   start = res_end;
-   if (start == end)
-   break;
-   }
-
-   /* And now free anything which remains */
-   if (start < end)
-   free_area_high(start, end);
+   for (; start < end; start++)
+   free_highmem_page(pfn_to_page(start));
}
 #endif
 }
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index a05b306cf371..ad9d59d93f39 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -79,67 +79,32 @@ void __init zones_init(void)
free_area_init(max_zone_pfn);
 }
 
-#ifdef CONFIG_HIGHMEM
-static void __init free_area_high(unsigned long pfn, unsigned long end)
-{
-   for (; pfn < end; pfn++)
-   free_highmem_page(pfn_to_page(pfn));
-}
-
 static void __init free_highpages(void)
 {
+#ifdef CONFIG_HIGHMEM
unsigned long max_low = max_low_pfn;
-   struct memblock_region *mem, *res;
+   phys_addr_t range_start, range_end;
+   u64 i;
 
-   reset_all_zones_managed_pages();
/* set highmem page free */
-   for_each_memblock(memory, mem) {
-   unsigned long start = memblock_region_memory_base_pfn(mem);
-   unsigned long end = memblock_region_memory_end_pfn(mem);
+   for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
+   _start, _end, NULL) {
+   unsigned long start = PHYS_PFN(range_start);
+   unsigned long end = PHYS_PFN(range_end);
 
/* Ignore complete lowmem entries */
if (end <= max_low)
continue;
 
-   if (memblock_is_nomap(mem))
-   continue;
-
/* Truncate partial highmem entries */
if (start < max_low)
start = max_low;
 
-   /* Find and exclude any reserved regions */
-   for_each_memblock(reserved, res) {
-   unsigned long res_start, 

[PATCH v3 02/17] dma-contiguous: simplify cma_early_percent_memory()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

The memory size calculation in cma_early_percent_memory() traverses
memblock.memory rather than simply call memblock_phys_mem_size(). The
comment in that function suggests that at some point there should have been
call to memblock_analyze() before memblock_phys_mem_size() could be used.
As of now, there is no memblock_analyze() at all and
memblock_phys_mem_size() can be used as soon as cold-plug memory is
registerd with memblock.

Replace loop over memblock.memory with a call to memblock_phys_mem_size().

Signed-off-by: Mike Rapoport 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Baoquan He 
---
 kernel/dma/contiguous.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index cff7e60968b9..0369fd5fda8f 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -73,16 +73,7 @@ early_param("cma", early_cma);
 
 static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
 {
-   struct memblock_region *reg;
-   unsigned long total_pages = 0;
-
-   /*
-* We cannot use memblock_phys_mem_size() here, because
-* memblock_analyze() has not been called yet.
-*/
-   for_each_memblock(memory, reg)
-   total_pages += memblock_region_memory_end_pfn(reg) -
-  memblock_region_memory_base_pfn(reg);
+   unsigned long total_pages = PHYS_PFN(memblock_phys_mem_size());
 
return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT;
 }
-- 
2.26.2



[PATCH v3 01/17] KVM: PPC: Book3S HV: simplify kvm_cma_reserve()

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

The memory size calculation in kvm_cma_reserve() traverses memblock.memory
rather than simply call memblock_phys_mem_size(). The comment in that
function suggests that at some point there should have been call to
memblock_analyze() before memblock_phys_mem_size() could be used.
As of now, there is no memblock_analyze() at all and
memblock_phys_mem_size() can be used as soon as cold-plug memory is
registerd with memblock.

Replace loop over memblock.memory with a call to memblock_phys_mem_size().

Signed-off-by: Mike Rapoport 
---
 arch/powerpc/kvm/book3s_hv_builtin.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
b/arch/powerpc/kvm/book3s_hv_builtin.c
index 073617ce83e0..8f58dd20b362 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -95,23 +95,15 @@ EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
 void __init kvm_cma_reserve(void)
 {
unsigned long align_size;
-   struct memblock_region *reg;
-   phys_addr_t selected_size = 0;
+   phys_addr_t selected_size;
 
/*
 * We need CMA reservation only when we are in HV mode
 */
if (!cpu_has_feature(CPU_FTR_HVMODE))
return;
-   /*
-* We cannot use memblock_phys_mem_size() here, because
-* memblock_analyze() has not been called yet.
-*/
-   for_each_memblock(memory, reg)
-   selected_size += memblock_region_memory_end_pfn(reg) -
-memblock_region_memory_base_pfn(reg);
 
-   selected_size = (selected_size * kvm_cma_resv_ratio / 100) << 
PAGE_SHIFT;
+   selected_size = PAGE_ALIGN(memblock_phys_mem_size() * 
kvm_cma_resv_ratio / 100);
if (selected_size) {
pr_info("%s: reserving %ld MiB for global area\n", __func__,
 (unsigned long)selected_size / SZ_1M);
-- 
2.26.2



[PATCH v3 00/17] memblock: seasonal cleaning^w cleanup

2020-08-18 Thread Mike Rapoport
From: Mike Rapoport 

Hi,

These patches simplify several uses of memblock iterators and hide some of
the memblock implementation details from the rest of the system.

The patches are on top of v5.9-rc1

v3 changes:
* rebase on v5.9-rc1, as the result this required some non-trivial changes
  in patches 10 and 16. I didn't add Baoquan's Reviewed-by to theses
  patches, but I keept Thomas and Miguel
* Add Acked-by from Thomas and Miguel as there were changes in MIPS and
  only trivial changes in .clang-format
* Added Reviewed-by from Baoquan except for the patches 10 and 16
* Fixed misc build errors and warnings reported by kbuild bot
* Updated PowerPC KVM reservation size (patch 2), as per Daniel's comment

v2 changes:
* replace for_each_memblock() with two versions, one for memblock.memory
  and another one for memblock.reserved
* fix overzealous cleanup of powerpc fadamp: keep the traversal over the
  memblocks, but use better suited iterators
* don't remove traversal over memblock.reserved in x86 numa cleanup but
  replace for_each_memblock() with new for_each_reserved_mem_region()
* simplify ramdisk and crash kernel allocations on x86
* drop more redundant and unused code: __next_reserved_mem_region() and
  memblock_mem_size()
* add description of numa initialization fix on arm64 (thanks Jonathan)
* add Acked and Reviewed tags

Mike Rapoport (17):
  KVM: PPC: Book3S HV: simplify kvm_cma_reserve()
  dma-contiguous: simplify cma_early_percent_memory()
  arm, xtensa: simplify initialization of high memory pages
  arm64: numa: simplify dummy_numa_init()
  h8300, nds32, openrisc: simplify detection of memory extents
  riscv: drop unneeded node initialization
  mircoblaze: drop unneeded NUMA and sparsemem initializations
  memblock: make for_each_memblock_type() iterator private
  memblock: make memblock_debug and related functionality private
  memblock: reduce number of parameters in for_each_mem_range()
  arch, mm: replace for_each_memblock() with for_each_mem_pfn_range()
  arch, drivers: replace for_each_membock() with for_each_mem_range()
  x86/setup: simplify initrd relocation and reservation
  x86/setup: simplify reserve_crashkernel()
  memblock: remove unused memblock_mem_size()
  memblock: implement for_each_reserved_mem_region() using
__next_mem_region()
  memblock: use separate iterators for memory and reserved regions

 .clang-format|  5 +-
 arch/arm/kernel/setup.c  | 18 +++--
 arch/arm/mm/init.c   | 59 +++
 arch/arm/mm/mmu.c| 39 --
 arch/arm/mm/pmsa-v7.c| 23 +++---
 arch/arm/mm/pmsa-v8.c| 17 ++---
 arch/arm/xen/mm.c|  7 +-
 arch/arm64/kernel/machine_kexec_file.c   |  6 +-
 arch/arm64/kernel/setup.c|  4 +-
 arch/arm64/mm/init.c | 11 +--
 arch/arm64/mm/kasan_init.c   | 10 +--
 arch/arm64/mm/mmu.c  | 11 +--
 arch/arm64/mm/numa.c | 15 ++--
 arch/c6x/kernel/setup.c  |  9 ++-
 arch/h8300/kernel/setup.c|  8 +-
 arch/microblaze/mm/init.c| 21 ++
 arch/mips/cavium-octeon/dma-octeon.c | 12 +--
 arch/mips/kernel/setup.c | 31 
 arch/mips/netlogic/xlp/setup.c   |  2 +-
 arch/nds32/kernel/setup.c|  8 +-
 arch/openrisc/kernel/setup.c |  9 +--
 arch/openrisc/mm/init.c  |  8 +-
 arch/powerpc/kernel/fadump.c | 57 +++---
 arch/powerpc/kexec/file_load_64.c| 16 ++--
 arch/powerpc/kvm/book3s_hv_builtin.c | 12 +--
 arch/powerpc/mm/book3s64/hash_utils.c| 16 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c | 10 +--
 arch/powerpc/mm/kasan/kasan_init_32.c|  8 +-
 arch/powerpc/mm/mem.c| 33 
 arch/powerpc/mm/numa.c   |  7 +-
 arch/powerpc/mm/pgtable_32.c |  8 +-
 arch/riscv/mm/init.c | 36 +++--
 arch/riscv/mm/kasan_init.c   | 10 +--
 arch/s390/kernel/setup.c | 27 ---
 arch/s390/mm/page-states.c   |  6 +-
 arch/s390/mm/vmem.c  |  7 +-
 arch/sh/mm/init.c|  9 +--
 arch/sparc/mm/init_64.c  | 12 +--
 arch/x86/kernel/setup.c  | 56 +-
 arch/x86/mm/numa.c   |  2 +-
 arch/xtensa/mm/init.c| 55 +++---
 drivers/bus/mvebu-mbus.c | 12 +--
 drivers/irqchip/irq-gic-v3-its.c |  2 +-
 include/linux/memblock.h | 88 +-
 kernel/dma/contiguous.c  | 11 +--
 mm/memblock.c| 95 ++--
 mm/page_alloc.c  | 11 ++-
 mm/sparse.c  | 10 +--
 48 files changed, 387 insertions(+), 562 deletions(-)

-- 
2.26.2

Flushing transparent hugepages

2020-08-18 Thread Matthew Wilcox
If your arch does not support HAVE_ARCH_TRANSPARENT_HUGEPAGE, you can
stop reading now.  Although maybe you're curious about adding support.

$ git grep -w HAVE_ARCH_TRANSPARENT_HUGEPAGE arch
arch/Kconfig:config HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/arc/Kconfig:config HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/arm/Kconfig:config HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/arm64/Kconfig: select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/mips/Kconfig:  select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 
CPU_SUPPORTS_HUGEPAGES
arch/powerpc/platforms/Kconfig.cputype: select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/s390/Kconfig:  select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/sparc/Kconfig: select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/x86/Kconfig:   select HAVE_ARCH_TRANSPARENT_HUGEPAGE

If your arch does not implement flush_dcache_page(), you can also
stop reading.

$ for i in arc arm arm64 mips powerpc s390 sparc x86; do git grep -l 
flush_dcache_page arch/$i/include; done
arch/arc/include/asm/cacheflush.h
arch/arm/include/asm/cacheflush.h
arch/arm64/include/asm/cacheflush.h
arch/mips/include/asm/cacheflush.h
arch/powerpc/include/asm/cacheflush.h
arch/sparc/include/asm/cacheflush_32.h
arch/sparc/include/asm/cacheflush_64.h
arch/sparc/include/asm/pgtable_64.h

OK, so we're down to arc, arm, arm64, mips, powerpc & sparc.  Hi!  ;-)

I'm working on adding THP support for filesystems with storage backing
and part of that is expanding the definition of THP to be any order
(ie any power of two of PAGE_SIZE).  Now, shmem already has some calls
to flush_dcache_page() for THPs, for example:

if (sgp != SGP_WRITE && !PageUptodate(page)) {
struct page *head = compound_head(page);
int i;

for (i = 0; i < compound_nr(head); i++) {
clear_highpage(head + i);
flush_dcache_page(head + i);
}
SetPageUptodate(head);
}

where you'll be called once for each subpage.  But ... these are error
paths, and I'm sure you all diligently test cache coherency scenarios
of error paths in shmem ... right?

For example, arm64 seems confused in this scenario:

void flush_dcache_page(struct page *page)
{
if (test_bit(PG_dcache_clean, >flags))
clear_bit(PG_dcache_clean, >flags);
}

...

void __sync_icache_dcache(pte_t pte)
{
struct page *page = pte_page(pte);

if (!test_and_set_bit(PG_dcache_clean, >flags))
sync_icache_aliases(page_address(page), page_size(page));
}

So arm64 keeps track on a per-page basis which ones have been flushed.
page_size() will return PAGE_SIZE if called on a tail page or regular
page, but will return PAGE_SIZE << compound_order if called on a head
page.  So this will either over-flush, or it's missing the opportunity
to clear the bits on all the subpages which have now been flushed.

PowerPC has special handling of hugetlbfs pages.  Well, that's what
the config option says, but actually it handles THP as well.  If
the config option is enabled.

#ifdef CONFIG_HUGETLB_PAGE
if (PageCompound(page)) {
flush_dcache_icache_hugepage(page);
return;
}
#endif

By the way, THPs can be mapped askew -- that is, at an offset which
means you can't use a PMD to map a PMD sized page.

Anyway, we don't really have consensus between the various architectures
on how to handle either THPs or hugetlb pages.  It's not contemplated
in Documentation/core-api/cachetlb.rst so there's no real surprise
we've diverged.

What would you _like_ to see?  Would you rather flush_dcache_page()
were called once for each subpage, or would you rather maintain
the page-needs-flushing state once per compound page?  We could also
introduce flush_dcache_thp() if some architectures would prefer it one
way and one the other, although that brings into question what to do
for hugetlbfs pages.

It might not be a bad idea to centralise the handling of all this stuff
somewhere.  Sounds like the kind of thing Arnd would like to do ;-) I'll
settle for getting enough clear feedback about what the various arch
maintainers want that I can write a documentation update for cachetlb.rst.


Re: [Virtual ppce500] virtio_gpu virtio0: swiotlb buffer is full

2020-08-18 Thread Christian Zigotzky

On 18 August 2020 at 10:18 am, Gerd Hoffmann wrote:

On Mon, Aug 17, 2020 at 11:19:58AM +0200, Christian Zigotzky wrote:

Hello

I compiled the RC1 of kernel 5.9 today. Unfortunately the issue with the
VirtIO-GPU (see below) still exists. Therefore we still need the patch (see
below) for using the VirtIO-GPU in a virtual e5500 PPC64 QEMU machine.

It is fixed in drm-misc-next (commit 51c3b0cc32d2e17581fce5b487ee95bbe9e8270a).

Will cherry-pick into drm-misc-fixes once the branch is 5.9-based, which
in turn should bring it to 5.9-rc2 or -rc3.

take care,
   Gerd


Hello Gerd,

I compiled a new kernel with the latest DRM misc updates today. The 
patch is included in these updates.


This kernel works with the VirtIO-GPU in a virtual e5500 QEMU/KVM HV 
machine on my X5000.


Unfortunately I can only use the VirtIO-GPU (Monitor: Red Hat, Inc. 8") 
with a resolution of 640x480. If I set a higher resolution then the 
guest disables the monitor.

I can use higher resolutions with the stable kernel 5.8 and the VirtIO-GPU.

Please check the latest DRM updates.

Thanks,
Christian


  1   2   >