[RFC] hwmon: (ibmpowernv) Add support for reset-history sensors

2017-07-25 Thread Shilpasri G Bhat
In P9, OCC allows for clearing the sensor min-max history. This patch
exports attribute to reset history when set will clear the history of
all the sensors owned by CSM and belonging to the chip.

Signed-off-by: Shilpasri G Bhat 
---
This patch is on top of this patchset:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1453891.html

This patch creates a non-standard attribute called as reset_historyX
which clears the lowest and highest of all the sensors like power,
temperature, voltage belonging to the chip.

 drivers/hwmon/ibmpowernv.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/drivers/hwmon/ibmpowernv.c b/drivers/hwmon/ibmpowernv.c
index 5ccdd0b..611e472 100644
--- a/drivers/hwmon/ibmpowernv.c
+++ b/drivers/hwmon/ibmpowernv.c
@@ -51,6 +51,7 @@ enum sensors {
POWER_SUPPLY,
POWER_INPUT,
CURRENT,
+   RESET_HISTORY,
MAX_SENSOR_TYPE,
 };
 
@@ -78,6 +79,7 @@ enum sensors {
{ "in"},
{ "power" },
{ "curr"  },
+   { "reset_history" },
 };
 
 struct sensor_data {
@@ -126,6 +128,25 @@ static ssize_t show_label(struct device *dev, struct 
device_attribute *devattr,
return sprintf(buf, "%s\n", sdata->label);
 }
 
+static ssize_t store_reset_history(struct device *dev,
+  struct device_attribute *devattr,
+  const char *buf, size_t count)
+{
+   struct sensor_data *sdata = container_of(devattr, struct sensor_data,
+dev_attr);
+   int rc;
+   int reset;
+
+   rc = kstrtoint(buf, 0, );
+   if (rc)
+   return rc;
+
+   if (reset == 1)
+   rc = opal_sensor_groups_clear_history(sdata->id);
+
+   return rc ? rc : count;
+}
+
 static int __init get_logical_cpu(int hwcpu)
 {
int cpu;
@@ -458,6 +479,16 @@ static int create_device_attrs(struct platform_device 
*pdev)
 
create_hwmon_attr([count], attr_name, show_sensor);
 
+   if (type == RESET_HISTORY) {
+   snprintf(sdata[count].name, MAX_ATTR_LEN, "%s%d",
+sensor_groups[type].name,
+sdata[count].hwmon_index);
+
+   sdata[count].dev_attr.attr.mode = 0220;
+   sdata[count].dev_attr.store = store_reset_history;
+   sdata[count].dev_attr.show = NULL;
+   }
+
pgroups[type]->attrs[sensor_groups[type].attr_count++] =
[count++].dev_attr.attr;
 
-- 
1.8.3.1



[PATCH] powerpc/powernv/pci: Return failure for some uses of dma_set_mask()

2017-07-25 Thread Alistair Popple
Commit 8e3f1b1d8255 ("powerpc/powernv/pci: Enable 64-bit devices to access
>4GB DMA space") introduced the ability for PCI device drivers to request a
DMA mask between 64 and 32 bits and actually get a mask greater than
32-bits. However currently if certain machine configuration dependent
conditions are not meet the code silently falls back to a 32-bit mask.

This makes it hard for device drivers to detect which mask they actually
got. Instead we should return an error when the request could not be
fulfilled which allows drivers to either fallback or implement other
workarounds as documented in DMA-API-HOWTO.txt.

Signed-off-by: Alistair Popple 
---

Ideally we should do the same thing for 64-bit mode as well however there
are a lot more drivers requesting a dma mask of 64-bits so it's a much
larger task to audit them all to see if they behave correctly when
dma_set_mask() fails. Such an audit may be required as previously these
calls would not have failed on PPC64 (although may have on other
architectures).

A quick bit of grepping didn't turn up many drivers requesting 33-63 bit
dma masks. Most of the ones that do are specific to other HW, although
there were a couple of more generic drivers requesting eg. 48 bits. However
those tested the return values of dma_set_mask() and took appropriate
action (falling back to 32 bits) in the case of failure.

- Alistair

arch/powerpc/platforms/powernv/pci-ioda.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 4376135..b900eb1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1852,6 +1852,14 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev 
*pdev, u64 dma_mask)
/* 4GB offset bypasses 32-bit space */
set_dma_offset(>dev, (1ULL << 32));
set_dma_ops(>dev, _direct_ops);
+   } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
+   /*
+* Fail the request if a DMA mask between 32 and 64 bits
+* was requested but couldn't be fulfilled. Ideally we
+* would do this for 64-bits but historically we have
+* always fallen back to 32-bits.
+*/
+   return -ENOMEM;
} else {
dev_info(>dev, "Using 32-bit DMA via iommu\n");
set_dma_ops(>dev, _iommu_ops);
-- 
2.1.4



[PATCH V8 3/3] powernv: Add support to clear sensor groups data

2017-07-25 Thread Shilpasri G Bhat
Adds support for clearing different sensor groups. OCC inband sensor
groups like CSM, Profiler, Job Scheduler can be cleared using this
driver. The min/max of all sensors belonging to these sensor groups
will be cleared.

Signed-off-by: Shilpasri G Bhat 
---
Changes from V7:
- s/send_occ_command/opal_sensor_groups_clear_history

 arch/powerpc/include/asm/opal-api.h|   3 +-
 arch/powerpc/include/asm/opal.h|   2 +
 arch/powerpc/include/uapi/asm/opal-occ.h   |  23 ++
 arch/powerpc/platforms/powernv/Makefile|   2 +-
 arch/powerpc/platforms/powernv/opal-occ.c  | 109 +
 arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
 arch/powerpc/platforms/powernv/opal.c  |   3 +
 7 files changed, 141 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/uapi/asm/opal-occ.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-occ.c

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 0d37315..342738a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -195,7 +195,8 @@
 #define OPAL_SET_POWERCAP  153
 #define OPAL_GET_PSR   154
 #define OPAL_SET_PSR   155
-#define OPAL_LAST  155
+#define OPAL_SENSOR_GROUPS_CLEAR   156
+#define OPAL_LAST  156
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 58b30a4..92db6af 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -271,6 +271,7 @@ int64_t opal_xive_set_vp_info(uint64_t vp,
 int opal_set_powercap(u32 handle, int token, u32 pcap);
 int opal_get_power_shifting_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shifting_ratio(u32 handle, int token, u32 psr);
+int opal_sensor_groups_clear(u32 group_hndl, int token);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
@@ -351,6 +352,7 @@ static inline int opal_get_async_rc(struct opal_msg msg)
 
 void opal_powercap_init(void);
 void opal_psr_init(void);
+int opal_sensor_groups_clear_history(u32 handle);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/uapi/asm/opal-occ.h 
b/arch/powerpc/include/uapi/asm/opal-occ.h
new file mode 100644
index 000..97c45e2
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/opal-occ.h
@@ -0,0 +1,23 @@
+/*
+ * OPAL OCC command interface
+ * Supported on POWERNV platform
+ *
+ * (C) Copyright IBM 2017
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_ASM_POWERPC_OPAL_OCC_H_
+#define _UAPI_ASM_POWERPC_OPAL_OCC_H_
+
+#define OPAL_OCC_IOCTL_CLEAR_SENSOR_GROUPS _IOR('o', 1, u32)
+
+#endif /* _UAPI_ASM_POWERPC_OPAL_OCC_H */
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 9ed7d33..f193b33 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,7 +2,7 @@ obj-y   += setup.o opal-wrappers.o opal.o 
opal-async.o idle.o
 obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
 obj-y  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
-obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o
+obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o opal-occ.o
 
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
diff --git a/arch/powerpc/platforms/powernv/opal-occ.c 
b/arch/powerpc/platforms/powernv/opal-occ.c
new file mode 100644
index 000..d1d4b28
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-occ.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright IBM Corporation 2017
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "opal-occ: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 

[PATCH V8 2/3] powernv: Add support to set power-shifting-ratio

2017-07-25 Thread Shilpasri G Bhat
This patch adds support to set power-shifting-ratio for CPU-GPU which
is used by OCC power capping algorithm.

Signed-off-by: Shilpasri G Bhat 
---
Changes from V7:
- Replaced sscanf with kstrtoint

 arch/powerpc/include/asm/opal-api.h|   4 +-
 arch/powerpc/include/asm/opal.h|   3 +
 arch/powerpc/platforms/powernv/Makefile|   2 +-
 arch/powerpc/platforms/powernv/opal-psr.c  | 169 +
 arch/powerpc/platforms/powernv/opal-wrappers.S |   2 +
 arch/powerpc/platforms/powernv/opal.c  |   3 +
 6 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-psr.c

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index c3e0c4a..0d37315 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -193,7 +193,9 @@
 #define OPAL_NPU_MAP_LPAR  148
 #define OPAL_GET_POWERCAP  152
 #define OPAL_SET_POWERCAP  153
-#define OPAL_LAST  153
+#define OPAL_GET_PSR   154
+#define OPAL_SET_PSR   155
+#define OPAL_LAST  155
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ec2087c..58b30a4 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -269,6 +269,8 @@ int64_t opal_xive_set_vp_info(uint64_t vp,
 int64_t opal_xive_dump(uint32_t type, uint32_t id);
 int opal_get_powercap(u32 handle, int token, u32 *pcap);
 int opal_set_powercap(u32 handle, int token, u32 pcap);
+int opal_get_power_shifting_ratio(u32 handle, int token, u32 *psr);
+int opal_set_power_shifting_ratio(u32 handle, int token, u32 psr);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
@@ -348,6 +350,7 @@ static inline int opal_get_async_rc(struct opal_msg msg)
 void opal_wake_poller(void);
 
 void opal_powercap_init(void);
+void opal_psr_init(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index e79f806..9ed7d33 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,7 +2,7 @@ obj-y   += setup.o opal-wrappers.o opal.o 
opal-async.o idle.o
 obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
 obj-y  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
-obj-y  += opal-kmsg.o opal-powercap.o
+obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o
 
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
diff --git a/arch/powerpc/platforms/powernv/opal-psr.c 
b/arch/powerpc/platforms/powernv/opal-psr.c
new file mode 100644
index 000..07e3f78
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-psr.c
@@ -0,0 +1,169 @@
+/*
+ * PowerNV OPAL Power-Shifting-Ratio interface
+ *
+ * Copyright 2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal-psr: " fmt
+
+#include 
+#include 
+#include 
+
+#include 
+
+DEFINE_MUTEX(psr_mutex);
+
+static struct kobject *psr_kobj;
+
+struct psr_attr {
+   u32 handle;
+   struct kobj_attribute attr;
+};
+
+static struct psr_attr *psr_attrs;
+static struct kobject *psr_kobj;
+
+static ssize_t psr_show(struct kobject *kobj, struct kobj_attribute *attr,
+   char *buf)
+{
+   struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr);
+   struct opal_msg msg;
+   int psr, ret, token;
+
+   token = opal_async_get_token_interruptible();
+   if (token < 0) {
+   pr_devel("Failed to get token\n");
+   return token;
+   }
+
+   mutex_lock(_mutex);
+   ret = opal_get_power_shifting_ratio(psr_attr->handle, token, );
+   switch (ret) {
+   case OPAL_ASYNC_COMPLETION:
+   ret = opal_async_wait_response(token, );
+   if (ret) {
+   pr_devel("Failed to wait for the async response %d\n",
+ret);
+   goto out;
+   }
+   ret = opal_error_code(opal_get_async_rc(msg));
+   if (!ret)
+   ret = sprintf(buf, "%u\n", be32_to_cpu(psr));
+   break;
+   case OPAL_SUCCESS:
+   ret = sprintf(buf, "%u\n", be32_to_cpu(psr));
+   break;
+   

[PATCH V8 1/3] powernv: powercap: Add support for powercap framework

2017-07-25 Thread Shilpasri G Bhat
Adds a generic powercap framework to change the system powercap
inband through OPAL-OCC command/response interface.

Signed-off-by: Shilpasri G Bhat 
---
Changes from V7:
- Replaced sscanf with kstrtoint

 arch/powerpc/include/asm/opal-api.h|   5 +-
 arch/powerpc/include/asm/opal.h|   4 +
 arch/powerpc/platforms/powernv/Makefile|   2 +-
 arch/powerpc/platforms/powernv/opal-powercap.c | 237 +
 arch/powerpc/platforms/powernv/opal-wrappers.S |   2 +
 arch/powerpc/platforms/powernv/opal.c  |   4 +
 6 files changed, 252 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-powercap.c

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 3130a73..c3e0c4a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -42,6 +42,7 @@
 #define OPAL_I2C_STOP_ERR  -24
 #define OPAL_XIVE_PROVISIONING -31
 #define OPAL_XIVE_FREE_ACTIVE  -32
+#define OPAL_TIMEOUT   -33
 
 /* API Tokens (in r0) */
 #define OPAL_INVALID_CALL -1
@@ -190,7 +191,9 @@
 #define OPAL_NPU_INIT_CONTEXT  146
 #define OPAL_NPU_DESTROY_CONTEXT   147
 #define OPAL_NPU_MAP_LPAR  148
-#define OPAL_LAST  148
+#define OPAL_GET_POWERCAP  152
+#define OPAL_SET_POWERCAP  153
+#define OPAL_LAST  153
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 588fb1c..ec2087c 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -267,6 +267,8 @@ int64_t opal_xive_set_vp_info(uint64_t vp,
 int64_t opal_xive_free_irq(uint32_t girq);
 int64_t opal_xive_sync(uint32_t type, uint32_t id);
 int64_t opal_xive_dump(uint32_t type, uint32_t id);
+int opal_get_powercap(u32 handle, int token, u32 *pcap);
+int opal_set_powercap(u32 handle, int token, u32 pcap);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
@@ -345,6 +347,8 @@ static inline int opal_get_async_rc(struct opal_msg msg)
 
 void opal_wake_poller(void);
 
+void opal_powercap_init(void);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index b5d98cb..e79f806 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -2,7 +2,7 @@ obj-y   += setup.o opal-wrappers.o opal.o 
opal-async.o idle.o
 obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
 obj-y  += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
-obj-y  += opal-kmsg.o
+obj-y  += opal-kmsg.o opal-powercap.o
 
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o
diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c 
b/arch/powerpc/platforms/powernv/opal-powercap.c
new file mode 100644
index 000..7c57f4b
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-powercap.c
@@ -0,0 +1,237 @@
+/*
+ * PowerNV OPAL Powercap interface
+ *
+ * Copyright 2017 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "opal-powercap: " fmt
+
+#include 
+#include 
+#include 
+
+#include 
+
+DEFINE_MUTEX(powercap_mutex);
+
+static struct kobject *powercap_kobj;
+
+struct powercap_attr {
+   u32 handle;
+   struct kobj_attribute attr;
+};
+
+static struct attribute_group *pattr_groups;
+static struct powercap_attr *pcap_attrs;
+
+static ssize_t powercap_show(struct kobject *kobj, struct kobj_attribute *attr,
+char *buf)
+{
+   struct powercap_attr *pcap_attr = container_of(attr,
+   struct powercap_attr, attr);
+   struct opal_msg msg;
+   u32 pcap;
+   int ret, token;
+
+   token = opal_async_get_token_interruptible();
+   if (token < 0) {
+   pr_devel("Failed to get token\n");
+   return token;
+   }
+
+   mutex_lock(_mutex);
+   ret = opal_get_powercap(pcap_attr->handle, token, );
+   switch (ret) {
+   case OPAL_ASYNC_COMPLETION:
+   ret = opal_async_wait_response(token, );
+   if (ret) {
+   pr_devel("Failed to wait for the async response %d\n",
+ret);
+   goto out;
+   }
+  

[PATCH V8 0/3] powernv : Add support for OPAL-OCC command/response interface

2017-07-25 Thread Shilpasri G Bhat
In P9, OCC (On-Chip-Controller) supports shared memory based
commad-response interface. Within the shared memory there is an OPAL
command buffer and OCC response buffer that can be used to send
inband commands to OCC. The following commands are supported:

1) Set system powercap
2) Set CPU-GPU power shifting ratio
3) Clear min/max for OCC sensor groups

The skiboot patch for this interface is posted here:
https://lists.ozlabs.org/pipermail/skiboot/2017-July/008352.html

Shilpasri G Bhat (3):
  powernv: powercap: Add support for powercap framework
  powernv: Add support to set power-shifting-ratio
  powernv: Add support to clear sensor groups data

 arch/powerpc/include/asm/opal-api.h|   8 +-
 arch/powerpc/include/asm/opal.h|   9 +
 arch/powerpc/include/uapi/asm/opal-occ.h   |  23 +++
 arch/powerpc/platforms/powernv/Makefile|   2 +-
 arch/powerpc/platforms/powernv/opal-occ.c  | 109 
 arch/powerpc/platforms/powernv/opal-powercap.c | 237 +
 arch/powerpc/platforms/powernv/opal-psr.c  | 169 ++
 arch/powerpc/platforms/powernv/opal-wrappers.S |   5 +
 arch/powerpc/platforms/powernv/opal.c  |  10 ++
 9 files changed, 570 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/include/uapi/asm/opal-occ.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-occ.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-powercap.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-psr.c

-- 
1.8.3.1



Re: [ltc-interlock] [PATCH] hugetlb: Fix hot remove for PowerVM

2017-07-25 Thread Aneesh Kumar K.V
Diego Domingos  writes:

> PowerVM has support for 16G huge pages, so the function
> gigantic_page_supported needs to check if the running system
> is a pseries and check if there are some gigantic page
> registered. Then, we must return true - avoiding Segmentation
> Fault when hot removing memory sections within huge pages.

That is not correct. Those pages are not in zone/buddy. What
gigantic_page_supported checks is whether we can allocate pages runtime.
We scan the zones, check if we have the range that we are looking for
free for allocation, if so we do runtime allocation of those pages. This
may include page migration too. But these pages comes from buddy. We
don't do buddy allocator directly here, because the order of allocation
we are looking is more than max order.

>
> Signed-off-by: Diego Domingos 
> ---
>  arch/powerpc/include/asm/book3s/64/hugetlb.h | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
> b/arch/powerpc/include/asm/book3s/64/hugetlb.h
> index 5c28bd6..49e43dd 100644
> --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
> +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
> @@ -1,3 +1,5 @@
> +#include 
> +
>  #ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_H
>  #define _ASM_POWERPC_BOOK3S_64_HUGETLB_H
>  /*
> @@ -54,9 +56,19 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct 
> vm_area_struct *vma,
>  #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
>  static inline bool gigantic_page_supported(void)
>  {
> +struct hstate *h;
> +
>   if (radix_enabled())
>   return true;
> - return false;
> +
> +/* PowerVM can support 16GB hugepages (requested at boot time) */
> +if(machine_is(pseries))
> +for_each_hstate(h) {
> +if (hstate_get_psize(h) == MMU_PAGE_16G)
> +return true;
> +}
> +
> +return false;
>  }
>  #endif
>
> -- 
> 1.8.3.1
>
> ___
> ltc-interlock mailing list 
> To unsubscribe from the list, change your list options
> or if you have forgotten your list password visit:
> https://w3-01.ibm.com/stg/linux/ltc/mailinglists/listinfo/ltc-interlock



Re: [RFC Part1 PATCH v3 03/17] x86/mm: Secure Encrypted Virtualization (SEV) support

2017-07-25 Thread Borislav Petkov
On Mon, Jul 24, 2017 at 02:07:43PM -0500, Brijesh Singh wrote:
> From: Tom Lendacky 
> 
> Provide support for Secure Encyrpted Virtualization (SEV). This initial

Your subject misses a verb and patch subjects should have an active verb
denoting what the patch does. The sentence above is a good example.

> support defines a flag that is used by the kernel to determine if it is
> running with SEV active.
> 
> Signed-off-by: Tom Lendacky 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/include/asm/mem_encrypt.h | 2 ++
>  arch/x86/mm/mem_encrypt.c  | 3 +++
>  include/linux/mem_encrypt.h| 8 +++-
>  3 files changed, 12 insertions(+), 1 deletion(-)

...

> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 0fbd092..1e4643e 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -40,6 +40,9 @@ static char sme_cmdline_off[] __initdata = "off";
>  unsigned long sme_me_mask __section(.data) = 0;
>  EXPORT_SYMBOL_GPL(sme_me_mask);
>  
> +unsigned int sev_enabled __section(.data) = 0;
> +EXPORT_SYMBOL_GPL(sev_enabled);

So sev_enabled is a pure bool used only in bool context, not like
sme_me_mask whose value is read too. Which means, you can make the
former static and query it only through accessor functions.

>  /* Buffer used for early in-place encryption by BSP, no locking needed */
>  static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
>  
> diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
> index 1255f09..ea0831a 100644
> --- a/include/linux/mem_encrypt.h
> +++ b/include/linux/mem_encrypt.h
> @@ -22,12 +22,18 @@
>  #else/* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
>  
>  #define sme_me_mask  0UL
> +#define sev_enabled  0
>  
>  #endif   /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
>  
>  static inline bool sme_active(void)
>  {
> - return !!sme_me_mask;
> + return (sme_me_mask && !sev_enabled);

You don't need the brackets. Below too.

> +}
> +
> +static inline bool sev_active(void)
> +{
> + return (sme_me_mask && sev_enabled);
>  }

So this is confusing, TBH. SME and SEV are not mutually exclusive and
yet the logic here says so. Why?

I mean, in the hypervisor context, sme_active() is still true.

/me is confused.

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 


Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Paul E. McKenney
On Tue, Jul 25, 2017 at 09:02:33PM -0700, David Miller wrote:
> From: "Paul E. McKenney" 
> Date: Tue, 25 Jul 2017 20:55:45 -0700
> 
> > On Tue, Jul 25, 2017 at 02:10:29PM -0700, David Miller wrote:
> >> Just to report, turning softlockup back on fixes things for me on
> >> sparc64 too.
> > 
> > Very good!
> > 
> >> The thing about softlockup is it runs an hrtimer, which seems to run
> >> about every 4 seconds.
> > 
> > I could see where that could shake things loose, but I am surprised that
> > it would be needed.  I ran a short run with CONFIG_SOFTLOCKUP_DETECTOR=y
> > with no trouble, but I will be running a longer test later on.
> > 
> >> So I wonder if this is a NO_HZ problem.
> > 
> > Might be.  My tests run with NO_HZ_FULL=n and NO_HZ_IDLE=y.  What are
> > you running?  (Again, my symptoms are slightly different, so I might
> > be seeing a different bug.)
> 
> I run with NO_HZ_FULL=n and NO_HZ_IDLE=y, just like you.
> 
> To clarify, the symptoms show up with SOFTLOCKUP_DETECTOR disabled.

Same here -- but my failure case happens fairly rarely, so it will take
some time to gain reasonable confidence that enabling SOFTLOCKUP_DETECTOR
had effect.

But you are right, might be interesting to try NO_HZ_PERIODIC=y
or NO_HZ_FULL=y.  So many possible tests, and so little time.  ;-)

Thanx, Paul



Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread David Miller
From: "Paul E. McKenney" 
Date: Tue, 25 Jul 2017 20:55:45 -0700

> On Tue, Jul 25, 2017 at 02:10:29PM -0700, David Miller wrote:
>> Just to report, turning softlockup back on fixes things for me on
>> sparc64 too.
> 
> Very good!
> 
>> The thing about softlockup is it runs an hrtimer, which seems to run
>> about every 4 seconds.
> 
> I could see where that could shake things loose, but I am surprised that
> it would be needed.  I ran a short run with CONFIG_SOFTLOCKUP_DETECTOR=y
> with no trouble, but I will be running a longer test later on.
> 
>> So I wonder if this is a NO_HZ problem.
> 
> Might be.  My tests run with NO_HZ_FULL=n and NO_HZ_IDLE=y.  What are
> you running?  (Again, my symptoms are slightly different, so I might
> be seeing a different bug.)

I run with NO_HZ_FULL=n and NO_HZ_IDLE=y, just like you.

To clarify, the symptoms show up with SOFTLOCKUP_DETECTOR disabled.


Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Paul E. McKenney
On Tue, Jul 25, 2017 at 02:10:29PM -0700, David Miller wrote:
> From: Jonathan Cameron 
> Date: Wed, 26 Jul 2017 00:52:07 +0800
> 
> > On Tue, 25 Jul 2017 08:12:45 -0700
> > "Paul E. McKenney"  wrote:
> > 
> >> On Tue, Jul 25, 2017 at 10:42:45PM +0800, Jonathan Cameron wrote:
> >> > On Tue, 25 Jul 2017 06:46:26 -0700
> >> > "Paul E. McKenney"  wrote:
> >> >   
> >> > > On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:  
> >> > > > On Tue, 25 Jul 2017 19:32:10 +0800
> >> > > > Jonathan Cameron  wrote:
> >> > > > 
> >> > > > > Hi All,
> >> > > > > 
> >> > > > > We observed a regression on our d05 boards (but curiously not
> >> > > > > the fairly similar but single socket / smaller core count
> >> > > > > d03), initially seen with linux-next prior to the merge window
> >> > > > > and still present in v4.13-rc2.
> >> > > > > 
> >> > > > > The symptom is:
> >> > > 
> >> > > Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
> >> > > they have been seeing something similar, and you might well have saved
> >> > > them the trouble of bisecting.
> >> > > 
> >> > > [ . . . ]
> >> > >   
> >> > > > > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 
> >> > > > > c1565 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1
> >> > > 
> >> > > This is the cause from an RCU perspective.  You had a lot of idle CPUs,
> >> > > and RCU is not permitted to disturb them -- the battery-powered 
> >> > > embedded
> >> > > guys get very annoyed by that sort of thing.  What happens instead is
> >> > > that each CPU updates a per-CPU state variable when entering or exiting
> >> > > idle, and the grace-period kthread ("rcu_preempt kthread" in the above
> >> > > message) checks these state variables, and if when sees an idle CPU,
> >> > > it reports a quiescent state on that CPU's behalf.
> >> > > 
> >> > > But the grace-period kthread can only do this work if it gets a chance
> >> > > to run.  And the message above says that this kthread hasn't had a 
> >> > > chance
> >> > > to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
> >> > > says that grace period #1566 is in progress, the "f0x0" says that no 
> >> > > one
> >> > > is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
> >> > > that the grace-period kthread has fully initialized the current grace
> >> > > period and is sleeping for a few jiffies waiting to scan for idle 
> >> > > tasks.
> >> > > Finally, the "->state=0x1" says that the grace-period kthread is in
> >> > > TASK_INTERRUPTIBLE state, in other words, still sleeping.  
> >> > 
> >> > Thanks for the explanation!  
> >> > > 
> >> > > So my first question is "What did commit 05a4a9527 (kernel/watchdog:
> >> > > split up config options) do to prevent the grace-period kthread from
> >> > > getting a chance to run?"   
> >> > 
> >> > As far as we can tell it was a side effect of that patch.
> >> > 
> >> > The real cause is that patch changed the result of defconfigs to stop 
> >> > running
> >> > the softlockup detector - now CONFIG_SOFTLOCKUP_DETECTOR
> >> > 
> >> > Enabling that on 4.13-rc2 (and presumably everything in between)
> >> > means we don't see the problem any more.
> >> >   
> >> > > I must confess that I don't see anything
> >> > > obvious in that commit, so my second question is "Are we sure that
> >> > > reverting this commit makes the problem go away?"  
> >> > 
> >> > Simply enabling CONFIG_SOFTLOCKUP_DETECTOR seems to make it go away.
> >> > That detector fires up a thread on every cpu, which may be relevant.  
> >> 
> >> Interesting...  Why should it be necessary to fire up a thread on every
> >> CPU in order to make sure that RCU's grace-period kthreads get some
> >> CPU time?  Especially give how many idle CPUs you had on your system.
> >> 
> >> So I have to ask if there is some other bug that the softlockup detector
> >> is masking.
> > I am thinking the same.  We can try going back further than 4.12 tomorrow
> > (we think we can realistically go back to 4.8 and possibly 4.6
> > with this board)
> 
> Just to report, turning softlockup back on fixes things for me on
> sparc64 too.

Very good!

> The thing about softlockup is it runs an hrtimer, which seems to run
> about every 4 seconds.

I could see where that could shake things loose, but I am surprised that
it would be needed.  I ran a short run with CONFIG_SOFTLOCKUP_DETECTOR=y
with no trouble, but I will be running a longer test later on.

> So I wonder if this is a NO_HZ problem.

Might be.  My tests run with NO_HZ_FULL=n and NO_HZ_IDLE=y.  What are
you running?  (Again, my symptoms are slightly different, so I might
be seeing a different bug.)

Thanx, Paul



Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Paul E. McKenney
On Wed, Jul 26, 2017 at 12:52:07AM +0800, Jonathan Cameron wrote:
> On Tue, 25 Jul 2017 08:12:45 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Tue, Jul 25, 2017 at 10:42:45PM +0800, Jonathan Cameron wrote:
> > > On Tue, 25 Jul 2017 06:46:26 -0700
> > > "Paul E. McKenney"  wrote:
> > >   
> > > > On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:  
> > > > > On Tue, 25 Jul 2017 19:32:10 +0800
> > > > > Jonathan Cameron  wrote:
> > > > > 
> > > > > > Hi All,
> > > > > > 
> > > > > > We observed a regression on our d05 boards (but curiously not
> > > > > > the fairly similar but single socket / smaller core count
> > > > > > d03), initially seen with linux-next prior to the merge window
> > > > > > and still present in v4.13-rc2.
> > > > > > 
> > > > > > The symptom is:
> > > > 
> > > > Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
> > > > they have been seeing something similar, and you might well have saved
> > > > them the trouble of bisecting.
> > > > 
> > > > [ . . . ]
> > > >   
> > > > > > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 
> > > > > > c1565 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1
> > > > 
> > > > This is the cause from an RCU perspective.  You had a lot of idle CPUs,
> > > > and RCU is not permitted to disturb them -- the battery-powered embedded
> > > > guys get very annoyed by that sort of thing.  What happens instead is
> > > > that each CPU updates a per-CPU state variable when entering or exiting
> > > > idle, and the grace-period kthread ("rcu_preempt kthread" in the above
> > > > message) checks these state variables, and if when sees an idle CPU,
> > > > it reports a quiescent state on that CPU's behalf.
> > > > 
> > > > But the grace-period kthread can only do this work if it gets a chance
> > > > to run.  And the message above says that this kthread hasn't had a 
> > > > chance
> > > > to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
> > > > says that grace period #1566 is in progress, the "f0x0" says that no one
> > > > is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
> > > > that the grace-period kthread has fully initialized the current grace
> > > > period and is sleeping for a few jiffies waiting to scan for idle tasks.
> > > > Finally, the "->state=0x1" says that the grace-period kthread is in
> > > > TASK_INTERRUPTIBLE state, in other words, still sleeping.  
> > > 
> > > Thanks for the explanation!  
> > > > 
> > > > So my first question is "What did commit 05a4a9527 (kernel/watchdog:
> > > > split up config options) do to prevent the grace-period kthread from
> > > > getting a chance to run?"   
> > > 
> > > As far as we can tell it was a side effect of that patch.
> > > 
> > > The real cause is that patch changed the result of defconfigs to stop 
> > > running
> > > the softlockup detector - now CONFIG_SOFTLOCKUP_DETECTOR
> > > 
> > > Enabling that on 4.13-rc2 (and presumably everything in between)
> > > means we don't see the problem any more.
> > >   
> > > > I must confess that I don't see anything
> > > > obvious in that commit, so my second question is "Are we sure that
> > > > reverting this commit makes the problem go away?"  
> > > 
> > > Simply enabling CONFIG_SOFTLOCKUP_DETECTOR seems to make it go away.
> > > That detector fires up a thread on every cpu, which may be relevant.  
> > 
> > Interesting...  Why should it be necessary to fire up a thread on every
> > CPU in order to make sure that RCU's grace-period kthreads get some
> > CPU time?  Especially give how many idle CPUs you had on your system.
> > 
> > So I have to ask if there is some other bug that the softlockup detector
> > is masking.
> I am thinking the same.  We can try going back further than 4.12 tomorrow
> (we think we can realistically go back to 4.8 and possibly 4.6
> with this board)

Looking forward to seeing results!

> > > > and my third is "Is
> > > > this an intermittent problem that led to a false bisection?"  
> > > 
> > > Whilst it is a bit slow to occur, we verified with long runs on either
> > > side of that patch and since with the option enabled on latest mainline.
> > > 
> > > Also can cause the issue before that patch by disabling the previous
> > > relevant option on 4.12.  
> > 
> > OK, thank you -- hard to argue with that!  ;-)
> 
> We thought it was a pretty unlikely a bisection result
> hence hammered it thoroughly ;)

Glad that I am not the only paranoid one out here.  ;-)

> > Except that I am still puzzled as to why per-CPU softlockup threads
> > are needed for RCU's kthreads to get their wakeups.  We really should
> > be able to disable softlockup and still have kthreads get wakeups and
> > access to CPU, after all.
> > 
> > > > [ . . . ]
> > > >   
> > > > > > Reducing the RCU CPU stall timeout makes it happen more often,
> > > > > > but we are seeing even with the default 

Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-25 Thread David Gibson
On Tue, Jul 18, 2017 at 02:22:20PM -0300, Murilo Opsfelder Araujo wrote:
> When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the
> following:
> 
> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release':
> vfio_pci.c:(.text+0xa98): undefined reference to 
> `.vfio_spapr_pci_eeh_release'
> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open':
> vfio_pci.c:(.text+0x1420): undefined reference to 
> `.vfio_spapr_pci_eeh_open'
> 
> In this case, vfio_pci.c should use the empty definitions of
> vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions.
> 
> This patch fixes it by guarding these function definitions with
> CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is
> built, which is where the non-empty versions of these functions are. We need 
> to
> make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate
> option.
> 
> This issue was found during a randconfig build. Logs are here:
> 
> http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/
> 
> Signed-off-by: Murilo Opsfelder Araujo 

Reviewed-by: David Gibson 

> ---
> 
> Changes from v1:
> - Rebased on top of next-20170718.
> 
>  include/linux/vfio.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 586809a..a47b985 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -152,7 +152,7 @@ extern int vfio_set_irqs_validate_and_prepare(struct 
> vfio_irq_set *hdr,
> size_t *data_size);
> 
>  struct pci_dev;
> -#ifdef CONFIG_EEH
> +#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
>  extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
>  extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
>  extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
> @@ -173,7 +173,7 @@ static inline long vfio_spapr_iommu_eeh_ioctl(struct 
> iommu_group *group,
>  {
>   return -ENOTTY;
>  }
> -#endif /* CONFIG_EEH */
> +#endif /* CONFIG_VFIO_SPAPR_EEH */
> 
>  /*
>   * IRQfd - generic

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-25 Thread Alexey Kardashevskiy
On 26/07/17 01:44, Alex Williamson wrote:
> [cc +Alexey, David]
> 
> Any comments from the usual suspects for vfio/spapr?  Thanks,

[trying once again as mailists rejected the previous response due to broken
outgoing mail encoding config in my thunderbird]

Reviewed-by: Alexey Kardashevskiy 


Having EEH + VFIO_PCI and not having VFIO_SPAPR_EEH does not make practical
sense and won't be seen in the wild so EEH + VFIO should enforce
VFIO_SPAPR_EEH but this is for another patch.


> 
> Alex
> 
> On Tue, 25 Jul 2017 10:56:38 -0300
> Murilo Opsfelder Araújo  wrote:
> 
>> On 07/18/2017 02:22 PM, Murilo Opsfelder Araujo wrote:
>>> When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the
>>> following:
>>>
>>> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release':
>>> vfio_pci.c:(.text+0xa98): undefined reference to 
>>> `.vfio_spapr_pci_eeh_release'
>>> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open':
>>> vfio_pci.c:(.text+0x1420): undefined reference to 
>>> `.vfio_spapr_pci_eeh_open'
>>>
>>> In this case, vfio_pci.c should use the empty definitions of
>>> vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions.
>>>
>>> This patch fixes it by guarding these function definitions with
>>> CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is
>>> built, which is where the non-empty versions of these functions are. We 
>>> need to
>>> make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate
>>> option.
>>>
>>> This issue was found during a randconfig build. Logs are here:
>>>
>>> http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/
>>>
>>> Signed-off-by: Murilo Opsfelder Araujo 
>>> ---
>>>
>>> Changes from v1:
>>> - Rebased on top of next-20170718.  
>>
>> Hi, Alex.
>>
>> Are you applying this?
>>
>> Thanks!
>>
> 


-- 
Alexey


Re: [PATCH v3] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-25 Thread Benjamin Herrenschmidt
On Tue, 2017-07-25 at 19:17 +0530, Santosh Sivaraj wrote:
> I get the point. I looked at the generated assembly a bit closer, the update
> count is optimized out. Will send the alternative asm only patch.

We could do it in C the way x86 does it, using some helpers for
begin/end and have either an lwsync (but that would be slower than
the data dependency I think) or carefully crafting the helpers to
create one (make them return the pointer).

If you go down that path though, you need to make sure we do not
generate any TOC reference as the vDSO doesn't have a TOC.

Cheers,
Ben.


Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-25 Thread Alexey Kardashevskiy
On 26/07/17 01:44, Alex Williamson wrote:
> [cc +Alexey, David]
> 
> Any comments from the usual suspects for vfio/spapr?  Thanks,

[trying again as mailists rejected the previous response]

Reviewed-by: Alexey Kardashevskiy 


Having EEH + VFIO_PCI and not having VFIO_SPAPR_EEH does not make practical
sense and won't be seen in the wild so EEH + VFIO should enforce
VFIO_SPAPR_EEH but this is for another patch.


> 
> Alex
> 
> On Tue, 25 Jul 2017 10:56:38 -0300
> Murilo Opsfelder Araújo  wrote:
> 
>> On 07/18/2017 02:22 PM, Murilo Opsfelder Araujo wrote:
>>> When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the
>>> following:
>>>
>>> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release':
>>> vfio_pci.c:(.text+0xa98): undefined reference to 
>>> `.vfio_spapr_pci_eeh_release'
>>> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open':
>>> vfio_pci.c:(.text+0x1420): undefined reference to 
>>> `.vfio_spapr_pci_eeh_open'
>>>
>>> In this case, vfio_pci.c should use the empty definitions of
>>> vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions.
>>>
>>> This patch fixes it by guarding these function definitions with
>>> CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is
>>> built, which is where the non-empty versions of these functions are. We 
>>> need to
>>> make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate
>>> option.
>>>
>>> This issue was found during a randconfig build. Logs are here:
>>>
>>> http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/
>>>
>>> Signed-off-by: Murilo Opsfelder Araujo 
>>> ---
>>>
>>> Changes from v1:
>>> - Rebased on top of next-20170718.  
>>
>> Hi, Alex.
>>
>> Are you applying this?
>>
>> Thanks!
>>
> 


-- 
Alexey


Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-25 Thread Alexey Kardashevskiy
On 26/07/17 01:44, Alex Williamson wrote:
> [cc +Alexey, David]
> 
> Any comments from the usual suspects for vfio/spapr?  Thanks,


Reviewed-by: Alexey Kardashevskiy 


Having EEH + VFIO_PCI and not having VFIO_SPAPR_EEH does not make practical
sense and won't be seen in the wild so EEH + VFIO should enforce
VFIO_SPAPR_EEH but this is for another patch.


> 
> Alex
> 
> On Tue, 25 Jul 2017 10:56:38 -0300
> Murilo Opsfelder Araújo  wrote:
> 
>> On 07/18/2017 02:22 PM, Murilo Opsfelder Araujo wrote:
>>> When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the
>>> following:
>>>
>>> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release':
>>> vfio_pci.c:(.text+0xa98): undefined reference to 
>>> `.vfio_spapr_pci_eeh_release'
>>> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open':
>>> vfio_pci.c:(.text+0x1420): undefined reference to 
>>> `.vfio_spapr_pci_eeh_open'
>>>
>>> In this case, vfio_pci.c should use the empty definitions of
>>> vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions.
>>>
>>> This patch fixes it by guarding these function definitions with
>>> CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is
>>> built, which is where the non-empty versions of these functions are. We 
>>> need to
>>> make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate
>>> option.
>>>
>>> This issue was found during a randconfig build. Logs are here:
>>>
>>> http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/
>>>
>>> Signed-off-by: Murilo Opsfelder Araujo 
>>> ---
>>>
>>> Changes from v1:
>>> - Rebased on top of next-20170718.  
>>
>> Hi, Alex.
>>
>> Are you applying this?
>>
>> Thanks!
>>
> 


-- 
Alexey


Re: [PATCH 0/4] Allow non-legacy cards to be vgaarb default

2017-07-25 Thread Daniel Axtens
Hi Laszlo,

Thanks for your input!

>> Are there other graphical applications we care about (other than Xorg)
>> that would need to be patched? I'm happy to do the Xorg patch, but I
>> don't know if anything other than Xorg keys off the boot_vga file.
>> 
>> I'm not fundamentally opposed to this approach if the Xorg community is
>> happy with it, the kernel community is happy with it, and no-one expects
>> me to provide patches to any other user-space applications that depend
>> on boot_vga.
>
> Ard both identified the Xorg commit that I would have, and CC'd Hans
> which I would have recommended as well.
>
> I assume the symptom is that now there's a class of platform GPU devices
> that is neither PCI nor legacy VGA, so neither the kernel's boot_vga
> logic matches it, nor Xorg's commit in question.
>
> I agree that it should be possible to add more logic to Xorg to detect
> this kind device as primary. However, I share Daniel's worry that it
> wouldn't cover all user space apps -- I see "Wayland this, Wayland that"
> on reddit every week.
>
> Having practically zero background in gfx development (either kernel or
> Xorg), I think the problem is that vga_default_device() /
> vga_set_default_device(), which -- apparently -- "boot_vga" is based
> upon, come from "drivers/gpu/vga/vgaarb.c". Namely, the concept of
> "primary / boot display device" is tied to the VGA arbiter, plus only a
> PCI device can currently be marked as primary/boot display device.

You're right, the issue is that the primary/boot device is tied to the
VGA arbiter.

>
> Can these concepts be split from each other? (I can fully imagine that
> this would result in a userspace visible interface change (or addition),
> so that e.g. "/sys/devices/**/boot_gpu" would have to be consulted by
> display servers.)

Yes, they can be split or a way of marking the default vga device that
doesn't depend on the arbiter can be added.

(But there is some question about what it actually means to be a boot
vga card - it's better defined on an x86 system, but on a ppc or arm64
system we're reduced to guessing based on the first driver loaded.)

> (Sorry if I'm totally wrong.)
>
> ... Hm, reading the thread starter at
> ,
> and the references within... It looks like this work is motivated by
> hardware that is supposed to be PCI, but actually breaks the specs. Is
> that correct? If so, then I don't think I can suggest anything useful.
> Specs exist so that hardware vendors and software authors follow them.
> If hardware does not conform, then software should either refuse to work
> with it, or handle it with quirks, on a case-by-case basis. I guess this
> means that I don't agree with the
>
>   broad[] suggest[ion] that a more generic solution would be better
>
> which seems to disqualify me from the discussion, as it must have been
> suggested by people with incomparably more experience than what I have :)

Originally this was brought to the fore through a PCI bridge that wasn't
spec compliant, and originally I proposed a simple quirk: [0]. However,
that highlighted the related issue that platforms that don't use legacy
resources still go through the VGA arbiter process which is built around
legacy resource arbitration. Changing that behaviour also fixes the
issue with the non-spec-compliant bridge because the new model doesn't
rely upon the particular part of the spec that the bridge violates.

I'm not fussy about how we solve this problem, so long as we solve this
problem somehow.

Regards,
Daniel

[0]: https://patchwork.ozlabs.org/patch/787003/

>
> Thanks
> Laszlo


Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Jonathan Cameron
On Tue, 25 Jul 2017 08:12:45 -0700
"Paul E. McKenney"  wrote:

> On Tue, Jul 25, 2017 at 10:42:45PM +0800, Jonathan Cameron wrote:
> > On Tue, 25 Jul 2017 06:46:26 -0700
> > "Paul E. McKenney"  wrote:
> >   
> > > On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:  
> > > > On Tue, 25 Jul 2017 19:32:10 +0800
> > > > Jonathan Cameron  wrote:
> > > > 
> > > > > Hi All,
> > > > > 
> > > > > We observed a regression on our d05 boards (but curiously not
> > > > > the fairly similar but single socket / smaller core count
> > > > > d03), initially seen with linux-next prior to the merge window
> > > > > and still present in v4.13-rc2.
> > > > > 
> > > > > The symptom is:
> > > 
> > > Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
> > > they have been seeing something similar, and you might well have saved
> > > them the trouble of bisecting.
> > > 
> > > [ . . . ]
> > >   
> > > > > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 
> > > > > c1565 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1
> > > 
> > > This is the cause from an RCU perspective.  You had a lot of idle CPUs,
> > > and RCU is not permitted to disturb them -- the battery-powered embedded
> > > guys get very annoyed by that sort of thing.  What happens instead is
> > > that each CPU updates a per-CPU state variable when entering or exiting
> > > idle, and the grace-period kthread ("rcu_preempt kthread" in the above
> > > message) checks these state variables, and if when sees an idle CPU,
> > > it reports a quiescent state on that CPU's behalf.
> > > 
> > > But the grace-period kthread can only do this work if it gets a chance
> > > to run.  And the message above says that this kthread hasn't had a chance
> > > to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
> > > says that grace period #1566 is in progress, the "f0x0" says that no one
> > > is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
> > > that the grace-period kthread has fully initialized the current grace
> > > period and is sleeping for a few jiffies waiting to scan for idle tasks.
> > > Finally, the "->state=0x1" says that the grace-period kthread is in
> > > TASK_INTERRUPTIBLE state, in other words, still sleeping.  
> > 
> > Thanks for the explanation!  
> > > 
> > > So my first question is "What did commit 05a4a9527 (kernel/watchdog:
> > > split up config options) do to prevent the grace-period kthread from
> > > getting a chance to run?"   
> > 
> > As far as we can tell it was a side effect of that patch.
> > 
> > The real cause is that patch changed the result of defconfigs to stop 
> > running
> > the softlockup detector - now CONFIG_SOFTLOCKUP_DETECTOR
> > 
> > Enabling that on 4.13-rc2 (and presumably everything in between)
> > means we don't see the problem any more.
> >   
> > > I must confess that I don't see anything
> > > obvious in that commit, so my second question is "Are we sure that
> > > reverting this commit makes the problem go away?"  
> > 
> > Simply enabling CONFIG_SOFTLOCKUP_DETECTOR seems to make it go away.
> > That detector fires up a thread on every cpu, which may be relevant.  
> 
> Interesting...  Why should it be necessary to fire up a thread on every
> CPU in order to make sure that RCU's grace-period kthreads get some
> CPU time?  Especially give how many idle CPUs you had on your system.
> 
> So I have to ask if there is some other bug that the softlockup detector
> is masking.
I am thinking the same.  We can try going back further than 4.12 tomorrow
(we think we can realistically go back to 4.8 and possibly 4.6
with this board)
> 
> > > and my third is "Is
> > > this an intermittent problem that led to a false bisection?"  
> > 
> > Whilst it is a bit slow to occur, we verified with long runs on either
> > side of that patch and since with the option enabled on latest mainline.
> > 
> > Also can cause the issue before that patch by disabling the previous
> > relevant option on 4.12.  
> 
> OK, thank you -- hard to argue with that!  ;-)
We thought it was a pretty unlikely a bisection result
hence hammered it thoroughly ;)
> 
> Except that I am still puzzled as to why per-CPU softlockup threads
> are needed for RCU's kthreads to get their wakeups.  We really should
> be able to disable softlockup and still have kthreads get wakeups and
> access to CPU, after all.
> 
> > > [ . . . ]
> > >   
> > > > > Reducing the RCU CPU stall timeout makes it happen more often,
> > > > > but we are seeing even with the default value of 24 seconds.
> > > > > 
> > > > > Tends to occur after a period or relatively low usage, but has
> > > > > also been seen mid way through performance tests.
> > > > > 
> > > > > This was not seen with v4.12 so a bisection run later lead to
> > > > > commit 05a4a9527 (kernel/watchdog: split up config options).
> > > > > 
> > > > > Which was odd until we 

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Jonathan Cameron
On Tue, 25 Jul 2017 06:46:26 -0700
"Paul E. McKenney"  wrote:

> On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:
> > On Tue, 25 Jul 2017 19:32:10 +0800
> > Jonathan Cameron  wrote:
> >   
> > > Hi All,
> > > 
> > > We observed a regression on our d05 boards (but curiously not
> > > the fairly similar but single socket / smaller core count
> > > d03), initially seen with linux-next prior to the merge window
> > > and still present in v4.13-rc2.
> > > 
> > > The symptom is:  
> 
> Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
> they have been seeing something similar, and you might well have saved
> them the trouble of bisecting.
> 
> [ . . . ]
> 
> > > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 c1565 
> > > f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1  
> 
> This is the cause from an RCU perspective.  You had a lot of idle CPUs,
> and RCU is not permitted to disturb them -- the battery-powered embedded
> guys get very annoyed by that sort of thing.  What happens instead is
> that each CPU updates a per-CPU state variable when entering or exiting
> idle, and the grace-period kthread ("rcu_preempt kthread" in the above
> message) checks these state variables, and if when sees an idle CPU,
> it reports a quiescent state on that CPU's behalf.
> 
> But the grace-period kthread can only do this work if it gets a chance
> to run.  And the message above says that this kthread hasn't had a chance
> to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
> says that grace period #1566 is in progress, the "f0x0" says that no one
> is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
> that the grace-period kthread has fully initialized the current grace
> period and is sleeping for a few jiffies waiting to scan for idle tasks.
> Finally, the "->state=0x1" says that the grace-period kthread is in
> TASK_INTERRUPTIBLE state, in other words, still sleeping.
Thanks for the explanation!
> 
> So my first question is "What did commit 05a4a9527 (kernel/watchdog:
> split up config options) do to prevent the grace-period kthread from
> getting a chance to run?" 

As far as we can tell it was a side effect of that patch.

The real cause is that patch changed the result of defconfigs to stop running
the softlockup detector - now CONFIG_SOFTLOCKUP_DETECTOR

Enabling that on 4.13-rc2 (and presumably everything in between)
means we don't see the problem any more.

> I must confess that I don't see anything
> obvious in that commit, so my second question is "Are we sure that
> reverting this commit makes the problem go away?"
Simply enabling CONFIG_SOFTLOCKUP_DETECTOR seems to make it go away.
That detector fires up a thread on every cpu, which may be relevant.

> and my third is "Is
> this an intermittent problem that led to a false bisection?"
Whilst it is a bit slow to occur, we verified with long runs on either
side of that patch and since with the option enabled on latest mainline.

Also can cause the issue before that patch by disabling the previous
relevant option on 4.12.

> 
> [ . . . ]
> 
> > > Reducing the RCU CPU stall timeout makes it happen more often,
> > > but we are seeing even with the default value of 24 seconds.
> > > 
> > > Tends to occur after a period or relatively low usage, but has
> > > also been seen mid way through performance tests.
> > > 
> > > This was not seen with v4.12 so a bisection run later lead to
> > > commit 05a4a9527 (kernel/watchdog: split up config options).
> > > 
> > > Which was odd until we discovered that a side effect of this patch
> > > was to change whether the softlockup detector was enabled or not in
> > > the arm64 defconfig.
> > > 
> > > On 4.13-rc2 enabling the softlockup detector indeed stopped us
> > > seeing the rcu issue. Disabling the equivalent on 4.12 made the
> > > issue occur there as well.
> > > 
> > > Clearly the softlockup detector results in a thread on every cpu,
> > > which might be related but beyond that we are still looking into
> > > the issue.
> > > 
> > > So the obvious question is whether anyone else is seeing this as
> > > it might help us to focus in on where to look!  
> > 
> > Huh. Something similar has been seen very intermittently on powerpc
> > as well. We couldn't reproduce it reliably to bisect it already, so
> > this is a good help.
> > 
> > http://marc.info/?l=linuxppc-embedded=149872815523646=2
> > 
> > It looks like the watchdog patch has a similar effect on powerpc in
> > that it stops enabling the softlockup detector by default. Haven't
> > confirmed, but it looks like the same thing.
> > 
> > A bug in RCU stall detection?  
> 
> Well, if I am expected to make grace periods complete when my grace-period
> kthreads aren't getting any CPU time, I will have to make some substantial
> changes.  ;-)
> 
> One possibility is that the timer isn't firing and another is that the
> timer's wakeup is being lost 

[PATCH 4/4] of/fdt: only store the device node basename in full_name

2017-07-25 Thread Rob Herring
With dependencies on a statically allocated full path name converted to
use %pOF format specifier, we can store just the basename of node, and
the unflattening of the FDT can be simplified.

This commit will affect the remaining users of full_name. After
analyzing these users, the remaining cases should only change some print
messages. The main users of full_name are providing a name for struct
resource. The resource names shouldn't be important other than providing
/proc/iomem names.

We no longer distinguish between pre and post 0x10 dtb formats as either
a full path or basename will work. However, less than 0x10 formats have
been broken since the conversion to use libfdt (and no one has cared).
The conversion of the unflattening code to be non-recursive also broke
pre 0x10 formats as the populate_node function would return 0 in that
case.

Signed-off-by: Rob Herring 
---
 drivers/of/fdt.c | 69 +---
 1 file changed, 11 insertions(+), 58 deletions(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index ce30c9a588a4..27c535af0be8 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -266,74 +266,32 @@ static void populate_properties(const void *blob,
*pprev = NULL;
 }
 
-static unsigned int populate_node(const void *blob,
- int offset,
- void **mem,
- struct device_node *dad,
- unsigned int fpsize,
- struct device_node **pnp,
- bool dryrun)
+static bool populate_node(const void *blob,
+ int offset,
+ void **mem,
+ struct device_node *dad,
+ struct device_node **pnp,
+ bool dryrun)
 {
struct device_node *np;
const char *pathp;
unsigned int l, allocl;
-   int new_format = 0;
 
pathp = fdt_get_name(blob, offset, );
if (!pathp) {
*pnp = NULL;
-   return 0;
+   return false;
}
 
allocl = ++l;
 
-   /* version 0x10 has a more compact unit name here instead of the full
-* path. we accumulate the full path size using "fpsize", we'll rebuild
-* it later. We detect this because the first character of the name is
-* not '/'.
-*/
-   if ((*pathp) != '/') {
-   new_format = 1;
-   if (fpsize == 0) {
-   /* root node: special case. fpsize accounts for path
-* plus terminating zero. root node only has '/', so
-* fpsize should be 2, but we want to avoid the first
-* level nodes to have two '/' so we use fpsize 1 here
-*/
-   fpsize = 1;
-   allocl = 2;
-   l = 1;
-   pathp = "";
-   } else {
-   /* account for '/' and path size minus terminal 0
-* already in 'l'
-*/
-   fpsize += l;
-   allocl = fpsize;
-   }
-   }
-
np = unflatten_dt_alloc(mem, sizeof(struct device_node) + allocl,
__alignof__(struct device_node));
if (!dryrun) {
char *fn;
of_node_init(np);
np->full_name = fn = ((char *)np) + sizeof(*np);
-   if (new_format) {
-   /* rebuild full path for new format */
-   if (dad && dad->parent) {
-   strcpy(fn, dad->full_name);
-#ifdef DEBUG
-   if ((strlen(fn) + l + 1) != allocl) {
-   pr_debug("%s: p: %d, l: %d, a: %d\n",
-   pathp, (int)strlen(fn),
-   l, allocl);
-   }
-#endif
-   fn += strlen(fn);
-   }
-   *(fn++) = '/';
-   }
+
memcpy(fn, pathp, l);
 
if (dad != NULL) {
@@ -355,7 +313,7 @@ static unsigned int populate_node(const void *blob,
}
 
*pnp = np;
-   return fpsize;
+   return true;
 }
 
 static void reverse_nodes(struct device_node *parent)
@@ -399,7 +357,6 @@ static int unflatten_dt_nodes(const void *blob,
struct device_node *root;
int offset = 0, depth = 0, initial_depth = 0;
 #define FDT_MAX_DEPTH  64
-   unsigned int fpsizes[FDT_MAX_DEPTH];
struct device_node *nps[FDT_MAX_DEPTH];
void *base = mem;
bool dryrun = !base;
@@ -418,7 +375,6 @@ static int unflatten_dt_nodes(const void *blob,
 

[PATCH 3/4] powerpc: pseries: only store the device node basename in full_name

2017-07-25 Thread Rob Herring
With dependencies on full_name containing the entire device node path
removed, stop storing the full_name in nodes created by
dlpar_configure_connector() and pSeries_reconfig_add_node().

Signed-off-by: Rob Herring 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/platforms/pseries/dlpar.c| 20 
 arch/powerpc/platforms/pseries/reconfig.c |  2 +-
 2 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
b/arch/powerpc/platforms/pseries/dlpar.c
index 783f36364690..8ab0be0706fd 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -75,24 +75,17 @@ static struct property *dlpar_parse_cc_property(struct 
cc_workarea *ccwa)
return prop;
 }
 
-static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa,
-  const char *path)
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
 {
struct device_node *dn;
char *name;
 
-   /* If parent node path is "/" advance path to NULL terminator to
-* prevent double leading slashs in full_name.
-*/
-   if (!path[1])
-   path++;
-
dn = kzalloc(sizeof(*dn), GFP_KERNEL);
if (!dn)
return NULL;
 
name = (char *)ccwa + be32_to_cpu(ccwa->name_offset);
-   dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name);
+   dn->full_name = kasprintf(GFP_KERNEL, "%s", name);
if (!dn->full_name) {
kfree(dn);
return NULL;
@@ -148,7 +141,6 @@ struct device_node *dlpar_configure_connector(__be32 
drc_index,
struct property *last_property = NULL;
struct cc_workarea *ccwa;
char *data_buf;
-   const char *parent_path = parent->full_name;
int cc_token;
int rc = -1;
 
@@ -182,7 +174,7 @@ struct device_node *dlpar_configure_connector(__be32 
drc_index,
break;
 
case NEXT_SIBLING:
-   dn = dlpar_parse_cc_node(ccwa, parent_path);
+   dn = dlpar_parse_cc_node(ccwa);
if (!dn)
goto cc_error;
 
@@ -192,10 +184,7 @@ struct device_node *dlpar_configure_connector(__be32 
drc_index,
break;
 
case NEXT_CHILD:
-   if (first_dn)
-   parent_path = last_dn->full_name;
-
-   dn = dlpar_parse_cc_node(ccwa, parent_path);
+   dn = dlpar_parse_cc_node(ccwa);
if (!dn)
goto cc_error;
 
@@ -226,7 +215,6 @@ struct device_node *dlpar_configure_connector(__be32 
drc_index,
 
case PREV_PARENT:
last_dn = last_dn->parent;
-   parent_path = last_dn->parent->full_name;
break;
 
case CALL_AGAIN:
diff --git a/arch/powerpc/platforms/pseries/reconfig.c 
b/arch/powerpc/platforms/pseries/reconfig.c
index e5bf1e84047f..73e4063ad997 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -33,7 +33,7 @@ static int pSeries_reconfig_add_node(const char *path, struct 
property *proplist
if (!np)
goto out_err;
 
-   np->full_name = kstrdup(path, GFP_KERNEL);
+   np->full_name = kstrdup(kbasename(path), GFP_KERNEL);
if (!np->full_name)
goto out_err;
 
-- 
2.11.0



[PATCH 2/4] powerpc: pseries: remove dlpar_attach_node dependency on full path

2017-07-25 Thread Rob Herring
In preparation to stop storing the full node path in full_name, remove the
dependency on full_name from dlpar_attach_node(). Callers of
dlpar_attach_node() already have the parent device_node, so just pass the
parent node into dlpar_attach_node instead of the path. This avoids doing
a lookup of the parent node by the path.

Signed-off-by: Rob Herring 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/platforms/pseries/dlpar.c   | 6 ++
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +-
 arch/powerpc/platforms/pseries/mobility.c| 2 +-
 arch/powerpc/platforms/pseries/pseries.h | 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
b/arch/powerpc/platforms/pseries/dlpar.c
index 80b84c9c8509..783f36364690 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -254,13 +254,11 @@ struct device_node *dlpar_configure_connector(__be32 
drc_index,
return first_dn;
 }
 
-int dlpar_attach_node(struct device_node *dn)
+int dlpar_attach_node(struct device_node *dn, struct device_node *parent)
 {
int rc;
 
-   dn->parent = pseries_of_derive_parent(dn->full_name);
-   if (IS_ERR(dn->parent))
-   return PTR_ERR(dn->parent);
+   dn->parent = parent;
 
rc = of_attach_node(dn);
if (rc) {
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 0a93093fbcef..b357f1ae0b0a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -463,7 +463,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
return -EINVAL;
}
 
-   rc = dlpar_attach_node(dn);
+   rc = dlpar_attach_node(dn, parent);
if (rc) {
saved_rc = rc;
pr_warn("Failed to attach node %s, rc: %d, drc index: %x\n",
diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index 2da4851eff99..210ce632d63e 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -229,7 +229,7 @@ static int add_dt_node(__be32 parent_phandle, __be32 
drc_index)
if (!dn)
return -ENOENT;
 
-   rc = dlpar_attach_node(dn);
+   rc = dlpar_attach_node(dn, parent_dn);
if (rc)
dlpar_free_cc_nodes(dn);
 
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 1361a9db534b..4470a3194311 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -46,7 +46,7 @@ extern void dlpar_free_cc_nodes(struct device_node *);
 extern void dlpar_free_cc_property(struct property *);
 extern struct device_node *dlpar_configure_connector(__be32,
struct device_node *);
-extern int dlpar_attach_node(struct device_node *);
+extern int dlpar_attach_node(struct device_node *, struct device_node *);
 extern int dlpar_detach_node(struct device_node *);
 extern int dlpar_acquire_drc(u32 drc_index);
 extern int dlpar_release_drc(u32 drc_index);
-- 
2.11.0



[PATCH 1/4] powerpc: pseries: vio: match parent nodes with of_find_node_by_path

2017-07-25 Thread Rob Herring
In preparation to remove the full path from device_node.full_name, use
of_find_node_by_path instead of open coding with strcmp.

Signed-off-by: Rob Herring 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/platforms/pseries/vio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 3201feb6d32b..28a9505e4919 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1357,9 +1357,9 @@ struct vio_dev *vio_register_device_node(struct 
device_node *of_node)
 */
parent_node = of_get_parent(of_node);
if (parent_node) {
-   if (!strcmp(parent_node->full_name, "ibm,platform-facilities"))
+   if (parent_node == 
of_find_node_by_path("/ibm,platform-facilities"))
family = PFO;
-   else if (!strcmp(parent_node->full_name, "vdevice"))
+   else if (parent_node == of_find_node_by_path("/vdevice"))
family = VDEVICE;
else {
pr_warn("%s: parent(%pOF) of %s not recognized.\n",
-- 
2.11.0



[PATCH 0/4] Removing full paths from DT full_name

2017-07-25 Thread Rob Herring
This series is the last steps to remove storing the full path for every 
DT node. Instead, we can create full path strings dynamically as needed 
with printf %pOF specifiers (commit ce4fecf1fe15). There are a number of 
remaining direct users of full_name after this series. I don't believe 
there should be any functional impact for those users with the change to 
only the node name (+unit-address). The majority are for struct 
resource.name. This should only affect /proc/iomem display.

Patches 1 and 2 can be applied now for 4.14. For patches 3 and 4, my 
target is 4.15 after all the dependencies have been merged.

PPC folks, Please test! The PPC parts are untested. A git branch with 
all the dependencies is here[1].

Rob

[1] git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git dt-printf

Rob Herring (4):
  powerpc: pseries: vio: match parent nodes with of_find_node_by_path
  powerpc: pseries: remove dlpar_attach_node dependency on full path
  powerpc: pseries: only store the device node basename in full_name
  of/fdt: only store the device node basename in full_name

 arch/powerpc/platforms/pseries/dlpar.c   | 26 +++
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  2 +-
 arch/powerpc/platforms/pseries/mobility.c|  2 +-
 arch/powerpc/platforms/pseries/pseries.h |  2 +-
 arch/powerpc/platforms/pseries/reconfig.c|  2 +-
 arch/powerpc/platforms/pseries/vio.c |  4 +-
 drivers/of/fdt.c | 69 +---
 7 files changed, 23 insertions(+), 84 deletions(-)

-- 
2.11.0



Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread David Miller
From: Jonathan Cameron 
Date: Wed, 26 Jul 2017 00:52:07 +0800

> On Tue, 25 Jul 2017 08:12:45 -0700
> "Paul E. McKenney"  wrote:
> 
>> On Tue, Jul 25, 2017 at 10:42:45PM +0800, Jonathan Cameron wrote:
>> > On Tue, 25 Jul 2017 06:46:26 -0700
>> > "Paul E. McKenney"  wrote:
>> >   
>> > > On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:  
>> > > > On Tue, 25 Jul 2017 19:32:10 +0800
>> > > > Jonathan Cameron  wrote:
>> > > > 
>> > > > > Hi All,
>> > > > > 
>> > > > > We observed a regression on our d05 boards (but curiously not
>> > > > > the fairly similar but single socket / smaller core count
>> > > > > d03), initially seen with linux-next prior to the merge window
>> > > > > and still present in v4.13-rc2.
>> > > > > 
>> > > > > The symptom is:
>> > > 
>> > > Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
>> > > they have been seeing something similar, and you might well have saved
>> > > them the trouble of bisecting.
>> > > 
>> > > [ . . . ]
>> > >   
>> > > > > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 
>> > > > > c1565 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1
>> > > 
>> > > This is the cause from an RCU perspective.  You had a lot of idle CPUs,
>> > > and RCU is not permitted to disturb them -- the battery-powered embedded
>> > > guys get very annoyed by that sort of thing.  What happens instead is
>> > > that each CPU updates a per-CPU state variable when entering or exiting
>> > > idle, and the grace-period kthread ("rcu_preempt kthread" in the above
>> > > message) checks these state variables, and if when sees an idle CPU,
>> > > it reports a quiescent state on that CPU's behalf.
>> > > 
>> > > But the grace-period kthread can only do this work if it gets a chance
>> > > to run.  And the message above says that this kthread hasn't had a chance
>> > > to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
>> > > says that grace period #1566 is in progress, the "f0x0" says that no one
>> > > is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
>> > > that the grace-period kthread has fully initialized the current grace
>> > > period and is sleeping for a few jiffies waiting to scan for idle tasks.
>> > > Finally, the "->state=0x1" says that the grace-period kthread is in
>> > > TASK_INTERRUPTIBLE state, in other words, still sleeping.  
>> > 
>> > Thanks for the explanation!  
>> > > 
>> > > So my first question is "What did commit 05a4a9527 (kernel/watchdog:
>> > > split up config options) do to prevent the grace-period kthread from
>> > > getting a chance to run?"   
>> > 
>> > As far as we can tell it was a side effect of that patch.
>> > 
>> > The real cause is that patch changed the result of defconfigs to stop 
>> > running
>> > the softlockup detector - now CONFIG_SOFTLOCKUP_DETECTOR
>> > 
>> > Enabling that on 4.13-rc2 (and presumably everything in between)
>> > means we don't see the problem any more.
>> >   
>> > > I must confess that I don't see anything
>> > > obvious in that commit, so my second question is "Are we sure that
>> > > reverting this commit makes the problem go away?"  
>> > 
>> > Simply enabling CONFIG_SOFTLOCKUP_DETECTOR seems to make it go away.
>> > That detector fires up a thread on every cpu, which may be relevant.  
>> 
>> Interesting...  Why should it be necessary to fire up a thread on every
>> CPU in order to make sure that RCU's grace-period kthreads get some
>> CPU time?  Especially give how many idle CPUs you had on your system.
>> 
>> So I have to ask if there is some other bug that the softlockup detector
>> is masking.
> I am thinking the same.  We can try going back further than 4.12 tomorrow
> (we think we can realistically go back to 4.8 and possibly 4.6
> with this board)

Just to report, turning softlockup back on fixes things for me on
sparc64 too.

The thing about softlockup is it runs an hrtimer, which seems to run
about every 4 seconds.

So I wonder if this is a NO_HZ problem.


Re: [PATCH] powerpc/pseries: Fix of_node_put() underflow during pseries remove

2017-07-25 Thread Tyrel Datwyler
On 07/24/2017 09:47 PM, Michael Ellerman wrote:
> Tyrel Datwyler  writes:
> 
>> On 07/24/2017 03:42 AM, Michael Ellerman wrote:
>>> Laurent Vivier  writes:
>>>
 As for commit 68baf692c435 ("powerpc/pseries: Fix of_node_put()
 underflow during DLPAR remove"), the call to of_node_put()
 must be removed from pSeries_reconfig_remove_node().

 dlpar_detach_node() and pSeries_reconfig_remove_node() call
 of_detach_node(), and thus the node should not be released
 in this case too.

 Signed-off-by: Laurent Vivier 
 ---
  arch/powerpc/platforms/pseries/reconfig.c | 1 -
  1 file changed, 1 deletion(-)
>>>
>>> Thanks. I'll spare you the swearing about why we have the same bug in
>>> two places.
>>
>> That's probably my bad. I must have failed to test with older powerpc-util 
>> tooling where
>> drmgr uses the /proc/ofdt interface for device tree modification.
> 
> OK. Really we should have automated tests of the various cases, I've
> just never had time to write any.

Agreed, some better CI is warranted.

> 
> Mainly the thing that bugs me is that we still have the two separate
> paths. Or if we must maintain both they could at least share more code,
> the two functions do basically the same thing AFAICS.

Yeah, I think that is where I dropped the ball. I wrongly assumed by not 
looking close
enough that code was shared in those two paths. Definitely some code 
de-duplication work
that can be done.

-Tyrel

> 
> cheers
> 



RE: [PATCH 0/4] Allow non-legacy cards to be vgaarb default

2017-07-25 Thread Gabriele Paoloni
Hi Laszlo

[...]

> 
> Having practically zero background in gfx development (either kernel or
> Xorg), I think the problem is that vga_default_device() /
> vga_set_default_device(), which -- apparently -- "boot_vga" is based
> upon, come from "drivers/gpu/vga/vgaarb.c". Namely, the concept of
> "primary / boot display device" is tied to the VGA arbiter, plus only a
> PCI device can currently be marked as primary/boot display device.
> 
> Can these concepts be split from each other? (I can fully imagine that
> this would result in a userspace visible interface change (or
> addition),
> so that e.g. "/sys/devices/**/boot_gpu" would have to be consulted by
> display servers.)
> 
> (Sorry if I'm totally wrong.)
> 
> ... Hm, reading the thread starter at
>  d...@lists.ozlabs.org/msg120851.html>,
> and the references within... It looks like this work is motivated by
> hardware that is supposed to be PCI, but actually breaks the specs. Is
> that correct? If so, then I don't think I can suggest anything useful.

My understanding is that the current PCIe HW is specs compliant but the
vgaarb, in order to make a VGA device the default one, requires all the
bridges on top of such device to have the "VGA Enable" bit set (optional
bit in the PCI Express™ to PCI/PCI-X Bridge Spec). I.e. all the bridges
on top have to support legacy VGA devices; and this is not mandatory
from the specs...right?

BTW my VGA experience is limited too...this is just my understanding...

Gab

> Specs exist so that hardware vendors and software authors follow them.
> If hardware does not conform, then software should either refuse to
> work
> with it, or handle it with quirks, on a case-by-case basis. I guess
> this
> means that I don't agree with the
> 
>   broad[] suggest[ion] that a more generic solution would be better
> 
> which seems to disqualify me from the discussion, as it must have been
> suggested by people with incomparably more experience than what I have
> :)
> 
> Thanks
> Laszlo


Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-25 Thread Alex Williamson
[cc +Alexey, David]

Any comments from the usual suspects for vfio/spapr?  Thanks,

Alex

On Tue, 25 Jul 2017 10:56:38 -0300
Murilo Opsfelder Araújo  wrote:

> On 07/18/2017 02:22 PM, Murilo Opsfelder Araujo wrote:
> > When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the
> > following:
> > 
> > drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release':
> > vfio_pci.c:(.text+0xa98): undefined reference to 
> > `.vfio_spapr_pci_eeh_release'
> > drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open':
> > vfio_pci.c:(.text+0x1420): undefined reference to 
> > `.vfio_spapr_pci_eeh_open'
> > 
> > In this case, vfio_pci.c should use the empty definitions of
> > vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions.
> > 
> > This patch fixes it by guarding these function definitions with
> > CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is
> > built, which is where the non-empty versions of these functions are. We 
> > need to
> > make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate
> > option.
> > 
> > This issue was found during a randconfig build. Logs are here:
> > 
> > http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/
> > 
> > Signed-off-by: Murilo Opsfelder Araujo 
> > ---
> > 
> > Changes from v1:
> > - Rebased on top of next-20170718.  
> 
> Hi, Alex.
> 
> Are you applying this?
> 
> Thanks!
> 



Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Borislav Petkov
On Tue, Jul 25, 2017 at 10:29:40AM -0500, Tom Lendacky wrote:
> But early_identify_cpu() calls get_cpu_cap() which will check for cpuid
> leaf 0x8008 support and set x86_phys_bits.

Right, but it can't be less than 32, can it? And if it is more than 32
bits, then it probably doesn't really matter on 32-bit. Unless it is
less than 36 bits and you do PAE...

> I'll try to build and run a 32-bit kernel and see how this all flows.

Yeah, that would be good.

Thanks.

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 


Re: [PATCH v3 4/5] powerpc/lib/sstep: Add prty instruction emulation

2017-07-25 Thread Segher Boessenkool
On Tue, Jul 25, 2017 at 01:33:19PM +1000, Matt Brown wrote:
> +static nokprobe_inline void do_prty(struct pt_regs *regs, unsigned long v,
> + int size, int ra)
> +{
> + unsigned long long res = v;
> +
> + res = (0x0001000100010001 & res) + (0x0001000100010001 & (res >> 8));
> + res = (0x00070007 & res) + (0x00070007 & (res >> 16));
> + if (size == 32) {   /* prtyw */
> + regs->gpr[ra] = (0x00010001 & res);
> + return;
> + }
> +
> + res = (0x000f & res) + (0x000f & (res >> 32));
> + regs->gpr[ra] = res & 1;/*prtyd */
> +}

Does 7's and 0xf look strange (since the top bit in the values there
is always 0).  You always "and" the values with 1 later, you could do
that immediately (and change + to ^).

A general question about these patches: some things are inside #ifdef
__powerpc64__, some are not.  It seems it is the wrong macro, and it
should be used (or not used) consistently?


Segher


Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Tom Lendacky

On 7/25/2017 10:13 AM, Borislav Petkov wrote:

On Tue, Jul 25, 2017 at 09:58:54AM -0500, Tom Lendacky wrote:

True, but it is more about being accurate and making sure the value is
correct where ever it may be used.


So early_identify_cpu() initializes phys_bits to 32 on 32-bit.
Subtracting it there would actually make actively it wrong, AFAICT.


But early_identify_cpu() calls get_cpu_cap() which will check for cpuid
leaf 0x8008 support and set x86_phys_bits.  I'll try to build and
run a 32-bit kernel and see how this all flows.

Thanks,
Tom





Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Borislav Petkov
On Tue, Jul 25, 2017 at 09:58:54AM -0500, Tom Lendacky wrote:
> True, but it is more about being accurate and making sure the value is
> correct where ever it may be used.

So early_identify_cpu() initializes phys_bits to 32 on 32-bit.
Subtracting it there would actually make actively it wrong, AFAICT.

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 


Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Paul E. McKenney
On Tue, Jul 25, 2017 at 10:42:45PM +0800, Jonathan Cameron wrote:
> On Tue, 25 Jul 2017 06:46:26 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:
> > > On Tue, 25 Jul 2017 19:32:10 +0800
> > > Jonathan Cameron  wrote:
> > >   
> > > > Hi All,
> > > > 
> > > > We observed a regression on our d05 boards (but curiously not
> > > > the fairly similar but single socket / smaller core count
> > > > d03), initially seen with linux-next prior to the merge window
> > > > and still present in v4.13-rc2.
> > > > 
> > > > The symptom is:  
> > 
> > Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
> > they have been seeing something similar, and you might well have saved
> > them the trouble of bisecting.
> > 
> > [ . . . ]
> > 
> > > > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 
> > > > c1565 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1  
> > 
> > This is the cause from an RCU perspective.  You had a lot of idle CPUs,
> > and RCU is not permitted to disturb them -- the battery-powered embedded
> > guys get very annoyed by that sort of thing.  What happens instead is
> > that each CPU updates a per-CPU state variable when entering or exiting
> > idle, and the grace-period kthread ("rcu_preempt kthread" in the above
> > message) checks these state variables, and if when sees an idle CPU,
> > it reports a quiescent state on that CPU's behalf.
> > 
> > But the grace-period kthread can only do this work if it gets a chance
> > to run.  And the message above says that this kthread hasn't had a chance
> > to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
> > says that grace period #1566 is in progress, the "f0x0" says that no one
> > is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
> > that the grace-period kthread has fully initialized the current grace
> > period and is sleeping for a few jiffies waiting to scan for idle tasks.
> > Finally, the "->state=0x1" says that the grace-period kthread is in
> > TASK_INTERRUPTIBLE state, in other words, still sleeping.
> 
> Thanks for the explanation!
> > 
> > So my first question is "What did commit 05a4a9527 (kernel/watchdog:
> > split up config options) do to prevent the grace-period kthread from
> > getting a chance to run?" 
> 
> As far as we can tell it was a side effect of that patch.
> 
> The real cause is that patch changed the result of defconfigs to stop running
> the softlockup detector - now CONFIG_SOFTLOCKUP_DETECTOR
> 
> Enabling that on 4.13-rc2 (and presumably everything in between)
> means we don't see the problem any more.
> 
> > I must confess that I don't see anything
> > obvious in that commit, so my second question is "Are we sure that
> > reverting this commit makes the problem go away?"
> 
> Simply enabling CONFIG_SOFTLOCKUP_DETECTOR seems to make it go away.
> That detector fires up a thread on every cpu, which may be relevant.

Interesting...  Why should it be necessary to fire up a thread on every
CPU in order to make sure that RCU's grace-period kthreads get some
CPU time?  Especially give how many idle CPUs you had on your system.

So I have to ask if there is some other bug that the softlockup detector
is masking.

> > and my third is "Is
> > this an intermittent problem that led to a false bisection?"
> 
> Whilst it is a bit slow to occur, we verified with long runs on either
> side of that patch and since with the option enabled on latest mainline.
> 
> Also can cause the issue before that patch by disabling the previous
> relevant option on 4.12.

OK, thank you -- hard to argue with that!  ;-)

Except that I am still puzzled as to why per-CPU softlockup threads
are needed for RCU's kthreads to get their wakeups.  We really should
be able to disable softlockup and still have kthreads get wakeups and
access to CPU, after all.

> > [ . . . ]
> > 
> > > > Reducing the RCU CPU stall timeout makes it happen more often,
> > > > but we are seeing even with the default value of 24 seconds.
> > > > 
> > > > Tends to occur after a period or relatively low usage, but has
> > > > also been seen mid way through performance tests.
> > > > 
> > > > This was not seen with v4.12 so a bisection run later lead to
> > > > commit 05a4a9527 (kernel/watchdog: split up config options).
> > > > 
> > > > Which was odd until we discovered that a side effect of this patch
> > > > was to change whether the softlockup detector was enabled or not in
> > > > the arm64 defconfig.
> > > > 
> > > > On 4.13-rc2 enabling the softlockup detector indeed stopped us
> > > > seeing the rcu issue. Disabling the equivalent on 4.12 made the
> > > > issue occur there as well.
> > > > 
> > > > Clearly the softlockup detector results in a thread on every cpu,
> > > > which might be related but beyond that we are still looking into
> > > > the issue.
> > > > 
> > > > So the obvious question is whether anyone 

Re: [RFC Part1 PATCH v3 01/17] Documentation/x86: Add AMD Secure Encrypted Virtualization (SEV) descrption

2017-07-25 Thread Brijesh Singh



On 07/25/2017 12:45 AM, Borislav Petkov wrote:

On Mon, Jul 24, 2017 at 02:07:41PM -0500, Brijesh Singh wrote:

Subject: Re: [RFC Part1 PATCH v3 01/17] Documentation/x86: Add AMD Secure 
Encrypted Virtualization (SEV) descrption

 ^^

Please introduce a spellchecker into your workflow.


Update amd-memory-encryption document describing the AMD Secure Encrypted


"Update the AMD memory encryption document...

The patch has the proper URL already.


Virtualization (SEV) feature.

Signed-off-by: Brijesh Singh 
---
  Documentation/x86/amd-memory-encryption.txt | 29 ++---
  1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/Documentation/x86/amd-memory-encryption.txt 
b/Documentation/x86/amd-memory-encryption.txt
index f512ab7..747df07 100644
--- a/Documentation/x86/amd-memory-encryption.txt
+++ b/Documentation/x86/amd-memory-encryption.txt
@@ -1,4 +1,5 @@
-Secure Memory Encryption (SME) is a feature found on AMD processors.
+Secure Memory Encryption (SME) and Secure Encrypted Virtualization (SEV) are
+features found on AMD processors.
  
  SME provides the ability to mark individual pages of memory as encrypted using

  the standard x86 page tables.  A page that is marked encrypted will be
@@ -6,6 +7,12 @@ automatically decrypted when read from DRAM and encrypted when 
written to
  DRAM.  SME can therefore be used to protect the contents of DRAM from physical
  attacks on the system.
  
+SEV enables running encrypted virtual machine (VMs) in which the code and data


 machines


+of the virtual machine are secured so that decrypted version is available only


... of the guest VM ...   ... so that a decrypted ...


+within the VM itself. SEV guest VMs have concept of private and shared memory.


have *the* concept - you need to use
definite and indefinite articles in your
text.


+Private memory is encrypted with the guest-specific key, while shared memory
+may be encrypted with hypervisor key.


And here you explain that the hypervisor key is the same key which we
use in SME. So that people can make the connection.


+
  A page is encrypted when a page table entry has the encryption bit set (see
  below on how to determine its position).  The encryption bit can also be
  specified in the cr3 register, allowing the PGD table to be encrypted. Each
@@ -19,11 +26,20 @@ so that the PGD is encrypted, but not set the encryption 
bit in the PGD entry
  for a PUD which results in the PUD pointed to by that entry to not be
  encrypted.
  
-Support for SME can be determined through the CPUID instruction. The CPUID

-function 0x801f reports information related to SME:
+When SEV is enabled, certain type of memory (namely insruction pages and guest


When SEV is enabled, instruction pages and guest page tables are ...


+page tables) are always treated as private. Due to security reasons all DMA


security reasons??


+operations inside the guest must be performed on shared memory. Since the
+memory encryption bit is only controllable by the guest OS when it is operating


 ... is controlled ...


+in 64-bit or 32-bit PAE mode, in all other modes the SEV hardware forces memory


... forces the 
memory ...


+encryption bit to 1.
+
+Support for SME and SEV can be determined through the CPUID instruction. The
+CPUID function 0x801f reports information related to SME:
  
  	0x801f[eax]:

Bit[0] indicates support for SME
+   0x81f[eax]:


There's a 0 missing and you don't really need it as it is already above.


+   Bit[1] indicates support for SEV
0x801f[ebx]:
Bits[5:0]  pagetable bit number used to activate memory
   encryption
@@ -39,6 +55,13 @@ determine if SME is enabled and/or to enable memory 
encryption:
Bit[23]   0 = memory encryption features are disabled
  1 = memory encryption features are enabled
  
+If SEV is supported, MSR 0xc0010131 (MSR_F17H_SEV) can be used to determine if


If this MSR is going to be part of the architecture - and I really think
it is - then call it MSR_AMD64_SEV.



Thanks Boris, I'll update the doc per your feedbacks. And will rename the MSR to
MSR_AMD64_SEV.

-Brijesh


Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Tom Lendacky

On 7/25/2017 9:36 AM, Borislav Petkov wrote:

On Tue, Jul 25, 2017 at 09:29:40AM -0500, Tom Lendacky wrote:

Yup, we can do something like that.  I believe the only change that
would be needed to your patch would be to move the IS_ENABLED() check
to after the physical address space reduction check.


Yeah, I wasn't sure about that. The logic is that if BIOS has enabled
SME and thus reduction is in place, we need to update x86_phys_bits on
32-bit regardless, right?

But, come to think of it, that reduction won't have any effect since we
have 32-bit addresses and the reduction is above 32-bits, right? And
thus it is moot.



True, but it is more about being accurate and making sure the value is
correct where ever it may be used.

Thanks,
Tom



Or?



Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Tom Lendacky

On 7/25/2017 5:26 AM, Borislav Petkov wrote:

On Mon, Jul 24, 2017 at 02:07:42PM -0500, Brijesh Singh wrote:

From: Tom Lendacky 

Update the CPU features to include identifying and reporting on the
Secure Encrypted Virtualization (SEV) feature.  SME is identified by
CPUID 0x801f, but requires BIOS support to enable it (set bit 23 of
MSR_K8_SYSCFG and set bit 0 of MSR_K7_HWCR).  Only show the SEV feature
as available if reported by CPUID and enabled by BIOS.

Signed-off-by: Tom Lendacky 
Signed-off-by: Brijesh Singh 
---
  arch/x86/include/asm/cpufeatures.h |  1 +
  arch/x86/include/asm/msr-index.h   |  2 ++
  arch/x86/kernel/cpu/amd.c  | 30 +-
  arch/x86/kernel/cpu/scattered.c|  1 +
  4 files changed, 29 insertions(+), 5 deletions(-)


...


@@ -637,6 +642,21 @@ static void early_init_amd(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_SME);
}
}
+
+   if (cpu_has(c, X86_FEATURE_SEV)) {
+   if (IS_ENABLED(CONFIG_X86_32)) {
+   clear_cpu_cap(c, X86_FEATURE_SEV);
+   } else {
+   u64 syscfg, hwcr;
+
+   /* Check if SEV is enabled */
+   rdmsrl(MSR_K8_SYSCFG, syscfg);
+   rdmsrl(MSR_K7_HWCR, hwcr);
+   if (!(syscfg & MSR_K8_SYSCFG_MEM_ENCRYPT) ||
+   !(hwcr & MSR_K7_HWCR_SMMLOCK))
+   clear_cpu_cap(c, X86_FEATURE_SEV);
+   }
+   }


Let's simplify this and read the MSRs only once. Diff ontop. Please
check if I'm missing a case:


Yup, we can do something like that.  I believe the only change that
would be needed to your patch would be to move the IS_ENABLED() check
to after the physical address space reduction check.

Thanks,
Tom



---
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c413f04bdd41..79af07731ab1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -546,6 +546,48 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
  }
  
+static void early_detect_mem_enc(struct cpuinfo_x86 *c)

+{
+   u64 syscfg, hwcr;
+
+   /*
+* BIOS support is required for SME and SEV.
+*   For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+*the SME physical address space reduction value.
+*If BIOS has not enabled SME then don't advertise the
+*SME feature (set in scattered.c).
+*   For SEV: If BIOS has not enabled SEV then don't advertise the
+*SEV feature (set in scattered.c).
+*
+*   In all cases, since support for SME and SEV requires long mode,
+*   don't advertise the feature under CONFIG_X86_32.
+*/
+   if (cpu_has(c, X86_FEATURE_SME) ||
+   cpu_has(c, X86_FEATURE_SEV)) {
+
+   if (IS_ENABLED(CONFIG_X86_32))
+   goto clear;
+
+   /* Check if SME is enabled */
+   rdmsrl(MSR_K8_SYSCFG, syscfg);
+   if (!(syscfg & MSR_K8_SYSCFG_MEM_ENCRYPT))
+   goto clear;
+
+   c->x86_phys_bits -= (cpuid_ebx(0x801f) >> 6) & 0x3f;
+
+   /* Check if SEV is enabled */
+   rdmsrl(MSR_K7_HWCR, hwcr);
+   if (!(hwcr & MSR_K7_HWCR_SMMLOCK))
+   goto clear_sev;
+
+   return;
+clear:
+   clear_cpu_cap(c, X86_FEATURE_SME);
+clear_sev:
+   clear_cpu_cap(c, X86_FEATURE_SEV);
+   }
+}
+
  static void early_init_amd(struct cpuinfo_x86 *c)
  {
u32 dummy;
@@ -617,46 +659,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_E400);
  
-	/*

-* BIOS support is required for SME and SEV.
-*   For SME: If BIOS has enabled SME then adjust x86_phys_bits by
-*the SME physical address space reduction value.
-*If BIOS has not enabled SME then don't advertise the
-*SME feature (set in scattered.c).
-*   For SEV: If BIOS has not enabled SEV then don't advertise the
-*SEV feature (set in scattered.c).
-*
-*   In all cases, since support for SME and SEV requires long mode,
-*   don't advertise the feature under CONFIG_X86_32.
-*/
-   if (cpu_has(c, X86_FEATURE_SME)) {
-   u64 msr;
-
-   /* Check if SME is enabled */
-   rdmsrl(MSR_K8_SYSCFG, msr);
-   if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
-   c->x86_phys_bits -= (cpuid_ebx(0x801f) >> 6) & 0x3f;
-   if (IS_ENABLED(CONFIG_X86_32))
-   clear_cpu_cap(c, 

Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Borislav Petkov
On Tue, Jul 25, 2017 at 09:29:40AM -0500, Tom Lendacky wrote:
> Yup, we can do something like that.  I believe the only change that
> would be needed to your patch would be to move the IS_ENABLED() check
> to after the physical address space reduction check.

Yeah, I wasn't sure about that. The logic is that if BIOS has enabled
SME and thus reduction is in place, we need to update x86_phys_bits on
32-bit regardless, right?

But, come to think of it, that reduction won't have any effect since we
have 32-bit addresses and the reduction is above 32-bits, right? And
thus it is moot.

Or?

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 


Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-25 Thread Murilo Opsfelder Araújo
On 07/18/2017 02:22 PM, Murilo Opsfelder Araujo wrote:
> When CONFIG_EEH=y and CONFIG_VFIO_SPAPR_EEH=n, build fails with the
> following:
> 
> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_release':
> vfio_pci.c:(.text+0xa98): undefined reference to 
> `.vfio_spapr_pci_eeh_release'
> drivers/vfio/pci/vfio_pci.o: In function `.vfio_pci_open':
> vfio_pci.c:(.text+0x1420): undefined reference to 
> `.vfio_spapr_pci_eeh_open'
> 
> In this case, vfio_pci.c should use the empty definitions of
> vfio_spapr_pci_eeh_open and vfio_spapr_pci_eeh_release functions.
> 
> This patch fixes it by guarding these function definitions with
> CONFIG_VFIO_SPAPR_EEH, the symbol that controls whether vfio_spapr_eeh.c is
> built, which is where the non-empty versions of these functions are. We need 
> to
> make use of IS_ENABLED() macro because CONFIG_VFIO_SPAPR_EEH is a tristate
> option.
> 
> This issue was found during a randconfig build. Logs are here:
> 
> http://kisskb.ellerman.id.au/kisskb/buildresult/12982362/
> 
> Signed-off-by: Murilo Opsfelder Araujo 
> ---
> 
> Changes from v1:
> - Rebased on top of next-20170718.

Hi, Alex.

Are you applying this?

Thanks!

-- 
Murilo


Re: [PATCH v3] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-25 Thread Santosh Sivaraj
* Benjamin Herrenschmidt  wrote (on 2017-07-25 
20:07:28 +1000):

> On Tue, 2017-07-25 at 12:26 +0530, Santosh Sivaraj wrote:
> 
> > +static notrace void kernel_get_tspec(struct timespec *tp,
> > +struct vdso_data *vdata, u32 *wtom_sec,
> > +u32 *wtom_nsec)
> > +{
> > +   u64 tb;
> > +   u32 update_count;
> 
> This is broken:
> 
> > +   do {
> > +   /* check for update count & load values */
> > +   update_count = vdata->tb_update_count;
> > +
> > +   /* Get TB, offset it and scale result */
> > +   tb = mulhdu((get_tb() - vdata->tb_orig_stamp) << 12,
> > +   vdata->tb_to_xs) + vdata->stamp_sec_fraction;
> > +   tp->tv_sec = vdata->stamp_xtime.tv_sec;
> > +   if (wtom_sec)
> > +   *wtom_sec = vdata->wtom_clock_sec;
> > +   if (wtom_nsec)
> > +   *wtom_nsec = vdata->wtom_clock_nsec;
> > +   } while (update_count != vdata->tb_update_count);
> 
> The assembly code is carefuly crafted to create a chain of data
> dependencies in order to enforce the ordering in this function,
> you completely broke it.
> 
> IE. the pointer used to access tb_orig_stamp etc... depends on the
> initial update count, and the final read of the update count depends
> on all the previously read values (or should), thus ordering those
> loads. Withtout that you'll need more expensive lwsync's.
> 
> Additionally, you broke another semantic of the seqlock which is
> to spin on the first update count if it has an odd value.
> 
> The same problem exist in all your other implementations.
> 
> I am really NOT a fan of that attempt at converting to C. The code is
> hand crafted assembly for a number of reasons, including the above ones
> and maximum performance.
> 
> As it is, it's deeply broken.

I get the point. I looked at the generated assembly a bit closer, the update
count is optimized out. Will send the alternative asm only patch.

Thanks,
Santosh
> 
> > +
> > +   tp->tv_nsec = ((u64)mulhwu(tb, NSEC_PER_SEC) << 32) >> 32;
> > +   tp->tv_sec += (tb >> 32);
> > +}
> > +
> > +static notrace int clock_get_realtime(struct timespec *tp,
> > + struct vdso_data *vdata)
> > +{
> > +   kernel_get_tspec(tp, vdata, NULL, NULL);
> > +
> > +   return 0;
> > +}
> > +
> > +static notrace int clock_get_monotonic(struct timespec *tp,
> > +  struct vdso_data *vdata)
> > +{
> > +   __s32 wtom_sec, wtom_nsec;
> > +   u64 nsec;
> > +
> > +   kernel_get_tspec(tp, vdata, _sec, _nsec);
> > +
> > +   tp->tv_sec += wtom_sec;
> > +
> > +   nsec = tp->tv_nsec;
> > +   tp->tv_nsec = 0;
> > +   timespec_add_ns(tp, nsec + wtom_nsec);
> > +
> > +   return 0;
> > +}
> > +
> > +static notrace int clock_realtime_coarse(struct timespec *tp,
> > +struct vdso_data *vdata)
> > +{
> > +   u32 update_count;
> > +
> > +   do {
> > +   /* check for update count & load values */
> > +   update_count = vdata->tb_update_count;
> > +
> > +   tp->tv_sec = vdata->stamp_xtime.tv_sec;
> > +   tp->tv_nsec = vdata->stamp_xtime.tv_nsec;
> > +   } while (update_count != vdata->tb_update_count);
> > +
> > +   return 0;
> > +}
> > +
> > +static notrace int clock_monotonic_coarse(struct timespec *tp,
> > + struct vdso_data *vdata)
> > +{
> > +   __s32 wtom_sec, wtom_nsec;
> > +   u64 nsec;
> > +   u32 update_count;
> > +
> > +   do {
> > +   /* check for update count & load values */
> > +   update_count = vdata->tb_update_count;
> > +
> > +   tp->tv_sec = vdata->stamp_xtime.tv_sec;
> > +   tp->tv_nsec = vdata->stamp_xtime.tv_nsec;
> > +   wtom_sec = vdata->wtom_clock_sec;
> > +   wtom_nsec = vdata->wtom_clock_nsec;
> > +   } while (update_count != vdata->tb_update_count);
> > +
> > +   tp->tv_sec += wtom_sec;
> > +   nsec = tp->tv_nsec;
> > +   tp->tv_nsec = 0;
> > +   timespec_add_ns(tp, nsec + wtom_nsec);
> > +
> > +   return 0;
> > +}
> > +
> > +notrace int kernel_clock_gettime(clockid_t clk_id, struct timespec *tp)
> > +{
> > +   int ret;
> > +   struct vdso_data *vdata = __get_datapage();
> > +
> > +   if (!tp || !vdata)
> > +   return -EBADR;
> > +
> > +   switch (clk_id) {
> > +   case CLOCK_REALTIME:
> > +   ret = clock_get_realtime(tp, vdata);
> > +   break;
> > +   case CLOCK_MONOTONIC:
> > +   ret = clock_get_monotonic(tp, vdata);
> > +   break;
> > +   case CLOCK_REALTIME_COARSE:
> > +   ret = clock_realtime_coarse(tp, vdata);
> > +   break;
> > +   case CLOCK_MONOTONIC_COARSE:
> > +   ret = clock_monotonic_coarse(tp, vdata);
> > +   break;
> > +   default:
> > +   /* fallback to syscall */
> > +   ret = -1;
> > +   break;
> > +   }
> > +
> > +   return ret;
> > +}
> > diff --git 

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Paul E. McKenney
On Tue, Jul 25, 2017 at 10:26:54PM +1000, Nicholas Piggin wrote:
> On Tue, 25 Jul 2017 19:32:10 +0800
> Jonathan Cameron  wrote:
> 
> > Hi All,
> > 
> > We observed a regression on our d05 boards (but curiously not
> > the fairly similar but single socket / smaller core count
> > d03), initially seen with linux-next prior to the merge window
> > and still present in v4.13-rc2.
> > 
> > The symptom is:

Adding Dave Miller and the sparcli...@vger.kernel.org email on CC, as
they have been seeing something similar, and you might well have saved
them the trouble of bisecting.

[ . . . ]

> > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 c1565 
> > f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1

This is the cause from an RCU perspective.  You had a lot of idle CPUs,
and RCU is not permitted to disturb them -- the battery-powered embedded
guys get very annoyed by that sort of thing.  What happens instead is
that each CPU updates a per-CPU state variable when entering or exiting
idle, and the grace-period kthread ("rcu_preempt kthread" in the above
message) checks these state variables, and if when sees an idle CPU,
it reports a quiescent state on that CPU's behalf.

But the grace-period kthread can only do this work if it gets a chance
to run.  And the message above says that this kthread hasn't had a chance
to run for a full 5,663 jiffies.  For completeness, the "g1566 c1565"
says that grace period #1566 is in progress, the "f0x0" says that no one
is needing another grace period #1567.  The "RCU_GP_WAIT_FQS(3)" says
that the grace-period kthread has fully initialized the current grace
period and is sleeping for a few jiffies waiting to scan for idle tasks.
Finally, the "->state=0x1" says that the grace-period kthread is in
TASK_INTERRUPTIBLE state, in other words, still sleeping.

So my first question is "What did commit 05a4a9527 (kernel/watchdog:
split up config options) do to prevent the grace-period kthread from
getting a chance to run?"  I must confess that I don't see anything
obvious in that commit, so my second question is "Are we sure that
reverting this commit makes the problem go away?" and my third is "Is
this an intermittent problem that led to a false bisection?"

[ . . . ]

> > Reducing the RCU CPU stall timeout makes it happen more often,
> > but we are seeing even with the default value of 24 seconds.
> > 
> > Tends to occur after a period or relatively low usage, but has
> > also been seen mid way through performance tests.
> > 
> > This was not seen with v4.12 so a bisection run later lead to
> > commit 05a4a9527 (kernel/watchdog: split up config options).
> > 
> > Which was odd until we discovered that a side effect of this patch
> > was to change whether the softlockup detector was enabled or not in
> > the arm64 defconfig.
> > 
> > On 4.13-rc2 enabling the softlockup detector indeed stopped us
> > seeing the rcu issue. Disabling the equivalent on 4.12 made the
> > issue occur there as well.
> > 
> > Clearly the softlockup detector results in a thread on every cpu,
> > which might be related but beyond that we are still looking into
> > the issue.
> > 
> > So the obvious question is whether anyone else is seeing this as
> > it might help us to focus in on where to look!
> 
> Huh. Something similar has been seen very intermittently on powerpc
> as well. We couldn't reproduce it reliably to bisect it already, so
> this is a good help.
> 
> http://marc.info/?l=linuxppc-embedded=149872815523646=2
> 
> It looks like the watchdog patch has a similar effect on powerpc in
> that it stops enabling the softlockup detector by default. Haven't
> confirmed, but it looks like the same thing.
> 
> A bug in RCU stall detection?

Well, if I am expected to make grace periods complete when my grace-period
kthreads aren't getting any CPU time, I will have to make some substantial
changes.  ;-)

One possibility is that the timer isn't firing and another is that the
timer's wakeup is being lost somehow.

So another thing to try is to boot with rcutree.rcu_kick_kthreads=1.
This will cause RCU to do redundant wakeups on the grace-period kthread
if the grace period is moving slowly.  This is of course a crude hack,
which is why this boot parameter will also cause a splat if it ever has
to do anything.

Does this help at all?

Thanx, Paul

> > In the meantime we'll carry on digging.
> > 
> > Thanks,
> > 
> > Jonathan
> > 
> > p.s. As a more general question, do we want to have the
> > soft lockup detector enabledon arm64 by default?
> 
> I've cc'ed Don. My patch should not have changed defconfigs, I
> should have been more careful with that.
> 
> Thanks,
> Nick
> 



Re: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation

2017-07-25 Thread Balbir Singh
On Tue, Jul 25, 2017 at 8:24 PM, David Laight  wrote:
> From: Linuxppc-dev 
> [mailto:linuxppc-dev-bounces+david.laight=aculab@lists.ozlabs.org] On 
> Behalf Of
>> Matt Brown
>> Sent: 25 July 2017 04:33
>> To: linuxppc-dev@lists.ozlabs.org
>> Subject: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation
>>
>> This adds emulations for the popcntb, popcntw, and popcntd instructions.
>> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
>>
>> Signed-off-by: Matt Brown 
>> ---
>> v3:
>>   - optimised using the Giles-Miller method of side-ways addition
>> v2:
>>   - fixed opcodes
>>   - fixed typecasting
>>   - fixed bitshifting error for both 32 and 64bit arch
>> ---
>>  arch/powerpc/lib/sstep.c | 40 +++-
>>  1 file changed, 39 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
>> index 87d277f..c1f9cdb 100644
>> --- a/arch/powerpc/lib/sstep.c
>> +++ b/arch/powerpc/lib/sstep.c
>> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs 
>> *regs, unsigned long v1,
>>   regs->gpr[rd] = out_val;
>>  }
>>
>> +/*
>> + * The size parameter is used to adjust the equivalent popcnt instruction.
>> + * popcntb = 8, popcntw = 32, popcntd = 64
>> + */
>> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long 
>> v1,
>> + int size, int ra)
>> +{
>> + unsigned long long out = v1;
>> +
>> + out = (0x & out) + (0x & (out >> 1));
>> + out = (0x & out) + (0x & (out >> 2));
>> + out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
>> + if (size == 8) {/* popcntb */
>> + regs->gpr[ra] = out;
>
> I'm pretty sure you need to mask the result with 7.
>
Absolutely! Good catch!

Balbir Singh.


Re: [PATCH 0/4] Allow non-legacy cards to be vgaarb default

2017-07-25 Thread Laszlo Ersek
On 07/24/17 01:15, Daniel Axtens wrote:
> Hi Ard,
> 
>> But the fact remains that we are going about this the wrong way.
>> Whether a graphics card decodes legacy VGA ranges or not has *nothing*
>> to do with whether or not it is in fact the primary device on a
>> non-x86 system, and so I still think the VGA arbiter should be omitted
>> entirely for such platforms, and Xorg should be fixed instead.
> 
> OK, I see where you're coming from. I've been trying to keep my changes
> small as I don't want to end up on the hook for the almost limitless
> range of problems that changing this sort of code can have, but I do
> take your point that it's a bit of an ugly hack of a solution.
> 
> Say we were to change Xorg instead. What would correct Xorg behaviour
> look like? Xorg would need to honour the boot_vga file if it existed so
> as not to break x86, etc. So your proposed Xorg - if it couldn't find a
> default card that way, and if there was no helpful config file info,
> would arbitrarily pick a card that has an Xorg driver? In other words,
> much like the proposed kernel approach but at a different level of the
> stack?
> 
> Are there other graphical applications we care about (other than Xorg)
> that would need to be patched? I'm happy to do the Xorg patch, but I
> don't know if anything other than Xorg keys off the boot_vga file.
> 
> I'm not fundamentally opposed to this approach if the Xorg community is
> happy with it, the kernel community is happy with it, and no-one expects
> me to provide patches to any other user-space applications that depend
> on boot_vga.

Ard both identified the Xorg commit that I would have, and CC'd Hans
which I would have recommended as well.

I assume the symptom is that now there's a class of platform GPU devices
that is neither PCI nor legacy VGA, so neither the kernel's boot_vga
logic matches it, nor Xorg's commit in question.

I agree that it should be possible to add more logic to Xorg to detect
this kind device as primary. However, I share Daniel's worry that it
wouldn't cover all user space apps -- I see "Wayland this, Wayland that"
on reddit every week.

Having practically zero background in gfx development (either kernel or
Xorg), I think the problem is that vga_default_device() /
vga_set_default_device(), which -- apparently -- "boot_vga" is based
upon, come from "drivers/gpu/vga/vgaarb.c". Namely, the concept of
"primary / boot display device" is tied to the VGA arbiter, plus only a
PCI device can currently be marked as primary/boot display device.

Can these concepts be split from each other? (I can fully imagine that
this would result in a userspace visible interface change (or addition),
so that e.g. "/sys/devices/**/boot_gpu" would have to be consulted by
display servers.)

(Sorry if I'm totally wrong.)

... Hm, reading the thread starter at
,
and the references within... It looks like this work is motivated by
hardware that is supposed to be PCI, but actually breaks the specs. Is
that correct? If so, then I don't think I can suggest anything useful.
Specs exist so that hardware vendors and software authors follow them.
If hardware does not conform, then software should either refuse to work
with it, or handle it with quirks, on a case-by-case basis. I guess this
means that I don't agree with the

  broad[] suggest[ion] that a more generic solution would be better

which seems to disqualify me from the discussion, as it must have been
suggested by people with incomparably more experience than what I have :)

Thanks
Laszlo


Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-25 Thread Nicholas Piggin
On Tue, 25 Jul 2017 19:32:10 +0800
Jonathan Cameron  wrote:

> Hi All,
> 
> We observed a regression on our d05 boards (but curiously not
> the fairly similar but single socket / smaller core count
> d03), initially seen with linux-next prior to the merge window
> and still present in v4.13-rc2.
> 
> The symptom is:
> 
> [ 1982.959365] INFO: rcu_preempt detected stalls on CPUs/tasks:
> [ 1982.965021]2-...: (10 GPs behind) idle=1d4/0/0 softirq=306/306 
> fqs=0 
> [ 1982.971624]3-...: (2 GPs behind) idle=700/0/0 softirq=307/307 
> fqs=0 
> [ 1982.978139]4-...: (20 GPs behind) idle=9f4/0/0 softirq=651/652 
> fqs=0 
> [ 1982.984740]5-...: (18 GPs behind) idle=a78/0/0 softirq=369/371 
> fqs=0 
> [ 1982.991342]6-...: (26 GPs behind) idle=e5c/0/0 softirq=217/219 
> fqs=0 
> [ 1982.997944]7-...: (1438 GPs behind) idle=eb4/0/0 softirq=260/260 
> fqs=0 
> [ 1983.004719]8-...: (18 GPs behind) idle=830/0/0 softirq=1609/1609 
> fqs=0 
> [ 1983.011494]9-...: (18 GPs behind) idle=e9c/0/0 softirq=242/242 
> fqs=0 
> [ 1983.018095]10-...: (1434 GPs behind) idle=ca0/0/0 softirq=210/212 
> fqs=0 
> [ 1983.024957]11-...: (1106 GPs behind) idle=ee0/0/0 softirq=188/191 
> fqs=0 
> [ 1983.031819]12-...: (1636 GPs behind) idle=c58/0/0 softirq=215/216 
> fqs=0 
> [ 1983.038680]13-...: (1114 GPs behind) idle=c20/0/0 softirq=170/170 
> fqs=0 
> [ 1983.045542]14-...: (1106 GPs behind) idle=d90/0/0 softirq=176/178 
> fqs=0 
> [ 1983.052403]15-...: (1858 GPs behind) idle=900/0/0 softirq=184/185 
> fqs=0 
> [ 1983.059266]16-...: (1621 GPs behind) idle=f04/0/0 softirq=204/206 
> fqs=0 
> [ 1983.066127]17-...: (1433 GPs behind) idle=d30/0/0 softirq=202/202 
> fqs=0 
> [ 1983.072988]18-...: (18 GPs behind) idle=2d4/0/0 softirq=218/220 
> fqs=0 
> [ 1983.079676]19-...: (19 GPs behind) idle=bbc/0/0 softirq=178/180 
> fqs=0 
> [ 1983.086364]20-...: (0 ticks this GP) idle=ee0/0/0 softirq=231/231 
> fqs=0 
> [ 1983.093226]21-...: (4 GPs behind) idle=140/0/0 softirq=208/208 
> fqs=0 
> [ 1983.099827]22-...: (5 GPs behind) idle=100/0/0 softirq=186/188 
> fqs=0 
> [ 1983.106428]23-...: (1635 GPs behind) idle=fd4/0/0 
> softirq=1220/1221 fqs=0 
> [ 1983.113463]24-...: (1112 GPs behind) idle=ca8/0/0 softirq=231/233 
> fqs=0 
> [ 1983.120325]25-...: (1637 GPs behind) idle=9c4/0/0 softirq=164/166 
> fqs=0 
> [ 1983.127187]27-...: (0 ticks this GP) idle=b08/0/0 softirq=182/182 
> fqs=0 
> [ 1983.134048]28-...: (1110 GPs behind) idle=d28/0/0 softirq=179/181 
> fqs=0 
> [ 1983.140909]29-...: (8 GPs behind) idle=1dc/0/0 softirq=196/198 
> fqs=0 
> [ 1983.147511]31-...: (1434 GPs behind) idle=74c/0/0 softirq=160/161 
> fqs=0 
> [ 1983.154373]32-...: (1432 GPs behind) idle=7d4/0/0 softirq=164/164 
> fqs=0 
> [ 1983.161234]33-...: (1632 GPs behind) idle=4dc/0/0 softirq=130/132 
> fqs=0 
> [ 1983.168096]34-...: (57 GPs behind) idle=3b0/0/0 softirq=411/411 
> fqs=0 
> [ 1983.174784]35-...: (1599 GPs behind) idle=8a0/0/0 softirq=177/179 
> fqs=0 
> [ 1983.181646]36-...: (1603 GPs behind) idle=520/0/0 softirq=132/134 
> fqs=0 
> [ 1983.188507]37-...: (8 GPs behind) idle=02c/0/0 softirq=176/178 
> fqs=0 
> [ 1983.195108]38-...: (1442 GPs behind) idle=d8c/0/0 
> softirq=3189/3190 fqs=0 
> [ 1983.202144]39-...: (1431 GPs behind) idle=444/0/0 softirq=117/117 
> fqs=0 
> [ 1983.209005]40-...: (4 GPs behind) idle=688/0/0 softirq=134/136 
> fqs=0 
> [ 1983.215606]41-...: (1599 GPs behind) idle=554/0/0 
> softirq=2707/2711 fqs=0 
> [ 1983.222642]42-...: (1430 GPs behind) idle=15c/0/0 softirq=110/111 
> fqs=0 
> [ 1983.229503]43-...: (4 GPs behind) idle=054/0/0 softirq=101/103 
> fqs=0 
> [ 1983.236104]46-...: (1117 GPs behind) idle=558/0/0 softirq=251/253 
> fqs=0 
> [ 1983.242966]47-...: (1118 GPs behind) idle=5f0/0/0 softirq=110/112 
> fqs=0 
> [ 1983.249827]48-...: (1621 GPs behind) idle=ef4/0/0 softirq=241/242 
> fqs=0 
> [ 1983.256689]49-...: (1648 GPs behind) idle=92c/0/0 softirq=207/208 
> fqs=0 
> [ 1983.263550]52-...: (1439 GPs behind) idle=e40/0/0 softirq=261/263 
> fqs=0 
> [ 1983.270412]54-...: (1434 GPs behind) idle=650/0/0 softirq=258/260 
> fqs=0 
> [ 1983.277273]55-...: (1646 GPs behind) idle=5e0/0/0 softirq=178/178 
> fqs=0 
> [ 1983.284135]56-...: (1646 GPs behind) idle=800/0/0 softirq=249/249 
> fqs=0 
> [ 1983.290996]57-...: (1599 GPs behind) idle=c48/0/0 softirq=222/224 
> fqs=0 
> [ 1983.297858]58-...: (1648 GPs behind) idle=3e8/0/0 softirq=235/235 
> fqs=0 
> [ 1983.304719]59-...: (1434 GPs behind) idle=720/0/0 softirq=201/203 
> fqs=0 
> [ 1983.311581]60-...: (1647 GPs behind) idle=c80/0/0 softirq=250/250 
> 

Re: [PATCH] powerpc/perf: Update default sdar_mode value for power9

2017-07-25 Thread Anton Blanchard
On Tue, 25 Jul 2017 11:05:51 +0530
Madhavan Srinivasan  wrote:

> Commit 20dd4c624d251 ('powerpc/perf: Fix SDAR_MODE value for continous
> sampling on Power9') set the default sdar_mode value in
> MMCRA[SDAR_MODE] to be used as 0b01 (Update on TLB miss). And this
> value is set if sdar_mode from event is zero, or we are in continous
> sampling mode in power9 dd1.
> 
> But it is preferred to have the sdar_mode value for power9 as
> 0b10 (Update on dcache miss) for better sampling updates instead
> of 0b01 (Update on TLB miss).

Acked-by: Anton Blanchard 

Using a bandwidth test case with a 1MB footprint, I profiled cycles and
chose TLB updates of the SDAR:

# perf record -d -e r0004001E:u ./bw2001 1M
  ^
  SDAR TLB

# perf report -D | grep PERF_RECORD_SAMPLE | sed 's/.*addr: //' | sort -u | wc 
-l
4

I get 4 unique addresses. If I ran with dcache misses:

# perf record -d -e r0008001E:u ./bw2001 1M
  ^
  SDAR dcache miss

# perf report -D|grep PERF_RECORD_SAMPLE| sed 's/.*addr: //'|sort -u | wc -l
5217

I get 5217 unique addresses. No surprises here, but it does show why
TLB misses is the wrong event to default to - we get very little useful
information out of it.

Anton

> Signed-off-by: Madhavan Srinivasan 
> ---
>  arch/powerpc/perf/isa207-common.c | 2 +-
>  arch/powerpc/perf/isa207-common.h | 1 +
>  2 files changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/perf/isa207-common.c
> b/arch/powerpc/perf/isa207-common.c index 3f3aa9a7063a..582ed2c9bc56
> 100644 --- a/arch/powerpc/perf/isa207-common.c
> +++ b/arch/powerpc/perf/isa207-common.c
> @@ -99,7 +99,7 @@ static void mmcra_sdar_mode(u64 event, unsigned
> long *mmcra) else if (!cpu_has_feature(CPU_FTR_POWER9_DD1) &&
> p9_SDAR_MODE(event)) *mmcra |=  p9_SDAR_MODE(event) <<
> MMCRA_SDAR_MODE_SHIFT; else
> - *mmcra |= MMCRA_SDAR_MODE_TLB;
> + *mmcra |= MMCRA_SDAR_MODE_DCACHE;
>   } else
>   *mmcra |= MMCRA_SDAR_MODE_TLB;
>  }
> diff --git a/arch/powerpc/perf/isa207-common.h
> b/arch/powerpc/perf/isa207-common.h index 8acbe6e802c7..7a0228bf283c
> 100644 --- a/arch/powerpc/perf/isa207-common.h
> +++ b/arch/powerpc/perf/isa207-common.h
> @@ -247,6 +247,7 @@
>  #define MMCRA_SDAR_MODE_SHIFT42
>  #define MMCRA_SDAR_MODE_TLB  (1ull <<
> MMCRA_SDAR_MODE_SHIFT) #define MMCRA_SDAR_MODE_NO_UPDATES
> ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) +#define
> MMCRA_SDAR_MODE_DCACHE(2ull << MMCRA_SDAR_MODE_SHIFT)
> #define MMCRA_IFM_SHIFT   30 #define
> MMCRA_THR_CTR_MANT_SHIFT  19 #define
> MMCRA_THR_CTR_MANT_MASK   0x7Ful



Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's

2017-07-25 Thread Michael Ellerman
Nicholas Piggin  writes:

> On Mon, 24 Jul 2017 23:46:44 +1000
> Michael Ellerman  wrote:
>
>> Nicholas Piggin  writes:
>> 
>> > On Mon, 24 Jul 2017 14:28:02 +1000
>> > Benjamin Herrenschmidt  wrote:
>> >  
>> >> Instead of comparing the whole CPU mask every time, let's
>> >> keep a counter of how many bits are set in the mask. Thus
>> >> testing for a local mm only requires testing if that counter
>> >> is 1 and the current CPU bit is set in the mask.  
>> ...
>> >
>> > Also does it make sense to define it based on NR_CPUS > BITS_PER_LONG?
>> > If it's <= then it should be similar load and compare, no?  
>> 
>> Do we make a machine with that few CPUs? ;)
>> 
>> I don't think it's worth special casing, all the distros run with much
>> much larger NR_CPUs than that.
>
> Not further special-casing, but just casing it based on NR_CPUS
> rather than BOOK3S.

The problem is the mm_context_t is defined based on BookE vs BookS etc.
not based on NR_CPUS.

So we'd have to add the atomic_t to all mm_context_t's, but #ifdef'ed
based on NR_CPUS.

But then some platforms don't support SMP, so it's a waste there. The
existing cpumask check compiles to ~= nothing on UP.

cheers


[PATCH v6] powerpc/mm/radix: Workaround prefetch issue with KVM

2017-07-25 Thread Michael Ellerman
From: Benjamin Herrenschmidt 

There's a somewhat architectural issue with Radix MMU and KVM.

When coming out of a guest with AIL (Alternate Interrupt Location, ie,
MMU enabled), we start executing hypervisor code with the PID register
still containing whatever the guest has been using.

The problem is that the CPU can (and will) then start prefetching or
speculatively load from whatever host context has that same PID (if
any), thus bringing translations for that context into the TLB, which
Linux doesn't know about.

This can cause stale translations and subsequent crashes.

Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations always
use broadcast forms but that would hurt single threaded programs for
example.

We chose to fix it instead by partitioning the PID space between guest
and host. This is possible because today Linux only use 19 out of the
20 bits of PID space, so existing guests will work if we make the host
use the top half of the 20 bits space.

We additionally add support for a property to indicate to Linux the
size of the PID register which will be useful if we eventually have
processors with a larger PID space available.

There is still an issue with malicious guests purposefully setting the
PID register to a value in the hosts PID range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:

 - On the way out of a guest, before we clear the current VCPU in the
   PACA, we check the PID and if it's outside of the permitted range
   we flush the TLB for that PID.

 - When context switching, if the mm is "new" on that CPU (the
   corresponding bit was set for the first time in the mm cpumask), we
   check if any sibling thread is in KVM (has a non-NULL VCPU pointer
   in the PACA). If that is the case, we also flush the PID for that
   CPU (core).

This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations can
exist before we detect it and flush them out.

A future optimization could be added by keeping track of whether the
PID has ever been used and avoid doing that for completely fresh PIDs.
We could similarily mark PIDs that have been the subject of a global
invalidation as "fresh". But for now this will do.

Signed-off-by: Benjamin Herrenschmidt 
[mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
  unneeded include of kvm_book3s_asm.h]
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 15 
 arch/powerpc/include/asm/mmu_context.h   | 18 --
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 59 +++-
 arch/powerpc/mm/mmu_context_book3s64.c   |  5 +--
 arch/powerpc/mm/pgtable-radix.c  | 34 +-
 arch/powerpc/mm/tlb-radix.c  | 45 ++--
 6 files changed, 154 insertions(+), 22 deletions(-)

v6: Fix CONFIG_PPC_RADIX_MMU=n build.

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 77529a3e3811..5b4023c616f7 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -59,13 +59,14 @@ extern struct patb_entry *partition_tb;
 #define PRTS_MASK  0x1f/* process table size field */
 #define PRTB_MASK  0x0000UL
 
-/*
- * Limit process table to PAGE_SIZE table. This
- * also limit the max pid we can support.
- * MAX_USER_CONTEXT * 16 bytes of space.
- */
-#define PRTB_SIZE_SHIFT(CONTEXT_BITS + 4)
-#define PRTB_ENTRIES   (1ul << CONTEXT_BITS)
+/* Number of supported PID bits */
+extern unsigned int mmu_pid_bits;
+
+/* Base PID to allocate from */
+extern unsigned int mmu_base_pid;
+
+#define PRTB_SIZE_SHIFT(mmu_pid_bits + 4)
+#define PRTB_ENTRIES   (1ul << mmu_pid_bits)
 
 /*
  * Power9 currently only support 64K partition table size.
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index da7e9432fa8f..0c76675394c5 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -45,7 +45,7 @@ extern void set_context(unsigned long id, pgd_t *pgd);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 extern void radix__switch_mmu_context(struct mm_struct *prev,
-struct mm_struct *next);
+ struct mm_struct *next);
 static inline void switch_mmu_context(struct mm_struct *prev,
  struct mm_struct *next,
  struct task_struct *tsk)
@@ -67,6 +67,12 @@ extern void __destroy_context(unsigned long context_id);
 extern void mmu_context_init(void);
 #endif
 
+#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && 

[PATCH v2 2/2] 44x/fsp2: enable eMMC arasan for fsp2 platform

2017-07-25 Thread Ivan Mikhaylov
Add mmc0 changes for enabling arasan emmc and change
defconfig appropriately.

Signed-off-by: Ivan Mikhaylov 
---
 arch/powerpc/boot/dts/fsp2.dts  |   33 +-
 arch/powerpc/configs/44x/fsp2_defconfig |2 +
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/boot/dts/fsp2.dts b/arch/powerpc/boot/dts/fsp2.dts
index 475953a..6a63026 100644
--- a/arch/powerpc/boot/dts/fsp2.dts
+++ b/arch/powerpc/boot/dts/fsp2.dts
@@ -52,6 +52,7 @@
clocks {
mmc_clk: mmc_clk {
compatible = "fixed-clock";
+   #clock-cells = <0>;
clock-frequency = <5000>;
clock-output-names = "mmc_clk";
};
@@ -359,20 +360,6 @@
interrupts = <31 0x4 15 0x84>;
};
 
-   mmc0: sdhci@020c {
-   compatible  = "st,sdhci-stih407", "st,sdhci";
-   status  = "disabled";
-   reg = <0x020c 0x2>;
-   reg-names   = "mmc";
-   interrupt-parent = <_3>;
-   interrupts  = <21 0x4 22 0x4>;
-   interrupt-names = "mmcirq";
-   pinctrl-names   = "default";
-   pinctrl-0   = <>;
-   clock-names = "mmc";
-   clocks  = <_clk>;
-   };
-
plb6 {
compatible = "ibm,plb6";
#address-cells = <2>;
@@ -501,6 +488,24 @@
 /*RXDE*/  4 _2 13 0x4>;
};
 
+   mmc0: mmc@20c {
+   compatible  = "st,sdhci-stih407", "st,sdhci";
+   reg = <0x020c 0x2>;
+   reg-names   = "mmc";
+   interrupts  = <21 0x4>;
+   interrupt-parent = <_3>;
+   interrupt-names = "mmcirq";
+   pinctrl-names   = "default";
+   pinctrl-0   = <>;
+   clock-names = "mmc";
+   clocks  = <_clk>;
+   bus-width   = <4>;
+   non-removable;
+   sd-uhs-sdr50;
+   sd-uhs-sdr104;
+   sd-uhs-ddr50;
+   };
+
opb {
compatible = "ibm,opb";
#address-cells = <1>;
diff --git a/arch/powerpc/configs/44x/fsp2_defconfig 
b/arch/powerpc/configs/44x/fsp2_defconfig
index e8e6a69..935aabe 100644
--- a/arch/powerpc/configs/44x/fsp2_defconfig
+++ b/arch/powerpc/configs/44x/fsp2_defconfig
@@ -92,8 +92,10 @@ CONFIG_MMC_DEBUG=y
 CONFIG_MMC_SDHCI=y
 CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_SDHCI_OF_ARASAN=y
+CONFIG_MMC_SDHCI_ST=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_M41T80=y
+CONFIG_RESET_CONTROLLER=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
-- 
1.7.1



Re: [PATCH 23/24] powerpc/mm: Cleanup check for stack expansion

2017-07-25 Thread Michael Ellerman
LEROY Christophe  writes:

> Michael Ellerman  a écrit :
>
>> LEROY Christophe  writes:
>>
>>> Benjamin Herrenschmidt  a écrit :
>>>
 When hitting below a VM_GROWSDOWN vma (typically growing the stack),
 we check whether it's a valid stack-growing instruction and we
 check the distance to GPR1. This is largely open coded with lots
 of comments, so move it out to a helper.
>>>
>>> Did you have a look at the following patch ? It's been waiting for
>>> application for some weeks now.
>>> https://patchwork.ozlabs.org/patch/771869
>>
>> I actually merged it last merge window, but found I had no good way to
>> test it, so I took it out again until I can write a test case for it.
>>
>> The way I realised it wasn't being tested was by removing all the
>> store_updates_sp logic entirely and having my system run happily for
>> several days :}
>
> Which demonstrates how unlikely this is, hence doing that get_user()  
> at every fault is waste of time.

Yes I agree.

> How do you plan to handle that in parralele to ben's serie ?

Not sure :)

> I'll be back from vacation next week and may help finding a way to  
> test that. (A test program using alloca() ?)

I was thinking hand-crafted asm, but that might be a pain to get working
for 32 & 64-bit, in which case alloca() might work.

cheers


Re: [PATCH 5/6] powerpc/mm: Optimize detection of thread local mm's

2017-07-25 Thread Nicholas Piggin
On Tue, 25 Jul 2017 11:03:45 +1000
Benjamin Herrenschmidt  wrote:

> On Tue, 2017-07-25 at 10:44 +1000, Nicholas Piggin wrote:
> > The two variants are just cleaner versions of the two variants you
> > already introduced.
> > 
> > static inline bool mm_activate_cpu(struct mm_struct *mm)
> > {
> > if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) {
> > cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
> > #if CONFIG_PPC_BOOK3S_64
> > atomic_inc(>context.active_cpus);
> > #endif
> > smp_mb();
> > return true;
> > }
> > return false;
> > }  
> 
> Well the above is what I originally wrote, which Michael encouraged me
> to turn into a helper ;-) I was removing ifdef's from switch_mm in
> this series...

Well I won't harp on about it if you guys prefer the increment helper.
Just the comment would be good. The rest of the series seems okay to
me.

Thanks,
Nick


Re: [PATCH 2/2] powernv/powerpc: Clear PECE1 in LPCR via stop-api only on Hotplug

2017-07-25 Thread Michael Ellerman
Nicholas Piggin  writes:
...
> I wonder if we should think about a more structured kernel API for
> modifying these kind of system registers so we always have the
> up-to-date values stored in memory. Many of them do need to be
> restored after sleep, but they don't need to be saved per-thread
> or saved every time we go to sleep.

Yes that's on my mental TODO. Paul and I have talked about it in the
past for KVM.

I'll write it down sometime so we at least remember :D

chers


Re: [RFC Part1 PATCH v3 02/17] x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature

2017-07-25 Thread Borislav Petkov
On Mon, Jul 24, 2017 at 02:07:42PM -0500, Brijesh Singh wrote:
> From: Tom Lendacky 
> 
> Update the CPU features to include identifying and reporting on the
> Secure Encrypted Virtualization (SEV) feature.  SME is identified by
> CPUID 0x801f, but requires BIOS support to enable it (set bit 23 of
> MSR_K8_SYSCFG and set bit 0 of MSR_K7_HWCR).  Only show the SEV feature
> as available if reported by CPUID and enabled by BIOS.
> 
> Signed-off-by: Tom Lendacky 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/include/asm/cpufeatures.h |  1 +
>  arch/x86/include/asm/msr-index.h   |  2 ++
>  arch/x86/kernel/cpu/amd.c  | 30 +-
>  arch/x86/kernel/cpu/scattered.c|  1 +
>  4 files changed, 29 insertions(+), 5 deletions(-)

...

> @@ -637,6 +642,21 @@ static void early_init_amd(struct cpuinfo_x86 *c)
>   clear_cpu_cap(c, X86_FEATURE_SME);
>   }
>   }
> +
> + if (cpu_has(c, X86_FEATURE_SEV)) {
> + if (IS_ENABLED(CONFIG_X86_32)) {
> + clear_cpu_cap(c, X86_FEATURE_SEV);
> + } else {
> + u64 syscfg, hwcr;
> +
> + /* Check if SEV is enabled */
> + rdmsrl(MSR_K8_SYSCFG, syscfg);
> + rdmsrl(MSR_K7_HWCR, hwcr);
> + if (!(syscfg & MSR_K8_SYSCFG_MEM_ENCRYPT) ||
> + !(hwcr & MSR_K7_HWCR_SMMLOCK))
> + clear_cpu_cap(c, X86_FEATURE_SEV);
> + }
> + }

Let's simplify this and read the MSRs only once. Diff ontop. Please
check if I'm missing a case:

---
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c413f04bdd41..79af07731ab1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -546,6 +546,48 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
 }
 
+static void early_detect_mem_enc(struct cpuinfo_x86 *c)
+{
+   u64 syscfg, hwcr;
+
+   /*
+* BIOS support is required for SME and SEV.
+*   For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+*the SME physical address space reduction value.
+*If BIOS has not enabled SME then don't advertise the
+*SME feature (set in scattered.c).
+*   For SEV: If BIOS has not enabled SEV then don't advertise the
+*SEV feature (set in scattered.c).
+*
+*   In all cases, since support for SME and SEV requires long mode,
+*   don't advertise the feature under CONFIG_X86_32.
+*/
+   if (cpu_has(c, X86_FEATURE_SME) ||
+   cpu_has(c, X86_FEATURE_SEV)) {
+
+   if (IS_ENABLED(CONFIG_X86_32))
+   goto clear;
+
+   /* Check if SME is enabled */
+   rdmsrl(MSR_K8_SYSCFG, syscfg);
+   if (!(syscfg & MSR_K8_SYSCFG_MEM_ENCRYPT))
+   goto clear;
+
+   c->x86_phys_bits -= (cpuid_ebx(0x801f) >> 6) & 0x3f;
+
+   /* Check if SEV is enabled */
+   rdmsrl(MSR_K7_HWCR, hwcr);
+   if (!(hwcr & MSR_K7_HWCR_SMMLOCK))
+   goto clear_sev;
+
+   return;
+clear:
+   clear_cpu_cap(c, X86_FEATURE_SME);
+clear_sev:
+   clear_cpu_cap(c, X86_FEATURE_SEV);
+   }
+}
+
 static void early_init_amd(struct cpuinfo_x86 *c)
 {
u32 dummy;
@@ -617,46 +659,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_E400);
 
-   /*
-* BIOS support is required for SME and SEV.
-*   For SME: If BIOS has enabled SME then adjust x86_phys_bits by
-*the SME physical address space reduction value.
-*If BIOS has not enabled SME then don't advertise the
-*SME feature (set in scattered.c).
-*   For SEV: If BIOS has not enabled SEV then don't advertise the
-*SEV feature (set in scattered.c).
-*
-*   In all cases, since support for SME and SEV requires long mode,
-*   don't advertise the feature under CONFIG_X86_32.
-*/
-   if (cpu_has(c, X86_FEATURE_SME)) {
-   u64 msr;
-
-   /* Check if SME is enabled */
-   rdmsrl(MSR_K8_SYSCFG, msr);
-   if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) {
-   c->x86_phys_bits -= (cpuid_ebx(0x801f) >> 6) & 0x3f;
-   if (IS_ENABLED(CONFIG_X86_32))
-   clear_cpu_cap(c, X86_FEATURE_SME);
-   } else {
-   clear_cpu_cap(c, X86_FEATURE_SME);
-   }
-   }
+   early_detect_mem_enc(c);
 
-   if (cpu_has(c, X86_FEATURE_SEV)) {
-   if 

RE: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation

2017-07-25 Thread David Laight
From: Linuxppc-dev 
[mailto:linuxppc-dev-bounces+david.laight=aculab@lists.ozlabs.org] On 
Behalf Of
> Matt Brown
> Sent: 25 July 2017 04:33
> To: linuxppc-dev@lists.ozlabs.org
> Subject: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation
> 
> This adds emulations for the popcntb, popcntw, and popcntd instructions.
> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
> 
> Signed-off-by: Matt Brown 
> ---
> v3:
>   - optimised using the Giles-Miller method of side-ways addition
> v2:
>   - fixed opcodes
>   - fixed typecasting
>   - fixed bitshifting error for both 32 and 64bit arch
> ---
>  arch/powerpc/lib/sstep.c | 40 +++-
>  1 file changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 87d277f..c1f9cdb 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs 
> *regs, unsigned long v1,
>   regs->gpr[rd] = out_val;
>  }
> 
> +/*
> + * The size parameter is used to adjust the equivalent popcnt instruction.
> + * popcntb = 8, popcntw = 32, popcntd = 64
> + */
> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
> + int size, int ra)
> +{
> + unsigned long long out = v1;
> +
> + out = (0x & out) + (0x & (out >> 1));
> + out = (0x & out) + (0x & (out >> 2));
> + out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
> + if (size == 8) {/* popcntb */
> + regs->gpr[ra] = out;

I'm pretty sure you need to mask the result with 7.

David



Re: [PATCH v3] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-25 Thread Benjamin Herrenschmidt
On Tue, 2017-07-25 at 12:26 +0530, Santosh Sivaraj wrote:

> +static notrace void kernel_get_tspec(struct timespec *tp,
> +  struct vdso_data *vdata, u32 *wtom_sec,
> +  u32 *wtom_nsec)
> +{
> + u64 tb;
> + u32 update_count;

This is broken:

> + do {
> + /* check for update count & load values */
> + update_count = vdata->tb_update_count;
> +
> + /* Get TB, offset it and scale result */
> + tb = mulhdu((get_tb() - vdata->tb_orig_stamp) << 12,
> + vdata->tb_to_xs) + vdata->stamp_sec_fraction;
> + tp->tv_sec = vdata->stamp_xtime.tv_sec;
> + if (wtom_sec)
> + *wtom_sec = vdata->wtom_clock_sec;
> + if (wtom_nsec)
> + *wtom_nsec = vdata->wtom_clock_nsec;
> + } while (update_count != vdata->tb_update_count);

The assembly code is carefuly crafted to create a chain of data
dependencies in order to enforce the ordering in this function,
you completely broke it.

IE. the pointer used to access tb_orig_stamp etc... depends on the
initial update count, and the final read of the update count depends
on all the previously read values (or should), thus ordering those
loads. Withtout that you'll need more expensive lwsync's.

Additionally, you broke another semantic of the seqlock which is
to spin on the first update count if it has an odd value.

The same problem exist in all your other implementations.

I am really NOT a fan of that attempt at converting to C. The code is
hand crafted assembly for a number of reasons, including the above ones
and maximum performance.

As it is, it's deeply broken.

> +
> + tp->tv_nsec = ((u64)mulhwu(tb, NSEC_PER_SEC) << 32) >> 32;
> + tp->tv_sec += (tb >> 32);
> +}
> +
> +static notrace int clock_get_realtime(struct timespec *tp,
> +   struct vdso_data *vdata)
> +{
> + kernel_get_tspec(tp, vdata, NULL, NULL);
> +
> + return 0;
> +}
> +
> +static notrace int clock_get_monotonic(struct timespec *tp,
> +struct vdso_data *vdata)
> +{
> + __s32 wtom_sec, wtom_nsec;
> + u64 nsec;
> +
> + kernel_get_tspec(tp, vdata, _sec, _nsec);
> +
> + tp->tv_sec += wtom_sec;
> +
> + nsec = tp->tv_nsec;
> + tp->tv_nsec = 0;
> + timespec_add_ns(tp, nsec + wtom_nsec);
> +
> + return 0;
> +}
> +
> +static notrace int clock_realtime_coarse(struct timespec *tp,
> +  struct vdso_data *vdata)
> +{
> + u32 update_count;
> +
> + do {
> + /* check for update count & load values */
> + update_count = vdata->tb_update_count;
> +
> + tp->tv_sec = vdata->stamp_xtime.tv_sec;
> + tp->tv_nsec = vdata->stamp_xtime.tv_nsec;
> + } while (update_count != vdata->tb_update_count);
> +
> + return 0;
> +}
> +
> +static notrace int clock_monotonic_coarse(struct timespec *tp,
> +   struct vdso_data *vdata)
> +{
> + __s32 wtom_sec, wtom_nsec;
> + u64 nsec;
> + u32 update_count;
> +
> + do {
> + /* check for update count & load values */
> + update_count = vdata->tb_update_count;
> +
> + tp->tv_sec = vdata->stamp_xtime.tv_sec;
> + tp->tv_nsec = vdata->stamp_xtime.tv_nsec;
> + wtom_sec = vdata->wtom_clock_sec;
> + wtom_nsec = vdata->wtom_clock_nsec;
> + } while (update_count != vdata->tb_update_count);
> +
> + tp->tv_sec += wtom_sec;
> + nsec = tp->tv_nsec;
> + tp->tv_nsec = 0;
> + timespec_add_ns(tp, nsec + wtom_nsec);
> +
> + return 0;
> +}
> +
> +notrace int kernel_clock_gettime(clockid_t clk_id, struct timespec *tp)
> +{
> + int ret;
> + struct vdso_data *vdata = __get_datapage();
> +
> + if (!tp || !vdata)
> + return -EBADR;
> +
> + switch (clk_id) {
> + case CLOCK_REALTIME:
> + ret = clock_get_realtime(tp, vdata);
> + break;
> + case CLOCK_MONOTONIC:
> + ret = clock_get_monotonic(tp, vdata);
> + break;
> + case CLOCK_REALTIME_COARSE:
> + ret = clock_realtime_coarse(tp, vdata);
> + break;
> + case CLOCK_MONOTONIC_COARSE:
> + ret = clock_monotonic_coarse(tp, vdata);
> + break;
> + default:
> + /* fallback to syscall */
> + ret = -1;
> + break;
> + }
> +
> + return ret;
> +}
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index 3820213..c3f6b24 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -16,6 +16,8 @@
>  #include 
>  #include 
>  
> +.global  kernel_clock_gettime
> +
>   .text
>  /*
>   * Exact prototype of gettimeofday
> @@ -60,71 

RE: [RFC Part1 PATCH v3 13/17] x86/io: Unroll string I/O when SEV is active

2017-07-25 Thread David Laight
From: Brijesh Singh
> Sent: 24 July 2017 20:08
> From: Tom Lendacky 
> 
> Secure Encrypted Virtualization (SEV) does not support string I/O, so
> unroll the string I/O operation into a loop operating on one element at
> a time.
> 
> Signed-off-by: Tom Lendacky 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/include/asm/io.h | 26 ++
>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
> index e080a39..2f3c002 100644
> --- a/arch/x86/include/asm/io.h
> +++ b/arch/x86/include/asm/io.h
> @@ -327,14 +327,32 @@ static inline unsigned type in##bwl##_p(int port)   
> \
>   \
>  static inline void outs##bwl(int port, const void *addr, unsigned long 
> count) \
>  {

Is it even worth leaving these as inline functions?
Given the speed of IO cycles it is unlikely that the cost of calling a real
function will be significant.
The code bloat reduction will be significant.

David



Re: [PATCH v3 4/5] powerpc/lib/sstep: Add prty instruction emulation

2017-07-25 Thread Balbir Singh
On Tue, 2017-07-25 at 13:33 +1000, Matt Brown wrote:
> This adds emulation for the prtyw and prtyd instructions.
> Tested for logical correctness against the prtyw and prtyd instructions
> on ppc64le.
> 
> Signed-off-by: Matt Brown 
> ---
> v3:
>   - optimised using the Giles-Miller method of side-ways addition
> v2:
>   - fixed opcodes
>   - fixed bitshifting and typecast errors
>   - merged do_prtyw and do_prtyd into single function
> ---
>  arch/powerpc/lib/sstep.c | 27 +++
>  1 file changed, 27 insertions(+)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 6a79618..0bcf631 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -655,6 +655,25 @@ static nokprobe_inline void do_bpermd(struct pt_regs 
> *regs, unsigned long v1,
>   regs->gpr[ra] = perm;
>  }
>  #endif /* __powerpc64__ */
> +/*
> + * The size parameter adjusts the equivalent prty instruction.
> + * prtyw = 32, prtyd = 64
> + */
> +static nokprobe_inline void do_prty(struct pt_regs *regs, unsigned long v,
> + int size, int ra)
> +{
> + unsigned long long res = v;
> +
> + res = (0x0001000100010001 & res) + (0x0001000100010001 & (res >> 8));
> + res = (0x00070007 & res) + (0x00070007 & (res >> 16));
> + if (size == 32) {   /* prtyw */
> + regs->gpr[ra] = (0x00010001 & res);
> + return;
> + }
> +
> + res = (0x000f & res) + (0x000f & (res >> 32));
> + regs->gpr[ra] = res & 1;/*prtyd */

Looks good, you can also xor instead of adding, but the masks would need
to change in that case.

Balbir Singh.



Re: [PATCH v3 2/5] powerpc/lib/sstep: Add popcnt instruction emulation

2017-07-25 Thread Balbir Singh
On Tue, 2017-07-25 at 13:33 +1000, Matt Brown wrote:
> This adds emulations for the popcntb, popcntw, and popcntd instructions.
> Tested for correctness against the popcnt{b,w,d} instructions on ppc64le.
> 
> Signed-off-by: Matt Brown 
> ---
> v3:
>   - optimised using the Giles-Miller method of side-ways addition
> v2:
>   - fixed opcodes
>   - fixed typecasting
>   - fixed bitshifting error for both 32 and 64bit arch
> ---
>  arch/powerpc/lib/sstep.c | 40 +++-
>  1 file changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 87d277f..c1f9cdb 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -612,6 +612,32 @@ static nokprobe_inline void do_cmpb(struct pt_regs 
> *regs, unsigned long v1,
>   regs->gpr[rd] = out_val;
>  }
>  
> +/*
> + * The size parameter is used to adjust the equivalent popcnt instruction.
> + * popcntb = 8, popcntw = 32, popcntd = 64
> + */
> +static nokprobe_inline void do_popcnt(struct pt_regs *regs, unsigned long v1,
> + int size, int ra)
> +{
> + unsigned long long out = v1;
> +
> + out = (0x & out) + (0x & (out >> 1));
> + out = (0x & out) + (0x & (out >> 2));
> + out = (0x0f0f0f0f0f0f0f0f & out) + (0x0f0f0f0f0f0f0f0f & (out >> 4));
> + if (size == 8) {/* popcntb */
> + regs->gpr[ra] = out;
> + return;
> + }
> + out = (0x001f001f001f001f & out) + (0x001f001f001f001f & (out >> 8));

Why are we using 0x001f001f here? Now that we've got things in the
bytes with 0's prefixing, we can directly use

out = out + out >> 8

> + out = (0x003f003f & out) + (0x003f003f & (out >> 16));

Same as above

> + if (size == 32) {   /* popcntw */
> + regs->gpr[ra] = out;
> + return;
> + }
> + out = (0x007f & out) + (0x007f & (out >> 32));
> + regs->gpr[ra] = out;/* popcntd */

Ditto

Otherwise looks good!

Balbir Singh.



[PATCH v3] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-25 Thread Santosh Sivaraj
Current vDSO64 implementation does not have support for coarse clocks
(CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls back
to system call, increasing the response time, vDSO implementation reduces
the cycle time. Below is a benchmark of the difference in execution time
with and without vDSO support.

(Non-coarse clocks are also included just for completion)

Without vDSO support:

clock-gettime-realtime: syscall: 1547 nsec/call
clock-gettime-realtime:libc: 258 nsec/call
clock-gettime-realtime:vdso: 180 nsec/call

clock-gettime-monotonic: syscall: 1399 nsec/call
clock-gettime-monotonic:libc: 317 nsec/call
clock-gettime-monotonic:vdso: 249 nsec/call

clock-gettime-realtime-coarse: syscall: 1228 nsec/call
clock-gettime-realtime-coarse:libc: 1320 nsec/call
clock-gettime-realtime-coarse:vdso: 1330 nsec/call

clock-gettime-monotonic-coarse: syscall: 1263 nsec/call
clock-gettime-monotonic-coarse:libc: 1368 nsec/call
clock-gettime-monotonic-coarse:vdso: 1258 nsec/call

With vDSO support:
--
clock-gettime-realtime: syscall: 1660 nsec/call
clock-gettime-realtime:libc: 251 nsec/call
clock-gettime-realtime:vdso: 180 nsec/call

clock-gettime-monotonic: syscall: 1514 nsec/call
clock-gettime-monotonic:libc: 309 nsec/call
clock-gettime-monotonic:vdso: 239 nsec/call

clock-gettime-realtime-coarse: syscall: 1228 nsec/call
clock-gettime-realtime-coarse:libc: 172 nsec/call
clock-gettime-realtime-coarse:vdso: 101 nsec/call

clock-gettime-monotonic-coarse: syscall: 1347 nsec/call
clock-gettime-monotonic-coarse:libc: 187 nsec/call
clock-gettime-monotonic-coarse:vdso: 125 nsec/call

Used https://github.com/nlynch-mentor/vdsotest.git for the benchmarks.

CC: Benjamin Herrenschmidt 
Signed-off-by: Santosh Sivaraj 
---
V2 update:
 - moved syscall fallback to assembly.
V3 update:
 - Restored "exact prototype" comment for __kernel_clock_gettime
 - Remove .hidden/.protected directives from __get_datapage to allow it to be 
called
   from C.

 arch/powerpc/include/asm/vdso.h   |   1 +
 arch/powerpc/kernel/vdso64/Makefile   |   2 +-
 arch/powerpc/kernel/vdso64/datapage.S |   6 --
 arch/powerpc/kernel/vdso64/gettime.c  | 143 ++
 arch/powerpc/kernel/vdso64/gettimeofday.S |  78 
 5 files changed, 161 insertions(+), 69 deletions(-)
 create mode 100644 arch/powerpc/kernel/vdso64/gettime.c

diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index c53f5f6..721e4cf 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -23,6 +23,7 @@ extern unsigned long vdso32_sigtramp;
 extern unsigned long vdso32_rt_sigtramp;
 
 int vdso_getcpu_init(void);
+struct vdso_data *__get_datapage(void);
 
 #else /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/kernel/vdso64/Makefile 
b/arch/powerpc/kernel/vdso64/Makefile
index 31107bf..8958d87 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -1,6 +1,6 @@
 # List of files in the vdso, has to be asm only for now
 
-obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
+obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o 
gettime.o
 
 # Build rules
 
diff --git a/arch/powerpc/kernel/vdso64/datapage.S 
b/arch/powerpc/kernel/vdso64/datapage.S
index abf17fe..0a2ee63 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -22,12 +22,6 @@ __kernel_datapage_offset:
 
 V_FUNCTION_BEGIN(__get_datapage)
   .cfi_startproc
-   /* We don't want that exposed or overridable as we want other objects
-* to be able to bl directly to here
-*/
-   .protected __get_datapage
-   .hidden __get_datapage
-
mflrr0
   .cfi_register lr,r0
 
diff --git a/arch/powerpc/kernel/vdso64/gettime.c 
b/arch/powerpc/kernel/vdso64/gettime.c
new file mode 100644
index 000..ef8f75c
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/gettime.c
@@ -0,0 +1,143 @@
+/*
+ * Userland implementation of gettimeofday() for 64 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2017 Santosh Sivaraj (sant...@fossix.org), IBM.
+ *
+ * Originally implemented in assembly by:
+ *   Benjamin Herrenschmuidt (b...@kernel.crashing.org),
+ *IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static notrace void kernel_get_tspec(struct timespec *tp,
+struct vdso_data *vdata, u32 *wtom_sec,
+u32 *wtom_nsec)
+{
+