[PATCH 2/4] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

2017-06-16 Thread Len Brown
From: Len Brown 

The goal of this change is to give users a uniform and meaningful
result when they read /sys/...cpufreq/scaling_cur_freq
on modern x86 hardware, as compared to what they get today.

Modern x86 processors include the hardware needed
to accurately calculate frequency over an interval --
APERF, MPERF, and the TSC.

Here we provide an x86 routine to make this calculation
on supported hardware, and use it in preference to any
driver driver-specific cpufreq_driver.get() routine.

MHz is computed like so:

MHz = base_MHz * delta_APERF / delta_MPERF

MHz is the average frequency of the busy processor
over a measurement interval.  The interval is
defined to be the time between successive invocations
of aperfmperf_khz_on_cpu(), which are expected to to
happen on-demand when users read sysfs attribute
cpufreq/scaling_cur_freq.

As with previous methods of calculating MHz,
idle time is excluded.

base_MHz above is from TSC calibration global "cpu_khz".

This x86 native method to calculate MHz returns a meaningful result
no matter if P-states are controlled by hardware or firmware
and/or if the Linux cpufreq sub-system is or is-not installed.

When this routine is invoked more frequently, the measurement
interval becomes shorter.  However, the code limits re-computation
to 10ms intervals so that average frequency remains meaningful.

Discerning users are encouraged to take advantage of
the turbostat(8) utility, which can gracefully handle
concurrent measurement intervals of arbitrary length.

Signed-off-by: Len Brown 
---
 arch/x86/kernel/cpu/Makefile |  1 +
 arch/x86/kernel/cpu/aperfmperf.c | 82 
 drivers/cpufreq/cpufreq.c|  7 +++-
 include/linux/cpufreq.h  | 13 +++
 4 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/aperfmperf.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 521..cdf8249 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -21,6 +21,7 @@ obj-y += common.o
 obj-y  += rdrand.o
 obj-y  += match.o
 obj-y  += bugs.o
+obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
 
 obj-$(CONFIG_PROC_FS)  += proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000..5ccf63a
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,82 @@
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown 
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct aperfmperf_sample {
+   unsigned int khz;
+   unsigned long jiffies;
+   u64 aperf;
+   u64 mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 10ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+   u64 aperf, aperf_delta;
+   u64 mperf, mperf_delta;
+   struct aperfmperf_sample *s = _cpu_var(samples);
+
+   /* Don't bother re-computing within 10 ms */
+   if (time_before(jiffies, s->jiffies + HZ/100))
+   goto out;
+
+   rdmsrl(MSR_IA32_APERF, aperf);
+   rdmsrl(MSR_IA32_MPERF, mperf);
+
+   aperf_delta = aperf - s->aperf;
+   mperf_delta = mperf - s->mperf;
+
+   /*
+* There is no architectural guarantee that MPERF
+* increments faster than we can read it.
+*/
+   if (mperf_delta == 0)
+   goto out;
+
+   /*
+* if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
+*  khz = (cpu_khz * aperf_delta) / mperf_delta
+*/
+   if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
+   s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+   else/* khz = aperf_delta / (mperf_delta / cpu_khz) */
+   s->khz = div64_u64(aperf_delta,
+   div64_u64(mperf_delta, cpu_khz));
+   s->jiffies = jiffies;
+   s->aperf = aperf;
+   s->mperf = mperf;
+
+out:
+   put_cpu_var(samples);
+}
+
+unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
+{
+   if (!cpu_khz)
+   return 0;
+
+   if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+   return 0;
+
+   smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+   return per_cpu(samples.khz, cpu);
+}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 26b643d..a667fac 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -635,8 +635,13 @@ show_one(scaling_max_freq, max);
 static ssize_t 

[PATCH 2/4] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

2017-06-16 Thread Len Brown
From: Len Brown 

The goal of this change is to give users a uniform and meaningful
result when they read /sys/...cpufreq/scaling_cur_freq
on modern x86 hardware, as compared to what they get today.

Modern x86 processors include the hardware needed
to accurately calculate frequency over an interval --
APERF, MPERF, and the TSC.

Here we provide an x86 routine to make this calculation
on supported hardware, and use it in preference to any
driver driver-specific cpufreq_driver.get() routine.

MHz is computed like so:

MHz = base_MHz * delta_APERF / delta_MPERF

MHz is the average frequency of the busy processor
over a measurement interval.  The interval is
defined to be the time between successive invocations
of aperfmperf_khz_on_cpu(), which are expected to to
happen on-demand when users read sysfs attribute
cpufreq/scaling_cur_freq.

As with previous methods of calculating MHz,
idle time is excluded.

base_MHz above is from TSC calibration global "cpu_khz".

This x86 native method to calculate MHz returns a meaningful result
no matter if P-states are controlled by hardware or firmware
and/or if the Linux cpufreq sub-system is or is-not installed.

When this routine is invoked more frequently, the measurement
interval becomes shorter.  However, the code limits re-computation
to 10ms intervals so that average frequency remains meaningful.

Discerning users are encouraged to take advantage of
the turbostat(8) utility, which can gracefully handle
concurrent measurement intervals of arbitrary length.

Signed-off-by: Len Brown 
---
 arch/x86/kernel/cpu/Makefile |  1 +
 arch/x86/kernel/cpu/aperfmperf.c | 82 
 drivers/cpufreq/cpufreq.c|  7 +++-
 include/linux/cpufreq.h  | 13 +++
 4 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/aperfmperf.c

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 521..cdf8249 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -21,6 +21,7 @@ obj-y += common.o
 obj-y  += rdrand.o
 obj-y  += match.o
 obj-y  += bugs.o
+obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
 
 obj-$(CONFIG_PROC_FS)  += proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000..5ccf63a
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,82 @@
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown 
+ *
+ * This file is licensed under GPLv2.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct aperfmperf_sample {
+   unsigned int khz;
+   unsigned long jiffies;
+   u64 aperf;
+   u64 mperf;
+};
+
+static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
+
+/*
+ * aperfmperf_snapshot_khz()
+ * On the current CPU, snapshot APERF, MPERF, and jiffies
+ * unless we already did it within 10ms
+ * calculate kHz, save snapshot
+ */
+static void aperfmperf_snapshot_khz(void *dummy)
+{
+   u64 aperf, aperf_delta;
+   u64 mperf, mperf_delta;
+   struct aperfmperf_sample *s = _cpu_var(samples);
+
+   /* Don't bother re-computing within 10 ms */
+   if (time_before(jiffies, s->jiffies + HZ/100))
+   goto out;
+
+   rdmsrl(MSR_IA32_APERF, aperf);
+   rdmsrl(MSR_IA32_MPERF, mperf);
+
+   aperf_delta = aperf - s->aperf;
+   mperf_delta = mperf - s->mperf;
+
+   /*
+* There is no architectural guarantee that MPERF
+* increments faster than we can read it.
+*/
+   if (mperf_delta == 0)
+   goto out;
+
+   /*
+* if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
+*  khz = (cpu_khz * aperf_delta) / mperf_delta
+*/
+   if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
+   s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+   else/* khz = aperf_delta / (mperf_delta / cpu_khz) */
+   s->khz = div64_u64(aperf_delta,
+   div64_u64(mperf_delta, cpu_khz));
+   s->jiffies = jiffies;
+   s->aperf = aperf;
+   s->mperf = mperf;
+
+out:
+   put_cpu_var(samples);
+}
+
+unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
+{
+   if (!cpu_khz)
+   return 0;
+
+   if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+   return 0;
+
+   smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
+
+   return per_cpu(samples.khz, cpu);
+}
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 26b643d..a667fac 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -635,8 +635,13 @@ show_one(scaling_max_freq, max);
 static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
 {
ssize_t 

Re: [PATCH v4 4/5] watchdog: provide watchdog_reconfigure() for arch watchdogs

2017-06-16 Thread Nicholas Piggin
On Fri, 16 Jun 2017 11:24:07 -0700
Andrew Morton  wrote:

> On Fri, 16 Jun 2017 16:57:14 +1000 Nicholas Piggin  wrote:
> 
> > After reconfiguring watchdog sysctls etc., architecture specific
> > watchdogs may not get all their parameters updated.
> > 
> > watchdog_reconfigure() can be implemented to pull the new values
> > in and set the arch NMI watchdog.
> >   
> 
> I'll update the title and changelog to say "watchdog_nmi_reconfigure".

Thanks.


> > --- a/kernel/watchdog.c
> > +++ b/kernel/watchdog.c
> > @@ -123,6 +123,11 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
> >  {
> >  }
> >  
> > +void __weak watchdog_nmi_reconfigure(void)
> > +{
> > +}  
> 
> Can we please get some documentation in here describing what it's for? 
> How arch maintainers might use this?  When and why it is called, what
> it must do?  etc.

Good point, how's this?

---
 kernel/watchdog.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 06cd965f64d2..36531025496f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended;
 /*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
+ *
+ * watchdog_nmi_enable/disable can be implemented to start and stop when
+ * softlockup watchdog threads start and stop. The arch must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
@@ -123,6 +127,17 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
 {
 }
 
+/*
+ * watchdog_nmi_reconfigure can be implemented to be notified after any
+ * watchdog configuration change. The arch hardlockup watchdog should
+ * respond to the following variables:
+ * - nmi_watchdog_enabled
+ * - watchdog_thresh
+ * - watchdog_cpumask
+ * - sysctl_hardlockup_all_cpu_backtrace
+ * - hardlockup_panic
+ * - watchdog_suspended
+ */
 void __weak watchdog_nmi_reconfigure(void)
 {
 }
-- 
2.11.0



Re: [PATCH v4 4/5] watchdog: provide watchdog_reconfigure() for arch watchdogs

2017-06-16 Thread Nicholas Piggin
On Fri, 16 Jun 2017 11:24:07 -0700
Andrew Morton  wrote:

> On Fri, 16 Jun 2017 16:57:14 +1000 Nicholas Piggin  wrote:
> 
> > After reconfiguring watchdog sysctls etc., architecture specific
> > watchdogs may not get all their parameters updated.
> > 
> > watchdog_reconfigure() can be implemented to pull the new values
> > in and set the arch NMI watchdog.
> >   
> 
> I'll update the title and changelog to say "watchdog_nmi_reconfigure".

Thanks.


> > --- a/kernel/watchdog.c
> > +++ b/kernel/watchdog.c
> > @@ -123,6 +123,11 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
> >  {
> >  }
> >  
> > +void __weak watchdog_nmi_reconfigure(void)
> > +{
> > +}  
> 
> Can we please get some documentation in here describing what it's for? 
> How arch maintainers might use this?  When and why it is called, what
> it must do?  etc.

Good point, how's this?

---
 kernel/watchdog.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 06cd965f64d2..36531025496f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -114,6 +114,10 @@ int __read_mostly watchdog_suspended;
 /*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
+ *
+ * watchdog_nmi_enable/disable can be implemented to start and stop when
+ * softlockup watchdog threads start and stop. The arch must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
  */
 int __weak watchdog_nmi_enable(unsigned int cpu)
 {
@@ -123,6 +127,17 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
 {
 }
 
+/*
+ * watchdog_nmi_reconfigure can be implemented to be notified after any
+ * watchdog configuration change. The arch hardlockup watchdog should
+ * respond to the following variables:
+ * - nmi_watchdog_enabled
+ * - watchdog_thresh
+ * - watchdog_cpumask
+ * - sysctl_hardlockup_all_cpu_backtrace
+ * - hardlockup_panic
+ * - watchdog_suspended
+ */
 void __weak watchdog_nmi_reconfigure(void)
 {
 }
-- 
2.11.0



[PATCH V3] staging: vt6655 - style fix

2017-06-16 Thread Derek Robson
Fix checkpatch.pl warnings of the form "function definition argument
'foo' should also have an identifier name" in header files.

Signed-off-by: Derek Robson 

V1 and V2 had vague subject line
---
 drivers/staging/vt6655/card.h| 30 ++---
 drivers/staging/vt6655/channel.h |  4 +--
 drivers/staging/vt6655/mac.h | 58 
 drivers/staging/vt6655/power.h   | 16 +++
 drivers/staging/vt6655/rf.h  | 16 +--
 5 files changed, 57 insertions(+), 67 deletions(-)

diff --git a/drivers/staging/vt6655/card.h b/drivers/staging/vt6655/card.h
index 03369ffaa4d6..1a04dbb57d42 100644
--- a/drivers/staging/vt6655/card.h
+++ b/drivers/staging/vt6655/card.h
@@ -64,25 +64,25 @@ typedef enum _CARD_STATUS_TYPE {
 struct vnt_private;
 
 void CARDvSetRSPINF(struct vnt_private *priv, u8 bb_type);
-void CARDvUpdateBasicTopRate(struct vnt_private *);
-bool CARDbIsOFDMinBasicRate(struct vnt_private *);
-void CARDvSetLoopbackMode(struct vnt_private *, unsigned short wLoopbackMode);
-bool CARDbSoftwareReset(struct vnt_private *);
-void CARDvSetFirstNextTBTT(struct vnt_private *,
+void CARDvUpdateBasicTopRate(struct vnt_private *priv);
+bool CARDbIsOFDMinBasicRate(struct vnt_private *priv);
+void CARDvSetLoopbackMode(struct vnt_private *priv, unsigned short 
wLoopbackMode);
+bool CARDbSoftwareReset(struct vnt_private *priv);
+void CARDvSetFirstNextTBTT(struct vnt_private *priv,
   unsigned short wBeaconInterval);
-void CARDvUpdateNextTBTT(struct vnt_private *, u64 qwTSF,
+void CARDvUpdateNextTBTT(struct vnt_private *priv, u64 qwTSF,
 unsigned short wBeaconInterval);
-bool CARDbGetCurrentTSF(struct vnt_private *, u64 *pqwCurrTSF);
+bool CARDbGetCurrentTSF(struct vnt_private *priv, u64 *pqwCurrTSF);
 u64 CARDqGetNextTBTT(u64 qwTSF, unsigned short wBeaconInterval);
 u64 CARDqGetTSFOffset(unsigned char byRxRate, u64 qwTSF1, u64 qwTSF2);
-unsigned char CARDbyGetPktType(struct vnt_private *);
-void CARDvSafeResetTx(struct vnt_private *);
-void CARDvSafeResetRx(struct vnt_private *);
-bool CARDbRadioPowerOff(struct vnt_private *);
-bool CARDbRadioPowerOn(struct vnt_private *);
-bool CARDbSetPhyParameter(struct vnt_private *, u8);
-bool CARDbUpdateTSF(struct vnt_private *, unsigned char byRxRate,
+unsigned char CARDbyGetPktType(struct vnt_private *priv);
+void CARDvSafeResetTx(struct vnt_private *priv);
+void CARDvSafeResetRx(struct vnt_private *priv);
+bool CARDbRadioPowerOff(struct vnt_private *priv);
+bool CARDbRadioPowerOn(struct vnt_private *priv);
+bool CARDbSetPhyParameter(struct vnt_private *priv, u8 bb_type);
+bool CARDbUpdateTSF(struct vnt_private *priv, unsigned char byRxRate,
u64 qwBSSTimestamp);
-bool CARDbSetBeaconPeriod(struct vnt_private *, unsigned short 
wBeaconInterval);
+bool CARDbSetBeaconPeriod(struct vnt_private *priv, unsigned short 
wBeaconInterval);
 
 #endif /* __CARD_H__ */
diff --git a/drivers/staging/vt6655/channel.h b/drivers/staging/vt6655/channel.h
index 2621dfabff06..8fe70760e548 100644
--- a/drivers/staging/vt6655/channel.h
+++ b/drivers/staging/vt6655/channel.h
@@ -21,8 +21,8 @@
 
 #include "card.h"
 
-void vnt_init_bands(struct vnt_private *);
+void vnt_init_bands(struct vnt_private *priv);
 
-bool set_channel(struct vnt_private *, struct ieee80211_channel *);
+bool set_channel(struct vnt_private *priv, struct ieee80211_channel *ch);
 
 #endif /* _CHANNEL_H_ */
diff --git a/drivers/staging/vt6655/mac.h b/drivers/staging/vt6655/mac.h
index 33b758cb79d4..db401e32ae23 100644
--- a/drivers/staging/vt6655/mac.h
+++ b/drivers/staging/vt6655/mac.h
@@ -885,57 +885,57 @@ do {  
\
 #define MACvSetRFLE_LatchBase(iobase) \
MACvWordRegBitsOn(iobase, MAC_REG_SOFTPWRCTL, SOFTPWRCTL_RFLEOPT)
 
-bool MACbIsRegBitsOn(struct vnt_private *, unsigned char byRegOfs,
+bool MACbIsRegBitsOn(struct vnt_private *priv, unsigned char byRegOfs,
 unsigned char byTestBits);
-bool MACbIsRegBitsOff(struct vnt_private *, unsigned char byRegOfs,
+bool MACbIsRegBitsOff(struct vnt_private *priv, unsigned char byRegOfs,
  unsigned char byTestBits);
 
-bool MACbIsIntDisable(struct vnt_private *);
+bool MACbIsIntDisable(struct vnt_private *priv);
 
-void MACvSetShortRetryLimit(struct vnt_private *, unsigned char byRetryLimit);
+void MACvSetShortRetryLimit(struct vnt_private *priv, unsigned char 
byRetryLimit);
 
-void MACvSetLongRetryLimit(struct vnt_private *, unsigned char byRetryLimit);
-void MACvGetLongRetryLimit(struct vnt_private *,
+void MACvSetLongRetryLimit(struct vnt_private *priv, unsigned char 
byRetryLimit);
+void MACvGetLongRetryLimit(struct vnt_private *priv,
   unsigned char *pbyRetryLimit);
 
-void MACvSetLoopbackMode(struct vnt_private *, unsigned char byLoopbackMode);
+void MACvSetLoopbackMode(struct 

[PATCH V3] staging: vt6655 - style fix

2017-06-16 Thread Derek Robson
Fix checkpatch.pl warnings of the form "function definition argument
'foo' should also have an identifier name" in header files.

Signed-off-by: Derek Robson 

V1 and V2 had vague subject line
---
 drivers/staging/vt6655/card.h| 30 ++---
 drivers/staging/vt6655/channel.h |  4 +--
 drivers/staging/vt6655/mac.h | 58 
 drivers/staging/vt6655/power.h   | 16 +++
 drivers/staging/vt6655/rf.h  | 16 +--
 5 files changed, 57 insertions(+), 67 deletions(-)

diff --git a/drivers/staging/vt6655/card.h b/drivers/staging/vt6655/card.h
index 03369ffaa4d6..1a04dbb57d42 100644
--- a/drivers/staging/vt6655/card.h
+++ b/drivers/staging/vt6655/card.h
@@ -64,25 +64,25 @@ typedef enum _CARD_STATUS_TYPE {
 struct vnt_private;
 
 void CARDvSetRSPINF(struct vnt_private *priv, u8 bb_type);
-void CARDvUpdateBasicTopRate(struct vnt_private *);
-bool CARDbIsOFDMinBasicRate(struct vnt_private *);
-void CARDvSetLoopbackMode(struct vnt_private *, unsigned short wLoopbackMode);
-bool CARDbSoftwareReset(struct vnt_private *);
-void CARDvSetFirstNextTBTT(struct vnt_private *,
+void CARDvUpdateBasicTopRate(struct vnt_private *priv);
+bool CARDbIsOFDMinBasicRate(struct vnt_private *priv);
+void CARDvSetLoopbackMode(struct vnt_private *priv, unsigned short 
wLoopbackMode);
+bool CARDbSoftwareReset(struct vnt_private *priv);
+void CARDvSetFirstNextTBTT(struct vnt_private *priv,
   unsigned short wBeaconInterval);
-void CARDvUpdateNextTBTT(struct vnt_private *, u64 qwTSF,
+void CARDvUpdateNextTBTT(struct vnt_private *priv, u64 qwTSF,
 unsigned short wBeaconInterval);
-bool CARDbGetCurrentTSF(struct vnt_private *, u64 *pqwCurrTSF);
+bool CARDbGetCurrentTSF(struct vnt_private *priv, u64 *pqwCurrTSF);
 u64 CARDqGetNextTBTT(u64 qwTSF, unsigned short wBeaconInterval);
 u64 CARDqGetTSFOffset(unsigned char byRxRate, u64 qwTSF1, u64 qwTSF2);
-unsigned char CARDbyGetPktType(struct vnt_private *);
-void CARDvSafeResetTx(struct vnt_private *);
-void CARDvSafeResetRx(struct vnt_private *);
-bool CARDbRadioPowerOff(struct vnt_private *);
-bool CARDbRadioPowerOn(struct vnt_private *);
-bool CARDbSetPhyParameter(struct vnt_private *, u8);
-bool CARDbUpdateTSF(struct vnt_private *, unsigned char byRxRate,
+unsigned char CARDbyGetPktType(struct vnt_private *priv);
+void CARDvSafeResetTx(struct vnt_private *priv);
+void CARDvSafeResetRx(struct vnt_private *priv);
+bool CARDbRadioPowerOff(struct vnt_private *priv);
+bool CARDbRadioPowerOn(struct vnt_private *priv);
+bool CARDbSetPhyParameter(struct vnt_private *priv, u8 bb_type);
+bool CARDbUpdateTSF(struct vnt_private *priv, unsigned char byRxRate,
u64 qwBSSTimestamp);
-bool CARDbSetBeaconPeriod(struct vnt_private *, unsigned short 
wBeaconInterval);
+bool CARDbSetBeaconPeriod(struct vnt_private *priv, unsigned short 
wBeaconInterval);
 
 #endif /* __CARD_H__ */
diff --git a/drivers/staging/vt6655/channel.h b/drivers/staging/vt6655/channel.h
index 2621dfabff06..8fe70760e548 100644
--- a/drivers/staging/vt6655/channel.h
+++ b/drivers/staging/vt6655/channel.h
@@ -21,8 +21,8 @@
 
 #include "card.h"
 
-void vnt_init_bands(struct vnt_private *);
+void vnt_init_bands(struct vnt_private *priv);
 
-bool set_channel(struct vnt_private *, struct ieee80211_channel *);
+bool set_channel(struct vnt_private *priv, struct ieee80211_channel *ch);
 
 #endif /* _CHANNEL_H_ */
diff --git a/drivers/staging/vt6655/mac.h b/drivers/staging/vt6655/mac.h
index 33b758cb79d4..db401e32ae23 100644
--- a/drivers/staging/vt6655/mac.h
+++ b/drivers/staging/vt6655/mac.h
@@ -885,57 +885,57 @@ do {  
\
 #define MACvSetRFLE_LatchBase(iobase) \
MACvWordRegBitsOn(iobase, MAC_REG_SOFTPWRCTL, SOFTPWRCTL_RFLEOPT)
 
-bool MACbIsRegBitsOn(struct vnt_private *, unsigned char byRegOfs,
+bool MACbIsRegBitsOn(struct vnt_private *priv, unsigned char byRegOfs,
 unsigned char byTestBits);
-bool MACbIsRegBitsOff(struct vnt_private *, unsigned char byRegOfs,
+bool MACbIsRegBitsOff(struct vnt_private *priv, unsigned char byRegOfs,
  unsigned char byTestBits);
 
-bool MACbIsIntDisable(struct vnt_private *);
+bool MACbIsIntDisable(struct vnt_private *priv);
 
-void MACvSetShortRetryLimit(struct vnt_private *, unsigned char byRetryLimit);
+void MACvSetShortRetryLimit(struct vnt_private *priv, unsigned char 
byRetryLimit);
 
-void MACvSetLongRetryLimit(struct vnt_private *, unsigned char byRetryLimit);
-void MACvGetLongRetryLimit(struct vnt_private *,
+void MACvSetLongRetryLimit(struct vnt_private *priv, unsigned char 
byRetryLimit);
+void MACvGetLongRetryLimit(struct vnt_private *priv,
   unsigned char *pbyRetryLimit);
 
-void MACvSetLoopbackMode(struct vnt_private *, unsigned char byLoopbackMode);
+void MACvSetLoopbackMode(struct vnt_private *priv, 

[git pull] vfs fixes

2017-06-16 Thread Al Viro
A couple of fixes; a leak in mntns_install() caught by Andrei
(this cycle regression) + d_invalidate() softlockup fix - that had
been reported by a bunch of people lately, but the problem is pretty
old.

The following changes since commit 32c1431eea4881a6b17bd7c639315010aeefa452:

  Linux 4.12-rc5 (2017-06-11 16:48:20 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git for-linus

for you to fetch changes up to 4068367c9ca7b515a209f9c0c8741309a1e90495:

  fs: don't forget to put old mntns in mntns_install (2017-06-15 06:53:05 -0400)


Al Viro (1):
  Hang/soft lockup in d_invalidate with simultaneous calls

Andrei Vagin (1):
  fs: don't forget to put old mntns in mntns_install

 fs/dcache.c| 10 --
 fs/namespace.c |  2 ++
 2 files changed, 6 insertions(+), 6 deletions(-)


[git pull] vfs fixes

2017-06-16 Thread Al Viro
A couple of fixes; a leak in mntns_install() caught by Andrei
(this cycle regression) + d_invalidate() softlockup fix - that had
been reported by a bunch of people lately, but the problem is pretty
old.

The following changes since commit 32c1431eea4881a6b17bd7c639315010aeefa452:

  Linux 4.12-rc5 (2017-06-11 16:48:20 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git for-linus

for you to fetch changes up to 4068367c9ca7b515a209f9c0c8741309a1e90495:

  fs: don't forget to put old mntns in mntns_install (2017-06-15 06:53:05 -0400)


Al Viro (1):
  Hang/soft lockup in d_invalidate with simultaneous calls

Andrei Vagin (1):
  fs: don't forget to put old mntns in mntns_install

 fs/dcache.c| 10 --
 fs/namespace.c |  2 ++
 2 files changed, 6 insertions(+), 6 deletions(-)


[PATCH V2] staging: ccree: - style fix, spaces and tabs

2017-06-16 Thread Derek Robson
Changed code indent to be tabs across whole driver
Found using checkpatch

Signed-off-by: Derek Robson 

V1 had vague subject.
---
 drivers/staging/ccree/ssi_cipher.c | 45 +-
 drivers/staging/ccree/ssi_driver.c |  6 ++---
 drivers/staging/ccree/ssi_driver.h |  6 ++---
 drivers/staging/ccree/ssi_fips.h   |  8 +++---
 drivers/staging/ccree/ssi_fips_ext.c   |  4 +--
 drivers/staging/ccree/ssi_fips_ll.c| 40 +++---
 drivers/staging/ccree/ssi_fips_local.c | 28 ++---
 drivers/staging/ccree/ssi_fips_local.h | 12 -
 8 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/drivers/staging/ccree/ssi_cipher.c 
b/drivers/staging/ccree/ssi_cipher.c
index 2dfc6a3bd4c1..34450a5e6573 100644
--- a/drivers/staging/ccree/ssi_cipher.c
+++ b/drivers/staging/ccree/ssi_cipher.c
@@ -258,45 +258,45 @@ static void ssi_blkcipher_exit(struct crypto_tfm *tfm)
 
 
 typedef struct tdes_keys{
-u8  key1[DES_KEY_SIZE];
-u8  key2[DES_KEY_SIZE];
-u8  key3[DES_KEY_SIZE];
+   u8  key1[DES_KEY_SIZE];
+   u8  key2[DES_KEY_SIZE];
+   u8  key3[DES_KEY_SIZE];
 }tdes_keys_t;
 
-static const u8 zero_buff[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static const u8 zero_buff[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
 
 /* The function verifies that tdes keys are not weak.*/
 static int ssi_fips_verify_3des_keys(const u8 *key, unsigned int keylen)
 {
 #ifdef CCREE_FIPS_SUPPORT
-tdes_keys_t *tdes_key = (tdes_keys_t*)key;
+   tdes_keys_t *tdes_key = (tdes_keys_t*)key;
 
/* verify key1 != key2 and key3 != key2*/
-if (unlikely( (memcmp((u8*)tdes_key->key1, (u8*)tdes_key->key2, 
sizeof(tdes_key->key1)) == 0) ||
+   if (unlikely( (memcmp((u8*)tdes_key->key1, (u8*)tdes_key->key2, 
sizeof(tdes_key->key1)) == 0) ||
  (memcmp((u8*)tdes_key->key3, (u8*)tdes_key->key2, 
sizeof(tdes_key->key3)) == 0) )) {
-return -ENOEXEC;
-}
+   return -ENOEXEC;
+   }
 #endif /* CCREE_FIPS_SUPPORT */
 
-return 0;
+   return 0;
 }
 
 /* The function verifies that xts keys are not weak.*/
 static int ssi_fips_verify_xts_keys(const u8 *key, unsigned int keylen)
 {
 #ifdef CCREE_FIPS_SUPPORT
-/* Weak key is define as key that its first half (128/256 lsb) equals 
its second half (128/256 msb) */
-int singleKeySize = keylen >> 1;
+   /* Weak key is define as key that its first half (128/256 lsb) equals 
its second half (128/256 msb) */
+   int singleKeySize = keylen >> 1;
 
if (unlikely(memcmp(key, [singleKeySize], singleKeySize) == 0)) {
return -ENOEXEC;
}
 #endif /* CCREE_FIPS_SUPPORT */
 
-return 0;
+   return 0;
 }
 
 static enum cc_hw_crypto_key hw_key_to_cc_hw_key(int slot_num)
@@ -720,12 +720,13 @@ ssi_blkcipher_create_data_desc(
 }
 
 static int ssi_blkcipher_complete(struct device *dev,
-  struct ssi_ablkcipher_ctx *ctx_p,
-  struct blkcipher_req_ctx *req_ctx,
-  struct scatterlist *dst, struct scatterlist 
*src,
-  unsigned int ivsize,
-  void *areq,
-  void __iomem *cc_base)
+   struct ssi_ablkcipher_ctx *ctx_p,
+   struct blkcipher_req_ctx *req_ctx,
+   struct scatterlist *dst,
+   struct scatterlist *src,
+   unsigned int ivsize,
+   void *areq,
+   void __iomem *cc_base)
 {
int completion_error = 0;
u32 inflight_counter;
@@ -779,7 +780,7 @@ static int ssi_blkcipher_process(
/* No data to process is valid */
return 0;
}
-/*For CTS in case of data size aligned to 16 use CBC mode*/
+   /*For CTS in case of data size aligned to 16 use CBC mode*/
if (((nbytes % AES_BLOCK_SIZE) == 0) && (ctx_p->cipher_mode == 
DRV_CIPHER_CBC_CTS)){
 
ctx_p->cipher_mode = DRV_CIPHER_CBC;
diff --git a/drivers/staging/ccree/ssi_driver.c 
b/drivers/staging/ccree/ssi_driver.c
index 190922970bf0..b9d0dd27e853 100644
--- a/drivers/staging/ccree/ssi_driver.c
+++ b/drivers/staging/ccree/ssi_driver.c
@@ -437,9 +437,9 @@ static void 

[PATCH V2] staging: ccree: - style fix, spaces and tabs

2017-06-16 Thread Derek Robson
Changed code indent to be tabs across whole driver
Found using checkpatch

Signed-off-by: Derek Robson 

V1 had vague subject.
---
 drivers/staging/ccree/ssi_cipher.c | 45 +-
 drivers/staging/ccree/ssi_driver.c |  6 ++---
 drivers/staging/ccree/ssi_driver.h |  6 ++---
 drivers/staging/ccree/ssi_fips.h   |  8 +++---
 drivers/staging/ccree/ssi_fips_ext.c   |  4 +--
 drivers/staging/ccree/ssi_fips_ll.c| 40 +++---
 drivers/staging/ccree/ssi_fips_local.c | 28 ++---
 drivers/staging/ccree/ssi_fips_local.h | 12 -
 8 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/drivers/staging/ccree/ssi_cipher.c 
b/drivers/staging/ccree/ssi_cipher.c
index 2dfc6a3bd4c1..34450a5e6573 100644
--- a/drivers/staging/ccree/ssi_cipher.c
+++ b/drivers/staging/ccree/ssi_cipher.c
@@ -258,45 +258,45 @@ static void ssi_blkcipher_exit(struct crypto_tfm *tfm)
 
 
 typedef struct tdes_keys{
-u8  key1[DES_KEY_SIZE];
-u8  key2[DES_KEY_SIZE];
-u8  key3[DES_KEY_SIZE];
+   u8  key1[DES_KEY_SIZE];
+   u8  key2[DES_KEY_SIZE];
+   u8  key3[DES_KEY_SIZE];
 }tdes_keys_t;
 
-static const u8 zero_buff[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
-   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static const u8 zero_buff[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
 
 /* The function verifies that tdes keys are not weak.*/
 static int ssi_fips_verify_3des_keys(const u8 *key, unsigned int keylen)
 {
 #ifdef CCREE_FIPS_SUPPORT
-tdes_keys_t *tdes_key = (tdes_keys_t*)key;
+   tdes_keys_t *tdes_key = (tdes_keys_t*)key;
 
/* verify key1 != key2 and key3 != key2*/
-if (unlikely( (memcmp((u8*)tdes_key->key1, (u8*)tdes_key->key2, 
sizeof(tdes_key->key1)) == 0) ||
+   if (unlikely( (memcmp((u8*)tdes_key->key1, (u8*)tdes_key->key2, 
sizeof(tdes_key->key1)) == 0) ||
  (memcmp((u8*)tdes_key->key3, (u8*)tdes_key->key2, 
sizeof(tdes_key->key3)) == 0) )) {
-return -ENOEXEC;
-}
+   return -ENOEXEC;
+   }
 #endif /* CCREE_FIPS_SUPPORT */
 
-return 0;
+   return 0;
 }
 
 /* The function verifies that xts keys are not weak.*/
 static int ssi_fips_verify_xts_keys(const u8 *key, unsigned int keylen)
 {
 #ifdef CCREE_FIPS_SUPPORT
-/* Weak key is define as key that its first half (128/256 lsb) equals 
its second half (128/256 msb) */
-int singleKeySize = keylen >> 1;
+   /* Weak key is define as key that its first half (128/256 lsb) equals 
its second half (128/256 msb) */
+   int singleKeySize = keylen >> 1;
 
if (unlikely(memcmp(key, [singleKeySize], singleKeySize) == 0)) {
return -ENOEXEC;
}
 #endif /* CCREE_FIPS_SUPPORT */
 
-return 0;
+   return 0;
 }
 
 static enum cc_hw_crypto_key hw_key_to_cc_hw_key(int slot_num)
@@ -720,12 +720,13 @@ ssi_blkcipher_create_data_desc(
 }
 
 static int ssi_blkcipher_complete(struct device *dev,
-  struct ssi_ablkcipher_ctx *ctx_p,
-  struct blkcipher_req_ctx *req_ctx,
-  struct scatterlist *dst, struct scatterlist 
*src,
-  unsigned int ivsize,
-  void *areq,
-  void __iomem *cc_base)
+   struct ssi_ablkcipher_ctx *ctx_p,
+   struct blkcipher_req_ctx *req_ctx,
+   struct scatterlist *dst,
+   struct scatterlist *src,
+   unsigned int ivsize,
+   void *areq,
+   void __iomem *cc_base)
 {
int completion_error = 0;
u32 inflight_counter;
@@ -779,7 +780,7 @@ static int ssi_blkcipher_process(
/* No data to process is valid */
return 0;
}
-/*For CTS in case of data size aligned to 16 use CBC mode*/
+   /*For CTS in case of data size aligned to 16 use CBC mode*/
if (((nbytes % AES_BLOCK_SIZE) == 0) && (ctx_p->cipher_mode == 
DRV_CIPHER_CBC_CTS)){
 
ctx_p->cipher_mode = DRV_CIPHER_CBC;
diff --git a/drivers/staging/ccree/ssi_driver.c 
b/drivers/staging/ccree/ssi_driver.c
index 190922970bf0..b9d0dd27e853 100644
--- a/drivers/staging/ccree/ssi_driver.c
+++ b/drivers/staging/ccree/ssi_driver.c
@@ -437,9 +437,9 @@ static void cleanup_cc_resources(struct 

Re: [PATCH v4 2/5] watchdog: introduce arch_touch_nmi_watchdog()

2017-06-16 Thread Nicholas Piggin
On Fri, 16 Jun 2017 11:21:17 -0700
Andrew Morton  wrote:

> On Fri, 16 Jun 2017 16:57:12 +1000 Nicholas Piggin  wrote:
> 
> > For architectures that define HAVE_NMI_WATCHDOG, instead of having
> > them provide the complete touch_nmi_watchdog() function, just have
> > them provide arch_touch_nmi_watchdog().
> > 
> > This gives the generic code more flexibility in implementing this
> > function, and arch implementations don't miss out on touching the
> > softlockup watchdog or other generic details.
> > 
> > ...
> >
> > --- a/arch/blackfin/include/asm/nmi.h
> > +++ b/arch/blackfin/include/asm/nmi.h
> > @@ -9,4 +9,6 @@
> >  
> >  #include 
> >  
> > +extern void arch_touch_nmi_watchdog(void);  
> 
> Do we actually need to add this to the arch header files...

[snip]

> > +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || 
> > defined(CONFIG_HAVE_NMI_WATCHDOG)
> > +extern void arch_touch_nmi_watchdog(void);
> > +#else
> > +static inline void arch_touch_nmi_watchdog(void) {}
> > +#endif
> > +  
> 
> given that we have a global declaration here?

Probably not. I think it was a holdover from an earlier version where
I tried to let the arch declare it (one of the little embedded ones
had a comment somewhere saying it would be nice if they could make it
inline).

There was some difficulty with it, so yes let's remove these and do
that next time.

Thanks,
Nick


Re: [PATCH v4 2/5] watchdog: introduce arch_touch_nmi_watchdog()

2017-06-16 Thread Nicholas Piggin
On Fri, 16 Jun 2017 11:21:17 -0700
Andrew Morton  wrote:

> On Fri, 16 Jun 2017 16:57:12 +1000 Nicholas Piggin  wrote:
> 
> > For architectures that define HAVE_NMI_WATCHDOG, instead of having
> > them provide the complete touch_nmi_watchdog() function, just have
> > them provide arch_touch_nmi_watchdog().
> > 
> > This gives the generic code more flexibility in implementing this
> > function, and arch implementations don't miss out on touching the
> > softlockup watchdog or other generic details.
> > 
> > ...
> >
> > --- a/arch/blackfin/include/asm/nmi.h
> > +++ b/arch/blackfin/include/asm/nmi.h
> > @@ -9,4 +9,6 @@
> >  
> >  #include 
> >  
> > +extern void arch_touch_nmi_watchdog(void);  
> 
> Do we actually need to add this to the arch header files...

[snip]

> > +#if defined(CONFIG_HARDLOCKUP_DETECTOR) || 
> > defined(CONFIG_HAVE_NMI_WATCHDOG)
> > +extern void arch_touch_nmi_watchdog(void);
> > +#else
> > +static inline void arch_touch_nmi_watchdog(void) {}
> > +#endif
> > +  
> 
> given that we have a global declaration here?

Probably not. I think it was a holdover from an earlier version where
I tried to let the arch declare it (one of the little embedded ones
had a comment somewhere saying it would be nice if they could make it
inline).

There was some difficulty with it, so yes let's remove these and do
that next time.

Thanks,
Nick


Re: [git pull] first batch of ufs fixes

2017-06-16 Thread Al Viro
On Fri, Jun 16, 2017 at 07:29:00AM -0700, Richard Narron wrote:

> The 8 patches in the ufs-fixes group were applied to Linux 4.12-rc5.
> They seem to work fine with the simple testing that I do.
> 
> I tested all 3 BSDs, FreeBSD 11.0, OpenBSD 6.1 and NetBSD 7.1 using 2
> filesystems, 44bsd (ufs1) and ufs2.
> I found no errors doing a Linux mkdir, copy large file, BSD fsck, Linux rm
> large file, rmdir and BSD fsck in any of the 6 combinations.

FWIW, with xfstests I see the following:
* _require_metadata_journaling needs to be told not to bother with
our ufs
* generic/{409,410,411} lack $MOUNT_OPTIONS in several invocations
of mount -t $FSTYP; for ufs it's obviosuly a problem.  Trivial bug in tests,
fixing it makes them pass.
* generic/258 (timestamp wraparound) fails; fs is left intact
* generic/426 (fhandle stuff) fails and buggers the filesystem
Everything else passes with no fs corruption that could be detected by
fsck.ufs.  Note that crash simulation tests are *not* run - no point
even trying.  I tried several of those with -o sync; they seem to pass,
but there's free blocks, etc. stats summaries being slightly incorrect.
Inevitable, AFAICS.

> Doing a "df" on BSD and Linux now match on the counts including the
> "Available" counts.
> 
> It might be worth testing with ufs filesystems using softdep and/or
> journaling.  Should the Linux mount command reject such filesystems?

Depends...  Do you mean rejecting mount of fs with pending log replay?

> Now that ufs write access is working more or less, we're dangerous.

*snort*
It's marked experimental for a very good reason.  The stuff in #ufs-fixes
deals with fairly obvious bugs and with those the thing is reasonably
well-behaved on xfstests, but it needs a lot more testing (and probably
somebody doing journalling side of things as well) before it can be
considered fully safe for sharing with *BSD.

And I'm still not entirely convinced that mixing tail-unpacking, truncate
and mmap works correctly.  FWIW, switch to iomap-based approach would
probably deal with all of that nicely, but that's well beyond "obvious
bugfixes"; definitely not this cycle fodder.

I'm not even sure if #ufs-fixes is OK for this stage in the cycle - it
is local to fs/ufs, doesn't affect anything else and fixes fairly
obvious bugs, but we are nearly at -rc6.  Pull request for that would
be as below, but it's really up to Linus (and Evgeniy, if he is not
completely MIA) whether to pull it now or wait until the next window.
Linus, what do you think of that?  I'm honestly not sure...

As for my immediate plans, I'll look into the two failing tests,
but any further active work on ufs will have to wait for the next
cycle.  It had been a fun couple of weeks, but I have more than
enough other stuff to deal with.  And I would still very much prefer
for somebody to adopt that puppy.


Fix assorted ufs bugs; a couple of deadlocks, fs corruption in truncate(),
oopsen on tail unpacking and truncate when racing with vmscan, mild fs
corruption (free blocks stats summary buggered, *BSD fsck would
complain and fix), several instances of broken logics around reserved
blocks (starting with "check almost never triggers when it should" and
then there are issues with sufficiently large UFS2).

The following changes since commit 67a70017fa0a152657bc7e337e69bb9c9f5549bf:

  ufs: we need to sync inode before freeing it (2017-06-10 12:02:28 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git ufs-fixes

for you to fetch changes up to a8fad984833832d5ca11a9ed64ddc55646da30e3:

  ufs_truncate_blocks(): fix the case when size is in the last direct block 
(2017-06-15 03:57:46 -0400)


Al Viro (8):
  ufs: fix logics in "ufs: make fsck -f happy"
  ufs: make ufs_freespace() return signed
  ufs: fix reserved blocks check
  ufs: fix s_size/s_dsize users
  ufs_get_locked_page(): make sure we have buffer_heads
  ufs: avoid grabbing ->truncate_mutex if possible
  ufs: more deadlock prevention on tail unpacking
  ufs_truncate_blocks(): fix the case when size is in the last direct block

 fs/ufs/balloc.c | 22 
 fs/ufs/inode.c  | 47 --
 fs/ufs/super.c  | 64 +++--
 fs/ufs/ufs_fs.h |  7 +++
 fs/ufs/util.c   | 17 ---
 fs/ufs/util.h   |  9 ++--
 6 files changed, 98 insertions(+), 68 deletions(-)


Re: [git pull] first batch of ufs fixes

2017-06-16 Thread Al Viro
On Fri, Jun 16, 2017 at 07:29:00AM -0700, Richard Narron wrote:

> The 8 patches in the ufs-fixes group were applied to Linux 4.12-rc5.
> They seem to work fine with the simple testing that I do.
> 
> I tested all 3 BSDs, FreeBSD 11.0, OpenBSD 6.1 and NetBSD 7.1 using 2
> filesystems, 44bsd (ufs1) and ufs2.
> I found no errors doing a Linux mkdir, copy large file, BSD fsck, Linux rm
> large file, rmdir and BSD fsck in any of the 6 combinations.

FWIW, with xfstests I see the following:
* _require_metadata_journaling needs to be told not to bother with
our ufs
* generic/{409,410,411} lack $MOUNT_OPTIONS in several invocations
of mount -t $FSTYP; for ufs it's obviosuly a problem.  Trivial bug in tests,
fixing it makes them pass.
* generic/258 (timestamp wraparound) fails; fs is left intact
* generic/426 (fhandle stuff) fails and buggers the filesystem
Everything else passes with no fs corruption that could be detected by
fsck.ufs.  Note that crash simulation tests are *not* run - no point
even trying.  I tried several of those with -o sync; they seem to pass,
but there's free blocks, etc. stats summaries being slightly incorrect.
Inevitable, AFAICS.

> Doing a "df" on BSD and Linux now match on the counts including the
> "Available" counts.
> 
> It might be worth testing with ufs filesystems using softdep and/or
> journaling.  Should the Linux mount command reject such filesystems?

Depends...  Do you mean rejecting mount of fs with pending log replay?

> Now that ufs write access is working more or less, we're dangerous.

*snort*
It's marked experimental for a very good reason.  The stuff in #ufs-fixes
deals with fairly obvious bugs and with those the thing is reasonably
well-behaved on xfstests, but it needs a lot more testing (and probably
somebody doing journalling side of things as well) before it can be
considered fully safe for sharing with *BSD.

And I'm still not entirely convinced that mixing tail-unpacking, truncate
and mmap works correctly.  FWIW, switch to iomap-based approach would
probably deal with all of that nicely, but that's well beyond "obvious
bugfixes"; definitely not this cycle fodder.

I'm not even sure if #ufs-fixes is OK for this stage in the cycle - it
is local to fs/ufs, doesn't affect anything else and fixes fairly
obvious bugs, but we are nearly at -rc6.  Pull request for that would
be as below, but it's really up to Linus (and Evgeniy, if he is not
completely MIA) whether to pull it now or wait until the next window.
Linus, what do you think of that?  I'm honestly not sure...

As for my immediate plans, I'll look into the two failing tests,
but any further active work on ufs will have to wait for the next
cycle.  It had been a fun couple of weeks, but I have more than
enough other stuff to deal with.  And I would still very much prefer
for somebody to adopt that puppy.


Fix assorted ufs bugs; a couple of deadlocks, fs corruption in truncate(),
oopsen on tail unpacking and truncate when racing with vmscan, mild fs
corruption (free blocks stats summary buggered, *BSD fsck would
complain and fix), several instances of broken logics around reserved
blocks (starting with "check almost never triggers when it should" and
then there are issues with sufficiently large UFS2).

The following changes since commit 67a70017fa0a152657bc7e337e69bb9c9f5549bf:

  ufs: we need to sync inode before freeing it (2017-06-10 12:02:28 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git ufs-fixes

for you to fetch changes up to a8fad984833832d5ca11a9ed64ddc55646da30e3:

  ufs_truncate_blocks(): fix the case when size is in the last direct block 
(2017-06-15 03:57:46 -0400)


Al Viro (8):
  ufs: fix logics in "ufs: make fsck -f happy"
  ufs: make ufs_freespace() return signed
  ufs: fix reserved blocks check
  ufs: fix s_size/s_dsize users
  ufs_get_locked_page(): make sure we have buffer_heads
  ufs: avoid grabbing ->truncate_mutex if possible
  ufs: more deadlock prevention on tail unpacking
  ufs_truncate_blocks(): fix the case when size is in the last direct block

 fs/ufs/balloc.c | 22 
 fs/ufs/inode.c  | 47 --
 fs/ufs/super.c  | 64 +++--
 fs/ufs/ufs_fs.h |  7 +++
 fs/ufs/util.c   | 17 ---
 fs/ufs/util.h   |  9 ++--
 6 files changed, 98 insertions(+), 68 deletions(-)


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Len Brown
> On a second thought, in order to compute the frequency, user space
> needs to know the scaling and the max_pstate_physical value too, which
> may not be straightforward to obtain (on some Atoms, for example).

unless you run turbostat:-)

> So why don't we leave the tracepoint as is for now?

sure, no problem.
-- 
Len Brown, Intel Open Source Technology Center


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Len Brown
> On a second thought, in order to compute the frequency, user space
> needs to know the scaling and the max_pstate_physical value too, which
> may not be straightforward to obtain (on some Atoms, for example).

unless you run turbostat:-)

> So why don't we leave the tracepoint as is for now?

sure, no problem.
-- 
Len Brown, Intel Open Source Technology Center


Re: [PATCH 30/31] ext4: eliminate xattr entry e_hash recalculation for removes

2017-06-16 Thread Tahsin Erdogan
On Thu, Jun 15, 2017 at 2:10 AM, Jan Kara  wrote:
> I agree with moving ext4_xattr_rehash_entry() out of ext4_xattr_rehash().
> However how about just keeping ext4_xattr_rehash() in
> ext4_xattr_block_set() (so that you don't have to pass aditional argument
> to ext4_xattr_set_entry()) and calling ext4_xattr_rehash_entry() when
> i->value != NULL? That would seem easier and cleaner as well...

The is_block parameter is also used to decide whether block reserve
check should be performed:

@@ -1500,8 +1502,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 * attribute block so that a long value does not occupy the
 * whole space and prevent futher entries being added.
 */
-   if (ext4_has_feature_ea_inode(inode->i_sb) && new_size &&
-   (s->end - s->base) == i_blocksize(inode) &&
+   if (ext4_has_feature_ea_inode(inode->i_sb) &&
+   new_size && is_block &&
(min_offs + old_size - new_size) <
EXT4_XATTR_BLOCK_RESERVE(inode)) {
ret = -ENOSPC;

Because of that, I think moving ext4_xattr_rehash to caller makes it
bit more complicated. Let me know if you disagree.


Re: [PATCH 30/31] ext4: eliminate xattr entry e_hash recalculation for removes

2017-06-16 Thread Tahsin Erdogan
On Thu, Jun 15, 2017 at 2:10 AM, Jan Kara  wrote:
> I agree with moving ext4_xattr_rehash_entry() out of ext4_xattr_rehash().
> However how about just keeping ext4_xattr_rehash() in
> ext4_xattr_block_set() (so that you don't have to pass aditional argument
> to ext4_xattr_set_entry()) and calling ext4_xattr_rehash_entry() when
> i->value != NULL? That would seem easier and cleaner as well...

The is_block parameter is also used to decide whether block reserve
check should be performed:

@@ -1500,8 +1502,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 * attribute block so that a long value does not occupy the
 * whole space and prevent futher entries being added.
 */
-   if (ext4_has_feature_ea_inode(inode->i_sb) && new_size &&
-   (s->end - s->base) == i_blocksize(inode) &&
+   if (ext4_has_feature_ea_inode(inode->i_sb) &&
+   new_size && is_block &&
(min_offs + old_size - new_size) <
EXT4_XATTR_BLOCK_RESERVE(inode)) {
ret = -ENOSPC;

Because of that, I think moving ext4_xattr_rehash to caller makes it
bit more complicated. Let me know if you disagree.


Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions

2017-06-16 Thread Tahsin Erdogan
On Thu, Jun 15, 2017 at 12:57 AM, Jan Kara  wrote:
> Hum, rather handle this similarly to how we handle delalloc reserved space.
> Add a callback to dq_ops to get "inode usage" of an inode and then use it
> in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().

I tried that approach by adding a "int get_inode_usage(struct inode
*inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
ext4 code that calculates the number of internal inodes
(ext4_xattr_inode_count()) is subject to failures so the callback has
to be able to report errors. And, that itself is problematic because
we can't afford to have errors in dquot_free_inode(). If you have
thoughts about how to address this please let me know.

Alternatively, I could try to make this patch less intrusive by
keeping the existing dquot_transfer() signature and add a new
dquot_transfer_usage() that accepts inode_usage as a parameter. What
do you think?


Re: [PATCH 28/28] quota: add extra inode count to dquot transfer functions

2017-06-16 Thread Tahsin Erdogan
On Thu, Jun 15, 2017 at 12:57 AM, Jan Kara  wrote:
> Hum, rather handle this similarly to how we handle delalloc reserved space.
> Add a callback to dq_ops to get "inode usage" of an inode and then use it
> in dquot_transfer(), dquot_free_inode(), dquot_alloc_inode().

I tried that approach by adding a "int get_inode_usage(struct inode
*inode, qsize_t *usage)" callback to dquot_operations. Unfortunately,
ext4 code that calculates the number of internal inodes
(ext4_xattr_inode_count()) is subject to failures so the callback has
to be able to report errors. And, that itself is problematic because
we can't afford to have errors in dquot_free_inode(). If you have
thoughts about how to address this please let me know.

Alternatively, I could try to make this patch less intrusive by
keeping the existing dquot_transfer() signature and add a new
dquot_transfer_usage() that accepts inode_usage as a parameter. What
do you think?


Re: [PATCH 2/5] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 8:30 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:13 PM Len Brown wrote:
>> From: Len Brown 
>>
>> The goal of this change is to give users a uniform and meaningful
>> result when they read /sys/...cpufreq/scaling_cur_freq
>> on modern x86 hardware, as compared to what they get today.
>>
>> Modern x86 processors include the hardware needed
>> to accurately calculate frequency over an interval --
>> APERF, MPERF, and the TSC.
>>
>> Here we provide an x86 routine to make this calculation
>> on supported hardware, and use it in preference to any
>> driver driver-specific cpufreq_driver.get() routine.
>>
>> MHz is computed like so:
>>
>> MHz = base_MHz * delta_APERF / delta_MPERF
>>
>> MHz is the average frequency of the busy processor
>> over a measurement interval.  The interval is
>> defined to be the time between successive invocations
>> of aperfmperf_khz_on_cpu(), which are expected to to
>> happen on-demand when users read sysfs attribute
>> cpufreq/scaling_cur_freq.
>>
>> As with previous methods of calculating MHz,
>> idle time is excluded.
>>
>> base_MHz above is from TSC calibration global "cpu_khz".
>>
>> This x86 native method to calculate MHz returns a meaningful result
>> no matter if P-states are controlled by hardware or firmware
>> and/or if the Linux cpufreq sub-system is or is-not installed.
>>
>> When this routine is invoked more frequently, the measurement
>> interval becomes shorter.  However, the code limits re-computation
>> to 10ms intervals so that average frequency remains meaningful.
>>
>> Discerning users are encouraged to take advantage of
>> the turbostat(8) utility, which can gracefully handle
>> concurrent measurement intervals of arbitrary length.
>>
>> Signed-off-by: Len Brown 
>> ---
>>  arch/x86/kernel/cpu/Makefile |  1 +
>>  arch/x86/kernel/cpu/aperfmperf.c | 82 
>> 
>>  drivers/cpufreq/cpufreq.c|  7 +++-
>>  include/linux/cpufreq.h  | 13 +++
>>  4 files changed, 102 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/x86/kernel/cpu/aperfmperf.c
>>
>> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
>> index 521..cdf8249 100644
>> --- a/arch/x86/kernel/cpu/Makefile
>> +++ b/arch/x86/kernel/cpu/Makefile
>> @@ -21,6 +21,7 @@ obj-y   += common.o
>>  obj-y+= rdrand.o
>>  obj-y+= match.o
>>  obj-y+= bugs.o
>> +obj-$(CONFIG_CPU_FREQ)   += aperfmperf.o
>>
>>  obj-$(CONFIG_PROC_FS)+= proc.o
>>  obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
>> diff --git a/arch/x86/kernel/cpu/aperfmperf.c 
>> b/arch/x86/kernel/cpu/aperfmperf.c
>> new file mode 100644
>> index 000..5ccf63a
>> --- /dev/null
>> +++ b/arch/x86/kernel/cpu/aperfmperf.c
>> @@ -0,0 +1,82 @@
>> +/*
>> + * x86 APERF/MPERF KHz calculation for
>> + * /sys/.../cpufreq/scaling_cur_freq
>> + *
>> + * Copyright (C) 2017 Intel Corp.
>> + * Author: Len Brown 
>> + *
>> + * This file is licensed under GPLv2.
>> + */
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +struct aperfmperf_sample {
>> + unsigned int khz;
>> + unsigned long jiffies;
>> + u64 aperf;
>> + u64 mperf;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
>> +
>> +/*
>> + * aperfmperf_snapshot_khz()
>> + * On the current CPU, snapshot APERF, MPERF, and jiffies
>> + * unless we already did it within 10ms
>> + * calculate kHz, save snapshot
>> + */
>> +static void aperfmperf_snapshot_khz(void *dummy)
>> +{
>> + u64 aperf, aperf_delta;
>> + u64 mperf, mperf_delta;
>> + struct aperfmperf_sample *s = _cpu_var(samples);
>> +
>> + /* Don't bother re-computing within 10 ms */
>> + if (time_before(jiffies, s->jiffies + HZ/100))
>> + goto out;
>> +
>> + rdmsrl(MSR_IA32_APERF, aperf);
>> + rdmsrl(MSR_IA32_MPERF, mperf);
>> +
>> + aperf_delta = aperf - s->aperf;
>> + mperf_delta = mperf - s->mperf;
>> +
>> + /*
>> +  * There is no architectural guarantee that MPERF
>> +  * increments faster than we can read it.
>> +  */
>> + if (mperf_delta == 0)
>> + goto out;
>> +
>> + /*
>> +  * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
>> +  *  khz = (cpu_khz * aperf_delta) / mperf_delta
>> +  */
>> + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
>> + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
>> + else/* khz = aperf_delta / (mperf_delta / cpu_khz) */
>> + s->khz = div64_u64(aperf_delta,
>> + div64_u64(mperf_delta, cpu_khz));
>> + s->jiffies = jiffies;
>> + s->aperf = aperf;
>> + s->mperf = mperf;
>> +
>> +out:
>> + put_cpu_var(samples);
>> +}
>> +
>> +unsigned int 

Re: [PATCH 2/5] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 8:30 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:13 PM Len Brown wrote:
>> From: Len Brown 
>>
>> The goal of this change is to give users a uniform and meaningful
>> result when they read /sys/...cpufreq/scaling_cur_freq
>> on modern x86 hardware, as compared to what they get today.
>>
>> Modern x86 processors include the hardware needed
>> to accurately calculate frequency over an interval --
>> APERF, MPERF, and the TSC.
>>
>> Here we provide an x86 routine to make this calculation
>> on supported hardware, and use it in preference to any
>> driver driver-specific cpufreq_driver.get() routine.
>>
>> MHz is computed like so:
>>
>> MHz = base_MHz * delta_APERF / delta_MPERF
>>
>> MHz is the average frequency of the busy processor
>> over a measurement interval.  The interval is
>> defined to be the time between successive invocations
>> of aperfmperf_khz_on_cpu(), which are expected to to
>> happen on-demand when users read sysfs attribute
>> cpufreq/scaling_cur_freq.
>>
>> As with previous methods of calculating MHz,
>> idle time is excluded.
>>
>> base_MHz above is from TSC calibration global "cpu_khz".
>>
>> This x86 native method to calculate MHz returns a meaningful result
>> no matter if P-states are controlled by hardware or firmware
>> and/or if the Linux cpufreq sub-system is or is-not installed.
>>
>> When this routine is invoked more frequently, the measurement
>> interval becomes shorter.  However, the code limits re-computation
>> to 10ms intervals so that average frequency remains meaningful.
>>
>> Discerning users are encouraged to take advantage of
>> the turbostat(8) utility, which can gracefully handle
>> concurrent measurement intervals of arbitrary length.
>>
>> Signed-off-by: Len Brown 
>> ---
>>  arch/x86/kernel/cpu/Makefile |  1 +
>>  arch/x86/kernel/cpu/aperfmperf.c | 82 
>> 
>>  drivers/cpufreq/cpufreq.c|  7 +++-
>>  include/linux/cpufreq.h  | 13 +++
>>  4 files changed, 102 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/x86/kernel/cpu/aperfmperf.c
>>
>> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
>> index 521..cdf8249 100644
>> --- a/arch/x86/kernel/cpu/Makefile
>> +++ b/arch/x86/kernel/cpu/Makefile
>> @@ -21,6 +21,7 @@ obj-y   += common.o
>>  obj-y+= rdrand.o
>>  obj-y+= match.o
>>  obj-y+= bugs.o
>> +obj-$(CONFIG_CPU_FREQ)   += aperfmperf.o
>>
>>  obj-$(CONFIG_PROC_FS)+= proc.o
>>  obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
>> diff --git a/arch/x86/kernel/cpu/aperfmperf.c 
>> b/arch/x86/kernel/cpu/aperfmperf.c
>> new file mode 100644
>> index 000..5ccf63a
>> --- /dev/null
>> +++ b/arch/x86/kernel/cpu/aperfmperf.c
>> @@ -0,0 +1,82 @@
>> +/*
>> + * x86 APERF/MPERF KHz calculation for
>> + * /sys/.../cpufreq/scaling_cur_freq
>> + *
>> + * Copyright (C) 2017 Intel Corp.
>> + * Author: Len Brown 
>> + *
>> + * This file is licensed under GPLv2.
>> + */
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +struct aperfmperf_sample {
>> + unsigned int khz;
>> + unsigned long jiffies;
>> + u64 aperf;
>> + u64 mperf;
>> +};
>> +
>> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
>> +
>> +/*
>> + * aperfmperf_snapshot_khz()
>> + * On the current CPU, snapshot APERF, MPERF, and jiffies
>> + * unless we already did it within 10ms
>> + * calculate kHz, save snapshot
>> + */
>> +static void aperfmperf_snapshot_khz(void *dummy)
>> +{
>> + u64 aperf, aperf_delta;
>> + u64 mperf, mperf_delta;
>> + struct aperfmperf_sample *s = _cpu_var(samples);
>> +
>> + /* Don't bother re-computing within 10 ms */
>> + if (time_before(jiffies, s->jiffies + HZ/100))
>> + goto out;
>> +
>> + rdmsrl(MSR_IA32_APERF, aperf);
>> + rdmsrl(MSR_IA32_MPERF, mperf);
>> +
>> + aperf_delta = aperf - s->aperf;
>> + mperf_delta = mperf - s->mperf;
>> +
>> + /*
>> +  * There is no architectural guarantee that MPERF
>> +  * increments faster than we can read it.
>> +  */
>> + if (mperf_delta == 0)
>> + goto out;
>> +
>> + /*
>> +  * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
>> +  *  khz = (cpu_khz * aperf_delta) / mperf_delta
>> +  */
>> + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
>> + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
>> + else/* khz = aperf_delta / (mperf_delta / cpu_khz) */
>> + s->khz = div64_u64(aperf_delta,
>> + div64_u64(mperf_delta, cpu_khz));
>> + s->jiffies = jiffies;
>> + s->aperf = aperf;
>> + s->mperf = mperf;
>> +
>> +out:
>> + put_cpu_var(samples);
>> +}
>> +
>> +unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
>> +{
>> + if (!cpu_khz)
>> + return 0;
>> +
>> + 

Re: [PATCH v2 3/4] KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf

2017-06-16 Thread Wanpeng Li
2017-06-16 23:38 GMT+08:00 Radim Krčmář :
> 2017-06-16 22:24+0800, Wanpeng Li:
>> 2017-06-16 21:37 GMT+08:00 Radim Krčmář :
>> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> From: Wanpeng Li 
>> >>
>> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> is async page fault.
>> >>
>> >> Cc: Paolo Bonzini 
>> >> Cc: Radim Krčmář 
>> >> Signed-off-by: Wanpeng Li 
>> >> ---
>> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >>  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception 
>> >> *fault)
>> >>  {
>> >>   ++vcpu->stat.pf_guest;
>> >> - vcpu->arch.cr2 = fault->address;
>> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >
>> > I think we need to act as if arch.exception.async_page_fault was not
>> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events().  Otherwise, if we
>> > migrate with pending async_page_fault exception, we'd inject it as a
>> > normal #PF, which could confuse/kill the nested guest.
>> >
>> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> > sanity as well.
>>
>> Do you mean we should add a field like async_page_fault to
>> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> restores events->exception.async_page_fault to
>> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>
> No, I thought we could get away with a disgusting hack of hiding the
> exception from userspace, which would work for migration, but not if
> local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>
> Extending the userspace interface would work, but I'd do it as a last
> resort, after all conservative solutions have failed.
> async_pf migration is very crude, so exposing the exception is just an
> ugly workaround for the local case.  Adding the flag would also require
> userspace configuration of async_pf features for the guest to keep
> compatibility.
>
> I see two options that might be simpler than adding the userspace flag:
>
>  1) do the nested VM exit sooner, at the place where we now queue #PF,
>  2) queue the #PF later, save the async_pf in some intermediate
> structure and consume it at the place where you proposed the nested
> VM exit.

Yeah, this hidden looks rational to me, even if we lost a
PAGE_NOTREADY async_pf to L1, that just a hint to L1 reschedule
optimization, and PAGE_READY async_pf will be guranteed by the wakeup
all after live migration.

Regards,
Wanpeng Li


Re: [PATCH v2 3/4] KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf

2017-06-16 Thread Wanpeng Li
2017-06-16 23:38 GMT+08:00 Radim Krčmář :
> 2017-06-16 22:24+0800, Wanpeng Li:
>> 2017-06-16 21:37 GMT+08:00 Radim Krčmář :
>> > 2017-06-14 19:26-0700, Wanpeng Li:
>> >> From: Wanpeng Li 
>> >>
>> >> Add an async_page_fault field to vcpu->arch.exception to identify an async
>> >> page fault, and constructs the expected vm-exit information fields. Force
>> >> a nested VM exit from nested_vmx_check_exception() if the injected #PF
>> >> is async page fault.
>> >>
>> >> Cc: Paolo Bonzini 
>> >> Cc: Radim Krčmář 
>> >> Signed-off-by: Wanpeng Li 
>> >> ---
>> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> >> @@ -452,7 +452,11 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
>> >>  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception 
>> >> *fault)
>> >>  {
>> >>   ++vcpu->stat.pf_guest;
>> >> - vcpu->arch.cr2 = fault->address;
>> >> + vcpu->arch.exception.async_page_fault = fault->async_page_fault;
>> >
>> > I think we need to act as if arch.exception.async_page_fault was not
>> > pending in kvm_vcpu_ioctl_x86_get_vcpu_events().  Otherwise, if we
>> > migrate with pending async_page_fault exception, we'd inject it as a
>> > normal #PF, which could confuse/kill the nested guest.
>> >
>> > And kvm_vcpu_ioctl_x86_set_vcpu_events() should clean the flag for
>> > sanity as well.
>>
>> Do you mean we should add a field like async_page_fault to
>> kvm_vcpu_events::exception, then saves arch.exception.async_page_fault
>> to events->exception.async_page_fault through KVM_GET_VCPU_EVENTS and
>> restores events->exception.async_page_fault to
>> arch.exception.async_page_fault through KVM_SET_VCPU_EVENTS?
>
> No, I thought we could get away with a disgusting hack of hiding the
> exception from userspace, which would work for migration, but not if
> local userspace did KVM_GET_VCPU_EVENTS and KVM_SET_VCPU_EVENTS ...
>
> Extending the userspace interface would work, but I'd do it as a last
> resort, after all conservative solutions have failed.
> async_pf migration is very crude, so exposing the exception is just an
> ugly workaround for the local case.  Adding the flag would also require
> userspace configuration of async_pf features for the guest to keep
> compatibility.
>
> I see two options that might be simpler than adding the userspace flag:
>
>  1) do the nested VM exit sooner, at the place where we now queue #PF,
>  2) queue the #PF later, save the async_pf in some intermediate
> structure and consume it at the place where you proposed the nested
> VM exit.

Yeah, this hidden looks rational to me, even if we lost a
PAGE_NOTREADY async_pf to L1, that just a hint to L1 reschedule
optimization, and PAGE_READY async_pf will be guranteed by the wakeup
all after live migration.

Regards,
Wanpeng Li


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Sat, Jun 17, 2017 at 3:21 AM, Rafael J. Wysocki  wrote:
> On Friday, June 16, 2017 09:10:39 PM Len Brown wrote:
>> >> >> - get_avg_frequency(cpu),
>> >> >> + aperfmperf_khz_on_cpu(cpu->cpu),
>> >>
>> >> Note that I deleted the line above in an updated version of this patch
>> >> that I'm ready to send out.
>> >>
>> >> There were a couple of problems with it.
>> >> The first is that it was  ugly that tracing (which occurs only in the
>> >> SW governor case)
>> >> could shorten the measurement interval as seen by the sysfs interface.
>> >>
>> >> The second is that this trace point can be called with irqs off,
>> >> and smp_call_function_single() will WARN when called with irqs off.
>> >>
>> >> Srinivas Acked that I simply remove this field from the tracepoint --
>> >> as it is redundant to calculate it in the kernel when we are already
>> >> exporting the raw values of aperf and mperf.
>> >
>> > This changes the tracepoint format and I know about a couple of user space
>> > scripts that consume these tracepoints though.
>> > What would be wrong with leaving it as is?
>>
>> I'm fine keeping get_avg_frequency() for the purpose of just this tracepoint
>> compatibility, if you think it is useful to do so.
>>
>> I removed it because its only function was the (removed) intel_pstate.get()
>> and this tracepoint.  And this tracepoint already includes aperf and mperf,
>> which can be used to calculate frequency in user-space, if desired.
>> Srinivas Acked' that updating his user-space script would be fine --
>> dunno if that is sufficient.
>
> Well, we can try to make this change and see if there are any complaints
> about it.
>
> But the Srinivas' script is in the tree, so it would be good to update it too
> along with the tracepoint.

On a second thought, in order to compute the frequency, user space
needs to know the scaling and the max_pstate_physical value too, which
may not be straightforward to obtain (on some Atoms, for example).

So why don't we leave the tracepoint as is for now?

Thanks,
Rafael


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Sat, Jun 17, 2017 at 3:21 AM, Rafael J. Wysocki  wrote:
> On Friday, June 16, 2017 09:10:39 PM Len Brown wrote:
>> >> >> - get_avg_frequency(cpu),
>> >> >> + aperfmperf_khz_on_cpu(cpu->cpu),
>> >>
>> >> Note that I deleted the line above in an updated version of this patch
>> >> that I'm ready to send out.
>> >>
>> >> There were a couple of problems with it.
>> >> The first is that it was  ugly that tracing (which occurs only in the
>> >> SW governor case)
>> >> could shorten the measurement interval as seen by the sysfs interface.
>> >>
>> >> The second is that this trace point can be called with irqs off,
>> >> and smp_call_function_single() will WARN when called with irqs off.
>> >>
>> >> Srinivas Acked that I simply remove this field from the tracepoint --
>> >> as it is redundant to calculate it in the kernel when we are already
>> >> exporting the raw values of aperf and mperf.
>> >
>> > This changes the tracepoint format and I know about a couple of user space
>> > scripts that consume these tracepoints though.
>> > What would be wrong with leaving it as is?
>>
>> I'm fine keeping get_avg_frequency() for the purpose of just this tracepoint
>> compatibility, if you think it is useful to do so.
>>
>> I removed it because its only function was the (removed) intel_pstate.get()
>> and this tracepoint.  And this tracepoint already includes aperf and mperf,
>> which can be used to calculate frequency in user-space, if desired.
>> Srinivas Acked' that updating his user-space script would be fine --
>> dunno if that is sufficient.
>
> Well, we can try to make this change and see if there are any complaints
> about it.
>
> But the Srinivas' script is in the tree, so it would be good to update it too
> along with the tracepoint.

On a second thought, in order to compute the frequency, user space
needs to know the scaling and the max_pstate_physical value too, which
may not be straightforward to obtain (on some Atoms, for example).

So why don't we leave the tracepoint as is for now?

Thanks,
Rafael


Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
sorry, that was a premature send...

>>> > What about update_turbo_pstate()?
>>> >
>>> > In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
>>> > wouldn't that become problematic after this change?
>>>
>>> yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
>>> update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE
>>
>> If that was the only way it could change, I wouldn't worry about it, but what
>> about changes by BMCs and similar?  Are they not a concern?
>>
>>> But how is the presence or change in turbo related to the lack of a
>>> need to hook the scheduler callback in "performance" mode?  The hook
>>> literally does nothing in this case, except consume cycles, no?
>>
>> No.
>>
>> It actually sets the P-state to the current maximum (which admittedly is
>> excessive) exactly because the maximum may change on the fly in theory.
>
> There are 2 cases.
>
> If turbo was enabled and were we requesting max turbo
> and "somebody" disabled turbo in an MSR, then the HW would
> simply clip our excessive
request to what the hardware supports.

if turbo was disabled and we were requesting max non-turbo,
and "somebody" enabled turbo in the MSR,
then it is highly likely that current request is already sufficiently
high enough
to enable turbo.  "highly likely" = would work on 100% of the
machines I have ever seen, including those with mis-configured TAR.

ie. it should not matter.

>> If it can't change on the fly (or we don't care), we can do some more
>> simplifications there. :-)
>
> I do not think it is Linux's responsibility to monitor changes to MSRs
> such as Turbo enable/disable done behind its back by a BMC at run-time.
> (if this is even possible)
>
>
> --
> Len Brown, Intel Open Source Technology Center


Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
sorry, that was a premature send...

>>> > What about update_turbo_pstate()?
>>> >
>>> > In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
>>> > wouldn't that become problematic after this change?
>>>
>>> yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
>>> update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE
>>
>> If that was the only way it could change, I wouldn't worry about it, but what
>> about changes by BMCs and similar?  Are they not a concern?
>>
>>> But how is the presence or change in turbo related to the lack of a
>>> need to hook the scheduler callback in "performance" mode?  The hook
>>> literally does nothing in this case, except consume cycles, no?
>>
>> No.
>>
>> It actually sets the P-state to the current maximum (which admittedly is
>> excessive) exactly because the maximum may change on the fly in theory.
>
> There are 2 cases.
>
> If turbo was enabled and were we requesting max turbo
> and "somebody" disabled turbo in an MSR, then the HW would
> simply clip our excessive
request to what the hardware supports.

if turbo was disabled and we were requesting max non-turbo,
and "somebody" enabled turbo in the MSR,
then it is highly likely that current request is already sufficiently
high enough
to enable turbo.  "highly likely" = would work on 100% of the
machines I have ever seen, including those with mis-configured TAR.

ie. it should not matter.

>> If it can't change on the fly (or we don't care), we can do some more
>> simplifications there. :-)
>
> I do not think it is Linux's responsibility to monitor changes to MSRs
> such as Turbo enable/disable done behind its back by a BMC at run-time.
> (if this is even possible)
>
>
> --
> Len Brown, Intel Open Source Technology Center


Re: [PATCH v7 21/26] x86: Add emulation code for UMIP instructions

2017-06-16 Thread Ricardo Neri
On Thu, 2017-06-08 at 20:38 +0200, Borislav Petkov wrote:
> On Fri, May 05, 2017 at 11:17:19AM -0700, Ricardo Neri wrote:
> > The feature User-Mode Instruction Prevention present in recent Intel
> > processor prevents a group of instructions from being executed with
> > CPL > 0. Otherwise, a general protection fault is issued.
> 
> This is one of the best opening paragraphs of a commit message I've
> read this year! This is how you open: short, succinct, to the point, no
> marketing bullshit. Good!

Thanks you!
> 
> > Rather than relaying this fault to the user space (in the form of a SIGSEGV
> > signal), the instructions protected by UMIP can be emulated to provide
> > dummy results. This allows to conserve the current kernel behavior and not
> > reveal the system resources that UMIP intends to protect (the global
> > descriptor and interrupt descriptor tables, the segment selectors of the
> > local descriptor table and the task state and the machine status word).
> > 
> > This emulation is needed because certain applications (e.g., WineHQ and
> > DOSEMU2) rely on this subset of instructions to function.
> > 
> > The instructions protected by UMIP can be split in two groups. Those who
> 
> s/who/which/

I will correct.
> 
> > return a kernel memory address (sgdt and sidt) and those who return a
> 
> ditto.

I will correct here also.
> 
> > value (sldt, str and smsw).
> >
> > For the instructions that return a kernel memory address, applications
> > such as WineHQ rely on the result being located in the kernel memory space.
> > The result is emulated as a hard-coded value that, lies close to the top
> > of the kernel memory. The limit for the GDT and the IDT are set to zero.
> 
> Nice.
> 
> > Given that sldt and str are not used in common in programs supported by
> 
> You wanna say "in common programs" here? Or "not commonly used in programs" ?

I will rephrase this comment.
> 
> > WineHQ and DOSEMU2, they are not emulated.
> > 
> > The instruction smsw is emulated to return the value that the register CR0
> > has at boot time as set in the head_32.
> > 
> > Care is taken to appropriately emulate the results when segmentation is
> > used. This is, rather than relying on USER_DS and USER_CS, the function
> 
>   "That is,... "

I will correct it.
> 
> > insn_get_addr_ref() inspects the segment descriptor pointed by the
> > registers in pt_regs. This ensures that we correctly obtain the segment
> > base address and the address and operand sizes even if the user space
> > application uses local descriptor table.
> 
> Btw, I could very well use all that nice explanation in umip.c too so
> that the high-level behavior is documented.

Sure, I will include a high-level description in the file itself.

> 
> > Cc: Andy Lutomirski 
> > Cc: Andrew Morton 
> > Cc: H. Peter Anvin 
> > Cc: Borislav Petkov 
> > Cc: Brian Gerst 
> > Cc: Chen Yucong 
> > Cc: Chris Metcalf 
> > Cc: Dave Hansen 
> > Cc: Fenghua Yu 
> > Cc: Huang Rui 
> > Cc: Jiri Slaby 
> > Cc: Jonathan Corbet 
> > Cc: Michael S. Tsirkin 
> > Cc: Paul Gortmaker 
> > Cc: Peter Zijlstra 
> > Cc: Ravi V. Shankar 
> > Cc: Shuah Khan 
> > Cc: Vlastimil Babka 
> > Cc: Tony Luck 
> > Cc: Paolo Bonzini 
> > Cc: Liang Z. Li 
> > Cc: Alexandre Julliard 
> > Cc: Stas Sergeev 
> > Cc: x...@kernel.org
> > Cc: linux-ms...@vger.kernel.org
> > Signed-off-by: Ricardo Neri 
> > ---
> >  arch/x86/include/asm/umip.h |  15 +++
> >  arch/x86/kernel/Makefile|   1 +
> >  arch/x86/kernel/umip.c  | 245 
> > 
> >  3 files changed, 261 insertions(+)
> >  create mode 100644 arch/x86/include/asm/umip.h
> >  create mode 100644 arch/x86/kernel/umip.c
> > 
> > diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h
> > new file mode 100644
> > index 000..077b236
> > --- /dev/null
> > +++ b/arch/x86/include/asm/umip.h
> > @@ -0,0 +1,15 @@
> > +#ifndef _ASM_X86_UMIP_H
> > +#define _ASM_X86_UMIP_H
> > +
> > +#include 
> > +#include 
> > +
> > +#ifdef CONFIG_X86_INTEL_UMIP
> > +bool fixup_umip_exception(struct pt_regs *regs);
> > +#else
> > +static inline bool fixup_umip_exception(struct pt_regs *regs)
> > +{
> > +   return false;
> > +}
> 
> Let's save some header lines:
> 
> static inline bool fixup_umip_exception(struct pt_regs *regs) { 
> return false; }
> 
> those trunks take too much space as it is.

I will correct.
> 
> > +#endif  /* CONFIG_X86_INTEL_UMIP */
> > +#endif  /* _ASM_X86_UMIP_H */
> > diff --git 

Re: [PATCH v7 21/26] x86: Add emulation code for UMIP instructions

2017-06-16 Thread Ricardo Neri
On Thu, 2017-06-08 at 20:38 +0200, Borislav Petkov wrote:
> On Fri, May 05, 2017 at 11:17:19AM -0700, Ricardo Neri wrote:
> > The feature User-Mode Instruction Prevention present in recent Intel
> > processor prevents a group of instructions from being executed with
> > CPL > 0. Otherwise, a general protection fault is issued.
> 
> This is one of the best opening paragraphs of a commit message I've
> read this year! This is how you open: short, succinct, to the point, no
> marketing bullshit. Good!

Thanks you!
> 
> > Rather than relaying this fault to the user space (in the form of a SIGSEGV
> > signal), the instructions protected by UMIP can be emulated to provide
> > dummy results. This allows to conserve the current kernel behavior and not
> > reveal the system resources that UMIP intends to protect (the global
> > descriptor and interrupt descriptor tables, the segment selectors of the
> > local descriptor table and the task state and the machine status word).
> > 
> > This emulation is needed because certain applications (e.g., WineHQ and
> > DOSEMU2) rely on this subset of instructions to function.
> > 
> > The instructions protected by UMIP can be split in two groups. Those who
> 
> s/who/which/

I will correct.
> 
> > return a kernel memory address (sgdt and sidt) and those who return a
> 
> ditto.

I will correct here also.
> 
> > value (sldt, str and smsw).
> >
> > For the instructions that return a kernel memory address, applications
> > such as WineHQ rely on the result being located in the kernel memory space.
> > The result is emulated as a hard-coded value that, lies close to the top
> > of the kernel memory. The limit for the GDT and the IDT are set to zero.
> 
> Nice.
> 
> > Given that sldt and str are not used in common in programs supported by
> 
> You wanna say "in common programs" here? Or "not commonly used in programs" ?

I will rephrase this comment.
> 
> > WineHQ and DOSEMU2, they are not emulated.
> > 
> > The instruction smsw is emulated to return the value that the register CR0
> > has at boot time as set in the head_32.
> > 
> > Care is taken to appropriately emulate the results when segmentation is
> > used. This is, rather than relying on USER_DS and USER_CS, the function
> 
>   "That is,... "

I will correct it.
> 
> > insn_get_addr_ref() inspects the segment descriptor pointed by the
> > registers in pt_regs. This ensures that we correctly obtain the segment
> > base address and the address and operand sizes even if the user space
> > application uses local descriptor table.
> 
> Btw, I could very well use all that nice explanation in umip.c too so
> that the high-level behavior is documented.

Sure, I will include a high-level description in the file itself.

> 
> > Cc: Andy Lutomirski 
> > Cc: Andrew Morton 
> > Cc: H. Peter Anvin 
> > Cc: Borislav Petkov 
> > Cc: Brian Gerst 
> > Cc: Chen Yucong 
> > Cc: Chris Metcalf 
> > Cc: Dave Hansen 
> > Cc: Fenghua Yu 
> > Cc: Huang Rui 
> > Cc: Jiri Slaby 
> > Cc: Jonathan Corbet 
> > Cc: Michael S. Tsirkin 
> > Cc: Paul Gortmaker 
> > Cc: Peter Zijlstra 
> > Cc: Ravi V. Shankar 
> > Cc: Shuah Khan 
> > Cc: Vlastimil Babka 
> > Cc: Tony Luck 
> > Cc: Paolo Bonzini 
> > Cc: Liang Z. Li 
> > Cc: Alexandre Julliard 
> > Cc: Stas Sergeev 
> > Cc: x...@kernel.org
> > Cc: linux-ms...@vger.kernel.org
> > Signed-off-by: Ricardo Neri 
> > ---
> >  arch/x86/include/asm/umip.h |  15 +++
> >  arch/x86/kernel/Makefile|   1 +
> >  arch/x86/kernel/umip.c  | 245 
> > 
> >  3 files changed, 261 insertions(+)
> >  create mode 100644 arch/x86/include/asm/umip.h
> >  create mode 100644 arch/x86/kernel/umip.c
> > 
> > diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h
> > new file mode 100644
> > index 000..077b236
> > --- /dev/null
> > +++ b/arch/x86/include/asm/umip.h
> > @@ -0,0 +1,15 @@
> > +#ifndef _ASM_X86_UMIP_H
> > +#define _ASM_X86_UMIP_H
> > +
> > +#include 
> > +#include 
> > +
> > +#ifdef CONFIG_X86_INTEL_UMIP
> > +bool fixup_umip_exception(struct pt_regs *regs);
> > +#else
> > +static inline bool fixup_umip_exception(struct pt_regs *regs)
> > +{
> > +   return false;
> > +}
> 
> Let's save some header lines:
> 
> static inline bool fixup_umip_exception(struct pt_regs *regs) { 
> return false; }
> 
> those trunks take too much space as it is.

I will correct.
> 
> > +#endif  /* CONFIG_X86_INTEL_UMIP */
> > +#endif  /* _ASM_X86_UMIP_H */
> > diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> > index 4b99423..cc1b7cc 100644
> > --- a/arch/x86/kernel/Makefile
> > +++ b/arch/x86/kernel/Makefile
> > @@ -123,6 +123,7 @@ obj-$(CONFIG_EFI)   += sysfb_efi.o
> >  obj-$(CONFIG_PERF_EVENTS)  += perf_regs.o
> >  obj-$(CONFIG_TRACING)  += tracepoint.o
> >  obj-$(CONFIG_SCHED_MC_PRIO)+= itmt.o
> > +obj-$(CONFIG_X86_INTEL_UMIP)   += umip.o
> >  
> >  ifdef 

Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 9:06 PM, Rafael J. Wysocki  wrote:
> On Friday, June 16, 2017 08:52:53 PM Len Brown wrote:
>> On Fri, Jun 16, 2017 at 8:04 PM, Rafael J. Wysocki  
>> wrote:
>> > On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
>> >> From: Len Brown 
>> >>
>> >> When the governor is set to "performance", intel_pstate does not
>> >> need the scheduler hook for doing any calculations.  Under these
>> >> conditions, its only purpose is to continue to maintain
>> >> cpufreq/scaling_cur_freq.
>> >>
>> >> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
>> >> the x86 cpufreq core on all modern x86 systems, including
>> >> all systems supported by the intel_pstate driver.
>> >>
>> >> So in "performance" governor mode, the scheduler hook can be skipped.
>> >> This applies to both in Software and Hardware P-state control modes.
>> >>
>> >> Suggested-by: Srinivas Pandruvada 
>> >> Signed-off-by: Len Brown 
>> >> ---
>> >>  drivers/cpufreq/intel_pstate.c | 4 ++--
>> >>  1 file changed, 2 insertions(+), 2 deletions(-)
>> >>
>> >> diff --git a/drivers/cpufreq/intel_pstate.c 
>> >> b/drivers/cpufreq/intel_pstate.c
>> >> index 5d67780..0ff3a4b 100644
>> >> --- a/drivers/cpufreq/intel_pstate.c
>> >> +++ b/drivers/cpufreq/intel_pstate.c
>> >> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
>> >> cpufreq_policy *policy)
>> >>*/
>> >>   intel_pstate_clear_update_util_hook(policy->cpu);
>> >
>> > The statement above shouldn't be necessary any more after the change below.
>>
>> The policy can change at run time form something other than  performance
>> to performance, so we want to clear the hook in that case, no?
>
> Yes.
>
>> >>   intel_pstate_max_within_limits(cpu);
>> >> + } else {
>> >> + intel_pstate_set_update_util_hook(policy->cpu);
>> >>   }
>> >>
>> >> - intel_pstate_set_update_util_hook(policy->cpu);
>> >> -
>> >>   if (hwp_active)
>> >>   intel_pstate_hwp_set(policy->cpu);
>> >>
>> >
>> > What about update_turbo_pstate()?
>> >
>> > In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
>> > wouldn't that become problematic after this change?
>>
>> yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
>> update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE
>
> If that was the only way it could change, I wouldn't worry about it, but what
> about changes by BMCs and similar?  Are they not a concern?
>
>> But how is the presence or change in turbo related to the lack of a
>> need to hook the scheduler callback in "performance" mode?  The hook
>> literally does nothing in this case, except consume cycles, no?
>
> No.
>
> It actually sets the P-state to the current maximum (which admittedly is
> excessive) exactly because the maximum may change on the fly in theory.

There are 2 cases.

If turbo was enabled and were we requesting max turbo
and "somebody" disabled turbo in an MSR, then the HW would
simply clip our excessive req
> If it can't change on the fly (or we don't care), we can do some more
> simplifications there. :-)

I do not think it is Linux's responsibility to monitor changes to MSRs
such as Turbo enable/disable done behind its back by a BMC at run-time.
(if this is even possible)


-- 
Len Brown, Intel Open Source Technology Center


Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 9:06 PM, Rafael J. Wysocki  wrote:
> On Friday, June 16, 2017 08:52:53 PM Len Brown wrote:
>> On Fri, Jun 16, 2017 at 8:04 PM, Rafael J. Wysocki  
>> wrote:
>> > On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
>> >> From: Len Brown 
>> >>
>> >> When the governor is set to "performance", intel_pstate does not
>> >> need the scheduler hook for doing any calculations.  Under these
>> >> conditions, its only purpose is to continue to maintain
>> >> cpufreq/scaling_cur_freq.
>> >>
>> >> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
>> >> the x86 cpufreq core on all modern x86 systems, including
>> >> all systems supported by the intel_pstate driver.
>> >>
>> >> So in "performance" governor mode, the scheduler hook can be skipped.
>> >> This applies to both in Software and Hardware P-state control modes.
>> >>
>> >> Suggested-by: Srinivas Pandruvada 
>> >> Signed-off-by: Len Brown 
>> >> ---
>> >>  drivers/cpufreq/intel_pstate.c | 4 ++--
>> >>  1 file changed, 2 insertions(+), 2 deletions(-)
>> >>
>> >> diff --git a/drivers/cpufreq/intel_pstate.c 
>> >> b/drivers/cpufreq/intel_pstate.c
>> >> index 5d67780..0ff3a4b 100644
>> >> --- a/drivers/cpufreq/intel_pstate.c
>> >> +++ b/drivers/cpufreq/intel_pstate.c
>> >> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
>> >> cpufreq_policy *policy)
>> >>*/
>> >>   intel_pstate_clear_update_util_hook(policy->cpu);
>> >
>> > The statement above shouldn't be necessary any more after the change below.
>>
>> The policy can change at run time form something other than  performance
>> to performance, so we want to clear the hook in that case, no?
>
> Yes.
>
>> >>   intel_pstate_max_within_limits(cpu);
>> >> + } else {
>> >> + intel_pstate_set_update_util_hook(policy->cpu);
>> >>   }
>> >>
>> >> - intel_pstate_set_update_util_hook(policy->cpu);
>> >> -
>> >>   if (hwp_active)
>> >>   intel_pstate_hwp_set(policy->cpu);
>> >>
>> >
>> > What about update_turbo_pstate()?
>> >
>> > In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
>> > wouldn't that become problematic after this change?
>>
>> yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
>> update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE
>
> If that was the only way it could change, I wouldn't worry about it, but what
> about changes by BMCs and similar?  Are they not a concern?
>
>> But how is the presence or change in turbo related to the lack of a
>> need to hook the scheduler callback in "performance" mode?  The hook
>> literally does nothing in this case, except consume cycles, no?
>
> No.
>
> It actually sets the P-state to the current maximum (which admittedly is
> excessive) exactly because the maximum may change on the fly in theory.

There are 2 cases.

If turbo was enabled and were we requesting max turbo
and "somebody" disabled turbo in an MSR, then the HW would
simply clip our excessive req
> If it can't change on the fly (or we don't care), we can do some more
> simplifications there. :-)

I do not think it is Linux's responsibility to monitor changes to MSRs
such as Turbo enable/disable done behind its back by a BMC at run-time.
(if this is even possible)


-- 
Len Brown, Intel Open Source Technology Center


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Friday, June 16, 2017 09:10:39 PM Len Brown wrote:
> >> >> - get_avg_frequency(cpu),
> >> >> + aperfmperf_khz_on_cpu(cpu->cpu),
> >>
> >> Note that I deleted the line above in an updated version of this patch
> >> that I'm ready to send out.
> >>
> >> There were a couple of problems with it.
> >> The first is that it was  ugly that tracing (which occurs only in the
> >> SW governor case)
> >> could shorten the measurement interval as seen by the sysfs interface.
> >>
> >> The second is that this trace point can be called with irqs off,
> >> and smp_call_function_single() will WARN when called with irqs off.
> >>
> >> Srinivas Acked that I simply remove this field from the tracepoint --
> >> as it is redundant to calculate it in the kernel when we are already
> >> exporting the raw values of aperf and mperf.
> >
> > This changes the tracepoint format and I know about a couple of user space
> > scripts that consume these tracepoints though.
> > What would be wrong with leaving it as is?
> 
> I'm fine keeping get_avg_frequency() for the purpose of just this tracepoint
> compatibility, if you think it is useful to do so.
> 
> I removed it because its only function was the (removed) intel_pstate.get()
> and this tracepoint.  And this tracepoint already includes aperf and mperf,
> which can be used to calculate frequency in user-space, if desired.
> Srinivas Acked' that updating his user-space script would be fine --
> dunno if that is sufficient.

Well, we can try to make this change and see if there are any complaints
about it.

But the Srinivas' script is in the tree, so it would be good to update it too
along with the tracepoint.

Thanks,
Rafael



Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Friday, June 16, 2017 09:10:39 PM Len Brown wrote:
> >> >> - get_avg_frequency(cpu),
> >> >> + aperfmperf_khz_on_cpu(cpu->cpu),
> >>
> >> Note that I deleted the line above in an updated version of this patch
> >> that I'm ready to send out.
> >>
> >> There were a couple of problems with it.
> >> The first is that it was  ugly that tracing (which occurs only in the
> >> SW governor case)
> >> could shorten the measurement interval as seen by the sysfs interface.
> >>
> >> The second is that this trace point can be called with irqs off,
> >> and smp_call_function_single() will WARN when called with irqs off.
> >>
> >> Srinivas Acked that I simply remove this field from the tracepoint --
> >> as it is redundant to calculate it in the kernel when we are already
> >> exporting the raw values of aperf and mperf.
> >
> > This changes the tracepoint format and I know about a couple of user space
> > scripts that consume these tracepoints though.
> > What would be wrong with leaving it as is?
> 
> I'm fine keeping get_avg_frequency() for the purpose of just this tracepoint
> compatibility, if you think it is useful to do so.
> 
> I removed it because its only function was the (removed) intel_pstate.get()
> and this tracepoint.  And this tracepoint already includes aperf and mperf,
> which can be used to calculate frequency in user-space, if desired.
> Srinivas Acked' that updating his user-space script would be fine --
> dunno if that is sufficient.

Well, we can try to make this change and see if there are any complaints
about it.

But the Srinivas' script is in the tree, so it would be good to update it too
along with the tracepoint.

Thanks,
Rafael



Re: [PATCH 5/5] intel_pstate: delete scheduler hook in HWP mode

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 8:09 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:16 PM Len Brown wrote:
>> From: Len Brown 
>>
>> The cpufreqa/scaling_cur_freq sysfs attribute is now provided by
>> the x86 cpufreq core on all modern x86 systems, including
>> all systems supported by the intel_pstate driver.
>
> Not sure what you mean by "x86 cpufreq core"?

I refer to code that builds if (CONFIG_X86 && CONFIG_CPU_FREQ)

Since it was enough to provoke a comment form you, how about this wording?:

The cpufreq/scaling_cur_freq sysfs attribute is now provided by
shared x86 cpufreq code on modern x86 systems, including
all systems supported by the intel_pstate driver.

> Besides, I'd reorder this change with respect to patch [4/5] as this
> eliminates the hook entirely and then the "performance"-related change
> would only affect non-HWP.

I don't actually see a problem with either order,
but i'll send the refresh with the order you suggest.

thanks,
Len Brown, Intel Open Source Technology Center


Re: [PATCH 5/5] intel_pstate: delete scheduler hook in HWP mode

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 8:09 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:16 PM Len Brown wrote:
>> From: Len Brown 
>>
>> The cpufreqa/scaling_cur_freq sysfs attribute is now provided by
>> the x86 cpufreq core on all modern x86 systems, including
>> all systems supported by the intel_pstate driver.
>
> Not sure what you mean by "x86 cpufreq core"?

I refer to code that builds if (CONFIG_X86 && CONFIG_CPU_FREQ)

Since it was enough to provoke a comment form you, how about this wording?:

The cpufreq/scaling_cur_freq sysfs attribute is now provided by
shared x86 cpufreq code on modern x86 systems, including
all systems supported by the intel_pstate driver.

> Besides, I'd reorder this change with respect to patch [4/5] as this
> eliminates the hook entirely and then the "performance"-related change
> would only affect non-HWP.

I don't actually see a problem with either order,
but i'll send the refresh with the order you suggest.

thanks,
Len Brown, Intel Open Source Technology Center


[RFC PATCH 0/2] daxfile: enable byte-addressable updates to pmem

2017-06-16 Thread Dan Williams
Quoting PATCH 2/2:

To date, the full promise of byte-addressable access to persistent
memory has only been half realized via the filesystem-dax interface. The
current filesystem-dax mechanism allows an application to consume (read)
data from persistent storage at byte-size granularity, bypassing the
full page reads required by traditional storage devices.

Now, for writes, applications still need to contend with
page-granularity dirtying and flushing semantics as well as filesystem
coordination for metadata updates after any mmap write. The current
situation precludes use cases that leverage byte-granularity / in-place
updates to persistent media.

To get around this limitation there are some specialized applications
that are using the device-dax interface to bypass the overhead and
data-safety problems of the current filesystem-dax mmap-write path.
QEMU-KVM is forced to use device-dax to safely pass through persistent
memory to a guest [1]. Some specialized databases are using device-dax
for byte-granularity writes. Outside of those cases, device-dax is
difficult for general purpose persistent memory applications to consume.
There is demand for access to pmem without needing to contend with
special device configuration and other device-dax limitations.

The 'daxfile' interface satisfies this demand and realizes one of Dave
Chinner's ideas for allowing pmem applications to safely bypass
fsync/msync requirements. The idea is to make the file immutable with
respect to the offset-to-block mappings for every extent in the file
[2]. It turns out that filesystems already need to make this guarantee
today. This property is needed for files marked as swap files.

The new daxctl() syscall manages setting a file into 'static-dax' mode
whereby it arranges for the file to be treated as a swapfile as far as
the filesystem is concerned, but not registered with the core-mm as
swapfile space. A file in this mode is then safe to be mapped and
written without the requirement to fsync/msync the writes.  The cpu
cache management for flushing data to persistence can be handled
completely in userspace.
   
As can be seen in the patches there are still some TODOs to resolve in
the code, but this otherwise appears to solve the problem of persistent
memory applications needing to coordinate any and all writes to a file
mapping with fsync/msync.

[1]: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg01207.html
[2]: https://lkml.org/lkml/2016/9/11/159

---

Dan Williams (2):
  mm: introduce bmap_walk()
  mm, fs: daxfile, an interface for byte-addressable updates to pmem


 arch/x86/entry/syscalls/syscall_64.tbl |1 
 include/linux/dax.h|9 ++
 include/linux/fs.h |3 +
 include/linux/syscalls.h   |1 
 include/uapi/linux/dax.h   |8 +
 mm/Kconfig |5 +
 mm/Makefile|1 
 mm/daxfile.c   |  186 
 mm/page_io.c   |  117 +---
 9 files changed, 312 insertions(+), 19 deletions(-)
 create mode 100644 include/uapi/linux/dax.h
 create mode 100644 mm/daxfile.c


[RFC PATCH 1/2] mm: introduce bmap_walk()

2017-06-16 Thread Dan Williams
Refactor the core of generic_swapfile_activate() into bmap_walk() so
that it can be used by a new daxfile_activate() helper (to be added).

There should be no functional differences as a result of this change,
although it does add the capability to perform the bmap with a given
page-size. This is in support of daxfile users that want to ensure huge
page usage.

Cc: Jan Kara 
Cc: Jeff Moyer 
Cc: Christoph Hellwig 
Cc: Dave Chinner 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Signed-off-by: Dan Williams 
---
 mm/page_io.c |   86 +-
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..5cec9a3d49f2 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -135,11 +135,22 @@ static void end_swap_bio_read(struct bio *bio)
bio_put(bio);
 }
 
-int generic_swapfile_activate(struct swap_info_struct *sis,
-   struct file *swap_file,
-   sector_t *span)
+enum bmap_check {
+   BMAP_WALK_UNALIGNED,
+   BMAP_WALK_DISCONTIG,
+   BMAP_WALK_FULLPAGE,
+   BMAP_WALK_DONE,
+};
+
+typedef int (*bmap_check_fn)(sector_t block, unsigned long page_no,
+   enum bmap_check type, void *ctx);
+
+static int bmap_walk(struct file *file, const unsigned page_size,
+   const unsigned long page_max, sector_t *span,
+   bmap_check_fn check, void *ctx)
 {
-   struct address_space *mapping = swap_file->f_mapping;
+   struct address_space *mapping = file->f_mapping;
+   const unsigned page_shift = ilog2(page_size);
struct inode *inode = mapping->host;
unsigned blocks_per_page;
unsigned long page_no;
@@ -152,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
int ret;
 
blkbits = inode->i_blkbits;
-   blocks_per_page = PAGE_SIZE >> blkbits;
+   blocks_per_page = page_size >> blkbits;
 
/*
 * Map all the blocks into the extent list.  This code doesn't try
@@ -162,7 +173,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
page_no = 0;
last_block = i_size_read(inode) >> blkbits;
while ((probe_block + blocks_per_page) <= last_block &&
-   page_no < sis->max) {
+   page_no < page_max) {
unsigned block_in_page;
sector_t first_block;
 
@@ -173,11 +184,15 @@ int generic_swapfile_activate(struct swap_info_struct 
*sis,
goto bad_bmap;
 
/*
-* It must be PAGE_SIZE aligned on-disk
+* It must be @page_size aligned on-disk
 */
if (first_block & (blocks_per_page - 1)) {
probe_block++;
-   goto reprobe;
+   ret = check(first_block, page_no,
+   BMAP_WALK_UNALIGNED, ctx);
+   if (ret == -EAGAIN)
+   goto reprobe;
+   goto bad_bmap;
}
 
for (block_in_page = 1; block_in_page < blocks_per_page;
@@ -190,11 +205,15 @@ int generic_swapfile_activate(struct swap_info_struct 
*sis,
if (block != first_block + block_in_page) {
/* Discontiguity */
probe_block++;
-   goto reprobe;
+   ret = check(first_block, page_no,
+   BMAP_WALK_DISCONTIG, ctx);
+   if (ret == -EAGAIN)
+   goto reprobe;
+   goto bad_bmap;
}
}
 
-   first_block >>= (PAGE_SHIFT - blkbits);
+   first_block >>= (page_shift - blkbits);
if (page_no) {  /* exclude the header page */
if (first_block < lowest_block)
lowest_block = first_block;
@@ -203,9 +222,9 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
}
 
/*
-* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+* We found a @page_size-{length,aligned} run of blocks
 */
-   ret = add_swap_extent(sis, page_no, 1, first_block);
+   ret = check(first_block, page_no, BMAP_WALK_FULLPAGE, ctx);
if (ret < 0)
goto out;
nr_extents += ret;
@@ -215,20 +234,49 @@ int generic_swapfile_activate(struct swap_info_struct 
*sis,
continue;
}
ret = nr_extents;
-   *span = 1 + highest_block - 

[RFC PATCH 2/2] mm, fs: daxfile, an interface for byte-addressable updates to pmem

2017-06-16 Thread Dan Williams
To date, the full promise of byte-addressable access to persistent
memory has only been half realized via the filesystem-dax interface. The
current filesystem-dax mechanism allows an application to consume (read)
data from persistent storage at byte-size granularity, bypassing the
full page reads required by traditional storage devices.

Now, for writes, applications still need to contend with
page-granularity dirtying and flushing semantics as well as filesystem
coordination for metadata updates after any mmap write. The current
situation precludes use cases that leverage byte-granularity / in-place
updates to persistent media.

To get around this limitation there are some specialized applications
that are using the device-dax interface to bypass the overhead and
data-safety problems of the current filesystem-dax mmap-write path.
QEMU-KVM is forced to use device-dax to safely pass through persistent
memory to a guest [1]. Some specialized databases are using device-dax
for byte-granularity writes. Outside of those cases, device-dax is
difficult for general purpose persistent memory applications to consume.
There is demand for access to pmem without needing to contend with
special device configuration and other device-dax limitations.

The 'daxfile' interface satisfies this demand and realizes one of Dave
Chinner's ideas for allowing pmem applications to safely bypass
fsync/msync requirements. The idea is to make the file immutable with
respect to the offset-to-block mappings for every extent in the file
[2]. It turns out that filesystems already need to make this guarantee
today. This property is needed for files marked as swap files.

The new daxctl() syscall manages setting a file into 'static-dax' mode
whereby it arranges for the file to be treated as a swapfile as far as
the filesystem is concerned, but not registered with the core-mm as
swapfile space. A file in this mode is then safe to be mapped and
written without the requirement to fsync/msync the writes.  The cpu
cache management for flushing data to persistence can be handled
completely in userspace.

[1]: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg01207.html
[2]: https://lkml.org/lkml/2016/9/11/159

Cc: Jan Kara 
Cc: Jeff Moyer 
Cc: Christoph Hellwig 
Cc: Dave Chinner 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Signed-off-by: Dan Williams 
---
 arch/x86/entry/syscalls/syscall_64.tbl |1 
 include/linux/dax.h|9 ++
 include/linux/fs.h |3 +
 include/linux/syscalls.h   |1 
 include/uapi/linux/dax.h   |8 +
 mm/Kconfig |5 +
 mm/Makefile|1 
 mm/daxfile.c   |  186 
 mm/page_io.c   |   31 +
 9 files changed, 245 insertions(+)
 create mode 100644 include/uapi/linux/dax.h
 create mode 100644 mm/daxfile.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..795eb93d6beb 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330common  pkey_alloc  sys_pkey_alloc
 331common  pkey_free   sys_pkey_free
 332common  statx   sys_statx
+33364  daxctl  sys_daxctl
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 5ec1f6c47716..5f1d0e0ed30f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -4,8 +4,17 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
+/*
+ * TODO: make sys_daxctl() be the generic interface for toggling S_DAX
+ * across filesystems. For now, mark DAXCTL_F_DAX as an invalid flag
+ */
+#define DAXCTL_VALID_FLAGS (DAXCTL_F_GET | DAXCTL_F_STATIC)
+
+int daxfile_activate(struct file *daxfile, unsigned align);
+
 struct iomap_ops;
 struct dax_device;
 struct dax_operations {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e68cabb8457..3af649fb669f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1824,8 +1824,10 @@ struct super_operations {
 #define S_NOSEC4096/* no suid or xattr security attributes 
*/
 #ifdef CONFIG_FS_DAX
 #define S_DAX  8192/* Direct Access, avoiding the page cache */
+#define S_DAXFILE  16384   /* no truncate (swapfile) semantics + dax */
 #else
 #define S_DAX  0   /* Make all the DAX code disappear */
+#define S_DAXFILE  0
 #endif
 
 /*
@@ -1865,6 +1867,7 @@ struct super_operations {
 #define IS_AUTOMOUNT(inode)((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)((inode)->i_flags & S_NOSEC)
 #define IS_DAX(inode)  ((inode)->i_flags & S_DAX)

[RFC PATCH 0/2] daxfile: enable byte-addressable updates to pmem

2017-06-16 Thread Dan Williams
Quoting PATCH 2/2:

To date, the full promise of byte-addressable access to persistent
memory has only been half realized via the filesystem-dax interface. The
current filesystem-dax mechanism allows an application to consume (read)
data from persistent storage at byte-size granularity, bypassing the
full page reads required by traditional storage devices.

Now, for writes, applications still need to contend with
page-granularity dirtying and flushing semantics as well as filesystem
coordination for metadata updates after any mmap write. The current
situation precludes use cases that leverage byte-granularity / in-place
updates to persistent media.

To get around this limitation there are some specialized applications
that are using the device-dax interface to bypass the overhead and
data-safety problems of the current filesystem-dax mmap-write path.
QEMU-KVM is forced to use device-dax to safely pass through persistent
memory to a guest [1]. Some specialized databases are using device-dax
for byte-granularity writes. Outside of those cases, device-dax is
difficult for general purpose persistent memory applications to consume.
There is demand for access to pmem without needing to contend with
special device configuration and other device-dax limitations.

The 'daxfile' interface satisfies this demand and realizes one of Dave
Chinner's ideas for allowing pmem applications to safely bypass
fsync/msync requirements. The idea is to make the file immutable with
respect to the offset-to-block mappings for every extent in the file
[2]. It turns out that filesystems already need to make this guarantee
today. This property is needed for files marked as swap files.

The new daxctl() syscall manages setting a file into 'static-dax' mode
whereby it arranges for the file to be treated as a swapfile as far as
the filesystem is concerned, but not registered with the core-mm as
swapfile space. A file in this mode is then safe to be mapped and
written without the requirement to fsync/msync the writes.  The cpu
cache management for flushing data to persistence can be handled
completely in userspace.
   
As can be seen in the patches there are still some TODOs to resolve in
the code, but this otherwise appears to solve the problem of persistent
memory applications needing to coordinate any and all writes to a file
mapping with fsync/msync.

[1]: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg01207.html
[2]: https://lkml.org/lkml/2016/9/11/159

---

Dan Williams (2):
  mm: introduce bmap_walk()
  mm, fs: daxfile, an interface for byte-addressable updates to pmem


 arch/x86/entry/syscalls/syscall_64.tbl |1 
 include/linux/dax.h|9 ++
 include/linux/fs.h |3 +
 include/linux/syscalls.h   |1 
 include/uapi/linux/dax.h   |8 +
 mm/Kconfig |5 +
 mm/Makefile|1 
 mm/daxfile.c   |  186 
 mm/page_io.c   |  117 +---
 9 files changed, 312 insertions(+), 19 deletions(-)
 create mode 100644 include/uapi/linux/dax.h
 create mode 100644 mm/daxfile.c


[RFC PATCH 1/2] mm: introduce bmap_walk()

2017-06-16 Thread Dan Williams
Refactor the core of generic_swapfile_activate() into bmap_walk() so
that it can be used by a new daxfile_activate() helper (to be added).

There should be no functional differences as a result of this change,
although it does add the capability to perform the bmap with a given
page-size. This is in support of daxfile users that want to ensure huge
page usage.

Cc: Jan Kara 
Cc: Jeff Moyer 
Cc: Christoph Hellwig 
Cc: Dave Chinner 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Signed-off-by: Dan Williams 
---
 mm/page_io.c |   86 +-
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/mm/page_io.c b/mm/page_io.c
index 23f6d0d3470f..5cec9a3d49f2 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -135,11 +135,22 @@ static void end_swap_bio_read(struct bio *bio)
bio_put(bio);
 }
 
-int generic_swapfile_activate(struct swap_info_struct *sis,
-   struct file *swap_file,
-   sector_t *span)
+enum bmap_check {
+   BMAP_WALK_UNALIGNED,
+   BMAP_WALK_DISCONTIG,
+   BMAP_WALK_FULLPAGE,
+   BMAP_WALK_DONE,
+};
+
+typedef int (*bmap_check_fn)(sector_t block, unsigned long page_no,
+   enum bmap_check type, void *ctx);
+
+static int bmap_walk(struct file *file, const unsigned page_size,
+   const unsigned long page_max, sector_t *span,
+   bmap_check_fn check, void *ctx)
 {
-   struct address_space *mapping = swap_file->f_mapping;
+   struct address_space *mapping = file->f_mapping;
+   const unsigned page_shift = ilog2(page_size);
struct inode *inode = mapping->host;
unsigned blocks_per_page;
unsigned long page_no;
@@ -152,7 +163,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
int ret;
 
blkbits = inode->i_blkbits;
-   blocks_per_page = PAGE_SIZE >> blkbits;
+   blocks_per_page = page_size >> blkbits;
 
/*
 * Map all the blocks into the extent list.  This code doesn't try
@@ -162,7 +173,7 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
page_no = 0;
last_block = i_size_read(inode) >> blkbits;
while ((probe_block + blocks_per_page) <= last_block &&
-   page_no < sis->max) {
+   page_no < page_max) {
unsigned block_in_page;
sector_t first_block;
 
@@ -173,11 +184,15 @@ int generic_swapfile_activate(struct swap_info_struct 
*sis,
goto bad_bmap;
 
/*
-* It must be PAGE_SIZE aligned on-disk
+* It must be @page_size aligned on-disk
 */
if (first_block & (blocks_per_page - 1)) {
probe_block++;
-   goto reprobe;
+   ret = check(first_block, page_no,
+   BMAP_WALK_UNALIGNED, ctx);
+   if (ret == -EAGAIN)
+   goto reprobe;
+   goto bad_bmap;
}
 
for (block_in_page = 1; block_in_page < blocks_per_page;
@@ -190,11 +205,15 @@ int generic_swapfile_activate(struct swap_info_struct 
*sis,
if (block != first_block + block_in_page) {
/* Discontiguity */
probe_block++;
-   goto reprobe;
+   ret = check(first_block, page_no,
+   BMAP_WALK_DISCONTIG, ctx);
+   if (ret == -EAGAIN)
+   goto reprobe;
+   goto bad_bmap;
}
}
 
-   first_block >>= (PAGE_SHIFT - blkbits);
+   first_block >>= (page_shift - blkbits);
if (page_no) {  /* exclude the header page */
if (first_block < lowest_block)
lowest_block = first_block;
@@ -203,9 +222,9 @@ int generic_swapfile_activate(struct swap_info_struct *sis,
}
 
/*
-* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+* We found a @page_size-{length,aligned} run of blocks
 */
-   ret = add_swap_extent(sis, page_no, 1, first_block);
+   ret = check(first_block, page_no, BMAP_WALK_FULLPAGE, ctx);
if (ret < 0)
goto out;
nr_extents += ret;
@@ -215,20 +234,49 @@ int generic_swapfile_activate(struct swap_info_struct 
*sis,
continue;
}
ret = nr_extents;
-   *span = 1 + highest_block - lowest_block;
-   if (page_no == 0)
-   page_no = 1;/* force Empty message */
-   sis->max = page_no;
-   sis->pages = page_no 

[RFC PATCH 2/2] mm, fs: daxfile, an interface for byte-addressable updates to pmem

2017-06-16 Thread Dan Williams
To date, the full promise of byte-addressable access to persistent
memory has only been half realized via the filesystem-dax interface. The
current filesystem-dax mechanism allows an application to consume (read)
data from persistent storage at byte-size granularity, bypassing the
full page reads required by traditional storage devices.

Now, for writes, applications still need to contend with
page-granularity dirtying and flushing semantics as well as filesystem
coordination for metadata updates after any mmap write. The current
situation precludes use cases that leverage byte-granularity / in-place
updates to persistent media.

To get around this limitation there are some specialized applications
that are using the device-dax interface to bypass the overhead and
data-safety problems of the current filesystem-dax mmap-write path.
QEMU-KVM is forced to use device-dax to safely pass through persistent
memory to a guest [1]. Some specialized databases are using device-dax
for byte-granularity writes. Outside of those cases, device-dax is
difficult for general purpose persistent memory applications to consume.
There is demand for access to pmem without needing to contend with
special device configuration and other device-dax limitations.

The 'daxfile' interface satisfies this demand and realizes one of Dave
Chinner's ideas for allowing pmem applications to safely bypass
fsync/msync requirements. The idea is to make the file immutable with
respect to the offset-to-block mappings for every extent in the file
[2]. It turns out that filesystems already need to make this guarantee
today. This property is needed for files marked as swap files.

The new daxctl() syscall manages setting a file into 'static-dax' mode
whereby it arranges for the file to be treated as a swapfile as far as
the filesystem is concerned, but not registered with the core-mm as
swapfile space. A file in this mode is then safe to be mapped and
written without the requirement to fsync/msync the writes.  The cpu
cache management for flushing data to persistence can be handled
completely in userspace.

[1]: https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg01207.html
[2]: https://lkml.org/lkml/2016/9/11/159

Cc: Jan Kara 
Cc: Jeff Moyer 
Cc: Christoph Hellwig 
Cc: Dave Chinner 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Signed-off-by: Dan Williams 
---
 arch/x86/entry/syscalls/syscall_64.tbl |1 
 include/linux/dax.h|9 ++
 include/linux/fs.h |3 +
 include/linux/syscalls.h   |1 
 include/uapi/linux/dax.h   |8 +
 mm/Kconfig |5 +
 mm/Makefile|1 
 mm/daxfile.c   |  186 
 mm/page_io.c   |   31 +
 9 files changed, 245 insertions(+)
 create mode 100644 include/uapi/linux/dax.h
 create mode 100644 mm/daxfile.c

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..795eb93d6beb 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330common  pkey_alloc  sys_pkey_alloc
 331common  pkey_free   sys_pkey_free
 332common  statx   sys_statx
+33364  daxctl  sys_daxctl
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 5ec1f6c47716..5f1d0e0ed30f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -4,8 +4,17 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
+/*
+ * TODO: make sys_daxctl() be the generic interface for toggling S_DAX
+ * across filesystems. For now, mark DAXCTL_F_DAX as an invalid flag
+ */
+#define DAXCTL_VALID_FLAGS (DAXCTL_F_GET | DAXCTL_F_STATIC)
+
+int daxfile_activate(struct file *daxfile, unsigned align);
+
 struct iomap_ops;
 struct dax_device;
 struct dax_operations {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e68cabb8457..3af649fb669f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1824,8 +1824,10 @@ struct super_operations {
 #define S_NOSEC4096/* no suid or xattr security attributes 
*/
 #ifdef CONFIG_FS_DAX
 #define S_DAX  8192/* Direct Access, avoiding the page cache */
+#define S_DAXFILE  16384   /* no truncate (swapfile) semantics + dax */
 #else
 #define S_DAX  0   /* Make all the DAX code disappear */
+#define S_DAXFILE  0
 #endif
 
 /*
@@ -1865,6 +1867,7 @@ struct super_operations {
 #define IS_AUTOMOUNT(inode)((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)((inode)->i_flags & S_NOSEC)
 #define IS_DAX(inode)  ((inode)->i_flags & S_DAX)
+#define IS_DAXFILE(inode)  ((inode)->i_flags & S_DAXFILE)
 
 #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
  

Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Rafael J. Wysocki
On Friday, June 16, 2017 08:52:53 PM Len Brown wrote:
> On Fri, Jun 16, 2017 at 8:04 PM, Rafael J. Wysocki  wrote:
> > On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
> >> From: Len Brown 
> >>
> >> When the governor is set to "performance", intel_pstate does not
> >> need the scheduler hook for doing any calculations.  Under these
> >> conditions, its only purpose is to continue to maintain
> >> cpufreq/scaling_cur_freq.
> >>
> >> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
> >> the x86 cpufreq core on all modern x86 systems, including
> >> all systems supported by the intel_pstate driver.
> >>
> >> So in "performance" governor mode, the scheduler hook can be skipped.
> >> This applies to both in Software and Hardware P-state control modes.
> >>
> >> Suggested-by: Srinivas Pandruvada 
> >> Signed-off-by: Len Brown 
> >> ---
> >>  drivers/cpufreq/intel_pstate.c | 4 ++--
> >>  1 file changed, 2 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/cpufreq/intel_pstate.c 
> >> b/drivers/cpufreq/intel_pstate.c
> >> index 5d67780..0ff3a4b 100644
> >> --- a/drivers/cpufreq/intel_pstate.c
> >> +++ b/drivers/cpufreq/intel_pstate.c
> >> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
> >> cpufreq_policy *policy)
> >>*/
> >>   intel_pstate_clear_update_util_hook(policy->cpu);
> >
> > The statement above shouldn't be necessary any more after the change below.
> 
> The policy can change at run time form something other than  performance
> to performance, so we want to clear the hook in that case, no?

Yes.

> >>   intel_pstate_max_within_limits(cpu);
> >> + } else {
> >> + intel_pstate_set_update_util_hook(policy->cpu);
> >>   }
> >>
> >> - intel_pstate_set_update_util_hook(policy->cpu);
> >> -
> >>   if (hwp_active)
> >>   intel_pstate_hwp_set(policy->cpu);
> >>
> >
> > What about update_turbo_pstate()?
> >
> > In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
> > wouldn't that become problematic after this change?
> 
> yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
> update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE

If that was the only way it could change, I wouldn't worry about it, but what
about changes by BMCs and similar?  Are they not a concern?

> But how is the presence or change in turbo related to the lack of a
> need to hook the scheduler callback in "performance" mode?  The hook
> literally does nothing in this case, except consume cycles, no?

No.

It actually sets the P-state to the current maximum (which admittedly is
excessive) exactly because the maximum may change on the fly in theory.

If it can't change on the fly (or we don't care), we can do some more
simplifications there. :-)

Thanks,
Rafael



Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Rafael J. Wysocki
On Friday, June 16, 2017 08:52:53 PM Len Brown wrote:
> On Fri, Jun 16, 2017 at 8:04 PM, Rafael J. Wysocki  wrote:
> > On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
> >> From: Len Brown 
> >>
> >> When the governor is set to "performance", intel_pstate does not
> >> need the scheduler hook for doing any calculations.  Under these
> >> conditions, its only purpose is to continue to maintain
> >> cpufreq/scaling_cur_freq.
> >>
> >> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
> >> the x86 cpufreq core on all modern x86 systems, including
> >> all systems supported by the intel_pstate driver.
> >>
> >> So in "performance" governor mode, the scheduler hook can be skipped.
> >> This applies to both in Software and Hardware P-state control modes.
> >>
> >> Suggested-by: Srinivas Pandruvada 
> >> Signed-off-by: Len Brown 
> >> ---
> >>  drivers/cpufreq/intel_pstate.c | 4 ++--
> >>  1 file changed, 2 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/cpufreq/intel_pstate.c 
> >> b/drivers/cpufreq/intel_pstate.c
> >> index 5d67780..0ff3a4b 100644
> >> --- a/drivers/cpufreq/intel_pstate.c
> >> +++ b/drivers/cpufreq/intel_pstate.c
> >> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
> >> cpufreq_policy *policy)
> >>*/
> >>   intel_pstate_clear_update_util_hook(policy->cpu);
> >
> > The statement above shouldn't be necessary any more after the change below.
> 
> The policy can change at run time form something other than  performance
> to performance, so we want to clear the hook in that case, no?

Yes.

> >>   intel_pstate_max_within_limits(cpu);
> >> + } else {
> >> + intel_pstate_set_update_util_hook(policy->cpu);
> >>   }
> >>
> >> - intel_pstate_set_update_util_hook(policy->cpu);
> >> -
> >>   if (hwp_active)
> >>   intel_pstate_hwp_set(policy->cpu);
> >>
> >
> > What about update_turbo_pstate()?
> >
> > In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
> > wouldn't that become problematic after this change?
> 
> yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
> update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE

If that was the only way it could change, I wouldn't worry about it, but what
about changes by BMCs and similar?  Are they not a concern?

> But how is the presence or change in turbo related to the lack of a
> need to hook the scheduler callback in "performance" mode?  The hook
> literally does nothing in this case, except consume cycles, no?

No.

It actually sets the P-state to the current maximum (which admittedly is
excessive) exactly because the maximum may change on the fly in theory.

If it can't change on the fly (or we don't care), we can do some more
simplifications there. :-)

Thanks,
Rafael



Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Len Brown
>> >> - get_avg_frequency(cpu),
>> >> + aperfmperf_khz_on_cpu(cpu->cpu),
>>
>> Note that I deleted the line above in an updated version of this patch
>> that I'm ready to send out.
>>
>> There were a couple of problems with it.
>> The first is that it was  ugly that tracing (which occurs only in the
>> SW governor case)
>> could shorten the measurement interval as seen by the sysfs interface.
>>
>> The second is that this trace point can be called with irqs off,
>> and smp_call_function_single() will WARN when called with irqs off.
>>
>> Srinivas Acked that I simply remove this field from the tracepoint --
>> as it is redundant to calculate it in the kernel when we are already
>> exporting the raw values of aperf and mperf.
>
> This changes the tracepoint format and I know about a couple of user space
> scripts that consume these tracepoints though.
> What would be wrong with leaving it as is?

I'm fine keeping get_avg_frequency() for the purpose of just this tracepoint
compatibility, if you think it is useful to do so.

I removed it because its only function was the (removed) intel_pstate.get()
and this tracepoint.  And this tracepoint already includes aperf and mperf,
which can be used to calculate frequency in user-space, if desired.
Srinivas Acked' that updating his user-space script would be fine --
dunno if that is sufficient.

cheers,
-Len


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Len Brown
>> >> - get_avg_frequency(cpu),
>> >> + aperfmperf_khz_on_cpu(cpu->cpu),
>>
>> Note that I deleted the line above in an updated version of this patch
>> that I'm ready to send out.
>>
>> There were a couple of problems with it.
>> The first is that it was  ugly that tracing (which occurs only in the
>> SW governor case)
>> could shorten the measurement interval as seen by the sysfs interface.
>>
>> The second is that this trace point can be called with irqs off,
>> and smp_call_function_single() will WARN when called with irqs off.
>>
>> Srinivas Acked that I simply remove this field from the tracepoint --
>> as it is redundant to calculate it in the kernel when we are already
>> exporting the raw values of aperf and mperf.
>
> This changes the tracepoint format and I know about a couple of user space
> scripts that consume these tracepoints though.
> What would be wrong with leaving it as is?

I'm fine keeping get_avg_frequency() for the purpose of just this tracepoint
compatibility, if you think it is useful to do so.

I removed it because its only function was the (removed) intel_pstate.get()
and this tracepoint.  And this tracepoint already includes aperf and mperf,
which can be used to calculate frequency in user-space, if desired.
Srinivas Acked' that updating his user-space script would be fine --
dunno if that is sufficient.

cheers,
-Len


Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 8:04 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
>> From: Len Brown 
>>
>> When the governor is set to "performance", intel_pstate does not
>> need the scheduler hook for doing any calculations.  Under these
>> conditions, its only purpose is to continue to maintain
>> cpufreq/scaling_cur_freq.
>>
>> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
>> the x86 cpufreq core on all modern x86 systems, including
>> all systems supported by the intel_pstate driver.
>>
>> So in "performance" governor mode, the scheduler hook can be skipped.
>> This applies to both in Software and Hardware P-state control modes.
>>
>> Suggested-by: Srinivas Pandruvada 
>> Signed-off-by: Len Brown 
>> ---
>>  drivers/cpufreq/intel_pstate.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
>> index 5d67780..0ff3a4b 100644
>> --- a/drivers/cpufreq/intel_pstate.c
>> +++ b/drivers/cpufreq/intel_pstate.c
>> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
>> cpufreq_policy *policy)
>>*/
>>   intel_pstate_clear_update_util_hook(policy->cpu);
>
> The statement above shouldn't be necessary any more after the change below.

The policy can change at run time form something other than  performance
to performance, so we want to clear the hook in that case, no?

>>   intel_pstate_max_within_limits(cpu);
>> + } else {
>> + intel_pstate_set_update_util_hook(policy->cpu);
>>   }
>>
>> - intel_pstate_set_update_util_hook(policy->cpu);
>> -
>>   if (hwp_active)
>>   intel_pstate_hwp_set(policy->cpu);
>>
>
> What about update_turbo_pstate()?
>
> In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
> wouldn't that become problematic after this change?

yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE

But how is the presence or change in turbo related to the lack of a
need to hook the scheduler callback in "performance" mode?  The hook
literally does nothing in this case, except consume cycles, no?

-- 
Len Brown, Intel Open Source Technology Center


Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 8:04 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
>> From: Len Brown 
>>
>> When the governor is set to "performance", intel_pstate does not
>> need the scheduler hook for doing any calculations.  Under these
>> conditions, its only purpose is to continue to maintain
>> cpufreq/scaling_cur_freq.
>>
>> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
>> the x86 cpufreq core on all modern x86 systems, including
>> all systems supported by the intel_pstate driver.
>>
>> So in "performance" governor mode, the scheduler hook can be skipped.
>> This applies to both in Software and Hardware P-state control modes.
>>
>> Suggested-by: Srinivas Pandruvada 
>> Signed-off-by: Len Brown 
>> ---
>>  drivers/cpufreq/intel_pstate.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
>> index 5d67780..0ff3a4b 100644
>> --- a/drivers/cpufreq/intel_pstate.c
>> +++ b/drivers/cpufreq/intel_pstate.c
>> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
>> cpufreq_policy *policy)
>>*/
>>   intel_pstate_clear_update_util_hook(policy->cpu);
>
> The statement above shouldn't be necessary any more after the change below.

The policy can change at run time form something other than  performance
to performance, so we want to clear the hook in that case, no?

>>   intel_pstate_max_within_limits(cpu);
>> + } else {
>> + intel_pstate_set_update_util_hook(policy->cpu);
>>   }
>>
>> - intel_pstate_set_update_util_hook(policy->cpu);
>> -
>>   if (hwp_active)
>>   intel_pstate_hwp_set(policy->cpu);
>>
>
> What about update_turbo_pstate()?
>
> In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
> wouldn't that become problematic after this change?

yes, the sysfs "no_turbo" attribute can be modified at any time, invoking
update_turbo_state(), which will update MSR_IA32_MISC_ENABLE_TURBO_DISABLE

But how is the presence or change in turbo related to the lack of a
need to hook the scheduler callback in "performance" mode?  The hook
literally does nothing in this case, except consume cycles, no?

-- 
Len Brown, Intel Open Source Technology Center


RE: [PATCH v2] libnvdimm, pmem: Add sysfs notifications to badblocks

2017-06-16 Thread Kani, Toshimitsu
> On Fri, Jun 16, 2017 at 5:35 PM, Kani, Toshimitsu  wrote:
> >> On Mon, Jun 12, 2017 at 3:25 PM, Toshi Kani  wrote:
> >> > Sysfs "badblocks" information may be updated during run-time that:
> >> >  - MCE, SCI, and sysfs "scrub" may add new bad blocks
> >> >  - Writes and ioctl() may clear bad blocks
> >> >
> >> > Add support to send sysfs notifications to sysfs "badblocks" file
> >> > under region and pmem directories when their badblocks information
> >> > is re-evaluated (but is not necessarily changed) during run-time.
> >> >
> >> > Signed-off-by: Toshi Kani 
> >> > Cc: Dan Williams 
> >> > Cc: Vishal Verma 
> >> > Cc: Linda Knippers 
> >> > ---
> >> > v2: Send notifications for the clearing case
> >> > ---
> >>
> >> This looks good to me, I've applied it, but I also want to extend the
> >> ndctl unit tests to cover this mechanism.
> >
> > Right.  For the time being, would you mind to use the attached test
> > program for your sanity tests?  It simply monitors sysfs notifications
> > and prints badblocks info...  Sorry for inconvenience.
> >
> > Since I am not familiar with the ndctl unit tests and I am traveling for
> > the rest of the month, I may have to look into it after I am back.
> >
> 
> No worries, I'll take a look at integrating this. Have a nice trip!

Thanks Dan!!  I really appreciate it! 
-Toshi


RE: [PATCH v2] libnvdimm, pmem: Add sysfs notifications to badblocks

2017-06-16 Thread Kani, Toshimitsu
> On Fri, Jun 16, 2017 at 5:35 PM, Kani, Toshimitsu  wrote:
> >> On Mon, Jun 12, 2017 at 3:25 PM, Toshi Kani  wrote:
> >> > Sysfs "badblocks" information may be updated during run-time that:
> >> >  - MCE, SCI, and sysfs "scrub" may add new bad blocks
> >> >  - Writes and ioctl() may clear bad blocks
> >> >
> >> > Add support to send sysfs notifications to sysfs "badblocks" file
> >> > under region and pmem directories when their badblocks information
> >> > is re-evaluated (but is not necessarily changed) during run-time.
> >> >
> >> > Signed-off-by: Toshi Kani 
> >> > Cc: Dan Williams 
> >> > Cc: Vishal Verma 
> >> > Cc: Linda Knippers 
> >> > ---
> >> > v2: Send notifications for the clearing case
> >> > ---
> >>
> >> This looks good to me, I've applied it, but I also want to extend the
> >> ndctl unit tests to cover this mechanism.
> >
> > Right.  For the time being, would you mind to use the attached test
> > program for your sanity tests?  It simply monitors sysfs notifications
> > and prints badblocks info...  Sorry for inconvenience.
> >
> > Since I am not familiar with the ndctl unit tests and I am traveling for
> > the rest of the month, I may have to look into it after I am back.
> >
> 
> No worries, I'll take a look at integrating this. Have a nice trip!

Thanks Dan!!  I really appreciate it! 
-Toshi


Re: [PATCH v2] libnvdimm, pmem: Add sysfs notifications to badblocks

2017-06-16 Thread Dan Williams
On Fri, Jun 16, 2017 at 5:35 PM, Kani, Toshimitsu  wrote:
>> On Mon, Jun 12, 2017 at 3:25 PM, Toshi Kani  wrote:
>> > Sysfs "badblocks" information may be updated during run-time that:
>> >  - MCE, SCI, and sysfs "scrub" may add new bad blocks
>> >  - Writes and ioctl() may clear bad blocks
>> >
>> > Add support to send sysfs notifications to sysfs "badblocks" file
>> > under region and pmem directories when their badblocks information
>> > is re-evaluated (but is not necessarily changed) during run-time.
>> >
>> > Signed-off-by: Toshi Kani 
>> > Cc: Dan Williams 
>> > Cc: Vishal Verma 
>> > Cc: Linda Knippers 
>> > ---
>> > v2: Send notifications for the clearing case
>> > ---
>>
>> This looks good to me, I've applied it, but I also want to extend the
>> ndctl unit tests to cover this mechanism.
>
> Right.  For the time being, would you mind to use the attached test
> program for your sanity tests?  It simply monitors sysfs notifications
> and prints badblocks info...  Sorry for inconvenience.
>
> Since I am not familiar with the ndctl unit tests and I am traveling for
> the rest of the month, I may have to look into it after I am back.
>

No worries, I'll take a look at integrating this. Have a nice trip!


Re: [PATCH v2] libnvdimm, pmem: Add sysfs notifications to badblocks

2017-06-16 Thread Dan Williams
On Fri, Jun 16, 2017 at 5:35 PM, Kani, Toshimitsu  wrote:
>> On Mon, Jun 12, 2017 at 3:25 PM, Toshi Kani  wrote:
>> > Sysfs "badblocks" information may be updated during run-time that:
>> >  - MCE, SCI, and sysfs "scrub" may add new bad blocks
>> >  - Writes and ioctl() may clear bad blocks
>> >
>> > Add support to send sysfs notifications to sysfs "badblocks" file
>> > under region and pmem directories when their badblocks information
>> > is re-evaluated (but is not necessarily changed) during run-time.
>> >
>> > Signed-off-by: Toshi Kani 
>> > Cc: Dan Williams 
>> > Cc: Vishal Verma 
>> > Cc: Linda Knippers 
>> > ---
>> > v2: Send notifications for the clearing case
>> > ---
>>
>> This looks good to me, I've applied it, but I also want to extend the
>> ndctl unit tests to cover this mechanism.
>
> Right.  For the time being, would you mind to use the attached test
> program for your sanity tests?  It simply monitors sysfs notifications
> and prints badblocks info...  Sorry for inconvenience.
>
> Since I am not familiar with the ndctl unit tests and I am traveling for
> the rest of the month, I may have to look into it after I am back.
>

No worries, I'll take a look at integrating this. Have a nice trip!


RE: [PATCH v2 2/2] acpi/nfit: Issue Start ARS to retrieve existing records

2017-06-16 Thread Kani, Toshimitsu
> On Fri, Jun 16, 2017 at 5:01 PM, Kani, Toshimitsu  wrote:
> >> > --- a/drivers/acpi/nfit/core.c
> >> > +++ b/drivers/acpi/nfit/core.c
> >> > @@ -1031,7 +1031,7 @@ static ssize_t scrub_store(struct device *dev,
> >> > if (nd_desc) {
> >> > struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
> >> >
> >> > -   rc = acpi_nfit_ars_rescan(acpi_desc);
> >> > +   rc = acpi_nfit_ars_rescan(acpi_desc, 0);
> >> > }
> >> > device_unlock(dev);
> >> > if (rc)
> >> > @@ -2057,6 +2057,10 @@ static int ars_start(struct acpi_nfit_desc
> >> *acpi_desc, struct nfit_spa *nfit_spa
> >> > ars_start.type = ND_ARS_VOLATILE;
> >> > else
> >> > return -ENOTTY;
> >> > +   if (nfit_spa->ars_prev_data) {
> >> > +   ars_start.flags |= ND_ARS_RETURN_PREV_DATA;
> >> > +   nfit_spa->ars_prev_data = 0;
> >> > +   }
> >>
> >> I'd rather you plumb a new 'flags' parameter all the way through from
> >> acpi_nfit_ars_rescan() to ars_start() rather than carrying this as a
> >> property of nfit_spa.
> >
> > Yes, I wanted to carry 'flags' all the way, but since acpi_nfit_ars_rescan()
> > calls acpi_nfit_scrub() via acpi_desc->work, all info needs to be marshalled
> > into struct acpi_nfit_desc.  Using nfit_spa allows a request to be carried 
> > as
> > per-spa basis...
> 
> Ah ok, but I still think it does not belong to a spa. This is control
> / context information for the workqueue and that belongs with
> acpi_nfit_desc. It think it's fine if all spas get re-scrubbed with
> the "prev_data" flag in the the notification case.

Sounds good.  I will add the flags info to acpi_nfit_desc and update
this patch.

Thanks!
-Toshi



RE: [PATCH v2 2/2] acpi/nfit: Issue Start ARS to retrieve existing records

2017-06-16 Thread Kani, Toshimitsu
> On Fri, Jun 16, 2017 at 5:01 PM, Kani, Toshimitsu  wrote:
> >> > --- a/drivers/acpi/nfit/core.c
> >> > +++ b/drivers/acpi/nfit/core.c
> >> > @@ -1031,7 +1031,7 @@ static ssize_t scrub_store(struct device *dev,
> >> > if (nd_desc) {
> >> > struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
> >> >
> >> > -   rc = acpi_nfit_ars_rescan(acpi_desc);
> >> > +   rc = acpi_nfit_ars_rescan(acpi_desc, 0);
> >> > }
> >> > device_unlock(dev);
> >> > if (rc)
> >> > @@ -2057,6 +2057,10 @@ static int ars_start(struct acpi_nfit_desc
> >> *acpi_desc, struct nfit_spa *nfit_spa
> >> > ars_start.type = ND_ARS_VOLATILE;
> >> > else
> >> > return -ENOTTY;
> >> > +   if (nfit_spa->ars_prev_data) {
> >> > +   ars_start.flags |= ND_ARS_RETURN_PREV_DATA;
> >> > +   nfit_spa->ars_prev_data = 0;
> >> > +   }
> >>
> >> I'd rather you plumb a new 'flags' parameter all the way through from
> >> acpi_nfit_ars_rescan() to ars_start() rather than carrying this as a
> >> property of nfit_spa.
> >
> > Yes, I wanted to carry 'flags' all the way, but since acpi_nfit_ars_rescan()
> > calls acpi_nfit_scrub() via acpi_desc->work, all info needs to be marshalled
> > into struct acpi_nfit_desc.  Using nfit_spa allows a request to be carried 
> > as
> > per-spa basis...
> 
> Ah ok, but I still think it does not belong to a spa. This is control
> / context information for the workqueue and that belongs with
> acpi_nfit_desc. It think it's fine if all spas get re-scrubbed with
> the "prev_data" flag in the the notification case.

Sounds good.  I will add the flags info to acpi_nfit_desc and update
this patch.

Thanks!
-Toshi



Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Friday, June 16, 2017 08:35:19 PM Len Brown wrote:
> On Fri, Jun 16, 2017 at 7:53 PM, Rafael J. Wysocki  wrote:
> > On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
> >> From: Len Brown 
> >>
> >> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
> >> to supply /sys/.../cpufreq/scaling_cur_freq
> >> on all x86 systems supporting APERF/MPERF.
> >>
> >> That includes 100% of systems supported by intel_pstate,
> >> and so intel_pstate.get() is now a NOP -- remove it.
> >>
> >> Invoke aperfmperf_khz_on_cpu() directly,
> >> if legacy-mode p-state tracing is enabled.
> >>
> >> Signed-off-by: Len Brown 
> >> ---
> >>  drivers/cpufreq/intel_pstate.c | 16 +---
> >>  1 file changed, 1 insertion(+), 15 deletions(-)
> >>
> >> diff --git a/drivers/cpufreq/intel_pstate.c 
> >> b/drivers/cpufreq/intel_pstate.c
> >> index b7de5bd..5d67780 100644
> >> --- a/drivers/cpufreq/intel_pstate.c
> >> +++ b/drivers/cpufreq/intel_pstate.c
> >> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct 
> >> cpudata *cpu, u64 time)
> >>   return false;
> >>  }
> >>
> >> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
> >> -{
> >> - return mul_ext_fp(cpu->sample.core_avg_perf,
> >> -   cpu->pstate.max_pstate_physical * 
> >> cpu->pstate.scaling);
> >> -}
> >> -
> >>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
> >>  {
> >>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
> >> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct 
> >> cpudata *cpu, int target_pstate)
> >>   sample->mperf,
> >>   sample->aperf,
> >>   sample->tsc,
> >> - get_avg_frequency(cpu),
> >> + aperfmperf_khz_on_cpu(cpu->cpu),
> 
> Note that I deleted the line above in an updated version of this patch
> that I'm ready to send out.
> 
> There were a couple of problems with it.
> The first is that it was  ugly that tracing (which occurs only in the
> SW governor case)
> could shorten the measurement interval as seen by the sysfs interface.
> 
> The second is that this trace point can be called with irqs off,
> and smp_call_function_single() will WARN when called with irqs off.
> 
> Srinivas Acked that I simply remove this field from the tracepoint --
> as it is redundant to calculate it in the kernel when we are already
> exporting the raw values of aperf and mperf.

This changes the tracepoint format and I know about a couple of user space
scripts that consume these tracepoints though.

What would be wrong with leaving it as is?

> >>   fp_toint(cpu->iowait_boost * 100));
> >>  }
> >>
> >> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int 
> >> cpunum)
> >>   return 0;
> >>  }
> >>
> >> -static unsigned int intel_pstate_get(unsigned int cpu_num)
> >> -{
> >> - struct cpudata *cpu = all_cpu_data[cpu_num];
> >> -
> >> - return cpu ? get_avg_frequency(cpu) : 0;
> >> -}
> >> -
> >>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
> >>  {
> >>   struct cpudata *cpu = all_cpu_data[cpu_num];
> >> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
> >>   .setpolicy  = intel_pstate_set_policy,
> >>   .suspend= intel_pstate_hwp_save_state,
> >>   .resume = intel_pstate_resume,
> >> - .get= intel_pstate_get,
> >>   .init   = intel_pstate_cpu_init,
> >>   .exit   = intel_pstate_cpu_exit,
> >>   .stop_cpu   = intel_pstate_stop_cpu,
> >>
> >
> > This change will cause cpufreq_quick_get() to work differently and it is
> > called by KVM among other things.  Will that still work?
> 
> Neither invocation of cpufreq_quick_get() (ARM cpu-cooling and X86 variable 
> TSC)
> are aligned with the hardware supported by the intel_pstate driver.

OK

Thanks,
Rafael



Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Friday, June 16, 2017 08:35:19 PM Len Brown wrote:
> On Fri, Jun 16, 2017 at 7:53 PM, Rafael J. Wysocki  wrote:
> > On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
> >> From: Len Brown 
> >>
> >> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
> >> to supply /sys/.../cpufreq/scaling_cur_freq
> >> on all x86 systems supporting APERF/MPERF.
> >>
> >> That includes 100% of systems supported by intel_pstate,
> >> and so intel_pstate.get() is now a NOP -- remove it.
> >>
> >> Invoke aperfmperf_khz_on_cpu() directly,
> >> if legacy-mode p-state tracing is enabled.
> >>
> >> Signed-off-by: Len Brown 
> >> ---
> >>  drivers/cpufreq/intel_pstate.c | 16 +---
> >>  1 file changed, 1 insertion(+), 15 deletions(-)
> >>
> >> diff --git a/drivers/cpufreq/intel_pstate.c 
> >> b/drivers/cpufreq/intel_pstate.c
> >> index b7de5bd..5d67780 100644
> >> --- a/drivers/cpufreq/intel_pstate.c
> >> +++ b/drivers/cpufreq/intel_pstate.c
> >> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct 
> >> cpudata *cpu, u64 time)
> >>   return false;
> >>  }
> >>
> >> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
> >> -{
> >> - return mul_ext_fp(cpu->sample.core_avg_perf,
> >> -   cpu->pstate.max_pstate_physical * 
> >> cpu->pstate.scaling);
> >> -}
> >> -
> >>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
> >>  {
> >>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
> >> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct 
> >> cpudata *cpu, int target_pstate)
> >>   sample->mperf,
> >>   sample->aperf,
> >>   sample->tsc,
> >> - get_avg_frequency(cpu),
> >> + aperfmperf_khz_on_cpu(cpu->cpu),
> 
> Note that I deleted the line above in an updated version of this patch
> that I'm ready to send out.
> 
> There were a couple of problems with it.
> The first is that it was  ugly that tracing (which occurs only in the
> SW governor case)
> could shorten the measurement interval as seen by the sysfs interface.
> 
> The second is that this trace point can be called with irqs off,
> and smp_call_function_single() will WARN when called with irqs off.
> 
> Srinivas Acked that I simply remove this field from the tracepoint --
> as it is redundant to calculate it in the kernel when we are already
> exporting the raw values of aperf and mperf.

This changes the tracepoint format and I know about a couple of user space
scripts that consume these tracepoints though.

What would be wrong with leaving it as is?

> >>   fp_toint(cpu->iowait_boost * 100));
> >>  }
> >>
> >> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int 
> >> cpunum)
> >>   return 0;
> >>  }
> >>
> >> -static unsigned int intel_pstate_get(unsigned int cpu_num)
> >> -{
> >> - struct cpudata *cpu = all_cpu_data[cpu_num];
> >> -
> >> - return cpu ? get_avg_frequency(cpu) : 0;
> >> -}
> >> -
> >>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
> >>  {
> >>   struct cpudata *cpu = all_cpu_data[cpu_num];
> >> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
> >>   .setpolicy  = intel_pstate_set_policy,
> >>   .suspend= intel_pstate_hwp_save_state,
> >>   .resume = intel_pstate_resume,
> >> - .get= intel_pstate_get,
> >>   .init   = intel_pstate_cpu_init,
> >>   .exit   = intel_pstate_cpu_exit,
> >>   .stop_cpu   = intel_pstate_stop_cpu,
> >>
> >
> > This change will cause cpufreq_quick_get() to work differently and it is
> > called by KVM among other things.  Will that still work?
> 
> Neither invocation of cpufreq_quick_get() (ARM cpu-cooling and X86 variable 
> TSC)
> are aligned with the hardware supported by the intel_pstate driver.

OK

Thanks,
Rafael



Re: [kernel-hardening] Re: [PATCH v4 06/13] iscsi: ensure RNG is seeded before use

2017-06-16 Thread Jason A. Donenfeld
Hi Lee,

On Fri, Jun 16, 2017 at 11:58 PM, Lee Duncan  wrote:
> It seems like what you are doing is basically "good", i.e. if there is
> not enough random data, don't use it. But what happens in that case? The
> authentication fails? How does the user know to wait and try again?

The process just remains in interruptible (kill-able) sleep until
there is enough entropy, so the process doesn't need to do anything.
If the waiting is interrupted by a signal, it returns -ESYSRESTART,
which follows the usual semantics of restartable syscalls.

Jason


Re: [kernel-hardening] Re: [PATCH v4 06/13] iscsi: ensure RNG is seeded before use

2017-06-16 Thread Jason A. Donenfeld
Hi Lee,

On Fri, Jun 16, 2017 at 11:58 PM, Lee Duncan  wrote:
> It seems like what you are doing is basically "good", i.e. if there is
> not enough random data, don't use it. But what happens in that case? The
> authentication fails? How does the user know to wait and try again?

The process just remains in interruptible (kill-able) sleep until
there is enough entropy, so the process doesn't need to do anything.
If the waiting is interrupted by a signal, it returns -ESYSRESTART,
which follows the usual semantics of restartable syscalls.

Jason


Re: [PATCH] random: silence compiler warnings and fix race

2017-06-16 Thread Jason A. Donenfeld
On Fri, Jun 16, 2017 at 4:35 PM, Sebastian Andrzej Siewior
 wrote:
> I wouldn't just push the lock one up as is but move that write part to
> crng_init to remain within the locked section. Like that:

We can't quite do that, because invalidate_batched_entropy() needs to
be called _before_ crng_init. Otherwise a concurrent call to
get_random_u32/u64() will have crng_init being the wrong value when
the batched entropy is still old.

> Are use about that? I am not sure that the gcc will inline "crng_init"
> read twice. It is not a local variable. READ_ONCE() is usually used
> where gcc could cache a memory access but you do not want this. But hey!
> If someone knows better I am here to learn.

The whole purpose is that I _want_ it to cache the memory access so
that it is _not_ inlined. So, based on your understanding, it does
exactly what I intended it to do. The reason is that I'd like to avoid
a lock imbalance, which could happen if the read is inlined.

Jason


Re: [PATCH] random: silence compiler warnings and fix race

2017-06-16 Thread Jason A. Donenfeld
On Fri, Jun 16, 2017 at 4:35 PM, Sebastian Andrzej Siewior
 wrote:
> I wouldn't just push the lock one up as is but move that write part to
> crng_init to remain within the locked section. Like that:

We can't quite do that, because invalidate_batched_entropy() needs to
be called _before_ crng_init. Otherwise a concurrent call to
get_random_u32/u64() will have crng_init being the wrong value when
the batched entropy is still old.

> Are use about that? I am not sure that the gcc will inline "crng_init"
> read twice. It is not a local variable. READ_ONCE() is usually used
> where gcc could cache a memory access but you do not want this. But hey!
> If someone knows better I am here to learn.

The whole purpose is that I _want_ it to cache the memory access so
that it is _not_ inlined. So, based on your understanding, it does
exactly what I intended it to do. The reason is that I'd like to avoid
a lock imbalance, which could happen if the read is inlined.

Jason


Re: [PATCH 2/5] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:13 PM Len Brown wrote:
> From: Len Brown 
> 
> The goal of this change is to give users a uniform and meaningful
> result when they read /sys/...cpufreq/scaling_cur_freq
> on modern x86 hardware, as compared to what they get today.
> 
> Modern x86 processors include the hardware needed
> to accurately calculate frequency over an interval --
> APERF, MPERF, and the TSC.
> 
> Here we provide an x86 routine to make this calculation
> on supported hardware, and use it in preference to any
> driver driver-specific cpufreq_driver.get() routine.
> 
> MHz is computed like so:
> 
> MHz = base_MHz * delta_APERF / delta_MPERF
> 
> MHz is the average frequency of the busy processor
> over a measurement interval.  The interval is
> defined to be the time between successive invocations
> of aperfmperf_khz_on_cpu(), which are expected to to
> happen on-demand when users read sysfs attribute
> cpufreq/scaling_cur_freq.
> 
> As with previous methods of calculating MHz,
> idle time is excluded.
> 
> base_MHz above is from TSC calibration global "cpu_khz".
> 
> This x86 native method to calculate MHz returns a meaningful result
> no matter if P-states are controlled by hardware or firmware
> and/or if the Linux cpufreq sub-system is or is-not installed.
> 
> When this routine is invoked more frequently, the measurement
> interval becomes shorter.  However, the code limits re-computation
> to 10ms intervals so that average frequency remains meaningful.
> 
> Discerning users are encouraged to take advantage of
> the turbostat(8) utility, which can gracefully handle
> concurrent measurement intervals of arbitrary length.
> 
> Signed-off-by: Len Brown 
> ---
>  arch/x86/kernel/cpu/Makefile |  1 +
>  arch/x86/kernel/cpu/aperfmperf.c | 82 
> 
>  drivers/cpufreq/cpufreq.c|  7 +++-
>  include/linux/cpufreq.h  | 13 +++
>  4 files changed, 102 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/kernel/cpu/aperfmperf.c
> 
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 521..cdf8249 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -21,6 +21,7 @@ obj-y   += common.o
>  obj-y+= rdrand.o
>  obj-y+= match.o
>  obj-y+= bugs.o
> +obj-$(CONFIG_CPU_FREQ)   += aperfmperf.o
>  
>  obj-$(CONFIG_PROC_FS)+= proc.o
>  obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
> diff --git a/arch/x86/kernel/cpu/aperfmperf.c 
> b/arch/x86/kernel/cpu/aperfmperf.c
> new file mode 100644
> index 000..5ccf63a
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -0,0 +1,82 @@
> +/*
> + * x86 APERF/MPERF KHz calculation for
> + * /sys/.../cpufreq/scaling_cur_freq
> + *
> + * Copyright (C) 2017 Intel Corp.
> + * Author: Len Brown 
> + *
> + * This file is licensed under GPLv2.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct aperfmperf_sample {
> + unsigned int khz;
> + unsigned long jiffies;
> + u64 aperf;
> + u64 mperf;
> +};
> +
> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
> +
> +/*
> + * aperfmperf_snapshot_khz()
> + * On the current CPU, snapshot APERF, MPERF, and jiffies
> + * unless we already did it within 10ms
> + * calculate kHz, save snapshot
> + */
> +static void aperfmperf_snapshot_khz(void *dummy)
> +{
> + u64 aperf, aperf_delta;
> + u64 mperf, mperf_delta;
> + struct aperfmperf_sample *s = _cpu_var(samples);
> +
> + /* Don't bother re-computing within 10 ms */
> + if (time_before(jiffies, s->jiffies + HZ/100))
> + goto out;
> +
> + rdmsrl(MSR_IA32_APERF, aperf);
> + rdmsrl(MSR_IA32_MPERF, mperf);
> +
> + aperf_delta = aperf - s->aperf;
> + mperf_delta = mperf - s->mperf;
> +
> + /*
> +  * There is no architectural guarantee that MPERF
> +  * increments faster than we can read it.
> +  */
> + if (mperf_delta == 0)
> + goto out;
> +
> + /*
> +  * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
> +  *  khz = (cpu_khz * aperf_delta) / mperf_delta
> +  */
> + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
> + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
> + else/* khz = aperf_delta / (mperf_delta / cpu_khz) */
> + s->khz = div64_u64(aperf_delta,
> + div64_u64(mperf_delta, cpu_khz));
> + s->jiffies = jiffies;
> + s->aperf = aperf;
> + s->mperf = mperf;
> +
> +out:
> + put_cpu_var(samples);
> +}
> +
> +unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
> +{
> + if (!cpu_khz)
> + return 0;
> +
> + if (!static_cpu_has(X86_FEATURE_APERFMPERF))
> + return 0;
> +
> + smp_call_function_single(cpu, 

Re: [PATCH 2/5] x86: use common aperfmperf_khz_on_cpu() to calculate KHz using APERF/MPERF

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:13 PM Len Brown wrote:
> From: Len Brown 
> 
> The goal of this change is to give users a uniform and meaningful
> result when they read /sys/...cpufreq/scaling_cur_freq
> on modern x86 hardware, as compared to what they get today.
> 
> Modern x86 processors include the hardware needed
> to accurately calculate frequency over an interval --
> APERF, MPERF, and the TSC.
> 
> Here we provide an x86 routine to make this calculation
> on supported hardware, and use it in preference to any
> driver driver-specific cpufreq_driver.get() routine.
> 
> MHz is computed like so:
> 
> MHz = base_MHz * delta_APERF / delta_MPERF
> 
> MHz is the average frequency of the busy processor
> over a measurement interval.  The interval is
> defined to be the time between successive invocations
> of aperfmperf_khz_on_cpu(), which are expected to to
> happen on-demand when users read sysfs attribute
> cpufreq/scaling_cur_freq.
> 
> As with previous methods of calculating MHz,
> idle time is excluded.
> 
> base_MHz above is from TSC calibration global "cpu_khz".
> 
> This x86 native method to calculate MHz returns a meaningful result
> no matter if P-states are controlled by hardware or firmware
> and/or if the Linux cpufreq sub-system is or is-not installed.
> 
> When this routine is invoked more frequently, the measurement
> interval becomes shorter.  However, the code limits re-computation
> to 10ms intervals so that average frequency remains meaningful.
> 
> Discerning users are encouraged to take advantage of
> the turbostat(8) utility, which can gracefully handle
> concurrent measurement intervals of arbitrary length.
> 
> Signed-off-by: Len Brown 
> ---
>  arch/x86/kernel/cpu/Makefile |  1 +
>  arch/x86/kernel/cpu/aperfmperf.c | 82 
> 
>  drivers/cpufreq/cpufreq.c|  7 +++-
>  include/linux/cpufreq.h  | 13 +++
>  4 files changed, 102 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/kernel/cpu/aperfmperf.c
> 
> diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
> index 521..cdf8249 100644
> --- a/arch/x86/kernel/cpu/Makefile
> +++ b/arch/x86/kernel/cpu/Makefile
> @@ -21,6 +21,7 @@ obj-y   += common.o
>  obj-y+= rdrand.o
>  obj-y+= match.o
>  obj-y+= bugs.o
> +obj-$(CONFIG_CPU_FREQ)   += aperfmperf.o
>  
>  obj-$(CONFIG_PROC_FS)+= proc.o
>  obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
> diff --git a/arch/x86/kernel/cpu/aperfmperf.c 
> b/arch/x86/kernel/cpu/aperfmperf.c
> new file mode 100644
> index 000..5ccf63a
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/aperfmperf.c
> @@ -0,0 +1,82 @@
> +/*
> + * x86 APERF/MPERF KHz calculation for
> + * /sys/.../cpufreq/scaling_cur_freq
> + *
> + * Copyright (C) 2017 Intel Corp.
> + * Author: Len Brown 
> + *
> + * This file is licensed under GPLv2.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +struct aperfmperf_sample {
> + unsigned int khz;
> + unsigned long jiffies;
> + u64 aperf;
> + u64 mperf;
> +};
> +
> +static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
> +
> +/*
> + * aperfmperf_snapshot_khz()
> + * On the current CPU, snapshot APERF, MPERF, and jiffies
> + * unless we already did it within 10ms
> + * calculate kHz, save snapshot
> + */
> +static void aperfmperf_snapshot_khz(void *dummy)
> +{
> + u64 aperf, aperf_delta;
> + u64 mperf, mperf_delta;
> + struct aperfmperf_sample *s = _cpu_var(samples);
> +
> + /* Don't bother re-computing within 10 ms */
> + if (time_before(jiffies, s->jiffies + HZ/100))
> + goto out;
> +
> + rdmsrl(MSR_IA32_APERF, aperf);
> + rdmsrl(MSR_IA32_MPERF, mperf);
> +
> + aperf_delta = aperf - s->aperf;
> + mperf_delta = mperf - s->mperf;
> +
> + /*
> +  * There is no architectural guarantee that MPERF
> +  * increments faster than we can read it.
> +  */
> + if (mperf_delta == 0)
> + goto out;
> +
> + /*
> +  * if (cpu_khz * aperf_delta) fits into ULLONG_MAX, then
> +  *  khz = (cpu_khz * aperf_delta) / mperf_delta
> +  */
> + if (div64_u64(ULLONG_MAX, cpu_khz) > aperf_delta)
> + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
> + else/* khz = aperf_delta / (mperf_delta / cpu_khz) */
> + s->khz = div64_u64(aperf_delta,
> + div64_u64(mperf_delta, cpu_khz));
> + s->jiffies = jiffies;
> + s->aperf = aperf;
> + s->mperf = mperf;
> +
> +out:
> + put_cpu_var(samples);
> +}
> +
> +unsigned int aperfmperf_khz_on_cpu(unsigned int cpu)
> +{
> + if (!cpu_khz)
> + return 0;
> +
> + if (!static_cpu_has(X86_FEATURE_APERFMPERF))
> + return 0;
> +
> + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
> +
> + return per_cpu(samples.khz, cpu);

Re: [PATCH v5 0/2] add MSI support for PCIe port services and DPC IRQ support

2017-06-16 Thread Bjorn Helgaas
On Tue, May 23, 2017 at 03:23:57PM +0100, Gabriele Paoloni wrote:
> From: gabriele paoloni 
> 
> This patchset:
> 1) adds support for MSI interrupt vectors to be used for Roor Port services
> 2) adds support for DPC Root Port service interrupt
> 
> The patchset has been tested on Hisilicon Hip08 Chipset
> 
> Changes from v4:
> - removed meaningless comment
> 
> Changes from v3:
> - removed 2 extra lines at the bottom of comments
> 
> Changes from v2:
> - Fixed comment mismatch for function pcie_port_enable_irq_vec
> - removed redundant commet on pci_irq_vector()
> 
> Changes from v1:
> According to comments from Christoph Hellwig in
> https://www.spinics.net/lists/kernel/msg2508842.html
> and
> https://www.spinics.net/lists/kernel/msg2508850.html
> 
> - reduced the calls of pci_alloc_irq_vectors by ORing PCI_IRQ_MSIX and 
>   PCI_IRQ_MSI
> - used a unique macro for the max number of MSI/MSIx interrupt vectors
> - reworked pcie_init_service_irqs() to call pci_alloc_irq_vectors() only
>   for legacy IRQ
> 
> 
> Gabriele Paoloni (1):
>   PCI/portdrv: add support for different MSI interrupts for PCIe port
> services
> 
> gabriele paoloni (1):
>   PCI/portdrv: allocate MSI/MSIx vector for DPC RP service
> 
>  drivers/pci/pcie/portdrv.h  |  8 --
>  drivers/pci/pcie/portdrv_core.c | 63 
> +
>  2 files changed, 50 insertions(+), 21 deletions(-)

Applied to pci/portdrv for v4.13, thanks!

I tweaked the PME/hotplug no-MSI logic in pcie_init_service_irqs()
because I thought it was getting hard to read with the negations and
conjunctions.  Please double-check to make sure I didn't break it.

https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git/log/?h=pci/portdrv

I also capitalized your name consistently, Gabriele :)


Re: [PATCH v5 0/2] add MSI support for PCIe port services and DPC IRQ support

2017-06-16 Thread Bjorn Helgaas
On Tue, May 23, 2017 at 03:23:57PM +0100, Gabriele Paoloni wrote:
> From: gabriele paoloni 
> 
> This patchset:
> 1) adds support for MSI interrupt vectors to be used for Roor Port services
> 2) adds support for DPC Root Port service interrupt
> 
> The patchset has been tested on Hisilicon Hip08 Chipset
> 
> Changes from v4:
> - removed meaningless comment
> 
> Changes from v3:
> - removed 2 extra lines at the bottom of comments
> 
> Changes from v2:
> - Fixed comment mismatch for function pcie_port_enable_irq_vec
> - removed redundant commet on pci_irq_vector()
> 
> Changes from v1:
> According to comments from Christoph Hellwig in
> https://www.spinics.net/lists/kernel/msg2508842.html
> and
> https://www.spinics.net/lists/kernel/msg2508850.html
> 
> - reduced the calls of pci_alloc_irq_vectors by ORing PCI_IRQ_MSIX and 
>   PCI_IRQ_MSI
> - used a unique macro for the max number of MSI/MSIx interrupt vectors
> - reworked pcie_init_service_irqs() to call pci_alloc_irq_vectors() only
>   for legacy IRQ
> 
> 
> Gabriele Paoloni (1):
>   PCI/portdrv: add support for different MSI interrupts for PCIe port
> services
> 
> gabriele paoloni (1):
>   PCI/portdrv: allocate MSI/MSIx vector for DPC RP service
> 
>  drivers/pci/pcie/portdrv.h  |  8 --
>  drivers/pci/pcie/portdrv_core.c | 63 
> +
>  2 files changed, 50 insertions(+), 21 deletions(-)

Applied to pci/portdrv for v4.13, thanks!

I tweaked the PME/hotplug no-MSI logic in pcie_init_service_irqs()
because I thought it was getting hard to read with the negations and
conjunctions.  Please double-check to make sure I didn't break it.

https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git/log/?h=pci/portdrv

I also capitalized your name consistently, Gabriele :)


RE: [PATCH v2] libnvdimm, pmem: Add sysfs notifications to badblocks

2017-06-16 Thread Kani, Toshimitsu
> On Mon, Jun 12, 2017 at 3:25 PM, Toshi Kani  wrote:
> > Sysfs "badblocks" information may be updated during run-time that:
> >  - MCE, SCI, and sysfs "scrub" may add new bad blocks
> >  - Writes and ioctl() may clear bad blocks
> >
> > Add support to send sysfs notifications to sysfs "badblocks" file
> > under region and pmem directories when their badblocks information
> > is re-evaluated (but is not necessarily changed) during run-time.
> >
> > Signed-off-by: Toshi Kani 
> > Cc: Dan Williams 
> > Cc: Vishal Verma 
> > Cc: Linda Knippers 
> > ---
> > v2: Send notifications for the clearing case
> > ---
> 
> This looks good to me, I've applied it, but I also want to extend the
> ndctl unit tests to cover this mechanism.

Right.  For the time being, would you mind to use the attached test
program for your sanity tests?  It simply monitors sysfs notifications
and prints badblocks info...  Sorry for inconvenience.

Since I am not familiar with the ndctl unit tests and I am traveling for
the rest of the month, I may have to look into it after I am back.

Thanks!
-Toshi


/*
 * Copyright (C) 2017  Hewlett Packard Enterprise Development LP
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 */
#include 
#include 
#include 
#include 
#include 
#include  
#include  

int main(int argc, char **argv)
{
int fd, ret;
fd_set fds;
char buf[256];

if (argc != 2) {
printf("USAGE: test_sysfs_notify badblocks-path\n");
exit(1);
}

if ((fd = open(argv[1], O_RDONLY)) < 0) {
printf("Unable to open %s\n", argv[1]);
exit(1);
}
printf("Monitoring %s - ctl-c to stop\n", argv[1]);

while (1) {
memset(buf, 0, sizeof(buf));
ret = lseek(fd, 0, SEEK_SET);
ret = read(fd, buf, sizeof(buf));
printf("%s\n", buf);

FD_ZERO();
FD_SET(fd, );

ret = select(fd + 1, NULL, NULL, , NULL);
if (ret <= 0) {
printf("error (%d)\n", ret);
exit(1);
} else if (FD_ISSET(fd, )) {
printf("NOTIFIED!!\n");
}
}

close(fd);
}


RE: [PATCH v2] libnvdimm, pmem: Add sysfs notifications to badblocks

2017-06-16 Thread Kani, Toshimitsu
> On Mon, Jun 12, 2017 at 3:25 PM, Toshi Kani  wrote:
> > Sysfs "badblocks" information may be updated during run-time that:
> >  - MCE, SCI, and sysfs "scrub" may add new bad blocks
> >  - Writes and ioctl() may clear bad blocks
> >
> > Add support to send sysfs notifications to sysfs "badblocks" file
> > under region and pmem directories when their badblocks information
> > is re-evaluated (but is not necessarily changed) during run-time.
> >
> > Signed-off-by: Toshi Kani 
> > Cc: Dan Williams 
> > Cc: Vishal Verma 
> > Cc: Linda Knippers 
> > ---
> > v2: Send notifications for the clearing case
> > ---
> 
> This looks good to me, I've applied it, but I also want to extend the
> ndctl unit tests to cover this mechanism.

Right.  For the time being, would you mind to use the attached test
program for your sanity tests?  It simply monitors sysfs notifications
and prints badblocks info...  Sorry for inconvenience.

Since I am not familiar with the ndctl unit tests and I am traveling for
the rest of the month, I may have to look into it after I am back.

Thanks!
-Toshi


/*
 * Copyright (C) 2017  Hewlett Packard Enterprise Development LP
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 */
#include 
#include 
#include 
#include 
#include 
#include  
#include  

int main(int argc, char **argv)
{
int fd, ret;
fd_set fds;
char buf[256];

if (argc != 2) {
printf("USAGE: test_sysfs_notify badblocks-path\n");
exit(1);
}

if ((fd = open(argv[1], O_RDONLY)) < 0) {
printf("Unable to open %s\n", argv[1]);
exit(1);
}
printf("Monitoring %s - ctl-c to stop\n", argv[1]);

while (1) {
memset(buf, 0, sizeof(buf));
ret = lseek(fd, 0, SEEK_SET);
ret = read(fd, buf, sizeof(buf));
printf("%s\n", buf);

FD_ZERO();
FD_SET(fd, );

ret = select(fd + 1, NULL, NULL, , NULL);
if (ret <= 0) {
printf("error (%d)\n", ret);
exit(1);
} else if (FD_ISSET(fd, )) {
printf("NOTIFIED!!\n");
}
}

close(fd);
}


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 7:53 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
>> From: Len Brown 
>>
>> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
>> to supply /sys/.../cpufreq/scaling_cur_freq
>> on all x86 systems supporting APERF/MPERF.
>>
>> That includes 100% of systems supported by intel_pstate,
>> and so intel_pstate.get() is now a NOP -- remove it.
>>
>> Invoke aperfmperf_khz_on_cpu() directly,
>> if legacy-mode p-state tracing is enabled.
>>
>> Signed-off-by: Len Brown 
>> ---
>>  drivers/cpufreq/intel_pstate.c | 16 +---
>>  1 file changed, 1 insertion(+), 15 deletions(-)
>>
>> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
>> index b7de5bd..5d67780 100644
>> --- a/drivers/cpufreq/intel_pstate.c
>> +++ b/drivers/cpufreq/intel_pstate.c
>> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct cpudata 
>> *cpu, u64 time)
>>   return false;
>>  }
>>
>> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
>> -{
>> - return mul_ext_fp(cpu->sample.core_avg_perf,
>> -   cpu->pstate.max_pstate_physical * 
>> cpu->pstate.scaling);
>> -}
>> -
>>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
>>  {
>>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
>> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct cpudata 
>> *cpu, int target_pstate)
>>   sample->mperf,
>>   sample->aperf,
>>   sample->tsc,
>> - get_avg_frequency(cpu),
>> + aperfmperf_khz_on_cpu(cpu->cpu),

Note that I deleted the line above in an updated version of this patch
that I'm ready to send out.

There were a couple of problems with it.
The first is that it was  ugly that tracing (which occurs only in the
SW governor case)
could shorten the measurement interval as seen by the sysfs interface.

The second is that this trace point can be called with irqs off,
and smp_call_function_single() will WARN when called with irqs off.

Srinivas Acked that I simply remove this field from the tracepoint --
as it is redundant to calculate it in the kernel when we are already
exporting the raw values of aperf and mperf.

>>   fp_toint(cpu->iowait_boost * 100));
>>  }
>>
>> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
>>   return 0;
>>  }
>>
>> -static unsigned int intel_pstate_get(unsigned int cpu_num)
>> -{
>> - struct cpudata *cpu = all_cpu_data[cpu_num];
>> -
>> - return cpu ? get_avg_frequency(cpu) : 0;
>> -}
>> -
>>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
>>  {
>>   struct cpudata *cpu = all_cpu_data[cpu_num];
>> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
>>   .setpolicy  = intel_pstate_set_policy,
>>   .suspend= intel_pstate_hwp_save_state,
>>   .resume = intel_pstate_resume,
>> - .get= intel_pstate_get,
>>   .init   = intel_pstate_cpu_init,
>>   .exit   = intel_pstate_cpu_exit,
>>   .stop_cpu   = intel_pstate_stop_cpu,
>>
>
> This change will cause cpufreq_quick_get() to work differently and it is
> called by KVM among other things.  Will that still work?

Neither invocation of cpufreq_quick_get() (ARM cpu-cooling and X86 variable TSC)
are aligned with the hardware supported by the intel_pstate driver.

If it were, then  yes, cpufreq_quick_get() would not call the (absent)
intel_pstate.get,
and instead would return policy->cur.

acpi_cpufreq() retains its internal .get routine, and so cpufreq_quick_get()
doesn't change at all on those legacy systems.  As it turns out this
is moot, since that
driver returns the cached frequency request in either case.

thanks,
Len Brown, Intel Open Source Technology Center


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Len Brown
On Fri, Jun 16, 2017 at 7:53 PM, Rafael J. Wysocki  wrote:
> On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
>> From: Len Brown 
>>
>> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
>> to supply /sys/.../cpufreq/scaling_cur_freq
>> on all x86 systems supporting APERF/MPERF.
>>
>> That includes 100% of systems supported by intel_pstate,
>> and so intel_pstate.get() is now a NOP -- remove it.
>>
>> Invoke aperfmperf_khz_on_cpu() directly,
>> if legacy-mode p-state tracing is enabled.
>>
>> Signed-off-by: Len Brown 
>> ---
>>  drivers/cpufreq/intel_pstate.c | 16 +---
>>  1 file changed, 1 insertion(+), 15 deletions(-)
>>
>> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
>> index b7de5bd..5d67780 100644
>> --- a/drivers/cpufreq/intel_pstate.c
>> +++ b/drivers/cpufreq/intel_pstate.c
>> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct cpudata 
>> *cpu, u64 time)
>>   return false;
>>  }
>>
>> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
>> -{
>> - return mul_ext_fp(cpu->sample.core_avg_perf,
>> -   cpu->pstate.max_pstate_physical * 
>> cpu->pstate.scaling);
>> -}
>> -
>>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
>>  {
>>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
>> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct cpudata 
>> *cpu, int target_pstate)
>>   sample->mperf,
>>   sample->aperf,
>>   sample->tsc,
>> - get_avg_frequency(cpu),
>> + aperfmperf_khz_on_cpu(cpu->cpu),

Note that I deleted the line above in an updated version of this patch
that I'm ready to send out.

There were a couple of problems with it.
The first is that it was  ugly that tracing (which occurs only in the
SW governor case)
could shorten the measurement interval as seen by the sysfs interface.

The second is that this trace point can be called with irqs off,
and smp_call_function_single() will WARN when called with irqs off.

Srinivas Acked that I simply remove this field from the tracepoint --
as it is redundant to calculate it in the kernel when we are already
exporting the raw values of aperf and mperf.

>>   fp_toint(cpu->iowait_boost * 100));
>>  }
>>
>> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
>>   return 0;
>>  }
>>
>> -static unsigned int intel_pstate_get(unsigned int cpu_num)
>> -{
>> - struct cpudata *cpu = all_cpu_data[cpu_num];
>> -
>> - return cpu ? get_avg_frequency(cpu) : 0;
>> -}
>> -
>>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
>>  {
>>   struct cpudata *cpu = all_cpu_data[cpu_num];
>> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
>>   .setpolicy  = intel_pstate_set_policy,
>>   .suspend= intel_pstate_hwp_save_state,
>>   .resume = intel_pstate_resume,
>> - .get= intel_pstate_get,
>>   .init   = intel_pstate_cpu_init,
>>   .exit   = intel_pstate_cpu_exit,
>>   .stop_cpu   = intel_pstate_stop_cpu,
>>
>
> This change will cause cpufreq_quick_get() to work differently and it is
> called by KVM among other things.  Will that still work?

Neither invocation of cpufreq_quick_get() (ARM cpu-cooling and X86 variable TSC)
are aligned with the hardware supported by the intel_pstate driver.

If it were, then  yes, cpufreq_quick_get() would not call the (absent)
intel_pstate.get,
and instead would return policy->cur.

acpi_cpufreq() retains its internal .get routine, and so cpufreq_quick_get()
doesn't change at all on those legacy systems.  As it turns out this
is moot, since that
driver returns the cached frequency request in either case.

thanks,
Len Brown, Intel Open Source Technology Center


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
> From: Len Brown 
> 
> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
> to supply /sys/.../cpufreq/scaling_cur_freq
> on all x86 systems supporting APERF/MPERF.
> 
> That includes 100% of systems supported by intel_pstate,
> and so intel_pstate.get() is now a NOP -- remove it.
> 
> Invoke aperfmperf_khz_on_cpu() directly,
> if legacy-mode p-state tracing is enabled.
> 
> Signed-off-by: Len Brown 
> ---
>  drivers/cpufreq/intel_pstate.c | 16 +---
>  1 file changed, 1 insertion(+), 15 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index b7de5bd..5d67780 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct cpudata 
> *cpu, u64 time)
>   return false;
>  }
>  
> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
> -{
> - return mul_ext_fp(cpu->sample.core_avg_perf,
> -   cpu->pstate.max_pstate_physical * 
> cpu->pstate.scaling);
> -}
> -
>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
>  {
>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct cpudata 
> *cpu, int target_pstate)
>   sample->mperf,
>   sample->aperf,
>   sample->tsc,
> - get_avg_frequency(cpu),
> + aperfmperf_khz_on_cpu(cpu->cpu),

I don't think this change is an improvement.

We compute cpu->sample.core_avg_perf anyway and calling aperfmperf_khz_on_cpu()
here means quite a bit of additional overhead.

Besides, the sample->mperf and sample->aperf values here are expected to be
the ones used for computing the frequency.

>   fp_toint(cpu->iowait_boost * 100));
>  }
>  
> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
>   return 0;
>  }
>  
> -static unsigned int intel_pstate_get(unsigned int cpu_num)
> -{
> - struct cpudata *cpu = all_cpu_data[cpu_num];
> -
> - return cpu ? get_avg_frequency(cpu) : 0;
> -}
> -
>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
>  {
>   struct cpudata *cpu = all_cpu_data[cpu_num];
> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
>   .setpolicy  = intel_pstate_set_policy,
>   .suspend= intel_pstate_hwp_save_state,
>   .resume = intel_pstate_resume,
> - .get= intel_pstate_get,
>   .init   = intel_pstate_cpu_init,
>   .exit   = intel_pstate_cpu_exit,
>   .stop_cpu   = intel_pstate_stop_cpu,
> 

Thanks,
Rafael



Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
> From: Len Brown 
> 
> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
> to supply /sys/.../cpufreq/scaling_cur_freq
> on all x86 systems supporting APERF/MPERF.
> 
> That includes 100% of systems supported by intel_pstate,
> and so intel_pstate.get() is now a NOP -- remove it.
> 
> Invoke aperfmperf_khz_on_cpu() directly,
> if legacy-mode p-state tracing is enabled.
> 
> Signed-off-by: Len Brown 
> ---
>  drivers/cpufreq/intel_pstate.c | 16 +---
>  1 file changed, 1 insertion(+), 15 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index b7de5bd..5d67780 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct cpudata 
> *cpu, u64 time)
>   return false;
>  }
>  
> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
> -{
> - return mul_ext_fp(cpu->sample.core_avg_perf,
> -   cpu->pstate.max_pstate_physical * 
> cpu->pstate.scaling);
> -}
> -
>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
>  {
>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct cpudata 
> *cpu, int target_pstate)
>   sample->mperf,
>   sample->aperf,
>   sample->tsc,
> - get_avg_frequency(cpu),
> + aperfmperf_khz_on_cpu(cpu->cpu),

I don't think this change is an improvement.

We compute cpu->sample.core_avg_perf anyway and calling aperfmperf_khz_on_cpu()
here means quite a bit of additional overhead.

Besides, the sample->mperf and sample->aperf values here are expected to be
the ones used for computing the frequency.

>   fp_toint(cpu->iowait_boost * 100));
>  }
>  
> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
>   return 0;
>  }
>  
> -static unsigned int intel_pstate_get(unsigned int cpu_num)
> -{
> - struct cpudata *cpu = all_cpu_data[cpu_num];
> -
> - return cpu ? get_avg_frequency(cpu) : 0;
> -}
> -
>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
>  {
>   struct cpudata *cpu = all_cpu_data[cpu_num];
> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
>   .setpolicy  = intel_pstate_set_policy,
>   .suspend= intel_pstate_hwp_save_state,
>   .resume = intel_pstate_resume,
> - .get= intel_pstate_get,
>   .init   = intel_pstate_cpu_init,
>   .exit   = intel_pstate_cpu_exit,
>   .stop_cpu   = intel_pstate_stop_cpu,
> 

Thanks,
Rafael



Re: [PATCH 5/5] intel_pstate: delete scheduler hook in HWP mode

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:16 PM Len Brown wrote:
> From: Len Brown 
> 
> The cpufreqa/scaling_cur_freq sysfs attribute is now provided by
> the x86 cpufreq core on all modern x86 systems, including
> all systems supported by the intel_pstate driver.

Not sure what you mean by "x86 cpufreq core"?

Besides, I'd reorder this change with respect to patch [4/5] as this
eliminates the hook entirely and then the "performance"-related change
would only affect non-HWP.

Thanks,
Rafael




Re: [PATCH 5/5] intel_pstate: delete scheduler hook in HWP mode

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:16 PM Len Brown wrote:
> From: Len Brown 
> 
> The cpufreqa/scaling_cur_freq sysfs attribute is now provided by
> the x86 cpufreq core on all modern x86 systems, including
> all systems supported by the intel_pstate driver.

Not sure what you mean by "x86 cpufreq core"?

Besides, I'd reorder this change with respect to patch [4/5] as this
eliminates the hook entirely and then the "performance"-related change
would only affect non-HWP.

Thanks,
Rafael




Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
> From: Len Brown 
> 
> When the governor is set to "performance", intel_pstate does not
> need the scheduler hook for doing any calculations.  Under these
> conditions, its only purpose is to continue to maintain
> cpufreq/scaling_cur_freq.
> 
> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
> the x86 cpufreq core on all modern x86 systems, including
> all systems supported by the intel_pstate driver.
> 
> So in "performance" governor mode, the scheduler hook can be skipped.
> This applies to both in Software and Hardware P-state control modes.
> 
> Suggested-by: Srinivas Pandruvada 
> Signed-off-by: Len Brown 
> ---
>  drivers/cpufreq/intel_pstate.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 5d67780..0ff3a4b 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
> cpufreq_policy *policy)
>*/
>   intel_pstate_clear_update_util_hook(policy->cpu);

The statement above shouldn't be necessary any more after the change below.

>   intel_pstate_max_within_limits(cpu);
> + } else {
> + intel_pstate_set_update_util_hook(policy->cpu);
>   }
>  
> - intel_pstate_set_update_util_hook(policy->cpu);
> -
>   if (hwp_active)
>   intel_pstate_hwp_set(policy->cpu);
>  
> 

What about update_turbo_pstate()?

In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
wouldn't that become problematic after this change?

Thanks,
Rafael



Re: [PATCH 4/5] intel_pstate: skip scheduler hook when in "performance" mode.

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:15 PM Len Brown wrote:
> From: Len Brown 
> 
> When the governor is set to "performance", intel_pstate does not
> need the scheduler hook for doing any calculations.  Under these
> conditions, its only purpose is to continue to maintain
> cpufreq/scaling_cur_freq.
> 
> But the cpufreq/scaling_cur_freq sysfs attribute is now provided by
> the x86 cpufreq core on all modern x86 systems, including
> all systems supported by the intel_pstate driver.
> 
> So in "performance" governor mode, the scheduler hook can be skipped.
> This applies to both in Software and Hardware P-state control modes.
> 
> Suggested-by: Srinivas Pandruvada 
> Signed-off-by: Len Brown 
> ---
>  drivers/cpufreq/intel_pstate.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index 5d67780..0ff3a4b 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -2025,10 +2025,10 @@ static int intel_pstate_set_policy(struct 
> cpufreq_policy *policy)
>*/
>   intel_pstate_clear_update_util_hook(policy->cpu);

The statement above shouldn't be necessary any more after the change below.

>   intel_pstate_max_within_limits(cpu);
> + } else {
> + intel_pstate_set_update_util_hook(policy->cpu);
>   }
>  
> - intel_pstate_set_update_util_hook(policy->cpu);
> -
>   if (hwp_active)
>   intel_pstate_hwp_set(policy->cpu);
>  
> 

What about update_turbo_pstate()?

In theory MSR_IA32_MISC_ENABLE_TURBO_DISABLE can be set at any time, so
wouldn't that become problematic after this change?

Thanks,
Rafael



Re: [PATCH v2 2/2] acpi/nfit: Issue Start ARS to retrieve existing records

2017-06-16 Thread Dan Williams
On Fri, Jun 16, 2017 at 5:01 PM, Kani, Toshimitsu  wrote:
>> > --- a/drivers/acpi/nfit/core.c
>> > +++ b/drivers/acpi/nfit/core.c
>> > @@ -1031,7 +1031,7 @@ static ssize_t scrub_store(struct device *dev,
>> > if (nd_desc) {
>> > struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
>> >
>> > -   rc = acpi_nfit_ars_rescan(acpi_desc);
>> > +   rc = acpi_nfit_ars_rescan(acpi_desc, 0);
>> > }
>> > device_unlock(dev);
>> > if (rc)
>> > @@ -2057,6 +2057,10 @@ static int ars_start(struct acpi_nfit_desc
>> *acpi_desc, struct nfit_spa *nfit_spa
>> > ars_start.type = ND_ARS_VOLATILE;
>> > else
>> > return -ENOTTY;
>> > +   if (nfit_spa->ars_prev_data) {
>> > +   ars_start.flags |= ND_ARS_RETURN_PREV_DATA;
>> > +   nfit_spa->ars_prev_data = 0;
>> > +   }
>>
>> I'd rather you plumb a new 'flags' parameter all the way through from
>> acpi_nfit_ars_rescan() to ars_start() rather than carrying this as a
>> property of nfit_spa.
>
> Yes, I wanted to carry 'flags' all the way, but since acpi_nfit_ars_rescan()
> calls acpi_nfit_scrub() via acpi_desc->work, all info needs to be marshalled
> into struct acpi_nfit_desc.  Using nfit_spa allows a request to be carried as
> per-spa basis...

Ah ok, but I still think it does not belong to a spa. This is control
/ context information for the workqueue and that belongs with
acpi_nfit_desc. It think it's fine if all spas get re-scrubbed with
the "prev_data" flag in the the notification case.


Re: [PATCH v2 2/2] acpi/nfit: Issue Start ARS to retrieve existing records

2017-06-16 Thread Dan Williams
On Fri, Jun 16, 2017 at 5:01 PM, Kani, Toshimitsu  wrote:
>> > --- a/drivers/acpi/nfit/core.c
>> > +++ b/drivers/acpi/nfit/core.c
>> > @@ -1031,7 +1031,7 @@ static ssize_t scrub_store(struct device *dev,
>> > if (nd_desc) {
>> > struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
>> >
>> > -   rc = acpi_nfit_ars_rescan(acpi_desc);
>> > +   rc = acpi_nfit_ars_rescan(acpi_desc, 0);
>> > }
>> > device_unlock(dev);
>> > if (rc)
>> > @@ -2057,6 +2057,10 @@ static int ars_start(struct acpi_nfit_desc
>> *acpi_desc, struct nfit_spa *nfit_spa
>> > ars_start.type = ND_ARS_VOLATILE;
>> > else
>> > return -ENOTTY;
>> > +   if (nfit_spa->ars_prev_data) {
>> > +   ars_start.flags |= ND_ARS_RETURN_PREV_DATA;
>> > +   nfit_spa->ars_prev_data = 0;
>> > +   }
>>
>> I'd rather you plumb a new 'flags' parameter all the way through from
>> acpi_nfit_ars_rescan() to ars_start() rather than carrying this as a
>> property of nfit_spa.
>
> Yes, I wanted to carry 'flags' all the way, but since acpi_nfit_ars_rescan()
> calls acpi_nfit_scrub() via acpi_desc->work, all info needs to be marshalled
> into struct acpi_nfit_desc.  Using nfit_spa allows a request to be carried as
> per-spa basis...

Ah ok, but I still think it does not belong to a spa. This is control
/ context information for the workqueue and that belongs with
acpi_nfit_desc. It think it's fine if all spas get re-scrubbed with
the "prev_data" flag in the the notification case.


[PATCH v3 3/4] arm64: Provide a fncpy implementation

2017-06-16 Thread Florian Fainelli
Utilize the asm-generic/fncpy.h implementation for ARM64 to allow the
use of drivers/misc/sram*.c on these platforms as well.

Signed-off-by: Florian Fainelli 
---
 arch/arm64/include/asm/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index a7a97a608033..5cfd822d3256 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
 generic-y += emergency-restart.h
 generic-y += errno.h
+generic-y += fncpy.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
-- 
2.9.3



[PATCH v3 3/4] arm64: Provide a fncpy implementation

2017-06-16 Thread Florian Fainelli
Utilize the asm-generic/fncpy.h implementation for ARM64 to allow the
use of drivers/misc/sram*.c on these platforms as well.

Signed-off-by: Florian Fainelli 
---
 arch/arm64/include/asm/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index a7a97a608033..5cfd822d3256 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -7,6 +7,7 @@ generic-y += dma-contiguous.h
 generic-y += early_ioremap.h
 generic-y += emergency-restart.h
 generic-y += errno.h
+generic-y += fncpy.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
-- 
2.9.3



[PATCH v3 4/4] misc: sram: Allow ARM64 to select SRAM_EXEC

2017-06-16 Thread Florian Fainelli
Now that ARM64 also has a fncpy() implementation, allow selection
SRAM_EXEC for ARM64 as well.

Signed-off-by: Florian Fainelli 
---
 drivers/misc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 07bbd4cc1852..ac8779278c0c 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -464,7 +464,7 @@ config SRAM
bool "Generic on-chip SRAM driver"
depends on HAS_IOMEM
select GENERIC_ALLOCATOR
-   select SRAM_EXEC if ARM
+   select SRAM_EXEC if ARM || ARM64
help
  This driver allows you to declare a memory region to be managed by
  the genalloc API. It is supposed to be used for small on-chip SRAM
-- 
2.9.3



[PATCH v3 2/4] asm-generic: Provide a fncpy() implementation

2017-06-16 Thread Florian Fainelli
Define a generic fncpy() implementation largely based on the ARM version
that requires an 8 bytes alignment for the destination address where to
copy this function as well as the function's own address.

Signed-off-by: Florian Fainelli 
---
 include/asm-generic/fncpy.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/asm-generic/fncpy.h

diff --git a/include/asm-generic/fncpy.h b/include/asm-generic/fncpy.h
new file mode 100644
index ..ec03b83b8535
--- /dev/null
+++ b/include/asm-generic/fncpy.h
@@ -0,0 +1,93 @@
+/*
+ * include/asm-generic/fncpy.h - helper macros for function body copying
+ *
+ * Copyright (C) 2011 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * These macros are intended for use when there is a need to copy a low-level
+ * function body into special memory.
+ *
+ * For example, when reconfiguring the SDRAM controller, the code doing the
+ * reconfiguration may need to run from SRAM.
+ *
+ * NOTE: that the copied function body must be entirely self-contained and
+ * position-independent in order for this to work properly.
+ *
+ * NOTE: in order for embedded literals and data to get referenced correctly,
+ * the alignment of functions must be preserved when copying.  To ensure this,
+ * the source and destination addresses for fncpy() must be aligned to a
+ * multiple of 8 bytes: you will be get a BUG() if this condition is not met.
+ * You will typically need a ".align 3" directive in the assembler where the
+ * function to be copied is defined, and ensure that your allocator for the
+ * destination buffer returns 8-byte-aligned pointers.
+ *
+ * Typical usage example:
+ *
+ * extern int f(args);
+ * extern uint32_t size_of_f;
+ * int (*copied_f)(args);
+ * void *sram_buffer;
+ *
+ * copied_f = fncpy(sram_buffer, , size_of_f);
+ *
+ * ... later, call the function: ...
+ *
+ * copied_f(args);
+ *
+ * The size of the function to be copied can't be determined from C:
+ * this must be determined by other means, such as adding assmbler directives
+ * in the file where f is defined.
+ */
+
+#ifndef __ASM_FNCPY_H
+#define __ASM_FNCPY_H
+
+#include 
+#include 
+
+#include 
+#include 
+
+/*
+ * Minimum alignment requirement for the source and destination addresses
+ * for function copying.
+ */
+#define FNCPY_ALIGN 8
+
+#define fncpy(dest_buf, funcp, size) ({
\
+   uintptr_t __funcp_address;  \
+   typeof(funcp) __result; \
+   \
+   asm("" : "=r" (__funcp_address) : "0" (funcp)); \
+   \
+   /*  \
+* Ensure alignment of source and destination addresses.\
+*/ \
+   BUG_ON((uintptr_t)(dest_buf) & (FNCPY_ALIGN - 1) || \
+   (__funcp_address & (FNCPY_ALIGN - 1))); \
+   \
+   memcpy(dest_buf, (void const *)__funcp_address, size);  \
+   flush_icache_range((unsigned long)(dest_buf),   \
+   (unsigned long)(dest_buf) + (size));\
+   \
+   asm("" : "=r" (__result)\
+   : "0" ((uintptr_t)(dest_buf))); \
+   \
+   __result;   \
+})
+
+#endif /* !__ASM_FNCPY_H */
-- 
2.9.3



[PATCH v3 0/4] Generalize fncpy availability

2017-06-16 Thread Florian Fainelli
Hi all,

This patch series makes ARM's fncpy() implementation more generic (dropping the
Thumb-specifics) and available in an asm-generic header file.

Tested on a Broadcom ARM64 STB platform with code that is written to SRAM.

Changes in v3 (thanks Doug!):
- correct include guard names in asm-generic/fncpy.h to __ASM_FNCPY_H
- utilize Kbuild to provide the fncpy.h header on ARM64

Changes in v2:
- leave the ARM implementation where it is
- make the generic truly generic (no)

This is helpful in making SoC-specific power management code become true drivers
that can be shared between different architectures.

Thanks!

Florian Fainelli (4):
  ARM: fncpy: Rename include guards
  asm-generic: Provide a fncpy() implementation
  arm64: Provide a fncpy implementation
  misc: sram: Allow ARM64 to select SRAM_EXEC

 arch/arm/include/asm/fncpy.h  |  6 +--
 arch/arm64/include/asm/Kbuild |  1 +
 drivers/misc/Kconfig  |  2 +-
 include/asm-generic/fncpy.h   | 93 +++
 4 files changed, 98 insertions(+), 4 deletions(-)
 create mode 100644 include/asm-generic/fncpy.h

-- 
2.9.3



[PATCH v3 1/4] ARM: fncpy: Rename include guards

2017-06-16 Thread Florian Fainelli
In preparation for allowing a generic fncpy() implementation to live
under include/asm-generic/fncpy.h, rename the current include guards to
be __ASM_ARM_FNCPY_H, this also makes the header file more consistent
with other headers in the same directory.

Signed-off-by: Florian Fainelli 
---
 arch/arm/include/asm/fncpy.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/asm/fncpy.h b/arch/arm/include/asm/fncpy.h
index de5354746924..86a8fc14cde9 100644
--- a/arch/arm/include/asm/fncpy.h
+++ b/arch/arm/include/asm/fncpy.h
@@ -53,8 +53,8 @@
  * in the file where f is defined.
  */
 
-#ifndef __ASM_FNCPY_H
-#define __ASM_FNCPY_H
+#ifndef __ASM_ARM_FNCPY_H
+#define __ASM_ARM_FNCPY_H
 
 #include 
 #include 
@@ -91,4 +91,4 @@
__result;   \
 })
 
-#endif /* !__ASM_FNCPY_H */
+#endif /* !__ASM_ARM_FNCPY_H */
-- 
2.9.3



[PATCH v3 4/4] misc: sram: Allow ARM64 to select SRAM_EXEC

2017-06-16 Thread Florian Fainelli
Now that ARM64 also has a fncpy() implementation, allow selection
SRAM_EXEC for ARM64 as well.

Signed-off-by: Florian Fainelli 
---
 drivers/misc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 07bbd4cc1852..ac8779278c0c 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -464,7 +464,7 @@ config SRAM
bool "Generic on-chip SRAM driver"
depends on HAS_IOMEM
select GENERIC_ALLOCATOR
-   select SRAM_EXEC if ARM
+   select SRAM_EXEC if ARM || ARM64
help
  This driver allows you to declare a memory region to be managed by
  the genalloc API. It is supposed to be used for small on-chip SRAM
-- 
2.9.3



[PATCH v3 2/4] asm-generic: Provide a fncpy() implementation

2017-06-16 Thread Florian Fainelli
Define a generic fncpy() implementation largely based on the ARM version
that requires an 8 bytes alignment for the destination address where to
copy this function as well as the function's own address.

Signed-off-by: Florian Fainelli 
---
 include/asm-generic/fncpy.h | 93 +
 1 file changed, 93 insertions(+)
 create mode 100644 include/asm-generic/fncpy.h

diff --git a/include/asm-generic/fncpy.h b/include/asm-generic/fncpy.h
new file mode 100644
index ..ec03b83b8535
--- /dev/null
+++ b/include/asm-generic/fncpy.h
@@ -0,0 +1,93 @@
+/*
+ * include/asm-generic/fncpy.h - helper macros for function body copying
+ *
+ * Copyright (C) 2011 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/*
+ * These macros are intended for use when there is a need to copy a low-level
+ * function body into special memory.
+ *
+ * For example, when reconfiguring the SDRAM controller, the code doing the
+ * reconfiguration may need to run from SRAM.
+ *
+ * NOTE: that the copied function body must be entirely self-contained and
+ * position-independent in order for this to work properly.
+ *
+ * NOTE: in order for embedded literals and data to get referenced correctly,
+ * the alignment of functions must be preserved when copying.  To ensure this,
+ * the source and destination addresses for fncpy() must be aligned to a
+ * multiple of 8 bytes: you will be get a BUG() if this condition is not met.
+ * You will typically need a ".align 3" directive in the assembler where the
+ * function to be copied is defined, and ensure that your allocator for the
+ * destination buffer returns 8-byte-aligned pointers.
+ *
+ * Typical usage example:
+ *
+ * extern int f(args);
+ * extern uint32_t size_of_f;
+ * int (*copied_f)(args);
+ * void *sram_buffer;
+ *
+ * copied_f = fncpy(sram_buffer, , size_of_f);
+ *
+ * ... later, call the function: ...
+ *
+ * copied_f(args);
+ *
+ * The size of the function to be copied can't be determined from C:
+ * this must be determined by other means, such as adding assmbler directives
+ * in the file where f is defined.
+ */
+
+#ifndef __ASM_FNCPY_H
+#define __ASM_FNCPY_H
+
+#include 
+#include 
+
+#include 
+#include 
+
+/*
+ * Minimum alignment requirement for the source and destination addresses
+ * for function copying.
+ */
+#define FNCPY_ALIGN 8
+
+#define fncpy(dest_buf, funcp, size) ({
\
+   uintptr_t __funcp_address;  \
+   typeof(funcp) __result; \
+   \
+   asm("" : "=r" (__funcp_address) : "0" (funcp)); \
+   \
+   /*  \
+* Ensure alignment of source and destination addresses.\
+*/ \
+   BUG_ON((uintptr_t)(dest_buf) & (FNCPY_ALIGN - 1) || \
+   (__funcp_address & (FNCPY_ALIGN - 1))); \
+   \
+   memcpy(dest_buf, (void const *)__funcp_address, size);  \
+   flush_icache_range((unsigned long)(dest_buf),   \
+   (unsigned long)(dest_buf) + (size));\
+   \
+   asm("" : "=r" (__result)\
+   : "0" ((uintptr_t)(dest_buf))); \
+   \
+   __result;   \
+})
+
+#endif /* !__ASM_FNCPY_H */
-- 
2.9.3



[PATCH v3 0/4] Generalize fncpy availability

2017-06-16 Thread Florian Fainelli
Hi all,

This patch series makes ARM's fncpy() implementation more generic (dropping the
Thumb-specifics) and available in an asm-generic header file.

Tested on a Broadcom ARM64 STB platform with code that is written to SRAM.

Changes in v3 (thanks Doug!):
- correct include guard names in asm-generic/fncpy.h to __ASM_FNCPY_H
- utilize Kbuild to provide the fncpy.h header on ARM64

Changes in v2:
- leave the ARM implementation where it is
- make the generic truly generic (no)

This is helpful in making SoC-specific power management code become true drivers
that can be shared between different architectures.

Thanks!

Florian Fainelli (4):
  ARM: fncpy: Rename include guards
  asm-generic: Provide a fncpy() implementation
  arm64: Provide a fncpy implementation
  misc: sram: Allow ARM64 to select SRAM_EXEC

 arch/arm/include/asm/fncpy.h  |  6 +--
 arch/arm64/include/asm/Kbuild |  1 +
 drivers/misc/Kconfig  |  2 +-
 include/asm-generic/fncpy.h   | 93 +++
 4 files changed, 98 insertions(+), 4 deletions(-)
 create mode 100644 include/asm-generic/fncpy.h

-- 
2.9.3



[PATCH v3 1/4] ARM: fncpy: Rename include guards

2017-06-16 Thread Florian Fainelli
In preparation for allowing a generic fncpy() implementation to live
under include/asm-generic/fncpy.h, rename the current include guards to
be __ASM_ARM_FNCPY_H, this also makes the header file more consistent
with other headers in the same directory.

Signed-off-by: Florian Fainelli 
---
 arch/arm/include/asm/fncpy.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/include/asm/fncpy.h b/arch/arm/include/asm/fncpy.h
index de5354746924..86a8fc14cde9 100644
--- a/arch/arm/include/asm/fncpy.h
+++ b/arch/arm/include/asm/fncpy.h
@@ -53,8 +53,8 @@
  * in the file where f is defined.
  */
 
-#ifndef __ASM_FNCPY_H
-#define __ASM_FNCPY_H
+#ifndef __ASM_ARM_FNCPY_H
+#define __ASM_ARM_FNCPY_H
 
 #include 
 #include 
@@ -91,4 +91,4 @@
__result;   \
 })
 
-#endif /* !__ASM_FNCPY_H */
+#endif /* !__ASM_ARM_FNCPY_H */
-- 
2.9.3



RE: [PATCH v2 2/2] acpi/nfit: Issue Start ARS to retrieve existing records

2017-06-16 Thread Kani, Toshimitsu
> > --- a/drivers/acpi/nfit/core.c
> > +++ b/drivers/acpi/nfit/core.c
> > @@ -1031,7 +1031,7 @@ static ssize_t scrub_store(struct device *dev,
> > if (nd_desc) {
> > struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
> >
> > -   rc = acpi_nfit_ars_rescan(acpi_desc);
> > +   rc = acpi_nfit_ars_rescan(acpi_desc, 0);
> > }
> > device_unlock(dev);
> > if (rc)
> > @@ -2057,6 +2057,10 @@ static int ars_start(struct acpi_nfit_desc
> *acpi_desc, struct nfit_spa *nfit_spa
> > ars_start.type = ND_ARS_VOLATILE;
> > else
> > return -ENOTTY;
> > +   if (nfit_spa->ars_prev_data) {
> > +   ars_start.flags |= ND_ARS_RETURN_PREV_DATA;
> > +   nfit_spa->ars_prev_data = 0;
> > +   }
> 
> I'd rather you plumb a new 'flags' parameter all the way through from
> acpi_nfit_ars_rescan() to ars_start() rather than carrying this as a
> property of nfit_spa.

Yes, I wanted to carry 'flags' all the way, but since acpi_nfit_ars_rescan()
calls acpi_nfit_scrub() via acpi_desc->work, all info needs to be marshalled
into struct acpi_nfit_desc.  Using nfit_spa allows a request to be carried as
per-spa basis...

Thanks,
-Toshi




RE: [PATCH v2 2/2] acpi/nfit: Issue Start ARS to retrieve existing records

2017-06-16 Thread Kani, Toshimitsu
> > --- a/drivers/acpi/nfit/core.c
> > +++ b/drivers/acpi/nfit/core.c
> > @@ -1031,7 +1031,7 @@ static ssize_t scrub_store(struct device *dev,
> > if (nd_desc) {
> > struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
> >
> > -   rc = acpi_nfit_ars_rescan(acpi_desc);
> > +   rc = acpi_nfit_ars_rescan(acpi_desc, 0);
> > }
> > device_unlock(dev);
> > if (rc)
> > @@ -2057,6 +2057,10 @@ static int ars_start(struct acpi_nfit_desc
> *acpi_desc, struct nfit_spa *nfit_spa
> > ars_start.type = ND_ARS_VOLATILE;
> > else
> > return -ENOTTY;
> > +   if (nfit_spa->ars_prev_data) {
> > +   ars_start.flags |= ND_ARS_RETURN_PREV_DATA;
> > +   nfit_spa->ars_prev_data = 0;
> > +   }
> 
> I'd rather you plumb a new 'flags' parameter all the way through from
> acpi_nfit_ars_rescan() to ars_start() rather than carrying this as a
> property of nfit_spa.

Yes, I wanted to carry 'flags' all the way, but since acpi_nfit_ars_rescan()
calls acpi_nfit_scrub() via acpi_desc->work, all info needs to be marshalled
into struct acpi_nfit_desc.  Using nfit_spa allows a request to be carried as
per-spa basis...

Thanks,
-Toshi




Re: [PATCH 1/2] loop: use filp_close() rather than fput()

2017-06-16 Thread Al Viro
On Fri, Jun 16, 2017 at 03:02:09PM +1000, NeilBrown wrote:
> When a loop device is being shutdown the backing file is
> closed with fput().  This is different from how close(2)
> closes files - it uses filp_close().
> 
> The difference is important for filesystems which provide a ->flush
> file operation such as NFS.  NFS assumes a flush will always
> be called on last close, and gets confused otherwise.

Huh?  You do realize that mmap() + close() + modify + msync() + munmap()
will have IO done *after* the last flush, right?


Re: [PATCH 1/2] loop: use filp_close() rather than fput()

2017-06-16 Thread Al Viro
On Fri, Jun 16, 2017 at 03:02:09PM +1000, NeilBrown wrote:
> When a loop device is being shutdown the backing file is
> closed with fput().  This is different from how close(2)
> closes files - it uses filp_close().
> 
> The difference is important for filesystems which provide a ->flush
> file operation such as NFS.  NFS assumes a flush will always
> be called on last close, and gets confused otherwise.

Huh?  You do realize that mmap() + close() + modify + msync() + munmap()
will have IO done *after* the last flush, right?


Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
> From: Len Brown 
> 
> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
> to supply /sys/.../cpufreq/scaling_cur_freq
> on all x86 systems supporting APERF/MPERF.
> 
> That includes 100% of systems supported by intel_pstate,
> and so intel_pstate.get() is now a NOP -- remove it.
> 
> Invoke aperfmperf_khz_on_cpu() directly,
> if legacy-mode p-state tracing is enabled.
> 
> Signed-off-by: Len Brown 
> ---
>  drivers/cpufreq/intel_pstate.c | 16 +---
>  1 file changed, 1 insertion(+), 15 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index b7de5bd..5d67780 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct cpudata 
> *cpu, u64 time)
>   return false;
>  }
>  
> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
> -{
> - return mul_ext_fp(cpu->sample.core_avg_perf,
> -   cpu->pstate.max_pstate_physical * 
> cpu->pstate.scaling);
> -}
> -
>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
>  {
>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct cpudata 
> *cpu, int target_pstate)
>   sample->mperf,
>   sample->aperf,
>   sample->tsc,
> - get_avg_frequency(cpu),
> + aperfmperf_khz_on_cpu(cpu->cpu),
>   fp_toint(cpu->iowait_boost * 100));
>  }
>  
> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
>   return 0;
>  }
>  
> -static unsigned int intel_pstate_get(unsigned int cpu_num)
> -{
> - struct cpudata *cpu = all_cpu_data[cpu_num];
> -
> - return cpu ? get_avg_frequency(cpu) : 0;
> -}
> -
>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
>  {
>   struct cpudata *cpu = all_cpu_data[cpu_num];
> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
>   .setpolicy  = intel_pstate_set_policy,
>   .suspend= intel_pstate_hwp_save_state,
>   .resume = intel_pstate_resume,
> - .get= intel_pstate_get,
>   .init   = intel_pstate_cpu_init,
>   .exit   = intel_pstate_cpu_exit,
>   .stop_cpu   = intel_pstate_stop_cpu,
> 

This change will cause cpufreq_quick_get() to work differently and it is
called by KVM among other things.  Will that still work?

Thanks,
Rafael



Re: [PATCH 3/5] intel_pstate: remove intel_pstate.get()

2017-06-16 Thread Rafael J. Wysocki
On Wednesday, June 07, 2017 07:39:14 PM Len Brown wrote:
> From: Len Brown 
> 
> The x86 cpufreq core now uses aperfmperf_khz_on_cpu()
> to supply /sys/.../cpufreq/scaling_cur_freq
> on all x86 systems supporting APERF/MPERF.
> 
> That includes 100% of systems supported by intel_pstate,
> and so intel_pstate.get() is now a NOP -- remove it.
> 
> Invoke aperfmperf_khz_on_cpu() directly,
> if legacy-mode p-state tracing is enabled.
> 
> Signed-off-by: Len Brown 
> ---
>  drivers/cpufreq/intel_pstate.c | 16 +---
>  1 file changed, 1 insertion(+), 15 deletions(-)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index b7de5bd..5d67780 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1597,12 +1597,6 @@ static inline bool intel_pstate_sample(struct cpudata 
> *cpu, u64 time)
>   return false;
>  }
>  
> -static inline int32_t get_avg_frequency(struct cpudata *cpu)
> -{
> - return mul_ext_fp(cpu->sample.core_avg_perf,
> -   cpu->pstate.max_pstate_physical * 
> cpu->pstate.scaling);
> -}
> -
>  static inline int32_t get_avg_pstate(struct cpudata *cpu)
>  {
>   return mul_ext_fp(cpu->pstate.max_pstate_physical,
> @@ -1728,7 +1722,7 @@ static void intel_pstate_adjust_pstate(struct cpudata 
> *cpu, int target_pstate)
>   sample->mperf,
>   sample->aperf,
>   sample->tsc,
> - get_avg_frequency(cpu),
> + aperfmperf_khz_on_cpu(cpu->cpu),
>   fp_toint(cpu->iowait_boost * 100));
>  }
>  
> @@ -1922,13 +1916,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
>   return 0;
>  }
>  
> -static unsigned int intel_pstate_get(unsigned int cpu_num)
> -{
> - struct cpudata *cpu = all_cpu_data[cpu_num];
> -
> - return cpu ? get_avg_frequency(cpu) : 0;
> -}
> -
>  static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
>  {
>   struct cpudata *cpu = all_cpu_data[cpu_num];
> @@ -2157,7 +2144,6 @@ static struct cpufreq_driver intel_pstate = {
>   .setpolicy  = intel_pstate_set_policy,
>   .suspend= intel_pstate_hwp_save_state,
>   .resume = intel_pstate_resume,
> - .get= intel_pstate_get,
>   .init   = intel_pstate_cpu_init,
>   .exit   = intel_pstate_cpu_exit,
>   .stop_cpu   = intel_pstate_stop_cpu,
> 

This change will cause cpufreq_quick_get() to work differently and it is
called by KVM among other things.  Will that still work?

Thanks,
Rafael



Re: LTS testing with latest kselftests - some failures

2017-06-16 Thread Fengguang Wu

On Fri, Jun 16, 2017 at 06:46:51PM +0200, Luis R. Rodriguez wrote:

Kees, please review 47e0bbb7fa98 below.
Brian, please review be4a1326d12c below.

On Thu, Jun 15, 2017 at 11:26:53PM +0530, Sumit Semwal wrote:

Hello Greg, Shuah,

While testing 4.4.y and 4.9.y LTS kernels with latest kselftest,


To be clear it seems like you are taking the latest upstream ksefltest and run
it against older stable kernels. Furthermore you seem to only run the shell
script tests but are using older kselftests drivers? Is this all correct?
Otherwise it is unclear how you are running into the issues below.

Does 0-day so the same? I thought 0-day takes just the kselftest from each tree
submitted. That *seemed* to me like the way it was designed. Shuah ?


Yes in 0-day, we run the kselftest code corresponding to the current kernel.

Thanks,
Fengguang


Re: LTS testing with latest kselftests - some failures

2017-06-16 Thread Fengguang Wu

On Fri, Jun 16, 2017 at 06:46:51PM +0200, Luis R. Rodriguez wrote:

Kees, please review 47e0bbb7fa98 below.
Brian, please review be4a1326d12c below.

On Thu, Jun 15, 2017 at 11:26:53PM +0530, Sumit Semwal wrote:

Hello Greg, Shuah,

While testing 4.4.y and 4.9.y LTS kernels with latest kselftest,


To be clear it seems like you are taking the latest upstream ksefltest and run
it against older stable kernels. Furthermore you seem to only run the shell
script tests but are using older kselftests drivers? Is this all correct?
Otherwise it is unclear how you are running into the issues below.

Does 0-day so the same? I thought 0-day takes just the kselftest from each tree
submitted. That *seemed* to me like the way it was designed. Shuah ?


Yes in 0-day, we run the kselftest code corresponding to the current kernel.

Thanks,
Fengguang


[PATCH 3/3] block: order /proc/devices by major number

2017-06-16 Thread Logan Gunthorpe
Presently, the order of the block devices listed in /proc/devices is not
entirely sequential. If a block device has a major number greater than
BLKDEV_MAJOR_HASH_SIZE (255), it will be ordered as if its major were
module 255. For example, 511 appears after 1.

This patch cleans that up and prints each major number in the correct
order, regardless of where they are stored in the hash table.

In order to do this, we introduce BLKDEV_MAJOR_MAX as an artificial
limit (chosen to be 512). It will then print all devices in major
order number from 0 to the maximum.

Signed-off-by: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Jens Axboe 
Cc: Jeff Layton 
Cc: "J. Bruce Fields" 
---

This is a patch Greg requested after I proposed[1] the same to char devs.
It is based on the chardev patch I sent so it should be merged after it (to
avoid conflicts). If there are any changes requested I'll resend the
entire set as a whole.

[1] https://patchwork.kernel.org/patch/9790093/

 block/genhd.c  | 18 +-
 fs/proc/devices.c  |  4 ++--
 include/linux/fs.h |  4 ++--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index d252d29fe837..1fc734b1a0e4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -242,6 +242,7 @@ EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
  * Can be deleted altogether. Later.
  *
  */
+#define BLKDEV_MAJOR_HASH_SIZE 255
 static struct blk_major_name {
struct blk_major_name *next;
int major;
@@ -259,12 +260,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 {
struct blk_major_name *dp;

-   if (offset < BLKDEV_MAJOR_HASH_SIZE) {
-   mutex_lock(_class_lock);
-   for (dp = major_names[offset]; dp; dp = dp->next)
+   mutex_lock(_class_lock);
+   for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
+   if (dp->major == offset)
seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
-   mutex_unlock(_class_lock);
-   }
+   mutex_unlock(_class_lock);
 }
 #endif /* CONFIG_PROC_FS */

@@ -309,6 +309,14 @@ int register_blkdev(unsigned int major, const char *name)
ret = major;
}

+   if (major >= BLKDEV_MAJOR_MAX) {
+   pr_err("register_blkdev: major requested (%d) is greater than 
the maximum (%d) for %s\n",
+  major, BLKDEV_MAJOR_MAX, name);
+
+   ret = -EINVAL;
+   goto out;
+   }
+
p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
if (p == NULL) {
ret = -ENOMEM;
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index d196e22c4f1c..e5709343feb7 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v)

 static void *devinfo_start(struct seq_file *f, loff_t *pos)
 {
-   if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
+   if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return pos;
return NULL;
 }
@@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos)
 static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
 {
(*pos)++;
-   if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
+   if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return NULL;
return pos;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f1347c2ca3e9..8cc651807ea4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2478,14 +2478,14 @@ static inline void unregister_chrdev(unsigned int 
major, const char *name)
 #define BDEVT_SIZE 10  /* Largest string for MAJ:MIN for blkdev */

 #ifdef CONFIG_BLOCK
-#define BLKDEV_MAJOR_HASH_SIZE 255
+#define BLKDEV_MAJOR_MAX   512
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern void blkdev_show(struct seq_file *,off_t);

 #else
-#define BLKDEV_MAJOR_HASH_SIZE 0
+#define BLKDEV_MAJOR_MAX   0
 #endif

 extern void init_special_inode(struct inode *, umode_t, dev_t);
--
2.11.0


[PATCH 3/3] block: order /proc/devices by major number

2017-06-16 Thread Logan Gunthorpe
Presently, the order of the block devices listed in /proc/devices is not
entirely sequential. If a block device has a major number greater than
BLKDEV_MAJOR_HASH_SIZE (255), it will be ordered as if its major were
module 255. For example, 511 appears after 1.

This patch cleans that up and prints each major number in the correct
order, regardless of where they are stored in the hash table.

In order to do this, we introduce BLKDEV_MAJOR_MAX as an artificial
limit (chosen to be 512). It will then print all devices in major
order number from 0 to the maximum.

Signed-off-by: Logan Gunthorpe 
Cc: Greg Kroah-Hartman 
Cc: Jens Axboe 
Cc: Jeff Layton 
Cc: "J. Bruce Fields" 
---

This is a patch Greg requested after I proposed[1] the same to char devs.
It is based on the chardev patch I sent so it should be merged after it (to
avoid conflicts). If there are any changes requested I'll resend the
entire set as a whole.

[1] https://patchwork.kernel.org/patch/9790093/

 block/genhd.c  | 18 +-
 fs/proc/devices.c  |  4 ++--
 include/linux/fs.h |  4 ++--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index d252d29fe837..1fc734b1a0e4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -242,6 +242,7 @@ EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
  * Can be deleted altogether. Later.
  *
  */
+#define BLKDEV_MAJOR_HASH_SIZE 255
 static struct blk_major_name {
struct blk_major_name *next;
int major;
@@ -259,12 +260,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 {
struct blk_major_name *dp;

-   if (offset < BLKDEV_MAJOR_HASH_SIZE) {
-   mutex_lock(_class_lock);
-   for (dp = major_names[offset]; dp; dp = dp->next)
+   mutex_lock(_class_lock);
+   for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
+   if (dp->major == offset)
seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
-   mutex_unlock(_class_lock);
-   }
+   mutex_unlock(_class_lock);
 }
 #endif /* CONFIG_PROC_FS */

@@ -309,6 +309,14 @@ int register_blkdev(unsigned int major, const char *name)
ret = major;
}

+   if (major >= BLKDEV_MAJOR_MAX) {
+   pr_err("register_blkdev: major requested (%d) is greater than 
the maximum (%d) for %s\n",
+  major, BLKDEV_MAJOR_MAX, name);
+
+   ret = -EINVAL;
+   goto out;
+   }
+
p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
if (p == NULL) {
ret = -ENOMEM;
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index d196e22c4f1c..e5709343feb7 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v)

 static void *devinfo_start(struct seq_file *f, loff_t *pos)
 {
-   if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
+   if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return pos;
return NULL;
 }
@@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos)
 static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
 {
(*pos)++;
-   if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_MAX))
+   if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX))
return NULL;
return pos;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f1347c2ca3e9..8cc651807ea4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2478,14 +2478,14 @@ static inline void unregister_chrdev(unsigned int 
major, const char *name)
 #define BDEVT_SIZE 10  /* Largest string for MAJ:MIN for blkdev */

 #ifdef CONFIG_BLOCK
-#define BLKDEV_MAJOR_HASH_SIZE 255
+#define BLKDEV_MAJOR_MAX   512
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern void blkdev_show(struct seq_file *,off_t);

 #else
-#define BLKDEV_MAJOR_HASH_SIZE 0
+#define BLKDEV_MAJOR_MAX   0
 #endif

 extern void init_special_inode(struct inode *, umode_t, dev_t);
--
2.11.0


Re: [PATCH 1/2] platform/x86: silead_dmi: Add touchscreen info for PoV mobii wintab p800w

2017-06-16 Thread Darren Hart
On Fri, Jun 16, 2017 at 03:22:45PM +0200, Hans de Goede wrote:
> Hi,
> 
> On 16-06-17 14:44, Andy Shevchenko wrote:
> > On Thu, Jun 15, 2017 at 7:53 PM, Darren Hart  wrote:
> > > On Thu, Jun 15, 2017 at 08:48:31AM +0200, Hans de Goede wrote:
> > 
> > > > + /* Point of View mobii wintab p800w */
> > > > + .driver_data = (void *)_mobii_wintab_p800w_data,
> > > > + .matches = {
> > > > + DMI_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"),
> > > > + DMI_MATCH(DMI_BOARD_NAME, "Aptio CRB"),
> > > > + DMI_MATCH(DMI_BIOS_VERSION, "3BAIR1013"),
> > > > + /* Above matches are too generic, add bios-date 
> > > > match */
> > > > + DMI_MATCH(DMI_BIOS_DATE, "08/22/2014"),
> > > 
> > > This is the first time I've seen a BIOS date match used to determine 
> > > hardware
> > > features. DMI matching is a (necessary) hack to begin with (the vendors 
> > > should
> > > be providing this data via ACPI _DSD anyway) but a date match means we 
> > > would
> > > need a kernel patch every time one of these tablets gets a BIOS update...
> > > 
> > > With words like "Aptio CRB" it's clear the vendor isn't doing their job 
> > > and just
> > > using unmodified reference code. The problem with this of course is that 
> > > the
> > > vendor is not providing a way to identify this hardware.
> > > 
> > > Andy, I'd appreciate your thoughts on this... I'm leaning towards not 
> > > accepting
> > > bios date (or indeed, BIOS version) as a way to identify a platform.
> > 
> > The question is what is the anticipated amount of affected devices
> > with BIOS date included and otherwise?
> 
> I expect there to be collisions (false positive matches) without the
> BIOS_DATE check, a quick web-search finds other devices with a
> 3BAIR1013 bios version. Those don't necessarily also use a Silead
> touchscreen (which is needed for a collision to happen), but given
> the popularity of Silead touchscreens on cheap devices a collision
> is not unlikely.
> 
> With the bios-date check added, I expect this match to be unique,
> for it to not be unique we would need to be really unlucky.
> 
> > If Hans believes that there will be no update for some devices,
> 
> Yeah I'm pretty sure this specific device will not see any
> BIOS updates ever.
> 
> > while there are devices with the same DMI strings, but different date and
> > _fixed_ issue, I think we have no other choice for now.
> > Also can we use some other strings to distinguish group of devices
> > which are affected?
> 
> bios_date: 08/22/2014
> bios_vendor: American Megatrends Inc.
> bios_version: 3BAIR1013
> board_asset_tag: To be filled by O.E.M.
> board_name: Aptio CRB
> board_serial: T80091A4C11B0848
> board_vendor: AMI Corporation
> board_version: To be filled by O.E.M.
> chassis_asset_tag: To Be Filled By O.E.M.
> chassis_serial: To Be Filled By O.E.M.
> chassis_type: 3
> chassis_vendor: To Be Filled By O.E.M.
> chassis_version: To Be Filled By O.E.M.
> product_name: To be filled by O.E.M.
> product_serial: To be filled by O.E.M.
> product_uuid: 03000200-0400-0500-0006-000700080009
> product_version: To be filled by O.E.M.
> sys_vendor: To be filled by O.E.M.
> 
> The product-uuid is a known example uuid, so is
> no good. The board_serial might be useful, but
> only if it is unique for the model and not per
> tablet. Unfortunately I only have 1 of these
> tablets, so I cannot tell.

Do we have any indication that this BIOS Date isn't just the default value
provided by AMI? Does it offer any more information than the BIOS Version?

I suppose we may be able to do some kind of a partial match on the Board Serial
if even that is platform specific (I suspect it is with the T800 at the
beginning.

The sloppy handling of this firmware really irks me. That's obviously not Hans'
fault, so we'll take the patch. If we see a conflict in the future, we'll just
have to compare the other DMI strings for a match and see what we can do I'm
even tempted to insert a printk on this match, dumping the DMI values and
requesting the user to copy.paste them into an email to this list

I think we've already spent too much time on this patch based on this review:
https://www.notebookcheck.net/Point-of-View-Mobii-WinTab-800W-Tablet-Review.129561.0.html

Nice...

-- 
Darren Hart
VMware Open Source Technology Center


Re: [PATCH 1/2] platform/x86: silead_dmi: Add touchscreen info for PoV mobii wintab p800w

2017-06-16 Thread Darren Hart
On Fri, Jun 16, 2017 at 03:22:45PM +0200, Hans de Goede wrote:
> Hi,
> 
> On 16-06-17 14:44, Andy Shevchenko wrote:
> > On Thu, Jun 15, 2017 at 7:53 PM, Darren Hart  wrote:
> > > On Thu, Jun 15, 2017 at 08:48:31AM +0200, Hans de Goede wrote:
> > 
> > > > + /* Point of View mobii wintab p800w */
> > > > + .driver_data = (void *)_mobii_wintab_p800w_data,
> > > > + .matches = {
> > > > + DMI_MATCH(DMI_BOARD_VENDOR, "AMI Corporation"),
> > > > + DMI_MATCH(DMI_BOARD_NAME, "Aptio CRB"),
> > > > + DMI_MATCH(DMI_BIOS_VERSION, "3BAIR1013"),
> > > > + /* Above matches are too generic, add bios-date 
> > > > match */
> > > > + DMI_MATCH(DMI_BIOS_DATE, "08/22/2014"),
> > > 
> > > This is the first time I've seen a BIOS date match used to determine 
> > > hardware
> > > features. DMI matching is a (necessary) hack to begin with (the vendors 
> > > should
> > > be providing this data via ACPI _DSD anyway) but a date match means we 
> > > would
> > > need a kernel patch every time one of these tablets gets a BIOS update...
> > > 
> > > With words like "Aptio CRB" it's clear the vendor isn't doing their job 
> > > and just
> > > using unmodified reference code. The problem with this of course is that 
> > > the
> > > vendor is not providing a way to identify this hardware.
> > > 
> > > Andy, I'd appreciate your thoughts on this... I'm leaning towards not 
> > > accepting
> > > bios date (or indeed, BIOS version) as a way to identify a platform.
> > 
> > The question is what is the anticipated amount of affected devices
> > with BIOS date included and otherwise?
> 
> I expect there to be collisions (false positive matches) without the
> BIOS_DATE check, a quick web-search finds other devices with a
> 3BAIR1013 bios version. Those don't necessarily also use a Silead
> touchscreen (which is needed for a collision to happen), but given
> the popularity of Silead touchscreens on cheap devices a collision
> is not unlikely.
> 
> With the bios-date check added, I expect this match to be unique,
> for it to not be unique we would need to be really unlucky.
> 
> > If Hans believes that there will be no update for some devices,
> 
> Yeah I'm pretty sure this specific device will not see any
> BIOS updates ever.
> 
> > while there are devices with the same DMI strings, but different date and
> > _fixed_ issue, I think we have no other choice for now.
> > Also can we use some other strings to distinguish group of devices
> > which are affected?
> 
> bios_date: 08/22/2014
> bios_vendor: American Megatrends Inc.
> bios_version: 3BAIR1013
> board_asset_tag: To be filled by O.E.M.
> board_name: Aptio CRB
> board_serial: T80091A4C11B0848
> board_vendor: AMI Corporation
> board_version: To be filled by O.E.M.
> chassis_asset_tag: To Be Filled By O.E.M.
> chassis_serial: To Be Filled By O.E.M.
> chassis_type: 3
> chassis_vendor: To Be Filled By O.E.M.
> chassis_version: To Be Filled By O.E.M.
> product_name: To be filled by O.E.M.
> product_serial: To be filled by O.E.M.
> product_uuid: 03000200-0400-0500-0006-000700080009
> product_version: To be filled by O.E.M.
> sys_vendor: To be filled by O.E.M.
> 
> The product-uuid is a known example uuid, so is
> no good. The board_serial might be useful, but
> only if it is unique for the model and not per
> tablet. Unfortunately I only have 1 of these
> tablets, so I cannot tell.

Do we have any indication that this BIOS Date isn't just the default value
provided by AMI? Does it offer any more information than the BIOS Version?

I suppose we may be able to do some kind of a partial match on the Board Serial
if even that is platform specific (I suspect it is with the T800 at the
beginning.

The sloppy handling of this firmware really irks me. That's obviously not Hans'
fault, so we'll take the patch. If we see a conflict in the future, we'll just
have to compare the other DMI strings for a match and see what we can do I'm
even tempted to insert a printk on this match, dumping the DMI values and
requesting the user to copy.paste them into an email to this list

I think we've already spent too much time on this patch based on this review:
https://www.notebookcheck.net/Point-of-View-Mobii-WinTab-800W-Tablet-Review.129561.0.html

Nice...

-- 
Darren Hart
VMware Open Source Technology Center


[PATCHv3 1/3] firmware_class: move NO_CACHE from private to driver_data_req_params

2017-06-16 Thread yi1 . li
From: Yi Li 

This adds DRIVER_DATA_REQ_NO_CACHE flag with .req flag under struct
driver_data_req_params. When this flag is set, the driver_data driver
will not cache the firmware during PM cycle, which is expensive. It
will be used by streaming case and other drivers which implement
their own cache thing. Also added the debugfs interface to selftest.

Signed-off-by: Yi Li 
---
 drivers/base/firmware_class-dbg.c | 108 ++
 drivers/base/firmware_class.c |  26 +
 include/linux/driver_data.h   |   4 ++
 3 files changed, 127 insertions(+), 11 deletions(-)
 create mode 100644 drivers/base/firmware_class-dbg.c

diff --git a/drivers/base/firmware_class-dbg.c 
b/drivers/base/firmware_class-dbg.c
new file mode 100644
index 000..102a4cd
--- /dev/null
+++ b/drivers/base/firmware_class-dbg.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 by Yi Li 
+ *
+ */
+/* This is part of firmware_class.c for testing firmware cache */
+
+#ifndef CONFIG_TEST_DRIVER_DATA
+static inline void create_debug_files(struct firmware_cache *cache) { }
+static inline void remove_debug_files(struct firmware_cache *cache) { }
+#else
+#include 
+#include 
+
+static int debug_cache_show(struct seq_file *s, void *v)
+{
+   struct firmware_cache *cache = s->private;
+   unsigned long flags;
+   struct fw_cache_entry *cache_entry;
+
+   spin_lock_irqsave(>lock, flags);
+
+   list_for_each_entry(cache_entry, >fw_names, list)
+   seq_printf(s, "cached %s\n", cache_entry->name);
+
+   spin_unlock_irqrestore(>lock, flags);
+
+   return 0;
+}
+
+static int debug_cache_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, debug_cache_show, inode->i_private);
+}
+
+#define MAX_LEN16
+/**
+ * test_cache - set value in the 'cache' control file
+ *
+ * The relevant values are:
+ *
+ *  1: Test the suspend and start the cache
+ *  0: Test the resume and clear the cache.
+ **/
+static ssize_t test_cache(struct file *fp, const char __user *user_buffer,
+ size_t size, loff_t *ppos)
+{
+   char buf[MAX_LEN];
+   size_t len;
+   long cmd;
+
+   len = min(size, (size_t)(MAX_LEN - 1));
+   if (copy_from_user(buf, user_buffer, len))
+   return -EFAULT;
+   buf[len] = 0;
+   if (kstrtol(buf, 10, ))
+   return -EFAULT;
+
+#ifdef CONFIG_PM_SLEEP
+   switch (cmd) {
+   /* Simulate PM suspend prepare and start to cache */
+   case 1:
+   kill_pending_fw_fallback_reqs(true);
+   device_cache_fw_images();
+   disable_firmware();
+   break;
+   /* Simulate PM resume and un-cache */
+   case 0:
+   mutex_lock(_lock);
+   fw_cache.state = FW_LOADER_NO_CACHE;
+   mutex_unlock(_lock);
+   enable_firmware();
+   device_uncache_fw_images_delay(10);
+   break;
+   default:
+   pr_err("unexpected cmd\n");
+   }
+#endif
+   return size;
+}
+
+static const struct file_operations debug_cache_fops = {
+   .open = debug_cache_open,
+   .read = seq_read,
+   .write = test_cache,
+   .llseek = seq_lseek,
+   .release = single_release,
+};
+
+static void create_debug_files(struct firmware_cache *cache)
+{
+   cache->debug = debugfs_create_dir("firmware", NULL);
+   if (!cache->debug)
+   return;
+   if (!debugfs_create_file("cache", 0644, cache->debug,
+cache, _cache_fops))
+   goto failed_create;
+   return;
+
+failed_create:
+   debugfs_remove_recursive(cache->debug);
+}
+
+static void remove_debug_files(struct firmware_cache *cache)
+{
+   debugfs_remove_recursive(cache->debug);
+   cache->debug = NULL;
+}
+#endif
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 7af430a..a70a2a7 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -72,14 +72,10 @@ enum driver_data_mode {
  * issue a uevent to userspace. Userspace in turn is expected to be
  * monitoring for uevents for the firmware_class and will use the
  * exposted sysfs interface to upload the driver data for the caller.
- * @DRIVER_DATA_PRIV_REQ_NO_CACHE: indicates that the driver data request
- * should not set up and use the internal caching mechanism to assist
- * drivers from fetching driver data at resume time after suspend.
  */
 enum driver_data_priv_reqs {
DRIVER_DATA_PRIV_REQ_FALLBACK   = 1 << 0,
DRIVER_DATA_PRIV_REQ_FALLBACK_UEVENT= 1 << 1,
-   DRIVER_DATA_PRIV_REQ_NO_CACHE   = 1 << 2,
 };
 
 /**
@@ -151,10 +147,12 @@ struct driver_data_params {
}
 
 #define __DATA_REQ_FIRMWARE_BUF(buf, size) \
+  

[PATCHv3 1/3] firmware_class: move NO_CACHE from private to driver_data_req_params

2017-06-16 Thread yi1 . li
From: Yi Li 

This adds DRIVER_DATA_REQ_NO_CACHE flag with .req flag under struct
driver_data_req_params. When this flag is set, the driver_data driver
will not cache the firmware during PM cycle, which is expensive. It
will be used by streaming case and other drivers which implement
their own cache thing. Also added the debugfs interface to selftest.

Signed-off-by: Yi Li 
---
 drivers/base/firmware_class-dbg.c | 108 ++
 drivers/base/firmware_class.c |  26 +
 include/linux/driver_data.h   |   4 ++
 3 files changed, 127 insertions(+), 11 deletions(-)
 create mode 100644 drivers/base/firmware_class-dbg.c

diff --git a/drivers/base/firmware_class-dbg.c 
b/drivers/base/firmware_class-dbg.c
new file mode 100644
index 000..102a4cd
--- /dev/null
+++ b/drivers/base/firmware_class-dbg.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 by Yi Li 
+ *
+ */
+/* This is part of firmware_class.c for testing firmware cache */
+
+#ifndef CONFIG_TEST_DRIVER_DATA
+static inline void create_debug_files(struct firmware_cache *cache) { }
+static inline void remove_debug_files(struct firmware_cache *cache) { }
+#else
+#include 
+#include 
+
+static int debug_cache_show(struct seq_file *s, void *v)
+{
+   struct firmware_cache *cache = s->private;
+   unsigned long flags;
+   struct fw_cache_entry *cache_entry;
+
+   spin_lock_irqsave(>lock, flags);
+
+   list_for_each_entry(cache_entry, >fw_names, list)
+   seq_printf(s, "cached %s\n", cache_entry->name);
+
+   spin_unlock_irqrestore(>lock, flags);
+
+   return 0;
+}
+
+static int debug_cache_open(struct inode *inode, struct file *file)
+{
+   return single_open(file, debug_cache_show, inode->i_private);
+}
+
+#define MAX_LEN16
+/**
+ * test_cache - set value in the 'cache' control file
+ *
+ * The relevant values are:
+ *
+ *  1: Test the suspend and start the cache
+ *  0: Test the resume and clear the cache.
+ **/
+static ssize_t test_cache(struct file *fp, const char __user *user_buffer,
+ size_t size, loff_t *ppos)
+{
+   char buf[MAX_LEN];
+   size_t len;
+   long cmd;
+
+   len = min(size, (size_t)(MAX_LEN - 1));
+   if (copy_from_user(buf, user_buffer, len))
+   return -EFAULT;
+   buf[len] = 0;
+   if (kstrtol(buf, 10, ))
+   return -EFAULT;
+
+#ifdef CONFIG_PM_SLEEP
+   switch (cmd) {
+   /* Simulate PM suspend prepare and start to cache */
+   case 1:
+   kill_pending_fw_fallback_reqs(true);
+   device_cache_fw_images();
+   disable_firmware();
+   break;
+   /* Simulate PM resume and un-cache */
+   case 0:
+   mutex_lock(_lock);
+   fw_cache.state = FW_LOADER_NO_CACHE;
+   mutex_unlock(_lock);
+   enable_firmware();
+   device_uncache_fw_images_delay(10);
+   break;
+   default:
+   pr_err("unexpected cmd\n");
+   }
+#endif
+   return size;
+}
+
+static const struct file_operations debug_cache_fops = {
+   .open = debug_cache_open,
+   .read = seq_read,
+   .write = test_cache,
+   .llseek = seq_lseek,
+   .release = single_release,
+};
+
+static void create_debug_files(struct firmware_cache *cache)
+{
+   cache->debug = debugfs_create_dir("firmware", NULL);
+   if (!cache->debug)
+   return;
+   if (!debugfs_create_file("cache", 0644, cache->debug,
+cache, _cache_fops))
+   goto failed_create;
+   return;
+
+failed_create:
+   debugfs_remove_recursive(cache->debug);
+}
+
+static void remove_debug_files(struct firmware_cache *cache)
+{
+   debugfs_remove_recursive(cache->debug);
+   cache->debug = NULL;
+}
+#endif
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 7af430a..a70a2a7 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -72,14 +72,10 @@ enum driver_data_mode {
  * issue a uevent to userspace. Userspace in turn is expected to be
  * monitoring for uevents for the firmware_class and will use the
  * exposted sysfs interface to upload the driver data for the caller.
- * @DRIVER_DATA_PRIV_REQ_NO_CACHE: indicates that the driver data request
- * should not set up and use the internal caching mechanism to assist
- * drivers from fetching driver data at resume time after suspend.
  */
 enum driver_data_priv_reqs {
DRIVER_DATA_PRIV_REQ_FALLBACK   = 1 << 0,
DRIVER_DATA_PRIV_REQ_FALLBACK_UEVENT= 1 << 1,
-   DRIVER_DATA_PRIV_REQ_NO_CACHE   = 1 << 2,
 };
 
 /**
@@ -151,10 +147,12 @@ struct driver_data_params {
}
 
 #define __DATA_REQ_FIRMWARE_BUF(buf, size) \
+   .req_params = { \
+

<    1   2   3   4   5   6   7   8   9   10   >