RE: [PATCH v5 6/7] iommu/exynos: Add runtime pm support

2016-10-21 Thread Sricharan
Hi Marek,

>This patch adds runtime pm implementation, which is based on previous
>suspend/resume code. SYSMMU controller is now being enabled/disabled mainly
>from the runtime pm callbacks. System sleep callbacks relies on generic
>pm_runtime_force_suspend/pm_runtime_force_resume helpers. To ensure
>internal state consistency, additional lock for runtime pm transitions
>was introduced.
>
>Signed-off-by: Marek Szyprowski 
>---
> drivers/iommu/exynos-iommu.c | 45 +++-
> 1 file changed, 36 insertions(+), 9 deletions(-)
>
>diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
>index a959443e6f33..5e6d7bbf9b70 100644
>--- a/drivers/iommu/exynos-iommu.c
>+++ b/drivers/iommu/exynos-iommu.c
>@@ -206,6 +206,7 @@ struct sysmmu_fault_info {
> struct exynos_iommu_owner {
>   struct list_head controllers;   /* list of sysmmu_drvdata.owner_node */
>   struct iommu_domain *domain;/* domain this device is attached */
>+  struct mutex rpm_lock;  /* for runtime pm of all sysmmus */
> };
>
> /*
>@@ -594,40 +595,46 @@ static int __init exynos_sysmmu_probe(struct 
>platform_device *pdev)
>   return 0;
> }
>
>-#ifdef CONFIG_PM_SLEEP
>-static int exynos_sysmmu_suspend(struct device *dev)
>+static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
> {
>   struct sysmmu_drvdata *data = dev_get_drvdata(dev);
>   struct device *master = data->master;
>
>   if (master) {
>-  pm_runtime_put(dev);
>+  struct exynos_iommu_owner *owner = master->archdata.iommu;
>+
>+  mutex_lock(>rpm_lock);
More of a device link question,
To understand, i see that with device link + runtime, the supplier
callbacks are not called for irqsafe clients, even if supplier is irqsafe.
Why so ?

>   if (data->domain) {
>   dev_dbg(data->sysmmu, "saving state\n");
>   __sysmmu_disable(data);
>   }
>+  mutex_unlock(>rpm_lock);
>   }
>   return 0;
> }
>
>-static int exynos_sysmmu_resume(struct device *dev)
>+static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
> {
>   struct sysmmu_drvdata *data = dev_get_drvdata(dev);
>   struct device *master = data->master;
>
>   if (master) {
>-  pm_runtime_get_sync(dev);
>+  struct exynos_iommu_owner *owner = master->archdata.iommu;
>+
>+  mutex_lock(>rpm_lock);
>   if (data->domain) {
>   dev_dbg(data->sysmmu, "restoring state\n");
>   __sysmmu_enable(data);
>   }
>+  mutex_unlock(>rpm_lock);
>   }
>   return 0;
> }
>-#endif
>
> static const struct dev_pm_ops sysmmu_pm_ops = {
>-  SET_LATE_SYSTEM_SLEEP_PM_OPS(exynos_sysmmu_suspend, 
>exynos_sysmmu_resume)
>+  SET_RUNTIME_PM_OPS(exynos_sysmmu_suspend, exynos_sysmmu_resume, NULL)
>+  SET_LATE_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
>+   pm_runtime_force_resume)
> };
 Is this needed to be LATE_SYSTEM_SLEEP_PM_OPS with device links to take care
  of the order ?

Regards,
 Sricharan



RE: [PATCH v5 6/7] iommu/exynos: Add runtime pm support

2016-10-21 Thread Sricharan
Hi Marek,

>This patch adds runtime pm implementation, which is based on previous
>suspend/resume code. SYSMMU controller is now being enabled/disabled mainly
>from the runtime pm callbacks. System sleep callbacks relies on generic
>pm_runtime_force_suspend/pm_runtime_force_resume helpers. To ensure
>internal state consistency, additional lock for runtime pm transitions
>was introduced.
>
>Signed-off-by: Marek Szyprowski 
>---
> drivers/iommu/exynos-iommu.c | 45 +++-
> 1 file changed, 36 insertions(+), 9 deletions(-)
>
>diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
>index a959443e6f33..5e6d7bbf9b70 100644
>--- a/drivers/iommu/exynos-iommu.c
>+++ b/drivers/iommu/exynos-iommu.c
>@@ -206,6 +206,7 @@ struct sysmmu_fault_info {
> struct exynos_iommu_owner {
>   struct list_head controllers;   /* list of sysmmu_drvdata.owner_node */
>   struct iommu_domain *domain;/* domain this device is attached */
>+  struct mutex rpm_lock;  /* for runtime pm of all sysmmus */
> };
>
> /*
>@@ -594,40 +595,46 @@ static int __init exynos_sysmmu_probe(struct 
>platform_device *pdev)
>   return 0;
> }
>
>-#ifdef CONFIG_PM_SLEEP
>-static int exynos_sysmmu_suspend(struct device *dev)
>+static int __maybe_unused exynos_sysmmu_suspend(struct device *dev)
> {
>   struct sysmmu_drvdata *data = dev_get_drvdata(dev);
>   struct device *master = data->master;
>
>   if (master) {
>-  pm_runtime_put(dev);
>+  struct exynos_iommu_owner *owner = master->archdata.iommu;
>+
>+  mutex_lock(>rpm_lock);
More of a device link question,
To understand, i see that with device link + runtime, the supplier
callbacks are not called for irqsafe clients, even if supplier is irqsafe.
Why so ?

>   if (data->domain) {
>   dev_dbg(data->sysmmu, "saving state\n");
>   __sysmmu_disable(data);
>   }
>+  mutex_unlock(>rpm_lock);
>   }
>   return 0;
> }
>
>-static int exynos_sysmmu_resume(struct device *dev)
>+static int __maybe_unused exynos_sysmmu_resume(struct device *dev)
> {
>   struct sysmmu_drvdata *data = dev_get_drvdata(dev);
>   struct device *master = data->master;
>
>   if (master) {
>-  pm_runtime_get_sync(dev);
>+  struct exynos_iommu_owner *owner = master->archdata.iommu;
>+
>+  mutex_lock(>rpm_lock);
>   if (data->domain) {
>   dev_dbg(data->sysmmu, "restoring state\n");
>   __sysmmu_enable(data);
>   }
>+  mutex_unlock(>rpm_lock);
>   }
>   return 0;
> }
>-#endif
>
> static const struct dev_pm_ops sysmmu_pm_ops = {
>-  SET_LATE_SYSTEM_SLEEP_PM_OPS(exynos_sysmmu_suspend, 
>exynos_sysmmu_resume)
>+  SET_RUNTIME_PM_OPS(exynos_sysmmu_suspend, exynos_sysmmu_resume, NULL)
>+  SET_LATE_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
>+   pm_runtime_force_resume)
> };
 Is this needed to be LATE_SYSTEM_SLEEP_PM_OPS with device links to take care
  of the order ?

Regards,
 Sricharan



Re: [PATCH 0/4] IIO wrapper drivers, dpot-dac and envelope-detector

2016-10-21 Thread Peter Rosin
On 2016-10-21 09:17, ji...@kernel.org wrote:
> On 20.10.2016 19:17, Peter Rosin wrote:
>> On 2016-10-20 19:37, Jonathan Cameron wrote:
>>> On 20 October 2016 18:30:19 BST, Jonathan Cameron 
>>>  wrote:
 On 20 October 2016 13:55:12 BST, Lars-Peter Clausen  
 wrote:
> On 10/20/2016 11:25 AM, Peter Rosin wrote:
>> Also, is there some agreed-upon way to dig out the maximum value 
>> from
>> an iio channel? If so, "dpot-dac,max-ohms" can be eliminated from 
>> the
>> dt bindings, which would have been nice...
>
> Yes, this is something we could really use. In a sense it exists for
> the
> devices with buffer-capable channels where there is the real_bits 
> field
> which tells us the data width of the channel. But a dedicated 
> mechanism
> for
> querying the maximum (and minimum) valid code seems like a useful
> feature.
> Not only for in-kernel clients, but also for userspace.

 This was something that was addressed by the rather ancient patch
 series i posted that added
 an available call back which provided info on range and values for 
 all info mask elements.
 Series got buried by there being a lot of precursors but quite a few of
 those have merged since.

 Hmm Google won't let me find it on my phone. Was a while back now. 
 Will
 try to get on pc with
 decent email archive later and dig out a reference.
>>> http://marc.info/?l=linux-iio=138469765309868=2 I think...
>>
>> Interesting, one issue with that is that it is all in real world
>> units, while I'd rather have the raw value.
> Um.. It's been a while, but the principle was (IIRC) that every
> _available would match the units fo the associated info mask element.
> Thus if you have a _raw element it would be in adc counts (most likely).
> 
> _input would be in relevant real world units, scale etc in the whatever
> units the value itself is in.

Ok, so I forward ported that patch and added code so that the relevant
channels provide what is available. I also added code to turn the
rest of the parameter style devicetree properties into iio device/channel
attributes. So, it is now much neater from a bindings point of view.

Before I post the updated patches, I'm wondering what the status is
on that ancient patch? It didn't forward port without issues, but there
were no real difficulties that I noticed. Should I just start off my v2
series with that patch? I tend to think that that's the best option,
because I suspect that adding a "max-ohms" devicetree property as a
stop-gap pending some new infrastructure is pretty unrealistic...

Basically, my question is if that ancient patch as any chance of living
at all in a form close to what it is, or if should start looking for
an alternative right away?

Cheers,
Peter



Re: [PATCH 0/4] IIO wrapper drivers, dpot-dac and envelope-detector

2016-10-21 Thread Peter Rosin
On 2016-10-21 09:17, ji...@kernel.org wrote:
> On 20.10.2016 19:17, Peter Rosin wrote:
>> On 2016-10-20 19:37, Jonathan Cameron wrote:
>>> On 20 October 2016 18:30:19 BST, Jonathan Cameron 
>>>  wrote:
 On 20 October 2016 13:55:12 BST, Lars-Peter Clausen  
 wrote:
> On 10/20/2016 11:25 AM, Peter Rosin wrote:
>> Also, is there some agreed-upon way to dig out the maximum value 
>> from
>> an iio channel? If so, "dpot-dac,max-ohms" can be eliminated from 
>> the
>> dt bindings, which would have been nice...
>
> Yes, this is something we could really use. In a sense it exists for
> the
> devices with buffer-capable channels where there is the real_bits 
> field
> which tells us the data width of the channel. But a dedicated 
> mechanism
> for
> querying the maximum (and minimum) valid code seems like a useful
> feature.
> Not only for in-kernel clients, but also for userspace.

 This was something that was addressed by the rather ancient patch
 series i posted that added
 an available call back which provided info on range and values for 
 all info mask elements.
 Series got buried by there being a lot of precursors but quite a few of
 those have merged since.

 Hmm Google won't let me find it on my phone. Was a while back now. 
 Will
 try to get on pc with
 decent email archive later and dig out a reference.
>>> http://marc.info/?l=linux-iio=138469765309868=2 I think...
>>
>> Interesting, one issue with that is that it is all in real world
>> units, while I'd rather have the raw value.
> Um.. It's been a while, but the principle was (IIRC) that every
> _available would match the units fo the associated info mask element.
> Thus if you have a _raw element it would be in adc counts (most likely).
> 
> _input would be in relevant real world units, scale etc in the whatever
> units the value itself is in.

Ok, so I forward ported that patch and added code so that the relevant
channels provide what is available. I also added code to turn the
rest of the parameter style devicetree properties into iio device/channel
attributes. So, it is now much neater from a bindings point of view.

Before I post the updated patches, I'm wondering what the status is
on that ancient patch? It didn't forward port without issues, but there
were no real difficulties that I noticed. Should I just start off my v2
series with that patch? I tend to think that that's the best option,
because I suspect that adding a "max-ohms" devicetree property as a
stop-gap pending some new infrastructure is pretty unrealistic...

Basically, my question is if that ancient patch as any chance of living
at all in a form close to what it is, or if should start looking for
an alternative right away?

Cheers,
Peter



Re: [PATCH 2/2] drm/i915/gvt: fix compilation

2016-10-21 Thread Zhenyu Wang
On 2016.10.21 17:25:50 +0200, Arnd Bergmann wrote:
> Two functions in the newly added gvt render code are obviously
> broken, as they reference a variable without initialization and
> don't reference another variable at all:
> 
> drivers/gpu/drm/i915/gvt/render.c: In function 
> ???intel_gvt_load_render_mmio???:
> drivers/gpu/drm/i915/gvt/render.c:148:13: error: ???offset.reg??? may be used 
> uninitialized in this function [-Werror=maybe-uninitialized]
> drivers/gpu/drm/i915/gvt/render.c: In function 
> ???intel_gvt_restore_render_mmio???:
> drivers/gpu/drm/i915/gvt/render.c:185:13: error: ???offset.reg??? may be used 
> uninitialized in this function [-Werror=maybe-uninitialized]
> 
> This is probably not a correct fix, but it gets us a clean build
> by removing the unused arrays and initializing the offset variable
> to something that potentially might be correct.
> 
> Fixes: 178657139307 ("drm/i915/gvt: vGPU context switch")
> Signed-off-by: Arnd Bergmann 
> ---

I think the correct fix is like

diff --git a/drivers/gpu/drm/i915/gvt/render.c 
b/drivers/gpu/drm/i915/gvt/render.c
index feebb65..cc23c3f 100644
--- a/drivers/gpu/drm/i915/gvt/render.c
+++ b/drivers/gpu/drm/i915/gvt/render.c
@@ -162,6 +162,7 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id)
if (!IS_SKYLAKE(dev_priv))
return;
 
+   offset.reg = regs[ring_id];
for (i = 0; i < 64; i++) {
gen9_render_mocs[ring_id][i] = I915_READ(offset);
I915_WRITE(offset, vgpu_vreg(vgpu, offset));
@@ -199,6 +200,7 @@ static void restore_mocs(struct intel_vgpu *vgpu, int 
ring_id)
if (!IS_SKYLAKE(dev_priv))
return;
 
+   offset.reg = regs[ring_id];
for (i = 0; i < 64; i++) {
vgpu_vreg(vgpu, offset) = I915_READ(offset);
I915_WRITE(offset, gen9_render_mocs[ring_id][i]);

Thanks for pointing this out, it's a mistake during our code preparation for 
upstream.
I'll queue this up.

>  drivers/gpu/drm/i915/gvt/render.c | 25 +++--
>  1 file changed, 3 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gvt/render.c 
> b/drivers/gpu/drm/i915/gvt/render.c
> index feebb65ba641..79e112288065 100644
> --- a/drivers/gpu/drm/i915/gvt/render.c
> +++ b/drivers/gpu/drm/i915/gvt/render.c
> @@ -147,29 +147,20 @@ static void load_mocs(struct intel_vgpu *vgpu, int 
> ring_id)
>  {
>   struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
>   i915_reg_t offset, l3_offset;
> - u32 regs[] = {
> - [RCS] = 0xc800,
> - [VCS] = 0xc900,
> - [VCS2] = 0xca00,
> - [BCS] = 0xcc00,
> - [VECS] = 0xcb00,
> - };
>   int i;
>  
> - if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
> - return;
> -
>   if (!IS_SKYLAKE(dev_priv))
>   return;
>  
>   for (i = 0; i < 64; i++) {
> + offset.reg = i * 4;
>   gen9_render_mocs[ring_id][i] = I915_READ(offset);
>   I915_WRITE(offset, vgpu_vreg(vgpu, offset));
>   POSTING_READ(offset);
> - offset.reg += 4;
>   }
>  
>   if (ring_id == RCS) {
> + offset.reg = 64 * 4;
>   l3_offset.reg = 0xb020;
>   for (i = 0; i < 32; i++) {
>   gen9_render_mocs_L3[i] = I915_READ(l3_offset);
> @@ -184,26 +175,16 @@ static void restore_mocs(struct intel_vgpu *vgpu, int 
> ring_id)
>  {
>   struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
>   i915_reg_t offset, l3_offset;
> - u32 regs[] = {
> - [RCS] = 0xc800,
> - [VCS] = 0xc900,
> - [VCS2] = 0xca00,
> - [BCS] = 0xcc00,
> - [VECS] = 0xcb00,
> - };
>   int i;
>  
> - if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
> - return;
> -
>   if (!IS_SKYLAKE(dev_priv))
>   return;
>  
>   for (i = 0; i < 64; i++) {
> + offset.reg = i * 4;
>   vgpu_vreg(vgpu, offset) = I915_READ(offset);
>   I915_WRITE(offset, gen9_render_mocs[ring_id][i]);
>   POSTING_READ(offset);
> - offset.reg += 4;
>   }
>  
>   if (ring_id == RCS) {
> -- 
> 2.9.0
> 

-- 
Open Source Technology Center, Intel ltd.

$gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827


signature.asc
Description: PGP signature


Re: [PATCH 2/2] drm/i915/gvt: fix compilation

2016-10-21 Thread Zhenyu Wang
On 2016.10.21 17:25:50 +0200, Arnd Bergmann wrote:
> Two functions in the newly added gvt render code are obviously
> broken, as they reference a variable without initialization and
> don't reference another variable at all:
> 
> drivers/gpu/drm/i915/gvt/render.c: In function 
> ???intel_gvt_load_render_mmio???:
> drivers/gpu/drm/i915/gvt/render.c:148:13: error: ???offset.reg??? may be used 
> uninitialized in this function [-Werror=maybe-uninitialized]
> drivers/gpu/drm/i915/gvt/render.c: In function 
> ???intel_gvt_restore_render_mmio???:
> drivers/gpu/drm/i915/gvt/render.c:185:13: error: ???offset.reg??? may be used 
> uninitialized in this function [-Werror=maybe-uninitialized]
> 
> This is probably not a correct fix, but it gets us a clean build
> by removing the unused arrays and initializing the offset variable
> to something that potentially might be correct.
> 
> Fixes: 178657139307 ("drm/i915/gvt: vGPU context switch")
> Signed-off-by: Arnd Bergmann 
> ---

I think the correct fix is like

diff --git a/drivers/gpu/drm/i915/gvt/render.c 
b/drivers/gpu/drm/i915/gvt/render.c
index feebb65..cc23c3f 100644
--- a/drivers/gpu/drm/i915/gvt/render.c
+++ b/drivers/gpu/drm/i915/gvt/render.c
@@ -162,6 +162,7 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id)
if (!IS_SKYLAKE(dev_priv))
return;
 
+   offset.reg = regs[ring_id];
for (i = 0; i < 64; i++) {
gen9_render_mocs[ring_id][i] = I915_READ(offset);
I915_WRITE(offset, vgpu_vreg(vgpu, offset));
@@ -199,6 +200,7 @@ static void restore_mocs(struct intel_vgpu *vgpu, int 
ring_id)
if (!IS_SKYLAKE(dev_priv))
return;
 
+   offset.reg = regs[ring_id];
for (i = 0; i < 64; i++) {
vgpu_vreg(vgpu, offset) = I915_READ(offset);
I915_WRITE(offset, gen9_render_mocs[ring_id][i]);

Thanks for pointing this out, it's a mistake during our code preparation for 
upstream.
I'll queue this up.

>  drivers/gpu/drm/i915/gvt/render.c | 25 +++--
>  1 file changed, 3 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gvt/render.c 
> b/drivers/gpu/drm/i915/gvt/render.c
> index feebb65ba641..79e112288065 100644
> --- a/drivers/gpu/drm/i915/gvt/render.c
> +++ b/drivers/gpu/drm/i915/gvt/render.c
> @@ -147,29 +147,20 @@ static void load_mocs(struct intel_vgpu *vgpu, int 
> ring_id)
>  {
>   struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
>   i915_reg_t offset, l3_offset;
> - u32 regs[] = {
> - [RCS] = 0xc800,
> - [VCS] = 0xc900,
> - [VCS2] = 0xca00,
> - [BCS] = 0xcc00,
> - [VECS] = 0xcb00,
> - };
>   int i;
>  
> - if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
> - return;
> -
>   if (!IS_SKYLAKE(dev_priv))
>   return;
>  
>   for (i = 0; i < 64; i++) {
> + offset.reg = i * 4;
>   gen9_render_mocs[ring_id][i] = I915_READ(offset);
>   I915_WRITE(offset, vgpu_vreg(vgpu, offset));
>   POSTING_READ(offset);
> - offset.reg += 4;
>   }
>  
>   if (ring_id == RCS) {
> + offset.reg = 64 * 4;
>   l3_offset.reg = 0xb020;
>   for (i = 0; i < 32; i++) {
>   gen9_render_mocs_L3[i] = I915_READ(l3_offset);
> @@ -184,26 +175,16 @@ static void restore_mocs(struct intel_vgpu *vgpu, int 
> ring_id)
>  {
>   struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
>   i915_reg_t offset, l3_offset;
> - u32 regs[] = {
> - [RCS] = 0xc800,
> - [VCS] = 0xc900,
> - [VCS2] = 0xca00,
> - [BCS] = 0xcc00,
> - [VECS] = 0xcb00,
> - };
>   int i;
>  
> - if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
> - return;
> -
>   if (!IS_SKYLAKE(dev_priv))
>   return;
>  
>   for (i = 0; i < 64; i++) {
> + offset.reg = i * 4;
>   vgpu_vreg(vgpu, offset) = I915_READ(offset);
>   I915_WRITE(offset, gen9_render_mocs[ring_id][i]);
>   POSTING_READ(offset);
> - offset.reg += 4;
>   }
>  
>   if (ring_id == RCS) {
> -- 
> 2.9.0
> 

-- 
Open Source Technology Center, Intel ltd.

$gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827


signature.asc
Description: PGP signature


Re: why getrandom blocking does not work with /dev/urandom

2016-10-21 Thread Theodore Ts'o
On Sat, Oct 22, 2016 at 05:43:36AM +0200, Stephan Mueller wrote:
> Hi Ted,
> 
> as mentioned, I looked a bit deeper into the issue of adding the blocking 
> behavior of getrandom to /dev/urandom.
> 
> As you and I already identified, moving that blocking behavior to 
> /dev/urandom 
> simply does not work. The system does not boot.
> 
> The reason to this issue is actually quite simple. The init process of 
> systemd 
> reads /dev/urandom for whatever purpose. Now, when /dev/urandom blocks during 
> boot, systemd will be blocked too. That means that user space (either in the 
> initramfs or with the regular root partition) is set up.

I think you mean "is blocked from being set up".

> When there is no user space initialized, there are no devices set up. The 
> network card is not initialized, the block devices are not mounted, other 
> devices are not initialized. That means that neither interrupts nor block 
> device events are registered.

Well, those devices which are have already been initialized before
systemd starts will be fine.  Personally I believe in using built-in
drivers instead of depending on kernel modules for everything, but for
distribution kernels, yes, that's true.

In any case, yes, you're not telling me anything I didn't know.  What
I didn't know and still don't know is *why* systemd is tryinig to read
from /dev/urandom.  e.g., is it trying to initialize cryptographic
keys, the better to allow the Russians or the Chinese to set up
botnets which can take over IOT devices and paralyze root nameservers?
Or is it reading /dev/urandom for purely stupid, pointless,
non-cryptographic reasons?

I'm not sure which is worse, actually

- Ted


Re: why getrandom blocking does not work with /dev/urandom

2016-10-21 Thread Theodore Ts'o
On Sat, Oct 22, 2016 at 05:43:36AM +0200, Stephan Mueller wrote:
> Hi Ted,
> 
> as mentioned, I looked a bit deeper into the issue of adding the blocking 
> behavior of getrandom to /dev/urandom.
> 
> As you and I already identified, moving that blocking behavior to 
> /dev/urandom 
> simply does not work. The system does not boot.
> 
> The reason to this issue is actually quite simple. The init process of 
> systemd 
> reads /dev/urandom for whatever purpose. Now, when /dev/urandom blocks during 
> boot, systemd will be blocked too. That means that user space (either in the 
> initramfs or with the regular root partition) is set up.

I think you mean "is blocked from being set up".

> When there is no user space initialized, there are no devices set up. The 
> network card is not initialized, the block devices are not mounted, other 
> devices are not initialized. That means that neither interrupts nor block 
> device events are registered.

Well, those devices which are have already been initialized before
systemd starts will be fine.  Personally I believe in using built-in
drivers instead of depending on kernel modules for everything, but for
distribution kernels, yes, that's true.

In any case, yes, you're not telling me anything I didn't know.  What
I didn't know and still don't know is *why* systemd is tryinig to read
from /dev/urandom.  e.g., is it trying to initialize cryptographic
keys, the better to allow the Russians or the Chinese to set up
botnets which can take over IOT devices and paralyze root nameservers?
Or is it reading /dev/urandom for purely stupid, pointless,
non-cryptographic reasons?

I'm not sure which is worse, actually

- Ted


Re: [PATCH 1/2] drm/i915/gvt: add ACPI and 64BIT dependencies

2016-10-21 Thread Zhenyu Wang
On 2016.10.21 17:25:49 +0200, Arnd Bergmann wrote:
> The newly added gvt code produces lots of serious warnings and errors
> when either built on 32-bit x86, or built with ACPI disabled, e.g.
> 
> drivers/gpu/drm/i915/gvt/gtt.c: In function ???read_pte64???:
> drivers/gpu/drm/i915/gvt/gtt.c:277:2: error: left shift count >= width of 
> type [-Werror]
> drivers/gpu/drm/i915/gvt/gtt.c: In function ???gen8_gtt_get_pfn???:
> drivers/gpu/drm/i915/gvt/gtt.c:360:3: error: left shift count >= width of 
> type [-Werror]
> drivers/gpu/drm/i915/gvt/opregion.c: In function 
> ???intel_gvt_init_opregion???:
> drivers/gpu/drm/i915/gvt/opregion.c:183:2: error: implicit declaration of 
> function ???acpi_os_ioremap??? [-Werror=implicit-function-declaration]
> 
> This avoids the problems by simply disallowing those configurations
> in Kconfig. I'm sure it's possible to make the code more portable
> and support building GVT without those options, but it might not be
> useful to do so.
> 
> Fixes: 4d60c5fd3f87 ("drm/i915/gvt: vGPU PCI configuration space 
> virtualization")
> Signed-off-by: Arnd Bergmann 
> ---
> If the code is meant to work on 32-bit and non-ACPI kernels, please
> treat this as a bug report and disregard the patch.
> ---

Thanks, Arnd. We have to depend on 64bit now and not require for ACPI,
as we used one acpi function for opregion mem map which is not necessary,
so I queued one 64bit dependence and another to remove acpi dependence for 
Daniel.

>  drivers/gpu/drm/i915/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 6d4194288d11..1b9308284dde 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -84,6 +84,7 @@ config DRM_I915_USERPTR
>  config DRM_I915_GVT
>  bool "Enable Intel GVT-g graphics virtualization host support"
>  depends on DRM_I915
> + depends on 64BIT && ACPI
>  default n
>  help
> Choose this option if you want to enable Intel GVT-g graphics
> -- 
> 2.9.0
> 

-- 
Open Source Technology Center, Intel ltd.

$gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827


signature.asc
Description: PGP signature


Re: [PATCH 1/2] drm/i915/gvt: add ACPI and 64BIT dependencies

2016-10-21 Thread Zhenyu Wang
On 2016.10.21 17:25:49 +0200, Arnd Bergmann wrote:
> The newly added gvt code produces lots of serious warnings and errors
> when either built on 32-bit x86, or built with ACPI disabled, e.g.
> 
> drivers/gpu/drm/i915/gvt/gtt.c: In function ???read_pte64???:
> drivers/gpu/drm/i915/gvt/gtt.c:277:2: error: left shift count >= width of 
> type [-Werror]
> drivers/gpu/drm/i915/gvt/gtt.c: In function ???gen8_gtt_get_pfn???:
> drivers/gpu/drm/i915/gvt/gtt.c:360:3: error: left shift count >= width of 
> type [-Werror]
> drivers/gpu/drm/i915/gvt/opregion.c: In function 
> ???intel_gvt_init_opregion???:
> drivers/gpu/drm/i915/gvt/opregion.c:183:2: error: implicit declaration of 
> function ???acpi_os_ioremap??? [-Werror=implicit-function-declaration]
> 
> This avoids the problems by simply disallowing those configurations
> in Kconfig. I'm sure it's possible to make the code more portable
> and support building GVT without those options, but it might not be
> useful to do so.
> 
> Fixes: 4d60c5fd3f87 ("drm/i915/gvt: vGPU PCI configuration space 
> virtualization")
> Signed-off-by: Arnd Bergmann 
> ---
> If the code is meant to work on 32-bit and non-ACPI kernels, please
> treat this as a bug report and disregard the patch.
> ---

Thanks, Arnd. We have to depend on 64bit now and not require for ACPI,
as we used one acpi function for opregion mem map which is not necessary,
so I queued one 64bit dependence and another to remove acpi dependence for 
Daniel.

>  drivers/gpu/drm/i915/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 6d4194288d11..1b9308284dde 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -84,6 +84,7 @@ config DRM_I915_USERPTR
>  config DRM_I915_GVT
>  bool "Enable Intel GVT-g graphics virtualization host support"
>  depends on DRM_I915
> + depends on 64BIT && ACPI
>  default n
>  help
> Choose this option if you want to enable Intel GVT-g graphics
> -- 
> 2.9.0
> 

-- 
Open Source Technology Center, Intel ltd.

$gpg --keyserver wwwkeys.pgp.net --recv-keys 4D781827


signature.asc
Description: PGP signature


Re: [PATCH] ARM: dts: rockchip: add i2c-bus subnode to edp

2016-10-21 Thread ayaka


On 10/21/2016 04:25 PM, Heiko Stuebner wrote:

Am Donnerstag, 20. Oktober 2016, 15:47:56 CEST schrieb Tomeu Vizoso:

On 10/20/2016 03:45 PM, Heiko Stübner wrote:

Am Donnerstag, 20. Oktober 2016, 10:07:25 schrieb Tomeu Vizoso:

Add an empty 'i2c-bus' subnode to the edp node just so that the I2C core
doesn't attemp to parse the 'ports' subnode as containing i2c devices.

This is to avoid spurious failure messages such as:

i2c i2c-6: of_i2c: modalias failure on /dp@ff97/ports

On the one hand, the edp really has an i2c bus - with its only client the
EDID listening at 0x50 (and maybe 0x30).

On the other hand, adding an empty bus to the (implementation independent)
devicetree just to make the Linux i2c subsystem happy sounds heavily like
a
implementation-specific hack, as the edp i2c bus doesn't leak into the
outside world otherwise.

I guess this empty i2c bus not being part of the binding document points
heavily into the implementation-specific corner :-) .

My short search on other patches touching this didn't reveal anything but
maybe this was already discussed somewhere and found to be ok?

Here it is:

http://www.spinics.net/lists/linux-tegra/msg27862.html

thanks ... I'm still not sure about the placeholder though, aka needing an
undocumented subnode to make a Linux error message silent.

Sorry, I report the error result, it would work.

And about the problem at this thread beginning, I found I have to use 
something like Xserver to access DRM or the panel would not be power on. 
The legacy fbdev won't help.
But there is still problem to be solved, so the eDP panel for firefly is 
not ready yet.


In the thread you pointed to I also did not see any dt-maintainer involvement
pointing one way or another, but spinics is often not easy to navigate
threads, so I may have missed that.



Another option could be to just make of_i2c_register_device silent if
of_modalias_node returns -ENODEV?


Heiko


Signed-off-by: Tomeu Vizoso 
Cc: Randy Li 
Cc: Jon Hunter 
---

  arch/arm/boot/dts/rk3288.dtsi | 5 +
  1 file changed, 5 insertions(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi
b/arch/arm/boot/dts/rk3288.dtsi
index 2f814ffeb605..94f4b7eecca2 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -1075,6 +1075,11 @@

};

};

};

+
+   i2c-bus {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   };

};

hdmi: hdmi@ff98 {



___
Linux-rockchip mailing list
linux-rockc...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-rockchip





Re: [PATCH] ARM: dts: rockchip: add i2c-bus subnode to edp

2016-10-21 Thread ayaka


On 10/21/2016 04:25 PM, Heiko Stuebner wrote:

Am Donnerstag, 20. Oktober 2016, 15:47:56 CEST schrieb Tomeu Vizoso:

On 10/20/2016 03:45 PM, Heiko Stübner wrote:

Am Donnerstag, 20. Oktober 2016, 10:07:25 schrieb Tomeu Vizoso:

Add an empty 'i2c-bus' subnode to the edp node just so that the I2C core
doesn't attemp to parse the 'ports' subnode as containing i2c devices.

This is to avoid spurious failure messages such as:

i2c i2c-6: of_i2c: modalias failure on /dp@ff97/ports

On the one hand, the edp really has an i2c bus - with its only client the
EDID listening at 0x50 (and maybe 0x30).

On the other hand, adding an empty bus to the (implementation independent)
devicetree just to make the Linux i2c subsystem happy sounds heavily like
a
implementation-specific hack, as the edp i2c bus doesn't leak into the
outside world otherwise.

I guess this empty i2c bus not being part of the binding document points
heavily into the implementation-specific corner :-) .

My short search on other patches touching this didn't reveal anything but
maybe this was already discussed somewhere and found to be ok?

Here it is:

http://www.spinics.net/lists/linux-tegra/msg27862.html

thanks ... I'm still not sure about the placeholder though, aka needing an
undocumented subnode to make a Linux error message silent.

Sorry, I report the error result, it would work.

And about the problem at this thread beginning, I found I have to use 
something like Xserver to access DRM or the panel would not be power on. 
The legacy fbdev won't help.
But there is still problem to be solved, so the eDP panel for firefly is 
not ready yet.


In the thread you pointed to I also did not see any dt-maintainer involvement
pointing one way or another, but spinics is often not easy to navigate
threads, so I may have missed that.



Another option could be to just make of_i2c_register_device silent if
of_modalias_node returns -ENODEV?


Heiko


Signed-off-by: Tomeu Vizoso 
Cc: Randy Li 
Cc: Jon Hunter 
---

  arch/arm/boot/dts/rk3288.dtsi | 5 +
  1 file changed, 5 insertions(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi
b/arch/arm/boot/dts/rk3288.dtsi
index 2f814ffeb605..94f4b7eecca2 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -1075,6 +1075,11 @@

};

};

};

+
+   i2c-bus {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   };

};

hdmi: hdmi@ff98 {



___
Linux-rockchip mailing list
linux-rockc...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-rockchip





why getrandom blocking does not work with /dev/urandom

2016-10-21 Thread Stephan Mueller
Hi Ted,

as mentioned, I looked a bit deeper into the issue of adding the blocking 
behavior of getrandom to /dev/urandom.

As you and I already identified, moving that blocking behavior to /dev/urandom 
simply does not work. The system does not boot.

The reason to this issue is actually quite simple. The init process of systemd 
reads /dev/urandom for whatever purpose. Now, when /dev/urandom blocks during 
boot, systemd will be blocked too. That means that user space (either in the 
initramfs or with the regular root partition) is set up.

When there is no user space initialized, there are no devices set up. The 
network card is not initialized, the block devices are not mounted, other 
devices are not initialized. That means that neither interrupts nor block 
device events are registered.

The only device that does not need setup and that will generate events is a 
keyboard / mouse whose drivers are statically compiled (or other devices that 
trigger interrupts without requiring any user space setup procedure). Thus, if 
you use HID long enough, the kernel will eventually receive sufficient entropy 
from these events, release systemd and user space starts.

However, just by itself, user space will almost never start with a blocking 
behavior of /dev/urandom.

Ciao
Stephan


why getrandom blocking does not work with /dev/urandom

2016-10-21 Thread Stephan Mueller
Hi Ted,

as mentioned, I looked a bit deeper into the issue of adding the blocking 
behavior of getrandom to /dev/urandom.

As you and I already identified, moving that blocking behavior to /dev/urandom 
simply does not work. The system does not boot.

The reason to this issue is actually quite simple. The init process of systemd 
reads /dev/urandom for whatever purpose. Now, when /dev/urandom blocks during 
boot, systemd will be blocked too. That means that user space (either in the 
initramfs or with the regular root partition) is set up.

When there is no user space initialized, there are no devices set up. The 
network card is not initialized, the block devices are not mounted, other 
devices are not initialized. That means that neither interrupts nor block 
device events are registered.

The only device that does not need setup and that will generate events is a 
keyboard / mouse whose drivers are statically compiled (or other devices that 
trigger interrupts without requiring any user space setup procedure). Thus, if 
you use HID long enough, the kernel will eventually receive sufficient entropy 
from these events, release systemd and user space starts.

However, just by itself, user space will almost never start with a blocking 
behavior of /dev/urandom.

Ciao
Stephan


Re: [PATCH v2 3/3] ARM: dts: imx6qdl-apalis: Use enable-gpios property for backlight

2016-10-21 Thread Shawn Guo
On Mon, Sep 19, 2016 at 10:41:53AM +0530, Sanchayan Maity wrote:
> Use enable-gpios property of PWM backlight driver for backlight
> control.
> 
> Signed-off-by: Sanchayan Maity 

Applied, thanks.


Re: [PATCH v2 3/3] ARM: dts: imx6qdl-apalis: Use enable-gpios property for backlight

2016-10-21 Thread Shawn Guo
On Mon, Sep 19, 2016 at 10:41:53AM +0530, Sanchayan Maity wrote:
> Use enable-gpios property of PWM backlight driver for backlight
> control.
> 
> Signed-off-by: Sanchayan Maity 

Applied, thanks.


Re: [PATCH v2 2/3] ARM: dts: imx6q-apalis-ixora: Remove use of pwm-leds

2016-10-21 Thread Shawn Guo
On Mon, Sep 19, 2016 at 10:41:52AM +0530, Sanchayan Maity wrote:
> Remove use of pwm-leds and use the standard /sys/class/pwm
> interface from PWM subsystem.
> 
> Signed-off-by: Sanchayan Maity 
> Acked-by: Marcel Ziswiler 

Applied, thanks.


Re: [PATCH v2 2/3] ARM: dts: imx6q-apalis-ixora: Remove use of pwm-leds

2016-10-21 Thread Shawn Guo
On Mon, Sep 19, 2016 at 10:41:52AM +0530, Sanchayan Maity wrote:
> Remove use of pwm-leds and use the standard /sys/class/pwm
> interface from PWM subsystem.
> 
> Signed-off-by: Sanchayan Maity 
> Acked-by: Marcel Ziswiler 

Applied, thanks.


Re: Re: [PATCH] pcie: aer: aerdrv: PCIe AER workaround and handling for ASR1K platforms.

2016-10-21 Thread David Singleton -X (davsingl - MONTA VISTA SOFTWARE INC at Cisco)
Bjorn,

   sorry for the delay in getting back to you.  The first patch was incomplete. 
 The patch was not a 
complete unit.  There was a second patch that has the callers of the
routines in question.

   Appended are the two patches merged into one new patch.  And here is an 
explanation from
the original Author,  Steve Shih.

Hi Steve & David,

On Mon, Oct 17, 2016 at 09:51:06AM -0700, David Singleton wrote:
> From: Steve Shih 


> 
> ASR1K FPGAs and ASICs are configured to raise SERR/PERR through PCIe AER.
> When an error is raised, it is detected at the root complex, but it is not
> detected by the AER driver. If the root complex bridge control register is
> configured to forward secondary bus errors to the primary bus (which is not
> the case by default), then the aerdrv.c:aer_irq() is invoked, but the id read
> from the PCI_ERR_ROOT_COR_SRC register is 0. When aer_isr_one_error()
> processes the work posted by aer_irq(), it subsequently complains that
> "aer_isr_one_err->can't find device of ID".
> 
> Modifications need to be made such that PCIe AER are propagated through the
> root complex detected by the AER driver and delivered to the ASR1K PCI error
> handler.
> 
> In additions, MCH5100 and 3500/5500 JF send broadcast EOI to subordinate
> devices. However, the Cisco FPGAs and ASICs don't handle the vendor (Intel)
> specific messages properly and rases Uncorrectable and Unsupported Request
> errors. Thus, need to disable EOI Broadcast.
> 
> This change is needed to support 1RU, FP40, Kingpin, FP80, and FP160.

Can you help me understand this?  I'm having trouble connecting the
changelog to the patch.  The patch adds a pci_aer_set_callbacks()
interface, but no users of it.  It also adds a pci_fixup_aer_enable
fixup phase, but it is also unused.

The changelog mentions a change to the root complex bridge control
register, but I don't see that in the patch.  It also mentions a
broadcast EOI change, which also doesn't appear in the patch.

We have another platform where AER doesn't work with the existing
Linux driver; see [1].  It'd be nice if it turned out that the same
sort of change would help both that system and your Cisco platforms.

I'm familiar with the normal PCI Bridge Control Register.  But I don't
know what the "root complex bridge control register" is.  Can you
point me to a section of the spec?  Since you mention forwarding
secondary bus errors to the primary bus, maybe you mean a Root Port
bridge control register?

Is this a case of the hardware not quite conforming to the spec, or is
it a case of spec-compliant hardware where Linux is just missing
support for this particular case?

I'm going to resist adding a new fixup phase, especially one as
special-purpose as this one appears to be.  Without seeing the way you
want to actually use it, it's hard to tell, but likely one of the
existing fixup phases would be enough.

Bjorn



  Yes, it’s the root port PCI bridge control register at offset 0x3e:
 
#define PCI_BRIDGE_CONTROL  0x3e
#define  PCI_BRIDGE_CTL_PARITY  0x01/* Enable parity detection on secondary 
interface */
#define  PCI_BRIDGE_CTL_SERR0x02/* The same for SERR forwarding */
#define  PCI_BRIDGE_CTL_ISA 0x04/* Enable ISA mode */
#define  PCI_BRIDGE_CTL_VGA 0x08/* Forward VGA addresses */
#define  PCI_BRIDGE_CTL_MASTER_ABORT0x20  /* Report master aborts */
#define  PCI_BRIDGE_CTL_BUS_RESET   0x40/* Secondary bus reset */
#define  PCI_BRIDGE_CTL_FAST_BACK   0x80/* Fast Back2Back enabled on 
secondary interface */
 
/*
 * We must also forward #SERR and #PERR from the secondary
 * to primary bus.  This will result in the AER driver
 * receiving an interrupt that can then be delivered to
 * the device specific driver.
 */
pci_read_config_word(pdev, PCI_BRIDGE_CONTROL, );
reg16 |= PCI_BRIDGE_CTL_PARITY | PCI_BRIDGE_CTL_SERR;
pci_write_config_word(pdev, PCI_BRIDGE_CONTROL, reg16);
 
 
Yes, the Cisco FPGA/ASIC is not confirming to the PCIe standard in handling 
vendor specific messages. Instead of ignoring the Intel specific messages, the 
FPGA/ASIC raises Uncorrectable and Unsupported Request errors.
 
/*
 * 3500/5500 series CPUs (JF) send broadcast EOI to
 * subordinate devices. It is a vendor (Intel) specific
 * message that should be ignored by non-Intel devices,
 * but our devices (Yoda etc) do not ignore it and
 * raise Uncorrectable and Unsupported Request
 * errors.
 *
 * The EOI is for the Intel IO APIC, which is not
 * present and therefore not required.
 *
 * Disable EOI Broadcast to avoid Uncorrectable and
 * Unsupported request errors from devices which do
 * not support the EOI and do not adhere to the PCIe
 * spec.
 */
pci_read_config_dword(pdev, MISCCTRLSTS_REG, );
reg32 |= 

Re: Re: [PATCH] pcie: aer: aerdrv: PCIe AER workaround and handling for ASR1K platforms.

2016-10-21 Thread David Singleton -X (davsingl - MONTA VISTA SOFTWARE INC at Cisco)
Bjorn,

   sorry for the delay in getting back to you.  The first patch was incomplete. 
 The patch was not a 
complete unit.  There was a second patch that has the callers of the
routines in question.

   Appended are the two patches merged into one new patch.  And here is an 
explanation from
the original Author,  Steve Shih.

Hi Steve & David,

On Mon, Oct 17, 2016 at 09:51:06AM -0700, David Singleton wrote:
> From: Steve Shih 


> 
> ASR1K FPGAs and ASICs are configured to raise SERR/PERR through PCIe AER.
> When an error is raised, it is detected at the root complex, but it is not
> detected by the AER driver. If the root complex bridge control register is
> configured to forward secondary bus errors to the primary bus (which is not
> the case by default), then the aerdrv.c:aer_irq() is invoked, but the id read
> from the PCI_ERR_ROOT_COR_SRC register is 0. When aer_isr_one_error()
> processes the work posted by aer_irq(), it subsequently complains that
> "aer_isr_one_err->can't find device of ID".
> 
> Modifications need to be made such that PCIe AER are propagated through the
> root complex detected by the AER driver and delivered to the ASR1K PCI error
> handler.
> 
> In additions, MCH5100 and 3500/5500 JF send broadcast EOI to subordinate
> devices. However, the Cisco FPGAs and ASICs don't handle the vendor (Intel)
> specific messages properly and rases Uncorrectable and Unsupported Request
> errors. Thus, need to disable EOI Broadcast.
> 
> This change is needed to support 1RU, FP40, Kingpin, FP80, and FP160.

Can you help me understand this?  I'm having trouble connecting the
changelog to the patch.  The patch adds a pci_aer_set_callbacks()
interface, but no users of it.  It also adds a pci_fixup_aer_enable
fixup phase, but it is also unused.

The changelog mentions a change to the root complex bridge control
register, but I don't see that in the patch.  It also mentions a
broadcast EOI change, which also doesn't appear in the patch.

We have another platform where AER doesn't work with the existing
Linux driver; see [1].  It'd be nice if it turned out that the same
sort of change would help both that system and your Cisco platforms.

I'm familiar with the normal PCI Bridge Control Register.  But I don't
know what the "root complex bridge control register" is.  Can you
point me to a section of the spec?  Since you mention forwarding
secondary bus errors to the primary bus, maybe you mean a Root Port
bridge control register?

Is this a case of the hardware not quite conforming to the spec, or is
it a case of spec-compliant hardware where Linux is just missing
support for this particular case?

I'm going to resist adding a new fixup phase, especially one as
special-purpose as this one appears to be.  Without seeing the way you
want to actually use it, it's hard to tell, but likely one of the
existing fixup phases would be enough.

Bjorn



  Yes, it’s the root port PCI bridge control register at offset 0x3e:
 
#define PCI_BRIDGE_CONTROL  0x3e
#define  PCI_BRIDGE_CTL_PARITY  0x01/* Enable parity detection on secondary 
interface */
#define  PCI_BRIDGE_CTL_SERR0x02/* The same for SERR forwarding */
#define  PCI_BRIDGE_CTL_ISA 0x04/* Enable ISA mode */
#define  PCI_BRIDGE_CTL_VGA 0x08/* Forward VGA addresses */
#define  PCI_BRIDGE_CTL_MASTER_ABORT0x20  /* Report master aborts */
#define  PCI_BRIDGE_CTL_BUS_RESET   0x40/* Secondary bus reset */
#define  PCI_BRIDGE_CTL_FAST_BACK   0x80/* Fast Back2Back enabled on 
secondary interface */
 
/*
 * We must also forward #SERR and #PERR from the secondary
 * to primary bus.  This will result in the AER driver
 * receiving an interrupt that can then be delivered to
 * the device specific driver.
 */
pci_read_config_word(pdev, PCI_BRIDGE_CONTROL, );
reg16 |= PCI_BRIDGE_CTL_PARITY | PCI_BRIDGE_CTL_SERR;
pci_write_config_word(pdev, PCI_BRIDGE_CONTROL, reg16);
 
 
Yes, the Cisco FPGA/ASIC is not confirming to the PCIe standard in handling 
vendor specific messages. Instead of ignoring the Intel specific messages, the 
FPGA/ASIC raises Uncorrectable and Unsupported Request errors.
 
/*
 * 3500/5500 series CPUs (JF) send broadcast EOI to
 * subordinate devices. It is a vendor (Intel) specific
 * message that should be ignored by non-Intel devices,
 * but our devices (Yoda etc) do not ignore it and
 * raise Uncorrectable and Unsupported Request
 * errors.
 *
 * The EOI is for the Intel IO APIC, which is not
 * present and therefore not required.
 *
 * Disable EOI Broadcast to avoid Uncorrectable and
 * Unsupported request errors from devices which do
 * not support the EOI and do not adhere to the PCIe
 * spec.
 */
pci_read_config_dword(pdev, MISCCTRLSTS_REG, );
reg32 |= 

Re: [PATCH v2 1/3] ARM: dts: imx6qdl-apalis: Do not rely on DDC I2C bus bitbang for HDMI

2016-10-21 Thread Shawn Guo
On Mon, Sep 19, 2016 at 10:41:51AM +0530, Sanchayan Maity wrote:
> Remove the use of DDC I2C bus bitbang to support reading of EDID
> and rely on support from internal HDMI I2C master controller instead.
> As a result remove the device tree property ddc-i2c-bus.
> 
> Signed-off-by: Sanchayan Maity 

I think that the dw-hdmi i2c support [1] is a prerequisite of this
patch.  I do not see it lands on v4.9-rc1.  Or am I missing something?

Shawn

[1] https://patchwork.kernel.org/patch/9296883/

> ---
> Changes since v1:
> 
> Change the ranking in i2c aliases
> 
> v1: https://lkml.org/lkml/2016/9/14/55
> ---
>  arch/arm/boot/dts/imx6q-apalis-ixora.dts | 12 +++-
>  arch/arm/boot/dts/imx6qdl-apalis.dtsi| 25 +
>  2 files changed, 12 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/arm/boot/dts/imx6q-apalis-ixora.dts 
> b/arch/arm/boot/dts/imx6q-apalis-ixora.dts
> index 207b85b..82b81e0 100644
> --- a/arch/arm/boot/dts/imx6q-apalis-ixora.dts
> +++ b/arch/arm/boot/dts/imx6q-apalis-ixora.dts
> @@ -55,10 +55,9 @@
>"fsl,imx6q";
>  
>   aliases {
> - i2c0 = 
> - i2c1 = 
> - i2c2 = 
> - i2c3 = 
> + i2c0 = 
> + i2c1 = 
> + i2c2 = 
>   };
>  
>   aliases {
> @@ -186,11 +185,6 @@
>  };
>  
>   {
> - ddc-i2c-bus = <>;
> - status = "okay";
> -};
> -
> - {
>   status = "okay";
>  };
>  
> diff --git a/arch/arm/boot/dts/imx6qdl-apalis.dtsi 
> b/arch/arm/boot/dts/imx6qdl-apalis.dtsi
> index 99e323b..8c67dd8 100644
> --- a/arch/arm/boot/dts/imx6qdl-apalis.dtsi
> +++ b/arch/arm/boot/dts/imx6qdl-apalis.dtsi
> @@ -53,18 +53,6 @@
>   status = "disabled";
>   };
>  
> - /* DDC_I2C: I2C2_SDA/SCL on MXM3 205/207 */
> - i2cddc: i2c@0 {
> - compatible = "i2c-gpio";
> - pinctrl-names = "default";
> - pinctrl-0 = <_i2c_ddc>;
> - gpios = < 16 GPIO_ACTIVE_HIGH /* sda */
> -   30 GPIO_ACTIVE_HIGH /* scl */
> - >;
> - i2c-gpio,delay-us = <2>;/* ~100 kHz */
> - status = "disabled";
> - };
> -
>   reg_1p8v: regulator-1p8v {
>   compatible = "regulator-fixed";
>   regulator-name = "1P8V";
> @@ -209,6 +197,12 @@
>   };
>  };
>  
> + {
> + pinctrl-names = "default";
> + pinctrl-0 = <_hdmi_ddc>;
> + status = "disabled";
> +};
> +
>  /*
>   * GEN1_I2C: I2C1_SDA/SCL on MXM3 209/211 (e.g. RTC on carrier
>   * board)
> @@ -633,11 +627,10 @@
>   >;
>   };
>  
> - pinctrl_i2c_ddc: gpioi2cddcgrp {
> + pinctrl_hdmi_ddc: hdmiddcgrp {
>   fsl,pins = <
> - /* DDC bitbang */
> - MX6QDL_PAD_EIM_EB2__GPIO2_IO30 0x1b0b0
> - MX6QDL_PAD_EIM_D16__GPIO3_IO16 0x1b0b0
> + MX6QDL_PAD_EIM_EB2__HDMI_TX_DDC_SCL 0x4001b8b1
> + MX6QDL_PAD_EIM_D16__HDMI_TX_DDC_SDA 0x4001b8b1
>   >;
>   };
>  
> -- 
> 2.9.3
> 


Re: [PATCH v2 1/3] ARM: dts: imx6qdl-apalis: Do not rely on DDC I2C bus bitbang for HDMI

2016-10-21 Thread Shawn Guo
On Mon, Sep 19, 2016 at 10:41:51AM +0530, Sanchayan Maity wrote:
> Remove the use of DDC I2C bus bitbang to support reading of EDID
> and rely on support from internal HDMI I2C master controller instead.
> As a result remove the device tree property ddc-i2c-bus.
> 
> Signed-off-by: Sanchayan Maity 

I think that the dw-hdmi i2c support [1] is a prerequisite of this
patch.  I do not see it lands on v4.9-rc1.  Or am I missing something?

Shawn

[1] https://patchwork.kernel.org/patch/9296883/

> ---
> Changes since v1:
> 
> Change the ranking in i2c aliases
> 
> v1: https://lkml.org/lkml/2016/9/14/55
> ---
>  arch/arm/boot/dts/imx6q-apalis-ixora.dts | 12 +++-
>  arch/arm/boot/dts/imx6qdl-apalis.dtsi| 25 +
>  2 files changed, 12 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/arm/boot/dts/imx6q-apalis-ixora.dts 
> b/arch/arm/boot/dts/imx6q-apalis-ixora.dts
> index 207b85b..82b81e0 100644
> --- a/arch/arm/boot/dts/imx6q-apalis-ixora.dts
> +++ b/arch/arm/boot/dts/imx6q-apalis-ixora.dts
> @@ -55,10 +55,9 @@
>"fsl,imx6q";
>  
>   aliases {
> - i2c0 = 
> - i2c1 = 
> - i2c2 = 
> - i2c3 = 
> + i2c0 = 
> + i2c1 = 
> + i2c2 = 
>   };
>  
>   aliases {
> @@ -186,11 +185,6 @@
>  };
>  
>   {
> - ddc-i2c-bus = <>;
> - status = "okay";
> -};
> -
> - {
>   status = "okay";
>  };
>  
> diff --git a/arch/arm/boot/dts/imx6qdl-apalis.dtsi 
> b/arch/arm/boot/dts/imx6qdl-apalis.dtsi
> index 99e323b..8c67dd8 100644
> --- a/arch/arm/boot/dts/imx6qdl-apalis.dtsi
> +++ b/arch/arm/boot/dts/imx6qdl-apalis.dtsi
> @@ -53,18 +53,6 @@
>   status = "disabled";
>   };
>  
> - /* DDC_I2C: I2C2_SDA/SCL on MXM3 205/207 */
> - i2cddc: i2c@0 {
> - compatible = "i2c-gpio";
> - pinctrl-names = "default";
> - pinctrl-0 = <_i2c_ddc>;
> - gpios = < 16 GPIO_ACTIVE_HIGH /* sda */
> -   30 GPIO_ACTIVE_HIGH /* scl */
> - >;
> - i2c-gpio,delay-us = <2>;/* ~100 kHz */
> - status = "disabled";
> - };
> -
>   reg_1p8v: regulator-1p8v {
>   compatible = "regulator-fixed";
>   regulator-name = "1P8V";
> @@ -209,6 +197,12 @@
>   };
>  };
>  
> + {
> + pinctrl-names = "default";
> + pinctrl-0 = <_hdmi_ddc>;
> + status = "disabled";
> +};
> +
>  /*
>   * GEN1_I2C: I2C1_SDA/SCL on MXM3 209/211 (e.g. RTC on carrier
>   * board)
> @@ -633,11 +627,10 @@
>   >;
>   };
>  
> - pinctrl_i2c_ddc: gpioi2cddcgrp {
> + pinctrl_hdmi_ddc: hdmiddcgrp {
>   fsl,pins = <
> - /* DDC bitbang */
> - MX6QDL_PAD_EIM_EB2__GPIO2_IO30 0x1b0b0
> - MX6QDL_PAD_EIM_D16__GPIO3_IO16 0x1b0b0
> + MX6QDL_PAD_EIM_EB2__HDMI_TX_DDC_SCL 0x4001b8b1
> + MX6QDL_PAD_EIM_D16__HDMI_TX_DDC_SDA 0x4001b8b1
>   >;
>   };
>  
> -- 
> 2.9.3
> 


Re: [PATCH] ARM: dts: imx6: Add support for Toradex Colibri iMX6 module

2016-10-21 Thread Shawn Guo
On Wed, Sep 21, 2016 at 04:54:38PM +0530, Sanchayan Maity wrote:
> Add support for Toradex Colibri iMX6 module.
> 
> Signed-off-by: Sanchayan Maity 

Applied, thanks.


Re: [PATCH] ARM: dts: imx6: Add support for Toradex Colibri iMX6 module

2016-10-21 Thread Shawn Guo
On Wed, Sep 21, 2016 at 04:54:38PM +0530, Sanchayan Maity wrote:
> Add support for Toradex Colibri iMX6 module.
> 
> Signed-off-by: Sanchayan Maity 

Applied, thanks.


perf: perf_fuzzer triggers vmalloc_fault (then crashes)

2016-10-21 Thread Vince Weaver

This is on an AMD a10 system.  With paranoid=1.  Think it's 
probably unrelated to the (unreseolved) AMD IBS issues.
This is 4.9-rc0 just before rc1 (can't get actual rc1 to boot)

Machine locks hard after this.

[ 8098.085662] BAD LUCK: lost 42 message(s) from NMI context!
[ 8098.085663] [ cut here ]
[ 8098.085664] WARNING: CPU: 0 PID: 21338 at arch/x86/mm/fault.c:435 
vmalloc_fault+0x58/0x1f0
[ 8098.085668] CPU: 0 PID: 21338 Comm: perf_fuzzer Not tainted 4.8.0+ #37
[ 8098.085668] Hardware name: Hewlett-Packard HP Compaq Pro 6305 SFF/1850, BIOS 
K06 v02.57 08/16/2013
[ 8098.085670] Call Trace:
[ 8098.085670][] ? dump_stack+0x46/0x59
[ 8098.085670]  [] ? __warn+0xd5/0xee
[ 8098.085671]  [] ? vmalloc_fault+0x58/0x1f0
[ 8098.085671]  [] ? __do_page_fault+0x6d/0x48e
[ 8098.085671]  [] ? perf_log_throttle+0xa4/0xf4
[ 8098.085672]  [] ? trace_page_fault+0x22/0x30
[ 8098.085672]  [] ? __unwind_start+0x28/0x42
[ 8098.085672]  [] ? perf_callchain_kernel+0x75/0xac
[ 8098.085672]  [] ? get_perf_callchain+0x13a/0x1f0
[ 8098.085673]  [] ? perf_callchain+0x6a/0x6c
[ 8098.085673]  [] ? perf_prepare_sample+0x71/0x2eb
[ 8098.085673]  [] ? perf_event_output_forward+0x1a/0x54
[ 8098.085674]  [] ? __default_send_IPI_shortcut+0x10/0x2d
[ 8098.085674]  [] ? __perf_event_overflow+0xfb/0x167
[ 8098.085674]  [] ? x86_pmu_handle_irq+0x113/0x150
[ 8098.085675]  [] ? native_read_msr+0x6/0x34
[ 8098.085675]  [] ? perf_event_nmi_handler+0x22/0x39
[ 8098.085675]  [] ? perf_ibs_nmi_handler+0x4a/0x51
[ 8098.085676]  [] ? perf_event_nmi_handler+0x22/0x39
[ 8098.085676]  [] ? nmi_handle+0x4d/0xf0
[ 8098.085676]  [] ? perf_ibs_handle_irq+0x3d1/0x3d1
[ 8098.085676]  [] ? default_do_nmi+0x3c/0xd5
[ 8098.085677]  [] ? do_nmi+0x92/0x102
[ 8098.085677]  [] ? end_repeat_nmi+0x1a/0x1e
[ 8098.085677]  [] ? entry_SYSCALL_64_after_swapgs+0x12/0x4a
[ 8098.085678]  [] ? entry_SYSCALL_64_after_swapgs+0x12/0x4a
[ 8098.085678]  [] ? entry_SYSCALL_64_after_swapgs+0x12/0x4a
[ 8098.085678]   ^A4---[ end trace 632723104d47d31a ]---
[ 8098.085679] BUG: stack guard page was hit at c9000850 (stack is 
c900084fc000..c900084f)
[ 8098.085679] kernel stack overflow (page fault):  [#1] SMP
[ 8098.085683] CPU: 0 PID: 21338 Comm: perf_fuzzer Tainted: GW   
4.8.0+ #37
[ 8098.085683] Hardware name: Hewlett-Packard HP Compaq Pro 6305 SFF/1850, BIOS 
K06 v02.57 08/16/2013
[ 8098.085684] task: 8802265d2080 task.stack: c900084fc000
[ 8098.085684] RIP: 0010:[] ^Ac [] 
__unwind_start+0x28/0x42
[ 8098.085684] RSP: 0018:88022ec05af0  EFLAGS: 00010006
[ 8098.085685] RAX: ffea RBX: 88022ec05b08 RCX: c9000850
[ 8098.085685] RDX: 88022ec0 RSI: 1000 RDI: c4d0
[ 8098.085685] RBP: c9000850 R08: 88022ec08000 R09: 
[ 8098.085686] R10: 0002 R11: 0206 R12: 88022ec05b70
[ 8098.085686] R13: 88022ec05ef8 R14:  R15: 0001
[ 8098.085687] FS:  7f06e791c700() GS:88022ec0() 
knlGS:
[ 8098.085687] CS:  0010 DS:  ES:  CR0: 80050033
[ 8098.085687] CR2: c9000850 CR3: 000223c25000 CR4: 000407f0
[ 8098.085688] DR0:  DR1: 5fc8 DR2: 5fc8
[ 8098.085688] DR3:  DR6: 0ff0 DR7: 0600
[ 8098.085689] Call Trace:
[ 8098.085690]   ^Ad [] ? perf_callchain_kernel+0x75/0xac
[ 8098.085690]  [] ? vsnprintf+0x380/0x3b4
[ 8098.085690]  [] ? sprintf+0x42/0x4a
[ 8098.085691]  [] ? __sprint_symbol+0x9d/0xd1
[ 8098.085691]  [] ? symbol_string+0x51/0x5d
[ 8098.085691]  [] ? __sprint_symbol+0x9d/0xd1
[ 8098.085692]  [] ? symbol_string+0x51/0x5d
[ 8098.085692]  [] ? pointer+0x85/0x379
[ 8098.085692]  [] ? vsnprintf+0x80/0x3b4
[ 8098.085692]  [] ? irq_work_queue+0xa/0x66
[ 8098.085693]  [] ? vprintk_nmi+0x88/0x97
[ 8098.085693]  [] ? vprintk_nmi+0x88/0x97
[ 8098.085693]  [] ? printk+0x43/0x4b
[ 8098.085694]  [] ? __module_text_address+0x9/0x4f
[ 8098.085694]  [] ? is_module_text_address+0x5/0xc
[ 8098.085694]  [] ? show_trace_log_lvl+0x108/0x195
[ 8098.085694]  [] ? no_context+0x102/0x36c
[ 8098.085695]  [] ? no_context+0x102/0x36c
[ 8098.085695]  [] ? show_stack_log_lvl+0x15b/0x172
[ 8098.085695]  [] ? show_regs+0x64/0x136
[ 8098.085696]  [] ? __die+0x8c/0xc4
[ 8098.085696]  [] ? die+0x3d/0x56
[ 8098.085696]  [] ? handle_stack_overflow+0x47/0x51
[ 8098.085697]  [] ? no_context+0x102/0x36c
[ 8098.085697]   ^AdCode: ^A1BUG: unable to handle kernel ^AcNULL pointer 
dereference^Ac at 0008
[ 8098.085697] IP:^Ac [<0008>] 0x8
[ 8098.085698] PGD 2231d5067 PUD 225162067 PMD 0 
[ 8098.085698] Oops: 0010 [#2] SMP
[ 8098.085702] 
[ 8098.957250] ---[ end trace 632723104d47d31b ]---
[ 8098.957250] Kernel panic - not syncing: Fatal exception in interrupt
[ 8098.957301] Kernel Offset: disabled
[ 8098.973814] ---[ end Kernel panic - not syncing: Fatal 

perf: perf_fuzzer triggers vmalloc_fault (then crashes)

2016-10-21 Thread Vince Weaver

This is on an AMD a10 system.  With paranoid=1.  Think it's 
probably unrelated to the (unreseolved) AMD IBS issues.
This is 4.9-rc0 just before rc1 (can't get actual rc1 to boot)

Machine locks hard after this.

[ 8098.085662] BAD LUCK: lost 42 message(s) from NMI context!
[ 8098.085663] [ cut here ]
[ 8098.085664] WARNING: CPU: 0 PID: 21338 at arch/x86/mm/fault.c:435 
vmalloc_fault+0x58/0x1f0
[ 8098.085668] CPU: 0 PID: 21338 Comm: perf_fuzzer Not tainted 4.8.0+ #37
[ 8098.085668] Hardware name: Hewlett-Packard HP Compaq Pro 6305 SFF/1850, BIOS 
K06 v02.57 08/16/2013
[ 8098.085670] Call Trace:
[ 8098.085670][] ? dump_stack+0x46/0x59
[ 8098.085670]  [] ? __warn+0xd5/0xee
[ 8098.085671]  [] ? vmalloc_fault+0x58/0x1f0
[ 8098.085671]  [] ? __do_page_fault+0x6d/0x48e
[ 8098.085671]  [] ? perf_log_throttle+0xa4/0xf4
[ 8098.085672]  [] ? trace_page_fault+0x22/0x30
[ 8098.085672]  [] ? __unwind_start+0x28/0x42
[ 8098.085672]  [] ? perf_callchain_kernel+0x75/0xac
[ 8098.085672]  [] ? get_perf_callchain+0x13a/0x1f0
[ 8098.085673]  [] ? perf_callchain+0x6a/0x6c
[ 8098.085673]  [] ? perf_prepare_sample+0x71/0x2eb
[ 8098.085673]  [] ? perf_event_output_forward+0x1a/0x54
[ 8098.085674]  [] ? __default_send_IPI_shortcut+0x10/0x2d
[ 8098.085674]  [] ? __perf_event_overflow+0xfb/0x167
[ 8098.085674]  [] ? x86_pmu_handle_irq+0x113/0x150
[ 8098.085675]  [] ? native_read_msr+0x6/0x34
[ 8098.085675]  [] ? perf_event_nmi_handler+0x22/0x39
[ 8098.085675]  [] ? perf_ibs_nmi_handler+0x4a/0x51
[ 8098.085676]  [] ? perf_event_nmi_handler+0x22/0x39
[ 8098.085676]  [] ? nmi_handle+0x4d/0xf0
[ 8098.085676]  [] ? perf_ibs_handle_irq+0x3d1/0x3d1
[ 8098.085676]  [] ? default_do_nmi+0x3c/0xd5
[ 8098.085677]  [] ? do_nmi+0x92/0x102
[ 8098.085677]  [] ? end_repeat_nmi+0x1a/0x1e
[ 8098.085677]  [] ? entry_SYSCALL_64_after_swapgs+0x12/0x4a
[ 8098.085678]  [] ? entry_SYSCALL_64_after_swapgs+0x12/0x4a
[ 8098.085678]  [] ? entry_SYSCALL_64_after_swapgs+0x12/0x4a
[ 8098.085678]   ^A4---[ end trace 632723104d47d31a ]---
[ 8098.085679] BUG: stack guard page was hit at c9000850 (stack is 
c900084fc000..c900084f)
[ 8098.085679] kernel stack overflow (page fault):  [#1] SMP
[ 8098.085683] CPU: 0 PID: 21338 Comm: perf_fuzzer Tainted: GW   
4.8.0+ #37
[ 8098.085683] Hardware name: Hewlett-Packard HP Compaq Pro 6305 SFF/1850, BIOS 
K06 v02.57 08/16/2013
[ 8098.085684] task: 8802265d2080 task.stack: c900084fc000
[ 8098.085684] RIP: 0010:[] ^Ac [] 
__unwind_start+0x28/0x42
[ 8098.085684] RSP: 0018:88022ec05af0  EFLAGS: 00010006
[ 8098.085685] RAX: ffea RBX: 88022ec05b08 RCX: c9000850
[ 8098.085685] RDX: 88022ec0 RSI: 1000 RDI: c4d0
[ 8098.085685] RBP: c9000850 R08: 88022ec08000 R09: 
[ 8098.085686] R10: 0002 R11: 0206 R12: 88022ec05b70
[ 8098.085686] R13: 88022ec05ef8 R14:  R15: 0001
[ 8098.085687] FS:  7f06e791c700() GS:88022ec0() 
knlGS:
[ 8098.085687] CS:  0010 DS:  ES:  CR0: 80050033
[ 8098.085687] CR2: c9000850 CR3: 000223c25000 CR4: 000407f0
[ 8098.085688] DR0:  DR1: 5fc8 DR2: 5fc8
[ 8098.085688] DR3:  DR6: 0ff0 DR7: 0600
[ 8098.085689] Call Trace:
[ 8098.085690]   ^Ad [] ? perf_callchain_kernel+0x75/0xac
[ 8098.085690]  [] ? vsnprintf+0x380/0x3b4
[ 8098.085690]  [] ? sprintf+0x42/0x4a
[ 8098.085691]  [] ? __sprint_symbol+0x9d/0xd1
[ 8098.085691]  [] ? symbol_string+0x51/0x5d
[ 8098.085691]  [] ? __sprint_symbol+0x9d/0xd1
[ 8098.085692]  [] ? symbol_string+0x51/0x5d
[ 8098.085692]  [] ? pointer+0x85/0x379
[ 8098.085692]  [] ? vsnprintf+0x80/0x3b4
[ 8098.085692]  [] ? irq_work_queue+0xa/0x66
[ 8098.085693]  [] ? vprintk_nmi+0x88/0x97
[ 8098.085693]  [] ? vprintk_nmi+0x88/0x97
[ 8098.085693]  [] ? printk+0x43/0x4b
[ 8098.085694]  [] ? __module_text_address+0x9/0x4f
[ 8098.085694]  [] ? is_module_text_address+0x5/0xc
[ 8098.085694]  [] ? show_trace_log_lvl+0x108/0x195
[ 8098.085694]  [] ? no_context+0x102/0x36c
[ 8098.085695]  [] ? no_context+0x102/0x36c
[ 8098.085695]  [] ? show_stack_log_lvl+0x15b/0x172
[ 8098.085695]  [] ? show_regs+0x64/0x136
[ 8098.085696]  [] ? __die+0x8c/0xc4
[ 8098.085696]  [] ? die+0x3d/0x56
[ 8098.085696]  [] ? handle_stack_overflow+0x47/0x51
[ 8098.085697]  [] ? no_context+0x102/0x36c
[ 8098.085697]   ^AdCode: ^A1BUG: unable to handle kernel ^AcNULL pointer 
dereference^Ac at 0008
[ 8098.085697] IP:^Ac [<0008>] 0x8
[ 8098.085698] PGD 2231d5067 PUD 225162067 PMD 0 
[ 8098.085698] Oops: 0010 [#2] SMP
[ 8098.085702] 
[ 8098.957250] ---[ end trace 632723104d47d31b ]---
[ 8098.957250] Kernel panic - not syncing: Fatal exception in interrupt
[ 8098.957301] Kernel Offset: disabled
[ 8098.973814] ---[ end Kernel panic - not syncing: Fatal 

Re: [PATCH][v12] PM / hibernate: Verify the consistent of e820 memory map by md5 digest

2016-10-21 Thread joeyli
Hi Chen Yu,

On Thu, Oct 20, 2016 at 04:14:52PM +0800, Chen Yu wrote:
> On some platforms, there is occasional panic triggered when
> trying to resume from hibernation, a typical panic looks like:
> 
> "BUG: unable to handle kernel paging request at 880085894000
> IP: [] load_image_lzo+0x8c2/0xe70"
> 
> Investigation carried out by Lee Chun-Yi shows that this is because
> e820 map has been changed by BIOS across hibernation, and one
> of the page frames from suspend kernel is right located in restore
> kernel's unmapped region, so panic comes out when accessing unmapped
> kernel address.
> 
> In order to expose this issue earlier, the md5 hash of e820 map
> is passed from suspend kernel to restore kernel, and the restore
> kernel will terminate the resume process once it finds the md5
> hash are not the same.
> 
> As the format of image header has been modified, the magic number
> should also be adjusted as kernels with the same RESTORE_MAGIC have
> to use the same header format and interpret all of the fields in
> it in the same way.
> 
> If the suspend kernel is built without md5 support, and the restore
> kernel has md5 support, then the latter will bypass the check process.
> Vice versa the restore kernel will bypass the check if it does not
> support md5 operation.
> 
> Note:
> 1. Without this patch applied, it is possible that BIOS has
>provided an inconsistent memory map, but the resume kernel is still
>able to restore the image anyway(e.g, E820_RAM region is the superset
>of the previous one), although the system might be unstable. So this
>patch tries to treat any inconsistent e820 as illegal.
> 
> 2. Another case is, this patch replies on comparing the e820_saved, but
>currently the e820_save might not be strictly the same across
>hibernation, even if BIOS has provided consistent e820 map - In
>theory mptable might modify the BIOS-provided e820_saved dynamically
>in early_reserve_e820_mpc_new, which would allocate a buffer from
>E820_RAM, and marks it from E820_RAM to E820_RESERVED).
>This is a potential and rare case we need to deal with in OS in
>the future.
> 
> Suggested-by: Pavel Machek 
> Suggested-by: Rafael J. Wysocki 
> Cc: Rafael J. Wysocki 
> Cc: Pavel Machek 
> Cc: Lee Chun-Yi 
> Cc: Borislav Petkov 
> Cc: Len Brown 
> Cc: Denys Vlasenko 
> Cc: Dan Williams 
> Signed-off-by: Chen Yu 

Please feel free to add:
Reviewed-by: Lee, Chun-Yi 

> ---
> v12:
>  - Adding more user-friendly warnings when md5 confliction
>is detected.
>Use the actual e820_save size instead of the whole struct e820map
>to generate the md5.
>Use AHASH_REQUEST_ON_STACK as suggested by Denys Vlasenko.


Thanks
Joey Lee 


Re: [PATCH][v12] PM / hibernate: Verify the consistent of e820 memory map by md5 digest

2016-10-21 Thread joeyli
Hi Chen Yu,

On Thu, Oct 20, 2016 at 04:14:52PM +0800, Chen Yu wrote:
> On some platforms, there is occasional panic triggered when
> trying to resume from hibernation, a typical panic looks like:
> 
> "BUG: unable to handle kernel paging request at 880085894000
> IP: [] load_image_lzo+0x8c2/0xe70"
> 
> Investigation carried out by Lee Chun-Yi shows that this is because
> e820 map has been changed by BIOS across hibernation, and one
> of the page frames from suspend kernel is right located in restore
> kernel's unmapped region, so panic comes out when accessing unmapped
> kernel address.
> 
> In order to expose this issue earlier, the md5 hash of e820 map
> is passed from suspend kernel to restore kernel, and the restore
> kernel will terminate the resume process once it finds the md5
> hash are not the same.
> 
> As the format of image header has been modified, the magic number
> should also be adjusted as kernels with the same RESTORE_MAGIC have
> to use the same header format and interpret all of the fields in
> it in the same way.
> 
> If the suspend kernel is built without md5 support, and the restore
> kernel has md5 support, then the latter will bypass the check process.
> Vice versa the restore kernel will bypass the check if it does not
> support md5 operation.
> 
> Note:
> 1. Without this patch applied, it is possible that BIOS has
>provided an inconsistent memory map, but the resume kernel is still
>able to restore the image anyway(e.g, E820_RAM region is the superset
>of the previous one), although the system might be unstable. So this
>patch tries to treat any inconsistent e820 as illegal.
> 
> 2. Another case is, this patch replies on comparing the e820_saved, but
>currently the e820_save might not be strictly the same across
>hibernation, even if BIOS has provided consistent e820 map - In
>theory mptable might modify the BIOS-provided e820_saved dynamically
>in early_reserve_e820_mpc_new, which would allocate a buffer from
>E820_RAM, and marks it from E820_RAM to E820_RESERVED).
>This is a potential and rare case we need to deal with in OS in
>the future.
> 
> Suggested-by: Pavel Machek 
> Suggested-by: Rafael J. Wysocki 
> Cc: Rafael J. Wysocki 
> Cc: Pavel Machek 
> Cc: Lee Chun-Yi 
> Cc: Borislav Petkov 
> Cc: Len Brown 
> Cc: Denys Vlasenko 
> Cc: Dan Williams 
> Signed-off-by: Chen Yu 

Please feel free to add:
Reviewed-by: Lee, Chun-Yi 

> ---
> v12:
>  - Adding more user-friendly warnings when md5 confliction
>is detected.
>Use the actual e820_save size instead of the whole struct e820map
>to generate the md5.
>Use AHASH_REQUEST_ON_STACK as suggested by Denys Vlasenko.


Thanks
Joey Lee 


Geschäftsvorschlag!!!

2016-10-21 Thread postmaster



--
Geschäftsvorschlag!!!


Ich vermute das diese E-Mail eine Überraschung für Sie sein wird, aber 
es ist wahr.Ich bin bei einer routinen Überprüfung in meiner Bank (First 
National Bank von Süd Afrika) wo ich arbeite, auf einem Konto 
gestoßen,was nicht in anspruch genommen worden ist, wo derzeit USD$18.5M 
(Achtzehn Million, Fünf Hundert Tausend, US Dollar) gutgeschrieben sind.


Dieses Konto gehörte Herrn Fisher Thomas, der ein Kunde in unsere Bank 
war, der leider verstorben ist. Herr Thomas war ein gebürtiger 
Deutscher.Damit es mir möglich ist dieses Geld USD$18.5M (Achtzehn 
Million, Fünf Hundert Tausend, US Dollar) inanspruch zunehmen, benötige 
ich die zusammenarbeit eines  Ausländischen Partners wie Sie,den ich als 
Verwandter und Erbe des verstorbenen Herrn Thomas. vorstellen kann,damit 
wir das Geld inanspruch nehmen können.


Für diese Unterstützung erhalten Sie 30% der Erbschaftsumme und die 
restlichen 70% teile ich mir mit meinen zwei Arbeitskollegen, die mich 
bei dieser Transaktion ebenfalls unterstützen.Wenn Sie interessiert 
sind, können Sie mir bitte eine E-Mail schicken, damit ich Ihnen mehr 
Details zukommen lassen kann.


Schicken Sie bitte Ihre Antwort auf diese E-Mail Adresse: 
(louis.is...@aim.com)


Mit freundlichen Grüßen

Louis ISAKI


First National Bank. N.B.BITTE SENDEN SIE MIR Louis ISAKI ANTWORT ZU 
durch mein E-mail: (louis.is...@aim.com).fÜR VERTRAULICHEN GRUND. 
Schicken Sie
keine POST ZU MEINEM BÜRO-E-MAIL. If you understand english,please 
kindly reply with english.


Geschäftsvorschlag!!!

2016-10-21 Thread postmaster



--
Geschäftsvorschlag!!!


Ich vermute das diese E-Mail eine Überraschung für Sie sein wird, aber 
es ist wahr.Ich bin bei einer routinen Überprüfung in meiner Bank (First 
National Bank von Süd Afrika) wo ich arbeite, auf einem Konto 
gestoßen,was nicht in anspruch genommen worden ist, wo derzeit USD$18.5M 
(Achtzehn Million, Fünf Hundert Tausend, US Dollar) gutgeschrieben sind.


Dieses Konto gehörte Herrn Fisher Thomas, der ein Kunde in unsere Bank 
war, der leider verstorben ist. Herr Thomas war ein gebürtiger 
Deutscher.Damit es mir möglich ist dieses Geld USD$18.5M (Achtzehn 
Million, Fünf Hundert Tausend, US Dollar) inanspruch zunehmen, benötige 
ich die zusammenarbeit eines  Ausländischen Partners wie Sie,den ich als 
Verwandter und Erbe des verstorbenen Herrn Thomas. vorstellen kann,damit 
wir das Geld inanspruch nehmen können.


Für diese Unterstützung erhalten Sie 30% der Erbschaftsumme und die 
restlichen 70% teile ich mir mit meinen zwei Arbeitskollegen, die mich 
bei dieser Transaktion ebenfalls unterstützen.Wenn Sie interessiert 
sind, können Sie mir bitte eine E-Mail schicken, damit ich Ihnen mehr 
Details zukommen lassen kann.


Schicken Sie bitte Ihre Antwort auf diese E-Mail Adresse: 
(louis.is...@aim.com)


Mit freundlichen Grüßen

Louis ISAKI


First National Bank. N.B.BITTE SENDEN SIE MIR Louis ISAKI ANTWORT ZU 
durch mein E-mail: (louis.is...@aim.com).fÜR VERTRAULICHEN GRUND. 
Schicken Sie
keine POST ZU MEINEM BÜRO-E-MAIL. If you understand english,please 
kindly reply with english.


[PATCH] arch/x86: Don't try to poke disabled/non-existent APIC

2016-10-21 Thread ville . syrjala
From: Ville Syrjälä 

Apparently trying to poke a disabled or non-existent APIC
leads to a box that doesn't even boot. Let's not do that.

No real clue if this is the right fix, but at least my
P3 machine boots again.

Cc: sta...@vger.kernel.org
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Prarit Bhargava 
Cc: Peter Zijlstra 
Cc: Len Brown 
Cc: Borislav Petkov 
Cc: Andi Kleen 
Cc: Jiri Olsa 
Cc: Juergen Gross 
Cc: dyo...@redhat.com
Cc: Eric Biederman 
Cc: ke...@lists.infradead.org
Cc: Thomas Gleixner 
Fixes: 2a51fe083eba ("arch/x86: Handle non enumerated CPU after physical 
hotplug")
Signed-off-by: Ville Syrjälä 
---
 arch/x86/kernel/smpboot.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 951f093a96fe..42f5eb7b4f6c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1409,15 +1409,17 @@ __init void prefill_possible_map(void)
 
/* No boot processor was found in mptable or ACPI MADT */
if (!num_processors) {
-   int apicid = boot_cpu_physical_apicid;
-   int cpu = hard_smp_processor_id();
+   if (boot_cpu_has(X86_FEATURE_APIC)) {
+   int apicid = boot_cpu_physical_apicid;
+   int cpu = hard_smp_processor_id();
 
-   pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
+   pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
 
-   /* Make sure boot cpu is enumerated */
-   if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
-   apic->apic_id_valid(apicid))
-   generic_processor_info(apicid, boot_cpu_apic_version);
+   /* Make sure boot cpu is enumerated */
+   if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
+   apic->apic_id_valid(apicid))
+   generic_processor_info(apicid, 
boot_cpu_apic_version);
+   }
 
if (!num_processors)
num_processors = 1;
-- 
2.7.4



[PATCH] arch/x86: Don't try to poke disabled/non-existent APIC

2016-10-21 Thread ville . syrjala
From: Ville Syrjälä 

Apparently trying to poke a disabled or non-existent APIC
leads to a box that doesn't even boot. Let's not do that.

No real clue if this is the right fix, but at least my
P3 machine boots again.

Cc: sta...@vger.kernel.org
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Prarit Bhargava 
Cc: Peter Zijlstra 
Cc: Len Brown 
Cc: Borislav Petkov 
Cc: Andi Kleen 
Cc: Jiri Olsa 
Cc: Juergen Gross 
Cc: dyo...@redhat.com
Cc: Eric Biederman 
Cc: ke...@lists.infradead.org
Cc: Thomas Gleixner 
Fixes: 2a51fe083eba ("arch/x86: Handle non enumerated CPU after physical 
hotplug")
Signed-off-by: Ville Syrjälä 
---
 arch/x86/kernel/smpboot.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 951f093a96fe..42f5eb7b4f6c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1409,15 +1409,17 @@ __init void prefill_possible_map(void)
 
/* No boot processor was found in mptable or ACPI MADT */
if (!num_processors) {
-   int apicid = boot_cpu_physical_apicid;
-   int cpu = hard_smp_processor_id();
+   if (boot_cpu_has(X86_FEATURE_APIC)) {
+   int apicid = boot_cpu_physical_apicid;
+   int cpu = hard_smp_processor_id();
 
-   pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
+   pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
 
-   /* Make sure boot cpu is enumerated */
-   if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
-   apic->apic_id_valid(apicid))
-   generic_processor_info(apicid, boot_cpu_apic_version);
+   /* Make sure boot cpu is enumerated */
+   if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
+   apic->apic_id_valid(apicid))
+   generic_processor_info(apicid, 
boot_cpu_apic_version);
+   }
 
if (!num_processors)
num_processors = 1;
-- 
2.7.4



Re: [PATCH] flow_dissector: avoid uninitialized variable access

2016-10-21 Thread Linus Torvalds
On Fri, Oct 21, 2016 at 9:31 AM, Jiri Pirko  wrote:
>
> I don't see how vlan could be used uninitialized. But I understand that
> this is impossible for gcc to track it. Please just use uninitialized_var()

Actually, I think we should never use "uninitialized_var()" except
possibly for arrays or structures that gcc can complain about.

It's a horrible thing to use, in that it adds extra cruft to the
source code, and then shuts up a compiler warning (even the _reliable_
warnings from gcc).

It's much better to just initialize the variable, and if gcc some day
gets smarter and sees that it  is unnecessary and always overwritten,
so much the better. The cost of initializing a single word is
basically zero.

Linus


Re: [PATCH] flow_dissector: avoid uninitialized variable access

2016-10-21 Thread Linus Torvalds
On Fri, Oct 21, 2016 at 9:31 AM, Jiri Pirko  wrote:
>
> I don't see how vlan could be used uninitialized. But I understand that
> this is impossible for gcc to track it. Please just use uninitialized_var()

Actually, I think we should never use "uninitialized_var()" except
possibly for arrays or structures that gcc can complain about.

It's a horrible thing to use, in that it adds extra cruft to the
source code, and then shuts up a compiler warning (even the _reliable_
warnings from gcc).

It's much better to just initialize the variable, and if gcc some day
gets smarter and sees that it  is unnecessary and always overwritten,
so much the better. The cost of initializing a single word is
basically zero.

Linus


RE: [PATCH V2]usb: dwc2: Clear GUSBCFG.UsbTrdTim before setting

2016-10-21 Thread Lipengcheng


> -Original Message-
> From: Felipe Balbi [mailto:felipe.ba...@linux.intel.com]
> Sent: Monday, October 17, 2016 5:37 PM
> To: Lipengcheng; johny...@synopsys.com
> Cc: gre...@linuxfoundation.org; linux-...@vger.kernel.org; 
> linux-kernel@vger.kernel.org; Xuejiancheng; Lidongpo; Caizhiyong; Lipengcheng
> Subject: Re: [PATCH V2]usb: dwc2: Clear GUSBCFG.UsbTrdTim before setting
> 
> 
> Hi,
> 
> Pengcheng Li  writes:
> > The USBTRDTIM field needs to be cleared before setting a new value.
> > Otherwise it will result in an incorrect value if phyif == GUSBCFG_PHYIF8.
> >
> > Change-Id: Ib3e33cf4fd15ada41dc070ff7b93858daafbd10f
> > Signed-off-by: Pengcheng Li 
> > Acked-by: John Youn 
> 
> which commit are you fixing? Seems like you're missing a:
> 
commit c45574ff02d1d1f35a6bf4b8ad051fc06c001fc7.
> Fixes: foobar 
> 
> line here. Also, you need to remove Gerritisms from commit log ;-)
> 
Ok, I will remove at the next patch.
> --
> Balbi

Best Regards
Pengcheng Li


RE: [PATCH V2]usb: dwc2: Clear GUSBCFG.UsbTrdTim before setting

2016-10-21 Thread Lipengcheng


> -Original Message-
> From: Felipe Balbi [mailto:felipe.ba...@linux.intel.com]
> Sent: Monday, October 17, 2016 5:37 PM
> To: Lipengcheng; johny...@synopsys.com
> Cc: gre...@linuxfoundation.org; linux-...@vger.kernel.org; 
> linux-kernel@vger.kernel.org; Xuejiancheng; Lidongpo; Caizhiyong; Lipengcheng
> Subject: Re: [PATCH V2]usb: dwc2: Clear GUSBCFG.UsbTrdTim before setting
> 
> 
> Hi,
> 
> Pengcheng Li  writes:
> > The USBTRDTIM field needs to be cleared before setting a new value.
> > Otherwise it will result in an incorrect value if phyif == GUSBCFG_PHYIF8.
> >
> > Change-Id: Ib3e33cf4fd15ada41dc070ff7b93858daafbd10f
> > Signed-off-by: Pengcheng Li 
> > Acked-by: John Youn 
> 
> which commit are you fixing? Seems like you're missing a:
> 
commit c45574ff02d1d1f35a6bf4b8ad051fc06c001fc7.
> Fixes: foobar 
> 
> line here. Also, you need to remove Gerritisms from commit log ;-)
> 
Ok, I will remove at the next patch.
> --
> Balbi

Best Regards
Pengcheng Li


Re: [PATCH 00/12] Fix and update HOWTO Korean translation

2016-10-21 Thread SeongJae Park
On Sat, Oct 22, 2016 at 5:47 AM, Jonathan Corbet  wrote:
> On Sat, 22 Oct 2016 00:19:45 +0900
> SeongJae Park  wrote:
>
>> This patchset applies ReST conversion effort for HOWTO document to its Korean
>> translation.  It also contains fixup of trivial nitpicks in the document and
>> the translation.
>
> Thanks for doing these; let's definitely get this work in soon.  I have
> just a few comments:

Thanks for your opinion! :)


>
> - I think that bringing the (English) HOWTO into line with our posted
>   conventions makes sense, I'm happy to take patches to do that.  Let's
>   do that separately from the translation changes, though, and let's
>   think in terms of building on Mauro's work.  Either send me a patch
>   after his stuff is in, or send one for him to collect and send with the
>   rest, whatever works best for everybody else.

Agree.  I will send you the patch again after Mauro's work be merged.


>
> - You do the ReST conversion, but don't take the final step and actually
>   bring the document into the Sphinx world.
>
> I've been thinking that it would be good to make a translations/
> top-level directory and move the various translated documents underneath
> that.  Then we could maybe make a translations/index.rst and create a
> special subbook for those documents.  Make sense?

Yes, it makes sense to me.  I will send patches for that soon.


Thanks,
SeongJae Park

>
> If at all possible, I'd like an ack from somebody else who understands
> Korean.  Minchan, I see you on the CC, can you help out? :)  With that, I
> see no reason not to apply parts 2-12.
>
> Thanks,
>
> jon


Re: [PATCH 00/12] Fix and update HOWTO Korean translation

2016-10-21 Thread SeongJae Park
On Sat, Oct 22, 2016 at 5:47 AM, Jonathan Corbet  wrote:
> On Sat, 22 Oct 2016 00:19:45 +0900
> SeongJae Park  wrote:
>
>> This patchset applies ReST conversion effort for HOWTO document to its Korean
>> translation.  It also contains fixup of trivial nitpicks in the document and
>> the translation.
>
> Thanks for doing these; let's definitely get this work in soon.  I have
> just a few comments:

Thanks for your opinion! :)


>
> - I think that bringing the (English) HOWTO into line with our posted
>   conventions makes sense, I'm happy to take patches to do that.  Let's
>   do that separately from the translation changes, though, and let's
>   think in terms of building on Mauro's work.  Either send me a patch
>   after his stuff is in, or send one for him to collect and send with the
>   rest, whatever works best for everybody else.

Agree.  I will send you the patch again after Mauro's work be merged.


>
> - You do the ReST conversion, but don't take the final step and actually
>   bring the document into the Sphinx world.
>
> I've been thinking that it would be good to make a translations/
> top-level directory and move the various translated documents underneath
> that.  Then we could maybe make a translations/index.rst and create a
> special subbook for those documents.  Make sense?

Yes, it makes sense to me.  I will send patches for that soon.


Thanks,
SeongJae Park

>
> If at all possible, I'd like an ack from somebody else who understands
> Korean.  Minchan, I see you on the CC, can you help out? :)  With that, I
> see no reason not to apply parts 2-12.
>
> Thanks,
>
> jon


[RFC][PATCH] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-10-21 Thread Srinivas Pandruvada
Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, which uses preferred power management profile as PM_MOBILE
in ACPI FADT.
This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Signed-off-by: Srinivas Pandruvada 
---
If some wants to look at test results, email me. I can't attach
pdf document here.

 drivers/cpufreq/intel_pstate.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d9a0196..637536d8 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1746,6 +1746,19 @@ static void __init copy_pid_params(struct 
pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+   if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+   pstate_funcs.get_target_pstate =
+   get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(struct pstate_funcs *funcs)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
pstate_funcs.get_max   = funcs->get_max;
@@ -1757,6 +1770,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
*funcs)
pstate_funcs.get_vid   = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+   intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
2.5.5



[RFC][PATCH] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-10-21 Thread Srinivas Pandruvada
Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, which uses preferred power management profile as PM_MOBILE
in ACPI FADT.
This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Signed-off-by: Srinivas Pandruvada 
---
If some wants to look at test results, email me. I can't attach
pdf document here.

 drivers/cpufreq/intel_pstate.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d9a0196..637536d8 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1746,6 +1746,19 @@ static void __init copy_pid_params(struct 
pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+   if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+   pstate_funcs.get_target_pstate =
+   get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(struct pstate_funcs *funcs)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
pstate_funcs.get_max   = funcs->get_max;
@@ -1757,6 +1770,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
*funcs)
pstate_funcs.get_vid   = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+   intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
2.5.5



Re: [PATCH] uapi: linux: acct: Remove redundant type comp2_t from kernel

2016-10-21 Thread Chen Gang
On 10/22/16 06:53, Chen Gang wrote:
> 
> On 10/21/16 11:41, Andrew Morton wrote:
>> On Wed,  5 Oct 2016 21:40:10 +0800 cheng...@emindsoft.com.cn wrote:
>>
>>> In api itself, kernel does not use it -- it is divided into ac_etime_hi
>>> and ac_etime_lo. So kernel side only need generate the correct
>>> ac_etime_hi and ac_etime_lo, but need not know about comp2_t.
>>>
>>> At present, kernel use normal u64 type for it, when kernel provdes it to
>>> outside, kernel can translate it into ac_etime_hi and ac_etime_lo,
>>> directly, but need not notice about comp2_t, in fact.
>>
>> hm.  Why is this an improvement?
>>
> 
> For me, it will let code a little more understanding, a little simpler,
> and let the code a little more extendable (when kernel members really
> needs comp2_t in future, they need not have to treat it as __u32).
> 
> Only when comp2_t is really used in api header in future, kernel has to
> know about it, but kernel still can keep original code no touch. So for
> me, our changing is harmless.
> 

Oh sorry, for "Only when comp2_t is really used in api header in future",
we may need encode_comp2_t, but in kernel wide, this changing is very
small.

At present, only encode_comp2_t uses comp2_t, and it is only called by
fill_ac in an area, and the goal of fill_ac is to encode etime to ac (
comp2_t is the intermediate generation).

And I guess, we have very small chance to use comp2_t in uapi header in
future, so now, encode_comp2_t can be removed, when we really need it,
we can revert to encode_comp2_t and let encode_ac_etime_hilo call it.

Thanks
-- 
Chen Gang (陈刚)

Managing Natural Environments is the Duty of Human Beings.


Re: [PATCH] uapi: linux: acct: Remove redundant type comp2_t from kernel

2016-10-21 Thread Chen Gang
On 10/22/16 06:53, Chen Gang wrote:
> 
> On 10/21/16 11:41, Andrew Morton wrote:
>> On Wed,  5 Oct 2016 21:40:10 +0800 cheng...@emindsoft.com.cn wrote:
>>
>>> In api itself, kernel does not use it -- it is divided into ac_etime_hi
>>> and ac_etime_lo. So kernel side only need generate the correct
>>> ac_etime_hi and ac_etime_lo, but need not know about comp2_t.
>>>
>>> At present, kernel use normal u64 type for it, when kernel provdes it to
>>> outside, kernel can translate it into ac_etime_hi and ac_etime_lo,
>>> directly, but need not notice about comp2_t, in fact.
>>
>> hm.  Why is this an improvement?
>>
> 
> For me, it will let code a little more understanding, a little simpler,
> and let the code a little more extendable (when kernel members really
> needs comp2_t in future, they need not have to treat it as __u32).
> 
> Only when comp2_t is really used in api header in future, kernel has to
> know about it, but kernel still can keep original code no touch. So for
> me, our changing is harmless.
> 

Oh sorry, for "Only when comp2_t is really used in api header in future",
we may need encode_comp2_t, but in kernel wide, this changing is very
small.

At present, only encode_comp2_t uses comp2_t, and it is only called by
fill_ac in an area, and the goal of fill_ac is to encode etime to ac (
comp2_t is the intermediate generation).

And I guess, we have very small chance to use comp2_t in uapi header in
future, so now, encode_comp2_t can be removed, when we really need it,
we can revert to encode_comp2_t and let encode_ac_etime_hilo call it.

Thanks
-- 
Chen Gang (陈刚)

Managing Natural Environments is the Duty of Human Beings.


Re: [RFC][PATCH 2/2] usb: dwc2: Add a quirk to allow speed negotiation for Hisilicon Hi6220

2016-10-21 Thread Chen Yu
On 2016/10/22 4:00, John Youn wrote:
> On 10/20/2016 5:43 PM, Chen Yu wrote:
>> On 2016/10/19 6:21, John Youn wrote:
>>> On 10/16/2016 7:42 PM, Chen Yu wrote:


 On 2016/10/15 3:37, John Youn wrote:
> On 10/13/2016 4:36 PM, John Stultz wrote:
>> From: Chen Yu 
>>
>> The Hi6220's usb controller is limited in that it does not
>> automatically autonegotiate the usb speed. Thus it requires a
>> quirk so that we can manually negotiate the best usb speed for
>> the attached device.
>
> Hi,
>
> Could you expand more on this by explaining what exactly is the
> limitation and the workaround?
>

 The USB host limitation of Hisilicon Hi6220 is full-speed and low-speed
 devices can not be enumerated when gets plugged behind a hub.

> [snip]
>
>> +/*
>> + * HPRT0_SPD_HIGH_SPEED: high speed
>> + * HPRT0_SPD_FULL_SPEED: full speed
>> + */
>> +static void dwc2_change_bus_speed(struct usb_hcd *hcd, int speed)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (hsotg->core_params->speed == speed)
>> +return;
>> +
>> +hsotg->core_params->speed = speed;
>> +queue_work(hsotg->wq_otg, >wf_otg);
>> +}
>> +
>> +static int dwc2_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (!hsotg->change_speed_quirk)
>> +return 1;
>> +
>> +hsotg->device_count++;
>
> Why do you need to track the device count?
>
>> +dev_info(hsotg->dev, "Device count is %u after alloc dev\n",
>> +hsotg->device_count);
>> +
>> +return 1;
>> +}
>> +
>> +static void dwc2_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (!hsotg->change_speed_quirk)
>> +return;
>> +
>> +if (hsotg->device_count)
>> +hsotg->device_count--;
>> +
>> +dev_info(hsotg->dev, "Device count is %u after free dev\n",
>> +hsotg->device_count);
>> +
>> +if (hsotg->device_count == 1 && udev->parent &&
>> +udev->parent->speed > USB_SPEED_UNKNOWN &&
>> +udev->parent->speed < USB_SPEED_HIGH) {
>> +dev_info(hsotg->dev, "Set speed to default 
>> high-speed\n");
>> +dwc2_change_bus_speed(hcd, HPRT0_SPD_HIGH_SPEED);
>> +}
>> +}
>> +
>> +static int dwc2_reset_device(struct usb_hcd *hcd, struct usb_device 
>> *udev)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (!hsotg->change_speed_quirk)
>> +return 0;
>> +
>> +if (udev->speed == USB_SPEED_HIGH) {
>> +dev_info(hsotg->dev, "Set speed to high-speed\n");
>> +dwc2_change_bus_speed(hcd, HPRT0_SPD_HIGH_SPEED);
>> +} else if (udev->speed == USB_SPEED_FULL
>> +|| udev->speed == USB_SPEED_LOW) {
>> +dev_info(hsotg->dev, "Set speed to full-speed\n");
>> +dwc2_change_bus_speed(hcd, HPRT0_SPD_FULL_SPEED);
>> +}
>
> It seems you are reinitializing the core every time a device is reset
> and the udev->speed does not match the core_param speed. But how is
> the udev->speed being set correctly if the hw cannot negotiate the
> speed in the first place?
>

 The hardware can negotiate the speed, but communication with a full-speed 
 or
 low-speed device behind a hub is the problem.

> Also should it be for every device? What about if a device gets
> plugged in behind a hub? I don't think you want to execute this code
> in that case.
>
> This should only affect devices plugged into the root hub, correct?
> And the hsotg controller only has one root hub port. It seems things
> could be simplified a bit.
>

 The patch is initially written for Hikey Hi6220 board, and there is a
 hub always connected to root hub, so the patch sets the configuration to
 HPRT0_SPD_HIGH_SPEED when there is only one device(the hub).
>>>
>>> Ok, I see.
>>>

 Thanks for your suggestions, the patch needs modified in these aspect:
 1. Change the speed setting only when the device is behind a hub in 
 dwc2_reset_device.
>>>
>>> I still think you will have issues with multiple devices. Since you
>>> have a built-in hub after root hub, it will always be behind the
>>> hub. So whenver you need to change speeds, it will always reset every
>>> device in the tree. Have 

Re: [RFC][PATCH 2/2] usb: dwc2: Add a quirk to allow speed negotiation for Hisilicon Hi6220

2016-10-21 Thread Chen Yu
On 2016/10/22 4:00, John Youn wrote:
> On 10/20/2016 5:43 PM, Chen Yu wrote:
>> On 2016/10/19 6:21, John Youn wrote:
>>> On 10/16/2016 7:42 PM, Chen Yu wrote:


 On 2016/10/15 3:37, John Youn wrote:
> On 10/13/2016 4:36 PM, John Stultz wrote:
>> From: Chen Yu 
>>
>> The Hi6220's usb controller is limited in that it does not
>> automatically autonegotiate the usb speed. Thus it requires a
>> quirk so that we can manually negotiate the best usb speed for
>> the attached device.
>
> Hi,
>
> Could you expand more on this by explaining what exactly is the
> limitation and the workaround?
>

 The USB host limitation of Hisilicon Hi6220 is full-speed and low-speed
 devices can not be enumerated when gets plugged behind a hub.

> [snip]
>
>> +/*
>> + * HPRT0_SPD_HIGH_SPEED: high speed
>> + * HPRT0_SPD_FULL_SPEED: full speed
>> + */
>> +static void dwc2_change_bus_speed(struct usb_hcd *hcd, int speed)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (hsotg->core_params->speed == speed)
>> +return;
>> +
>> +hsotg->core_params->speed = speed;
>> +queue_work(hsotg->wq_otg, >wf_otg);
>> +}
>> +
>> +static int dwc2_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (!hsotg->change_speed_quirk)
>> +return 1;
>> +
>> +hsotg->device_count++;
>
> Why do you need to track the device count?
>
>> +dev_info(hsotg->dev, "Device count is %u after alloc dev\n",
>> +hsotg->device_count);
>> +
>> +return 1;
>> +}
>> +
>> +static void dwc2_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (!hsotg->change_speed_quirk)
>> +return;
>> +
>> +if (hsotg->device_count)
>> +hsotg->device_count--;
>> +
>> +dev_info(hsotg->dev, "Device count is %u after free dev\n",
>> +hsotg->device_count);
>> +
>> +if (hsotg->device_count == 1 && udev->parent &&
>> +udev->parent->speed > USB_SPEED_UNKNOWN &&
>> +udev->parent->speed < USB_SPEED_HIGH) {
>> +dev_info(hsotg->dev, "Set speed to default 
>> high-speed\n");
>> +dwc2_change_bus_speed(hcd, HPRT0_SPD_HIGH_SPEED);
>> +}
>> +}
>> +
>> +static int dwc2_reset_device(struct usb_hcd *hcd, struct usb_device 
>> *udev)
>> +{
>> +struct dwc2_hsotg *hsotg = dwc2_hcd_to_hsotg(hcd);
>> +
>> +if (!hsotg->change_speed_quirk)
>> +return 0;
>> +
>> +if (udev->speed == USB_SPEED_HIGH) {
>> +dev_info(hsotg->dev, "Set speed to high-speed\n");
>> +dwc2_change_bus_speed(hcd, HPRT0_SPD_HIGH_SPEED);
>> +} else if (udev->speed == USB_SPEED_FULL
>> +|| udev->speed == USB_SPEED_LOW) {
>> +dev_info(hsotg->dev, "Set speed to full-speed\n");
>> +dwc2_change_bus_speed(hcd, HPRT0_SPD_FULL_SPEED);
>> +}
>
> It seems you are reinitializing the core every time a device is reset
> and the udev->speed does not match the core_param speed. But how is
> the udev->speed being set correctly if the hw cannot negotiate the
> speed in the first place?
>

 The hardware can negotiate the speed, but communication with a full-speed 
 or
 low-speed device behind a hub is the problem.

> Also should it be for every device? What about if a device gets
> plugged in behind a hub? I don't think you want to execute this code
> in that case.
>
> This should only affect devices plugged into the root hub, correct?
> And the hsotg controller only has one root hub port. It seems things
> could be simplified a bit.
>

 The patch is initially written for Hikey Hi6220 board, and there is a
 hub always connected to root hub, so the patch sets the configuration to
 HPRT0_SPD_HIGH_SPEED when there is only one device(the hub).
>>>
>>> Ok, I see.
>>>

 Thanks for your suggestions, the patch needs modified in these aspect:
 1. Change the speed setting only when the device is behind a hub in 
 dwc2_reset_device.
>>>
>>> I still think you will have issues with multiple devices. Since you
>>> have a built-in hub after root hub, it will always be behind the
>>> hub. So whenver you need to change speeds, it will always reset every
>>> device in the tree. Have you tested with 

Re: [PATCH 1/2] net: phy: broadcom: Update Auxiliary Control Register macros

2016-10-21 Thread Joel Stanley
On Sat, Oct 22, 2016 at 3:50 AM, Xo Wang  wrote:
> Add the RXD-to-RXC skew (delay) time bit in the Miscellaneous Control
> shadow register and a mask for the shadow selector field.
>
> Remove a re-definition of MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL.
>
> Signed-off-by: Xo Wang 

Reviewed-by: Joel Stanley 

Cheers,

Joel

> ---
>  include/linux/brcmphy.h | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)


Re: [PATCH 1/2] net: phy: broadcom: Update Auxiliary Control Register macros

2016-10-21 Thread Joel Stanley
On Sat, Oct 22, 2016 at 3:50 AM, Xo Wang  wrote:
> Add the RXD-to-RXC skew (delay) time bit in the Miscellaneous Control
> shadow register and a mask for the shadow selector field.
>
> Remove a re-definition of MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL.
>
> Signed-off-by: Xo Wang 

Reviewed-by: Joel Stanley 

Cheers,

Joel

> ---
>  include/linux/brcmphy.h | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)


Re: [PATCH 2/2] net: phy: broadcom: Add support for BCM54612E

2016-10-21 Thread Joel Stanley
On Sat, Oct 22, 2016 at 3:50 AM, Xo Wang  wrote:
> This PHY has internal delays enabled after reset. This clears the
> internal delay enables unless the interface specifically requests them.
>
> Signed-off-by: Xo Wang 

Reviewed-by: Joel Stanley 

Cheers,

Joel

> ---
>  drivers/net/phy/broadcom.c | 48 
> ++
>  include/linux/brcmphy.h|  1 +
>  2 files changed, 49 insertions(+)


Re: [PATCH 2/2] net: phy: broadcom: Add support for BCM54612E

2016-10-21 Thread Joel Stanley
On Sat, Oct 22, 2016 at 3:50 AM, Xo Wang  wrote:
> This PHY has internal delays enabled after reset. This clears the
> internal delay enables unless the interface specifically requests them.
>
> Signed-off-by: Xo Wang 

Reviewed-by: Joel Stanley 

Cheers,

Joel

> ---
>  drivers/net/phy/broadcom.c | 48 
> ++
>  include/linux/brcmphy.h|  1 +
>  2 files changed, 49 insertions(+)


[PATCH] x86/cpufeatures: Enable new AVX512 cpu features

2016-10-21 Thread Gayatri Kammela
Modify cpufeatures.h to add new AVX512 instruction groups/features
for enuermation in /proc/cpuinfo: AVX512IFMA and AVX512VBMI

Also modify the xstate.c to clear the flags in
fpu__xstate_clear_all_cpu_caps().

CPUID.(EAX=7,ECX=0):EBX[bit 21] AVX512IFMA
CPUID.(Eax=7,ECX=0):ECX[bit 1]  AVX512VBMI

Detailed information of cpuid bits for the features can be found in
Intel Architecture Instruction Set Extensions Programming Reference.

Cc: Ravi Shankar 
Cc: Fenghua Yu 
Signed-off-by: Gayatri Kammela 
---
 arch/x86/include/asm/cpufeatures.h | 2 ++
 arch/x86/kernel/fpu/xstate.c   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 4a413485f9eb..c722fcc18a15 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -225,6 +225,7 @@
 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_AVX512IFMA  ( 9*32+21) /* AVX-512 Integer Fused 
Multiply-Add instructions */
 #define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_CLWB   ( 9*32+24) /* CLWB instruction */
@@ -279,6 +280,7 @@
 #define X86_FEATURE_AVIC   (15*32+13) /* Virtual Interrupt Controller */
 
 /* Intel-defined CPU features, CPUID level 0x0007:0 (ecx), word 16 */
+#define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation 
instructions*/
 #define X86_FEATURE_PKU(16*32+ 3) /* Protection Keys for 
Userspace */
 #define X86_FEATURE_OSPKE  (16*32+ 4) /* OS Protection Keys Enable */
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4ea2a59483c7..92411568e320 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -56,6 +56,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+   setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
@@ -64,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+   setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
setup_clear_cpu_cap(X86_FEATURE_PKU);
 }
 
-- 
2.7.4



[PATCH] x86/cpufeatures: Enable new AVX512 cpu features

2016-10-21 Thread Gayatri Kammela
Modify cpufeatures.h to add new AVX512 instruction groups/features
for enuermation in /proc/cpuinfo: AVX512IFMA and AVX512VBMI

Also modify the xstate.c to clear the flags in
fpu__xstate_clear_all_cpu_caps().

CPUID.(EAX=7,ECX=0):EBX[bit 21] AVX512IFMA
CPUID.(Eax=7,ECX=0):ECX[bit 1]  AVX512VBMI

Detailed information of cpuid bits for the features can be found in
Intel Architecture Instruction Set Extensions Programming Reference.

Cc: Ravi Shankar 
Cc: Fenghua Yu 
Signed-off-by: Gayatri Kammela 
---
 arch/x86/include/asm/cpufeatures.h | 2 ++
 arch/x86/kernel/fpu/xstate.c   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 4a413485f9eb..c722fcc18a15 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -225,6 +225,7 @@
 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX( 9*32+19) /* The ADCX and ADOX 
instructions */
 #define X86_FEATURE_SMAP   ( 9*32+20) /* Supervisor Mode Access Prevention 
*/
+#define X86_FEATURE_AVX512IFMA  ( 9*32+21) /* AVX-512 Integer Fused 
Multiply-Add instructions */
 #define X86_FEATURE_PCOMMIT( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_CLWB   ( 9*32+24) /* CLWB instruction */
@@ -279,6 +280,7 @@
 #define X86_FEATURE_AVIC   (15*32+13) /* Virtual Interrupt Controller */
 
 /* Intel-defined CPU features, CPUID level 0x0007:0 (ecx), word 16 */
+#define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation 
instructions*/
 #define X86_FEATURE_PKU(16*32+ 3) /* Protection Keys for 
Userspace */
 #define X86_FEATURE_OSPKE  (16*32+ 4) /* OS Protection Keys Enable */
 
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 4ea2a59483c7..92411568e320 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -56,6 +56,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+   setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
@@ -64,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+   setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
setup_clear_cpu_cap(X86_FEATURE_PKU);
 }
 
-- 
2.7.4



Re: [tip:x86/asm] x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)

2016-10-21 Thread Andy Lutomirski
On Oct 21, 2016 5:32 AM, "Matt Fleming"  wrote:
>
> On Wed, 24 Aug, at 06:03:04AM, tip-bot for Andy Lutomirski wrote:
> > Commit-ID:  e37e43a497d5a8b7c0cc1736d56986f432c394c9
> > Gitweb: 
> > http://git.kernel.org/tip/e37e43a497d5a8b7c0cc1736d56986f432c394c9
> > Author: Andy Lutomirski 
> > AuthorDate: Thu, 11 Aug 2016 02:35:23 -0700
> > Committer:  Ingo Molnar 
> > CommitDate: Wed, 24 Aug 2016 12:11:42 +0200
> >
> > x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)
> >
> > This allows x86_64 kernels to enable vmapped stacks by setting
> > HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y
> > high level Kconfig option.
> >
> > There are a couple of interesting bits:
>
> This commit broke booting EFI mixed mode kernels. Here's what I've got
> queued up to fix it.
>
> ---
> From acf11e55bfcef7a1dca7d1735f4a780e0cdb1c89 Mon Sep 17 00:00:00 2001
> From: Matt Fleming 
> Date: Thu, 20 Oct 2016 22:17:21 +0100
> Subject: [PATCH] x86/efi: Prevent mixed mode boot corruption with
>  CONFIG_VMAP_STACK
>
> Booting an EFI mixed mode kernel has been crashing since commit:
>
>   e37e43a497d5 ("x86/mm/64: Enable vmapped stacks 
> (CONFIG_HAVE_ARCH_VMAP_STACK=y)")
>
> The user-visible effect in my test setup was the kernel being unable
> to find the root file system ramdisk. This was likely caused by silent
> memory or page table corruption.
>
> Enabling CONFIG_DEBUG_VIRTUAL immediately flagged the thunking code as
> abusing virt_to_phys() because it was passing addresses that were not
> part of the kernel direct mapping.
>
> Use the slow version instead, which correctly handles all memory
> regions by performing a page table walk.
>
> Suggested-by: Andy Lutomirski 
> Cc: Ard Biesheuvel 
> Cc: Ingo Molnar 
> Cc: Thomas Gleixner 
> Cc: "H. Peter Anvin" 
> Signed-off-by: Matt Fleming 
> ---
>  arch/x86/platform/efi/efi_64.c | 59 
> +-
>  1 file changed, 35 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index 58b0f801f66f..e3569a00885b 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -211,6 +211,17 @@ void efi_sync_low_kernel_mappings(void)
> memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
>  }
>
> +/*
> + * Wrapper for slow_virt_to_phys() that handles NULL addresses.
> + */
> +static inline phys_addr_t virt_to_phys_or_null(void *va)
> +{
> +   if (!va)
> +   return 0;
> +
> +   return slow_virt_to_phys(va);
> +}
> +

This is asking for trouble if any of the variable length parameters
are on the stack.  How about adding a "size_t size" parameter and
doing:

if (!va) {
  return 0;
} else if (virt_addr_valid(va)) {
  return virt_to_phys(va);
} else {
  /* A fully aligned variable on the stack is guaranteed not to cross
a page boundary. */
  WARN_ON(!IS_ALIGNED((uintptr_t)va, size) || size > PAGE_SIZE);
  return slow_virt_to_phys(va);
}


Re: [tip:x86/asm] x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)

2016-10-21 Thread Andy Lutomirski
On Oct 21, 2016 5:32 AM, "Matt Fleming"  wrote:
>
> On Wed, 24 Aug, at 06:03:04AM, tip-bot for Andy Lutomirski wrote:
> > Commit-ID:  e37e43a497d5a8b7c0cc1736d56986f432c394c9
> > Gitweb: 
> > http://git.kernel.org/tip/e37e43a497d5a8b7c0cc1736d56986f432c394c9
> > Author: Andy Lutomirski 
> > AuthorDate: Thu, 11 Aug 2016 02:35:23 -0700
> > Committer:  Ingo Molnar 
> > CommitDate: Wed, 24 Aug 2016 12:11:42 +0200
> >
> > x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)
> >
> > This allows x86_64 kernels to enable vmapped stacks by setting
> > HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y
> > high level Kconfig option.
> >
> > There are a couple of interesting bits:
>
> This commit broke booting EFI mixed mode kernels. Here's what I've got
> queued up to fix it.
>
> ---
> From acf11e55bfcef7a1dca7d1735f4a780e0cdb1c89 Mon Sep 17 00:00:00 2001
> From: Matt Fleming 
> Date: Thu, 20 Oct 2016 22:17:21 +0100
> Subject: [PATCH] x86/efi: Prevent mixed mode boot corruption with
>  CONFIG_VMAP_STACK
>
> Booting an EFI mixed mode kernel has been crashing since commit:
>
>   e37e43a497d5 ("x86/mm/64: Enable vmapped stacks 
> (CONFIG_HAVE_ARCH_VMAP_STACK=y)")
>
> The user-visible effect in my test setup was the kernel being unable
> to find the root file system ramdisk. This was likely caused by silent
> memory or page table corruption.
>
> Enabling CONFIG_DEBUG_VIRTUAL immediately flagged the thunking code as
> abusing virt_to_phys() because it was passing addresses that were not
> part of the kernel direct mapping.
>
> Use the slow version instead, which correctly handles all memory
> regions by performing a page table walk.
>
> Suggested-by: Andy Lutomirski 
> Cc: Ard Biesheuvel 
> Cc: Ingo Molnar 
> Cc: Thomas Gleixner 
> Cc: "H. Peter Anvin" 
> Signed-off-by: Matt Fleming 
> ---
>  arch/x86/platform/efi/efi_64.c | 59 
> +-
>  1 file changed, 35 insertions(+), 24 deletions(-)
>
> diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
> index 58b0f801f66f..e3569a00885b 100644
> --- a/arch/x86/platform/efi/efi_64.c
> +++ b/arch/x86/platform/efi/efi_64.c
> @@ -211,6 +211,17 @@ void efi_sync_low_kernel_mappings(void)
> memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
>  }
>
> +/*
> + * Wrapper for slow_virt_to_phys() that handles NULL addresses.
> + */
> +static inline phys_addr_t virt_to_phys_or_null(void *va)
> +{
> +   if (!va)
> +   return 0;
> +
> +   return slow_virt_to_phys(va);
> +}
> +

This is asking for trouble if any of the variable length parameters
are on the stack.  How about adding a "size_t size" parameter and
doing:

if (!va) {
  return 0;
} else if (virt_addr_valid(va)) {
  return virt_to_phys(va);
} else {
  /* A fully aligned variable on the stack is guaranteed not to cross
a page boundary. */
  WARN_ON(!IS_ALIGNED((uintptr_t)va, size) || size > PAGE_SIZE);
  return slow_virt_to_phys(va);
}


Re: [PATCH 2/5] stop_machine: yield CPU during stop machine

2016-10-21 Thread Nicholas Piggin
On Fri, 21 Oct 2016 14:05:36 +0200
Peter Zijlstra  wrote:

> On Fri, Oct 21, 2016 at 01:58:55PM +0200, Christian Borntraeger wrote:
> > stop_machine can take a very long time if the hypervisor does
> > overcommitment for guest CPUs. When waiting for "the one", lets
> > give up our CPU by using the new cpu_relax_yield.  
> 
> This seems something that would apply to most other virt stuff. Lets Cc
> a few more lists for that.
> 
> > Signed-off-by: Christian Borntraeger 
> > ---
> >  kernel/stop_machine.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
> > index ec9ab2f..1eb8266 100644
> > --- a/kernel/stop_machine.c
> > +++ b/kernel/stop_machine.c
> > @@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data)
> > /* Simple state machine */
> > do {
> > /* Chill out and ensure we re-read multi_stop_state. */
> > -   cpu_relax();
> > +   cpu_relax_yield();
> > if (msdata->state != curstate) {
> > curstate = msdata->state;
> > switch (curstate) {
> > -- 
> > 2.5.5
> >   

This is the only caller of cpu_relax_yield()?

As a step to removing cpu_yield_lowlatency this series is nice so I
have no objection. But "general" kernel coders still have basically
no chance of using this properly.

I wonder what can be done about that. I've got that spin_do/while
series I'll rebase on top of this, but a spin_yield variant of them
is of no more help to the caller.

What makes this unique? Long latency and not performance critical?
Most places where we spin and maybe yield have been moved to arch
code, but I wonder whether we can make an easier to use architecture
independent API?

Thanks,
Nick


Re: [PATCH 2/5] stop_machine: yield CPU during stop machine

2016-10-21 Thread Nicholas Piggin
On Fri, 21 Oct 2016 14:05:36 +0200
Peter Zijlstra  wrote:

> On Fri, Oct 21, 2016 at 01:58:55PM +0200, Christian Borntraeger wrote:
> > stop_machine can take a very long time if the hypervisor does
> > overcommitment for guest CPUs. When waiting for "the one", lets
> > give up our CPU by using the new cpu_relax_yield.  
> 
> This seems something that would apply to most other virt stuff. Lets Cc
> a few more lists for that.
> 
> > Signed-off-by: Christian Borntraeger 
> > ---
> >  kernel/stop_machine.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
> > index ec9ab2f..1eb8266 100644
> > --- a/kernel/stop_machine.c
> > +++ b/kernel/stop_machine.c
> > @@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data)
> > /* Simple state machine */
> > do {
> > /* Chill out and ensure we re-read multi_stop_state. */
> > -   cpu_relax();
> > +   cpu_relax_yield();
> > if (msdata->state != curstate) {
> > curstate = msdata->state;
> > switch (curstate) {
> > -- 
> > 2.5.5
> >   

This is the only caller of cpu_relax_yield()?

As a step to removing cpu_yield_lowlatency this series is nice so I
have no objection. But "general" kernel coders still have basically
no chance of using this properly.

I wonder what can be done about that. I've got that spin_do/while
series I'll rebase on top of this, but a spin_yield variant of them
is of no more help to the caller.

What makes this unique? Long latency and not performance critical?
Most places where we spin and maybe yield have been moved to arch
code, but I wonder whether we can make an easier to use architecture
independent API?

Thanks,
Nick


Re: [RFC][PATCH] Add EXPORT_MACRO_SYMBOL() for asm

2016-10-21 Thread Nicholas Piggin
On Fri, 21 Oct 2016 12:17:59 -0400
Steven Rostedt  wrote:

> Commit 784d5699eddc5 ("x86: move exports to actual definitions") removed the
> EXPORT_SYMBOL(__fentry__) and EXPORT_SYMBOL(mcount) from x8664_ksyms_64.c,
> and added EXPORT_SYMBOL(function_hook) in mcount_64.S instead. The problem
> is that function_hook isn't a function at all, but a macro that is defined
> as eithe mcount or __fentry__ depending on the support from gcc. But instead
> of adding more #ifdefs like x8684_ksyms_64.c had, I suggest having another
> export that can handle this similar to the way __string() works with
> converting macros to strings. By having:
> 
>  EXPORT_MACRO_SYMBOL(function_hook)
> 
> Where we have:
> 
>  #define EXPORT_MACRO_SYMBOL(x) EXPORT_SYMBOL(x)
> 
> It will convert the macro into what it is defined as before calling
> EXPORT_SYMBOL(), and this will just work properly again.
> 
> Cc: sta...@vger.kernel.org
> Fixes: Commit 784d5699eddc5 ("x86: move exports to actual definitions")
> Signed-off-by: Steven Rostedt 
> ---
>  arch/x86/kernel/mcount_64.S  | 2 +-
>  include/asm-generic/export.h | 5 +
>  2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
> index efe73aacf966..ccd9d912af27 100644
> --- a/arch/x86/kernel/mcount_64.S
> +++ b/arch/x86/kernel/mcount_64.S
> @@ -295,7 +295,7 @@ trace:
>   jmp fgraph_trace
>  END(function_hook)
>  #endif /* CONFIG_DYNAMIC_FTRACE */
> -EXPORT_SYMBOL(function_hook)
> +EXPORT_MACRO_SYMBOL(function_hook)
>  #endif /* CONFIG_FUNCTION_TRACER */
>  
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
> diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
> index 43199a049da5..cb86e746865e 100644
> --- a/include/asm-generic/export.h
> +++ b/include/asm-generic/export.h
> @@ -90,5 +90,10 @@
>   __EXPORT_SYMBOL(name, KSYM(name),)
>  #define EXPORT_DATA_SYMBOL_GPL(name) \
>   __EXPORT_SYMBOL(name, KSYM(name),_gpl)
> +/*
> + * If "name" is a macro of a function and not a function itself,
> + * it needs a second pass.
> + */
> +#define EXPORT_MACRO_SYMBOL(x) EXPORT_SYMBOL(x)

Seems okay, but what about just calling it EXPORT_SYMBOL?

Thanks,
Nick


Re: [RFC][PATCH] Add EXPORT_MACRO_SYMBOL() for asm

2016-10-21 Thread Nicholas Piggin
On Fri, 21 Oct 2016 12:17:59 -0400
Steven Rostedt  wrote:

> Commit 784d5699eddc5 ("x86: move exports to actual definitions") removed the
> EXPORT_SYMBOL(__fentry__) and EXPORT_SYMBOL(mcount) from x8664_ksyms_64.c,
> and added EXPORT_SYMBOL(function_hook) in mcount_64.S instead. The problem
> is that function_hook isn't a function at all, but a macro that is defined
> as eithe mcount or __fentry__ depending on the support from gcc. But instead
> of adding more #ifdefs like x8684_ksyms_64.c had, I suggest having another
> export that can handle this similar to the way __string() works with
> converting macros to strings. By having:
> 
>  EXPORT_MACRO_SYMBOL(function_hook)
> 
> Where we have:
> 
>  #define EXPORT_MACRO_SYMBOL(x) EXPORT_SYMBOL(x)
> 
> It will convert the macro into what it is defined as before calling
> EXPORT_SYMBOL(), and this will just work properly again.
> 
> Cc: sta...@vger.kernel.org
> Fixes: Commit 784d5699eddc5 ("x86: move exports to actual definitions")
> Signed-off-by: Steven Rostedt 
> ---
>  arch/x86/kernel/mcount_64.S  | 2 +-
>  include/asm-generic/export.h | 5 +
>  2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
> index efe73aacf966..ccd9d912af27 100644
> --- a/arch/x86/kernel/mcount_64.S
> +++ b/arch/x86/kernel/mcount_64.S
> @@ -295,7 +295,7 @@ trace:
>   jmp fgraph_trace
>  END(function_hook)
>  #endif /* CONFIG_DYNAMIC_FTRACE */
> -EXPORT_SYMBOL(function_hook)
> +EXPORT_MACRO_SYMBOL(function_hook)
>  #endif /* CONFIG_FUNCTION_TRACER */
>  
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
> diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
> index 43199a049da5..cb86e746865e 100644
> --- a/include/asm-generic/export.h
> +++ b/include/asm-generic/export.h
> @@ -90,5 +90,10 @@
>   __EXPORT_SYMBOL(name, KSYM(name),)
>  #define EXPORT_DATA_SYMBOL_GPL(name) \
>   __EXPORT_SYMBOL(name, KSYM(name),_gpl)
> +/*
> + * If "name" is a macro of a function and not a function itself,
> + * it needs a second pass.
> + */
> +#define EXPORT_MACRO_SYMBOL(x) EXPORT_SYMBOL(x)

Seems okay, but what about just calling it EXPORT_SYMBOL?

Thanks,
Nick


Re: [PATCH] shmem: avoid huge pages for small files

2016-10-21 Thread Kirill A. Shutemov
On Sat, Oct 22, 2016 at 09:50:13AM +1100, Dave Chinner wrote:
> On Fri, Oct 21, 2016 at 06:00:07PM +0300, Kirill A. Shutemov wrote:
> > On Fri, Oct 21, 2016 at 04:01:18PM +1100, Dave Chinner wrote:
> > > On Thu, Oct 20, 2016 at 07:01:16PM -0700, Andi Kleen wrote:
> > > > > Ugh, no, please don't use mount options for file specific behaviours
> > > > > in filesystems like ext4 and XFS. This is exactly the sort of
> > > > > behaviour that should either just work automatically (i.e. be
> > > > > completely controlled by the filesystem) or only be applied to files
> > > > 
> > > > Can you explain what you mean? How would the file system control it?
> > > 
> > > There's no point in asking for huge pages when populating the page
> > > cache if the file is:
> > > 
> > >   - significantly smaller than the huge page size
> > >   - largely sparse
> > >   - being randomly accessed in small chunks
> > >   - badly fragmented and so takes hundreds of IO to read/write
> > > a huge page
> > >   - able to optimise delayed allocation to match huge page
> > > sizes and alignments
> > > 
> > > These are all constraints the filesystem knows about, but the
> > > application and user don't.
> > 
> > Really?
> > 
> > To me, most of things you're talking about is highly dependent on access
> > pattern generated by userspace:
> > 
> >   - we may want to allocate huge pages from byte 1 if we know that file
> > will grow;
> 
> delayed allocation takes care of that. We use a growing speculative
> delalloc size that kicks in at specific sizes and can be used
> directly to determine if a large page shoul dbe allocated. This code
> is aware of sparse files, sparse writes, etc.

I'm confused here. How can we delay allocation of page cache?

Delalloc is helpful to have reasonable on-disk layout, but my
understanding is that it uses page cache as buffering to postpone
block allocation. Later on writeback we see access pattern using pages
from page cache.

I'm likely missing something important here. Hm?

> >   - it will be beneficial to allocate huge page even for fragmented files,
> > if it's read-mostly;
> 
> No, no it won't. The IO latency impact here can be massive.
> read-ahead of single 4k pages hides most of this latency from the
> application, but with a 2MB page, we can't use readhead to hide this
> IO latency because the first access could stall for hundreds of
> small random read IOs to be completed instead of just 1.

I agree that it will lead to initial latency spike. But don't we have
workloads which would tolerate it to get faster hot-cache behaviour?

> > > Further, we are moving the IO path to a model where we use extents
> > > for mapping, not blocks.  We're optimising for the fact that modern
> > > filesystems use extents and so massively reduce the number of block
> > > mapping lookup calls we need to do for a given IO.
> > > 
> > > i.e. instead of doing "get page, map block to page" over and over
> > > again until we've alked over the entire IO range, we're doing
> > > "map extent for entire IO range" once, then iterating "get page"
> > > until we've mapped the entire range.
> > 
> > That's great, but it's not how IO path works *now*. And will takes a long
> > time (if ever) to flip it over to what you've described.
> 
> Wrong. fs/iomap.c. XFS already uses it, ext4 is being converted
> right now, GFS2 will use parts of it in the next release, DAX
> already uses it and PMD support in DAX is being built on top of it.

That's interesting. I've managed to miss whole fs/iomap.c thing...

> > > As such, there is no way we should be considering different
> > > interfaces and methods for configuring the /same functionality/ just
> > > because DAX is enabled or not. It's the /same decision/ that needs
> > > to be made, and the filesystem knows an awful lot more about whether
> > > huge pages can be used efficiently at the time of access than just
> > > about any other actor you can name
> > 
> > I'm not convinced that filesystem is in better position to see access
> > patterns than mm for page cache. It's not all about on-disk layout.
> 
> Spoken like a true mm developer.

Guilty.

-- 
 Kirill A. Shutemov


Re: [PATCH] shmem: avoid huge pages for small files

2016-10-21 Thread Kirill A. Shutemov
On Sat, Oct 22, 2016 at 09:50:13AM +1100, Dave Chinner wrote:
> On Fri, Oct 21, 2016 at 06:00:07PM +0300, Kirill A. Shutemov wrote:
> > On Fri, Oct 21, 2016 at 04:01:18PM +1100, Dave Chinner wrote:
> > > On Thu, Oct 20, 2016 at 07:01:16PM -0700, Andi Kleen wrote:
> > > > > Ugh, no, please don't use mount options for file specific behaviours
> > > > > in filesystems like ext4 and XFS. This is exactly the sort of
> > > > > behaviour that should either just work automatically (i.e. be
> > > > > completely controlled by the filesystem) or only be applied to files
> > > > 
> > > > Can you explain what you mean? How would the file system control it?
> > > 
> > > There's no point in asking for huge pages when populating the page
> > > cache if the file is:
> > > 
> > >   - significantly smaller than the huge page size
> > >   - largely sparse
> > >   - being randomly accessed in small chunks
> > >   - badly fragmented and so takes hundreds of IO to read/write
> > > a huge page
> > >   - able to optimise delayed allocation to match huge page
> > > sizes and alignments
> > > 
> > > These are all constraints the filesystem knows about, but the
> > > application and user don't.
> > 
> > Really?
> > 
> > To me, most of things you're talking about is highly dependent on access
> > pattern generated by userspace:
> > 
> >   - we may want to allocate huge pages from byte 1 if we know that file
> > will grow;
> 
> delayed allocation takes care of that. We use a growing speculative
> delalloc size that kicks in at specific sizes and can be used
> directly to determine if a large page shoul dbe allocated. This code
> is aware of sparse files, sparse writes, etc.

I'm confused here. How can we delay allocation of page cache?

Delalloc is helpful to have reasonable on-disk layout, but my
understanding is that it uses page cache as buffering to postpone
block allocation. Later on writeback we see access pattern using pages
from page cache.

I'm likely missing something important here. Hm?

> >   - it will be beneficial to allocate huge page even for fragmented files,
> > if it's read-mostly;
> 
> No, no it won't. The IO latency impact here can be massive.
> read-ahead of single 4k pages hides most of this latency from the
> application, but with a 2MB page, we can't use readhead to hide this
> IO latency because the first access could stall for hundreds of
> small random read IOs to be completed instead of just 1.

I agree that it will lead to initial latency spike. But don't we have
workloads which would tolerate it to get faster hot-cache behaviour?

> > > Further, we are moving the IO path to a model where we use extents
> > > for mapping, not blocks.  We're optimising for the fact that modern
> > > filesystems use extents and so massively reduce the number of block
> > > mapping lookup calls we need to do for a given IO.
> > > 
> > > i.e. instead of doing "get page, map block to page" over and over
> > > again until we've alked over the entire IO range, we're doing
> > > "map extent for entire IO range" once, then iterating "get page"
> > > until we've mapped the entire range.
> > 
> > That's great, but it's not how IO path works *now*. And will takes a long
> > time (if ever) to flip it over to what you've described.
> 
> Wrong. fs/iomap.c. XFS already uses it, ext4 is being converted
> right now, GFS2 will use parts of it in the next release, DAX
> already uses it and PMD support in DAX is being built on top of it.

That's interesting. I've managed to miss whole fs/iomap.c thing...

> > > As such, there is no way we should be considering different
> > > interfaces and methods for configuring the /same functionality/ just
> > > because DAX is enabled or not. It's the /same decision/ that needs
> > > to be made, and the filesystem knows an awful lot more about whether
> > > huge pages can be used efficiently at the time of access than just
> > > about any other actor you can name
> > 
> > I'm not convinced that filesystem is in better position to see access
> > patterns than mm for page cache. It's not all about on-disk layout.
> 
> Spoken like a true mm developer.

Guilty.

-- 
 Kirill A. Shutemov


Re: [PATCH v2] mailbox: PCC: Fix lockdep warning when request PCC channel

2016-10-21 Thread Hoan Tran
Hi Prashanth,

On Fri, Oct 21, 2016 at 9:13 AM, Prakash, Prashanth
 wrote:
> Hi Hoan,
>
> On 10/18/2016 1:00 AM, Hoan Tran wrote:
>> This patch fixes the lockdep warning below
>>
>> [7.229767] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>> [7.229776] [ cut here ]
>> [7.229787] WARNING: CPU: 1 PID: 1 at 
>> linux-next/kernel/locking/lockdep.c:2876 loc
>> kdep_trace_alloc+0xe0/0xf0
>> [7.229790] Modules linked in:
>> [7.229793]
>> [7.229798] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
>> 4.8.0-11756-g86c5152 #46
>> ...
>> [7.229900] Call trace:
>> [7.229903] Exception stack(0x8007da837890 to 0x8007da8379c0)
>> [7.229906] 7880:   8007da834000 
>> 0001
>> [7.229909] 78a0: 8007da837a70 08a0 60c5 
>> 003d
>> [7.229912] 78c0: 9374bc6a7f3c7832 00381878 09db7ab8 
>> 002f
>> [7.229915] 78e0: 0811aabc 08be2548 8007da837990 
>> 0811adf8
>> [7.229918] 7900: 8007da834000 024080c0 00c0 
>> 09021000
>> [7.229921] 7920:   08c8f7c8 
>> 8007da579810
>> [7.229923] 7940: 002f 8007da858000  
>> 0001
>> [7.229926] 7960: 0001  0811a468 
>> 0002
>> [7.229929] 7980: 656c62617369645f 00038187 00ee 
>> 8007da837850
>> [7.229932] 79a0: 09db50c0 09db569d 0006 
>> 89db568f
>> [7.229936] [] lockdep_trace_alloc+0xe0/0xf0
>> [7.229940] [] __kmalloc_track_caller+0x50/0x250
>> [7.229945] [] devres_alloc_node+0x28/0x60
>> [7.229949] [] devm_request_threaded_irq+0x50/0xe0
>> [7.229955] [] pcc_mbox_request_channel+0x110/0x170
>> [7.229960] [] acpi_cppc_processor_probe+0x264/0x414
>> [7.229963] [] __acpi_processor_start+0x28/0xa0
>> [7.229966] [] acpi_processor_start+0x44/0x54
>> [7.229970] [] driver_probe_device+0x1fc/0x2b0
>> [7.229974] [] __driver_attach+0xb4/0xc0
>> [7.229977] [] bus_for_each_dev+0x5c/0xa0
>> [7.229980] [] driver_attach+0x20/0x30
>> [7.229983] [] bus_add_driver+0x110/0x230
>> [7.229987] [] driver_register+0x60/0x100
>> [7.229991] [] acpi_processor_driver_init+0x2c/0xb0
>> [7.229996] [] do_one_initcall+0x38/0x130
>> [7.23] [] kernel_init_freeable+0x210/0x2b4
>> [7.230004] [] kernel_init+0x10/0x110
>> [7.230007] [] ret_from_fork+0x10/0x50
>>
>> It's because the spinlock inside pcc_mbox_request_channel() is
>> kept too long. This patch releases spinlock before request_irq()
>> and free_irq() to fix this issue  as spinlock is only needed to
>> protect the channel data.
>>
>> Signed-off-by: Hoan Tran 
>> ---
>> v2
>>  * Release spinlock before request_irq() and free_irq() instead of
>> using mutex
>>
>>  drivers/mailbox/pcc.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
>> index 08c87fa..b5275a8 100644
>> --- a/drivers/mailbox/pcc.c
>> +++ b/drivers/mailbox/pcc.c
>> @@ -267,6 +267,8 @@ struct mbox_chan *pcc_mbox_request_channel(struct 
>> mbox_client *cl,
>>   if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone)
>>   chan->txdone_method |= TXDONE_BY_ACK;
>>
>> + spin_unlock_irqrestore(>lock, flags);
>> +
>>   if (pcc_doorbell_irq[subspace_id] > 0) {
>>   int rc;
>>
>> @@ -279,8 +281,6 @@ struct mbox_chan *pcc_mbox_request_channel(struct 
>> mbox_client *cl,
>>   }
>>   }
>>
>> - spin_unlock_irqrestore(>lock, flags);
>> -
>>   return chan;
>>  }
>>  EXPORT_SYMBOL_GPL(pcc_mbox_request_channel);
>> @@ -310,10 +310,10 @@ void pcc_mbox_free_channel(struct mbox_chan *chan)
>>   if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK))
>>   chan->txdone_method = TXDONE_BY_POLL;
>>
>> + spin_unlock_irqrestore(>lock, flags);
>> +
>>   if (pcc_doorbell_irq[id] > 0)
>>   devm_free_irq(chan->mbox->dev, pcc_doorbell_irq[id], chan);
> Shouldn't we free the irq first and then reset the channel state?
> To avoid the scenario where an interrupt might get triggered after
> we reset the channel state but before we release the interrupt

Yes, and another change for next version that if it fails to
request_irq(), call free_channel() function.

Thanks
Hoan

>
> --
> Thanks,
> Prashanth


Re: [PATCH v2] mailbox: PCC: Fix lockdep warning when request PCC channel

2016-10-21 Thread Hoan Tran
Hi Prashanth,

On Fri, Oct 21, 2016 at 9:13 AM, Prakash, Prashanth
 wrote:
> Hi Hoan,
>
> On 10/18/2016 1:00 AM, Hoan Tran wrote:
>> This patch fixes the lockdep warning below
>>
>> [7.229767] DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))
>> [7.229776] [ cut here ]
>> [7.229787] WARNING: CPU: 1 PID: 1 at 
>> linux-next/kernel/locking/lockdep.c:2876 loc
>> kdep_trace_alloc+0xe0/0xf0
>> [7.229790] Modules linked in:
>> [7.229793]
>> [7.229798] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 
>> 4.8.0-11756-g86c5152 #46
>> ...
>> [7.229900] Call trace:
>> [7.229903] Exception stack(0x8007da837890 to 0x8007da8379c0)
>> [7.229906] 7880:   8007da834000 
>> 0001
>> [7.229909] 78a0: 8007da837a70 08a0 60c5 
>> 003d
>> [7.229912] 78c0: 9374bc6a7f3c7832 00381878 09db7ab8 
>> 002f
>> [7.229915] 78e0: 0811aabc 08be2548 8007da837990 
>> 0811adf8
>> [7.229918] 7900: 8007da834000 024080c0 00c0 
>> 09021000
>> [7.229921] 7920:   08c8f7c8 
>> 8007da579810
>> [7.229923] 7940: 002f 8007da858000  
>> 0001
>> [7.229926] 7960: 0001  0811a468 
>> 0002
>> [7.229929] 7980: 656c62617369645f 00038187 00ee 
>> 8007da837850
>> [7.229932] 79a0: 09db50c0 09db569d 0006 
>> 89db568f
>> [7.229936] [] lockdep_trace_alloc+0xe0/0xf0
>> [7.229940] [] __kmalloc_track_caller+0x50/0x250
>> [7.229945] [] devres_alloc_node+0x28/0x60
>> [7.229949] [] devm_request_threaded_irq+0x50/0xe0
>> [7.229955] [] pcc_mbox_request_channel+0x110/0x170
>> [7.229960] [] acpi_cppc_processor_probe+0x264/0x414
>> [7.229963] [] __acpi_processor_start+0x28/0xa0
>> [7.229966] [] acpi_processor_start+0x44/0x54
>> [7.229970] [] driver_probe_device+0x1fc/0x2b0
>> [7.229974] [] __driver_attach+0xb4/0xc0
>> [7.229977] [] bus_for_each_dev+0x5c/0xa0
>> [7.229980] [] driver_attach+0x20/0x30
>> [7.229983] [] bus_add_driver+0x110/0x230
>> [7.229987] [] driver_register+0x60/0x100
>> [7.229991] [] acpi_processor_driver_init+0x2c/0xb0
>> [7.229996] [] do_one_initcall+0x38/0x130
>> [7.23] [] kernel_init_freeable+0x210/0x2b4
>> [7.230004] [] kernel_init+0x10/0x110
>> [7.230007] [] ret_from_fork+0x10/0x50
>>
>> It's because the spinlock inside pcc_mbox_request_channel() is
>> kept too long. This patch releases spinlock before request_irq()
>> and free_irq() to fix this issue  as spinlock is only needed to
>> protect the channel data.
>>
>> Signed-off-by: Hoan Tran 
>> ---
>> v2
>>  * Release spinlock before request_irq() and free_irq() instead of
>> using mutex
>>
>>  drivers/mailbox/pcc.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c
>> index 08c87fa..b5275a8 100644
>> --- a/drivers/mailbox/pcc.c
>> +++ b/drivers/mailbox/pcc.c
>> @@ -267,6 +267,8 @@ struct mbox_chan *pcc_mbox_request_channel(struct 
>> mbox_client *cl,
>>   if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone)
>>   chan->txdone_method |= TXDONE_BY_ACK;
>>
>> + spin_unlock_irqrestore(>lock, flags);
>> +
>>   if (pcc_doorbell_irq[subspace_id] > 0) {
>>   int rc;
>>
>> @@ -279,8 +281,6 @@ struct mbox_chan *pcc_mbox_request_channel(struct 
>> mbox_client *cl,
>>   }
>>   }
>>
>> - spin_unlock_irqrestore(>lock, flags);
>> -
>>   return chan;
>>  }
>>  EXPORT_SYMBOL_GPL(pcc_mbox_request_channel);
>> @@ -310,10 +310,10 @@ void pcc_mbox_free_channel(struct mbox_chan *chan)
>>   if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK))
>>   chan->txdone_method = TXDONE_BY_POLL;
>>
>> + spin_unlock_irqrestore(>lock, flags);
>> +
>>   if (pcc_doorbell_irq[id] > 0)
>>   devm_free_irq(chan->mbox->dev, pcc_doorbell_irq[id], chan);
> Shouldn't we free the irq first and then reset the channel state?
> To avoid the scenario where an interrupt might get triggered after
> we reset the channel state but before we release the interrupt

Yes, and another change for next version that if it fails to
request_irq(), call free_channel() function.

Thanks
Hoan

>
> --
> Thanks,
> Prashanth


[PATCH v3 3/4] arm64: dts: msm8996: Add SMEM DT nodes

2016-10-21 Thread Sarangdhar Joshi
From: Bjorn Andersson 

Add SMEM and TCSR DT nodes on MSM8996.

Signed-off-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 949b096..60d2d20c 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -164,17 +164,34 @@
 
};
 
+   tcsr_mutex: hwlock {
+   compatible = "qcom,tcsr-mutex";
+   syscon = <_mutex_regs 0 0x1000>;
+   #hwlock-cells = <1>;
+   };
+
psci {
compatible = "arm,psci-1.0";
method = "smc";
};
 
+   smem {
+   compatible = "qcom,smem";
+   memory-region = <_mem>;
+   hwlocks = <_mutex 3>;
+   };
+
soc: soc {
#address-cells = <1>;
#size-cells = <1>;
ranges = <0 0 0 0x>;
compatible = "simple-bus";
 
+   tcsr_mutex_regs: syscon@74 {
+   compatible = "syscon";
+   reg = <0x74 0x2>;
+   };
+
intc: interrupt-controller@9bc {
compatible = "arm,gic-v3";
#interrupt-cells = <3>;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



Re: 4.9-rc1 boot regression, ambiguous bisect result

2016-10-21 Thread Dan Williams
On Fri, Oct 21, 2016 at 1:20 PM, Matt Fleming  wrote:
> On Fri, 21 Oct, at 04:41:29PM, Matt Fleming wrote:
>>
>> FYI, I've been able to reproduce some crash when using your EFI memory
>> map layout under Qemu and forcing the ESRT driver to reserve the space.
>
> Nope, that was a bug in my hack. I can't get Qemu to crash while using
> your memory map layout.
>
> Any chance you can insert "while(1)" loops into the EFI boot paths for
> a kernel that is known to reboot or trigger a triple fault in kernels
> that hang, so that we can narrow in on the issue. See,
>
>   
> http://www.codeblueprint.co.uk/2015/04/early-x86-linux-boot-debug-tricks.html

I can take a look, but it will not be until Monday when I have
physical access to the system again.


[PATCH v3 3/4] arm64: dts: msm8996: Add SMEM DT nodes

2016-10-21 Thread Sarangdhar Joshi
From: Bjorn Andersson 

Add SMEM and TCSR DT nodes on MSM8996.

Signed-off-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 949b096..60d2d20c 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -164,17 +164,34 @@
 
};
 
+   tcsr_mutex: hwlock {
+   compatible = "qcom,tcsr-mutex";
+   syscon = <_mutex_regs 0 0x1000>;
+   #hwlock-cells = <1>;
+   };
+
psci {
compatible = "arm,psci-1.0";
method = "smc";
};
 
+   smem {
+   compatible = "qcom,smem";
+   memory-region = <_mem>;
+   hwlocks = <_mutex 3>;
+   };
+
soc: soc {
#address-cells = <1>;
#size-cells = <1>;
ranges = <0 0 0 0x>;
compatible = "simple-bus";
 
+   tcsr_mutex_regs: syscon@74 {
+   compatible = "syscon";
+   reg = <0x74 0x2>;
+   };
+
intc: interrupt-controller@9bc {
compatible = "arm,gic-v3";
#interrupt-cells = <3>;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



Re: 4.9-rc1 boot regression, ambiguous bisect result

2016-10-21 Thread Dan Williams
On Fri, Oct 21, 2016 at 1:20 PM, Matt Fleming  wrote:
> On Fri, 21 Oct, at 04:41:29PM, Matt Fleming wrote:
>>
>> FYI, I've been able to reproduce some crash when using your EFI memory
>> map layout under Qemu and forcing the ESRT driver to reserve the space.
>
> Nope, that was a bug in my hack. I can't get Qemu to crash while using
> your memory map layout.
>
> Any chance you can insert "while(1)" loops into the EFI boot paths for
> a kernel that is known to reboot or trigger a triple fault in kernels
> that hang, so that we can narrow in on the issue. See,
>
>   
> http://www.codeblueprint.co.uk/2015/04/early-x86-linux-boot-debug-tricks.html

I can take a look, but it will not be until Monday when I have
physical access to the system again.


[PATCH v3 2/4] arm64: dts: msm8996: Add reserve-memory nodes

2016-10-21 Thread Sarangdhar Joshi
Add reserve-memory nodes required for Qualcomm
Peripheral Image Loaders

Acked-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 36216ae..949b096 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -36,6 +36,31 @@
#size-cells = <2>;
ranges;
 
+   mba_region: mba@9150 {
+   reg = <0x0 0x9150 0x0 0x20>;
+   no-map;
+   };
+
+   slpi_region: slpi@90b0 {
+   reg = <0x0 0x90b0 0xa0>;
+   no-map;
+   };
+
+   venus_region: venus@9040 {
+   reg = <0x0 0x9040 0x0 0x70>;
+   no-map;
+   };
+
+   adsp_region: adsp@8ea0 {
+   reg = <0x0 0x8ea0 0x0 0x1a0>;
+   no-map;
+   };
+
+   mpss_region: mpss@8880 {
+   reg = <0x0 0x8880 0x0 0x620>;
+   no-map;
+   };
+
smem_mem: smem-mem@8600 {
reg = <0x0 0x8600 0x0 0x20>;
no-map;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 1/4] arm64: dts: msm8996: Add SMEM reserve-memory node

2016-10-21 Thread Sarangdhar Joshi
Add DT node to carveout memory for shared memory region.

Reviewed-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index d6da223..36216ae 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -31,6 +31,17 @@
reg = <0 0 0 0>;
};
 
+   reserved-memory {
+   #address-cells = <2>;
+   #size-cells = <2>;
+   ranges;
+
+   smem_mem: smem-mem@8600 {
+   reg = <0x0 0x8600 0x0 0x20>;
+   no-map;
+   };
+   };
+
cpus {
#address-cells = <2>;
#size-cells = <0>;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 2/4] arm64: dts: msm8996: Add reserve-memory nodes

2016-10-21 Thread Sarangdhar Joshi
Add reserve-memory nodes required for Qualcomm
Peripheral Image Loaders

Acked-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 36216ae..949b096 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -36,6 +36,31 @@
#size-cells = <2>;
ranges;
 
+   mba_region: mba@9150 {
+   reg = <0x0 0x9150 0x0 0x20>;
+   no-map;
+   };
+
+   slpi_region: slpi@90b0 {
+   reg = <0x0 0x90b0 0xa0>;
+   no-map;
+   };
+
+   venus_region: venus@9040 {
+   reg = <0x0 0x9040 0x0 0x70>;
+   no-map;
+   };
+
+   adsp_region: adsp@8ea0 {
+   reg = <0x0 0x8ea0 0x0 0x1a0>;
+   no-map;
+   };
+
+   mpss_region: mpss@8880 {
+   reg = <0x0 0x8880 0x0 0x620>;
+   no-map;
+   };
+
smem_mem: smem-mem@8600 {
reg = <0x0 0x8600 0x0 0x20>;
no-map;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 1/4] arm64: dts: msm8996: Add SMEM reserve-memory node

2016-10-21 Thread Sarangdhar Joshi
Add DT node to carveout memory for shared memory region.

Reviewed-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index d6da223..36216ae 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -31,6 +31,17 @@
reg = <0 0 0 0>;
};
 
+   reserved-memory {
+   #address-cells = <2>;
+   #size-cells = <2>;
+   ranges;
+
+   smem_mem: smem-mem@8600 {
+   reg = <0x0 0x8600 0x0 0x20>;
+   no-map;
+   };
+   };
+
cpus {
#address-cells = <2>;
#size-cells = <0>;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 4/4] arm64: dts: msm8996: Add SMP2P and APCS nodes

2016-10-21 Thread Sarangdhar Joshi
Add SMP2P and APCS DT nodes required for Qualcomm ADSP
Peripheral Image Loader.

Acked-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 60d2d20c..9e960c1 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -175,6 +175,30 @@
method = "smc";
};
 
+   adsp-smp2p {
+   compatible = "qcom,smp2p";
+   qcom,smem = <443>, <429>;
+
+   interrupts = <0 158 IRQ_TYPE_EDGE_RISING>;
+
+   qcom,ipc = < 16 10>;
+
+   qcom,local-pid = <0>;
+   qcom,remote-pid = <2>;
+
+   adsp_smp2p_out: master-kernel {
+   qcom,entry-name = "master-kernel";
+   #qcom,state-cells = <1>;
+   };
+
+   adsp_smp2p_in: slave-kernel {
+   qcom,entry-name = "slave-kernel";
+
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   };
+   };
+
smem {
compatible = "qcom,smem";
memory-region = <_mem>;
@@ -203,6 +227,11 @@
interrupts = ;
};
 
+   apcs: syscon@982 {
+   compatible = "syscon";
+   reg = <0x982 0x1000>;
+   };
+
gcc: clock-controller@30 {
compatible = "qcom,gcc-msm8996";
#clock-cells = <1>;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 4/4] arm64: dts: msm8996: Add SMP2P and APCS nodes

2016-10-21 Thread Sarangdhar Joshi
Add SMP2P and APCS DT nodes required for Qualcomm ADSP
Peripheral Image Loader.

Acked-by: Bjorn Andersson 
Signed-off-by: Sarangdhar Joshi 
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi 
b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 60d2d20c..9e960c1 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -175,6 +175,30 @@
method = "smc";
};
 
+   adsp-smp2p {
+   compatible = "qcom,smp2p";
+   qcom,smem = <443>, <429>;
+
+   interrupts = <0 158 IRQ_TYPE_EDGE_RISING>;
+
+   qcom,ipc = < 16 10>;
+
+   qcom,local-pid = <0>;
+   qcom,remote-pid = <2>;
+
+   adsp_smp2p_out: master-kernel {
+   qcom,entry-name = "master-kernel";
+   #qcom,state-cells = <1>;
+   };
+
+   adsp_smp2p_in: slave-kernel {
+   qcom,entry-name = "slave-kernel";
+
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   };
+   };
+
smem {
compatible = "qcom,smem";
memory-region = <_mem>;
@@ -203,6 +227,11 @@
interrupts = ;
};
 
+   apcs: syscon@982 {
+   compatible = "syscon";
+   reg = <0x982 0x1000>;
+   };
+
gcc: clock-controller@30 {
compatible = "qcom,gcc-msm8996";
#clock-cells = <1>;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 0/4] MSM8996 DT Support for ADSP PIL

2016-10-21 Thread Sarangdhar Joshi
Add various device tree nodes to lay the groundwork for
Qualcomm ADSP Peripheral Image Loader.

Sorry, I didn't send any cover letter for v1.

Changes since v2:
- Remove empty lines from smem DT node (Stephen)

Changes since v1:
- Move hwlock DT node under root (/) (Bjorn and Andy)
- Rename smp2p-adsp to adsp-smp2p (Bjorn)
- Remove interrupt-parent property from node (Bjorn)

Bjorn Andersson (1):
  arm64: dts: msm8996: Add SMEM DT nodes

Sarangdhar Joshi (3):
  arm64: dts: msm8996: Add SMEM reserve-memory node
  arm64: dts: msm8996: Add reserve-memory nodes
  arm64: dts: msm8996: Add SMP2P and APCS nodes

 arch/arm64/boot/dts/qcom/msm8996.dtsi | 82 +++
 1 file changed, 82 insertions(+)

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 0/4] MSM8996 DT Support for ADSP PIL

2016-10-21 Thread Sarangdhar Joshi
Add various device tree nodes to lay the groundwork for
Qualcomm ADSP Peripheral Image Loader.

Sorry, I didn't send any cover letter for v1.

Changes since v2:
- Remove empty lines from smem DT node (Stephen)

Changes since v1:
- Move hwlock DT node under root (/) (Bjorn and Andy)
- Rename smp2p-adsp to adsp-smp2p (Bjorn)
- Remove interrupt-parent property from node (Bjorn)

Bjorn Andersson (1):
  arm64: dts: msm8996: Add SMEM DT nodes

Sarangdhar Joshi (3):
  arm64: dts: msm8996: Add SMEM reserve-memory node
  arm64: dts: msm8996: Add reserve-memory nodes
  arm64: dts: msm8996: Add SMP2P and APCS nodes

 arch/arm64/boot/dts/qcom/msm8996.dtsi | 82 +++
 1 file changed, 82 insertions(+)

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



Re: [PATCH] remoteproc: Add support for xo clock

2016-10-21 Thread Sarangdhar Joshi

On 10/20/2016 04:54 PM, Stephen Boyd wrote:

On 10/20, Sarangdhar Joshi wrote:

Add xo clock support required for Qualcomm ADSP
Peripheral Image Loader.


Yes but why is xo needed?


It is required to boot up the ADSP processor. The remoteproc driver 
keeps the xo clock enabled until the driver receives an "handover" 
interrupt in order to allow remote processor to vote for xo clock with rpm.


I will update the commit text.




@@ -223,6 +232,17 @@ static irqreturn_t adsp_stop_ack_interrupt(int irq, void 
*dev)
return IRQ_HANDLED;
 }

+static int adsp_init_clock(struct qcom_adsp *adsp)
+{
+   adsp->xo = devm_clk_get(adsp->dev, "xo");
+   if (IS_ERR(adsp->xo)) {
+   dev_err(adsp->dev, "failed to get xo clock");


What if it's a probe defer error? Probably best to just be
silent/debug level, or we need a specific test for EPROBE_DEFER
and then silence in that case.


Sure, I will add a check for EPROBE_DEFER. Prefer driver to return
meaningful error message in case of failure.




+   return PTR_ERR(adsp->xo);
+   }




Thanks for reviewing the patch.

Regards,
Sarang

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


Re: [PATCH] remoteproc: Add support for xo clock

2016-10-21 Thread Sarangdhar Joshi

On 10/20/2016 04:54 PM, Stephen Boyd wrote:

On 10/20, Sarangdhar Joshi wrote:

Add xo clock support required for Qualcomm ADSP
Peripheral Image Loader.


Yes but why is xo needed?


It is required to boot up the ADSP processor. The remoteproc driver 
keeps the xo clock enabled until the driver receives an "handover" 
interrupt in order to allow remote processor to vote for xo clock with rpm.


I will update the commit text.




@@ -223,6 +232,17 @@ static irqreturn_t adsp_stop_ack_interrupt(int irq, void 
*dev)
return IRQ_HANDLED;
 }

+static int adsp_init_clock(struct qcom_adsp *adsp)
+{
+   adsp->xo = devm_clk_get(adsp->dev, "xo");
+   if (IS_ERR(adsp->xo)) {
+   dev_err(adsp->dev, "failed to get xo clock");


What if it's a probe defer error? Probably best to just be
silent/debug level, or we need a specific test for EPROBE_DEFER
and then silence in that case.


Sure, I will add a check for EPROBE_DEFER. Prefer driver to return
meaningful error message in case of failure.




+   return PTR_ERR(adsp->xo);
+   }




Thanks for reviewing the patch.

Regards,
Sarang

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project


[PATCH v2] debug: More properly delay for secondary CPUs

2016-10-21 Thread Douglas Anderson
We've got a delay loop waiting for secondary CPUs.  That loop uses
loops_per_jiffy.  However, loops_per_jiffy doesn't actually mean how
many tight loops make up a jiffy on all architectures.  It is quite
common to see things like this in the boot log:
  Calibrating delay loop (skipped), value calculated using timer
  frequency.. 48.00 BogoMIPS (lpj=24000)

In my case I was seeing lots of cases where other CPUs timed out
entering the debugger only to print their stack crawls shortly after the
kdb> prompt was written.

Elsewhere in kgdb we already use udelay(), so that should be safe enough
to use to implement our timeout.  We'll delay 1 ms for 1000 times, which
should give us a full second of delay (just like the old code wanted)
but allow us to notice that we're done every 1 ms.

Signed-off-by: Douglas Anderson 
---
Changes in v2:
- Use udelay, not __delay

 kernel/debug/debug_core.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..85a246feb442 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -61,6 +61,8 @@
 
 #include "debug_core.h"
 
+#define WAIT_CPUS_STOP_MS  1000
+
 static int kgdb_break_asap;
 
 struct debuggerinfo_struct kgdb_info[NR_CPUS];
@@ -598,11 +600,11 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct 
pt_regs *regs,
/*
 * Wait for the other CPUs to be notified and be waiting for us:
 */
-   time_left = loops_per_jiffy * HZ;
+   time_left = WAIT_CPUS_STOP_MS;
while (kgdb_do_roundup && --time_left &&
   (atomic_read(_in_kgdb) + atomic_read(_in_kgdb)) !=
   online_cpus)
-   cpu_relax();
+   udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
 
-- 
2.8.0.rc3.226.g39d4020



[PATCH v2] debug: More properly delay for secondary CPUs

2016-10-21 Thread Douglas Anderson
We've got a delay loop waiting for secondary CPUs.  That loop uses
loops_per_jiffy.  However, loops_per_jiffy doesn't actually mean how
many tight loops make up a jiffy on all architectures.  It is quite
common to see things like this in the boot log:
  Calibrating delay loop (skipped), value calculated using timer
  frequency.. 48.00 BogoMIPS (lpj=24000)

In my case I was seeing lots of cases where other CPUs timed out
entering the debugger only to print their stack crawls shortly after the
kdb> prompt was written.

Elsewhere in kgdb we already use udelay(), so that should be safe enough
to use to implement our timeout.  We'll delay 1 ms for 1000 times, which
should give us a full second of delay (just like the old code wanted)
but allow us to notice that we're done every 1 ms.

Signed-off-by: Douglas Anderson 
---
Changes in v2:
- Use udelay, not __delay

 kernel/debug/debug_core.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..85a246feb442 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -61,6 +61,8 @@
 
 #include "debug_core.h"
 
+#define WAIT_CPUS_STOP_MS  1000
+
 static int kgdb_break_asap;
 
 struct debuggerinfo_struct kgdb_info[NR_CPUS];
@@ -598,11 +600,11 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct 
pt_regs *regs,
/*
 * Wait for the other CPUs to be notified and be waiting for us:
 */
-   time_left = loops_per_jiffy * HZ;
+   time_left = WAIT_CPUS_STOP_MS;
while (kgdb_do_roundup && --time_left &&
   (atomic_read(_in_kgdb) + atomic_read(_in_kgdb)) !=
   online_cpus)
-   cpu_relax();
+   udelay(1000);
if (!time_left)
pr_crit("Timed out waiting for secondary CPUs.\n");
 
-- 
2.8.0.rc3.226.g39d4020



Re: [PATCH] pinctrl: qcom: Add msm8994 pinctrl driver

2016-10-21 Thread Jeremy McNicoll
On Fri, Oct 21, 2016 at 03:42:50PM -0700, Michael Scott wrote:
> Initial pinctrl driver for QCOM msm8994 platforms.
> 
> In order to continue the initial board support for QCOM msm8994/msm8992
> presented in patches from Jeremy McNicoll , let's put
> a proper pinctrl driver in place.
> 
> Currently, the DT for these platforms uses the msm8x74 pinctrl driver to 
> enable
> basic UART.  Beyond the first few pins the rest are different enough to 
> justify
> it's own driver.
> 
> Note: This driver is also be used by QCOM's msm8992 platform as it's TLM block
> is the same.
> 
> - Initial formatting and style was taken from the msm8x74 pinctrl driver added
>   by Björn Andersson 
> - Data was then adjusted per QCOM MSM8994 documentation for Top Level 
> Multiplexing
> - Bindings documentation was based on qcom,msm8996-pinctrl.txt by
>   Joonwoo Park  and then modified for msm8994 content
> 
> Signed-off-by: Michael Scott 
> ---
>  .../bindings/pinctrl/qcom,msm8994-pinctrl.txt  |  175 +++
>  drivers/pinctrl/qcom/Kconfig   |9 +
>  drivers/pinctrl/qcom/Makefile  |1 +
>  drivers/pinctrl/qcom/pinctrl-msm8994.c | 1402 
> 
>  4 files changed, 1587 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
>  create mode 100644 drivers/pinctrl/qcom/pinctrl-msm8994.c
>

This works fine on my msm8992 which currently only supports basic
serial.

Having already compared it to downstream it looks like things
line up pretty good. (from a msm8992 perspective)

Reviewed-by: Jeremy McNicoll 


-jeremy



Re: [PATCH] pinctrl: qcom: Add msm8994 pinctrl driver

2016-10-21 Thread Jeremy McNicoll
On Fri, Oct 21, 2016 at 03:42:50PM -0700, Michael Scott wrote:
> Initial pinctrl driver for QCOM msm8994 platforms.
> 
> In order to continue the initial board support for QCOM msm8994/msm8992
> presented in patches from Jeremy McNicoll , let's put
> a proper pinctrl driver in place.
> 
> Currently, the DT for these platforms uses the msm8x74 pinctrl driver to 
> enable
> basic UART.  Beyond the first few pins the rest are different enough to 
> justify
> it's own driver.
> 
> Note: This driver is also be used by QCOM's msm8992 platform as it's TLM block
> is the same.
> 
> - Initial formatting and style was taken from the msm8x74 pinctrl driver added
>   by Björn Andersson 
> - Data was then adjusted per QCOM MSM8994 documentation for Top Level 
> Multiplexing
> - Bindings documentation was based on qcom,msm8996-pinctrl.txt by
>   Joonwoo Park  and then modified for msm8994 content
> 
> Signed-off-by: Michael Scott 
> ---
>  .../bindings/pinctrl/qcom,msm8994-pinctrl.txt  |  175 +++
>  drivers/pinctrl/qcom/Kconfig   |9 +
>  drivers/pinctrl/qcom/Makefile  |1 +
>  drivers/pinctrl/qcom/pinctrl-msm8994.c | 1402 
> 
>  4 files changed, 1587 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
>  create mode 100644 drivers/pinctrl/qcom/pinctrl-msm8994.c
>

This works fine on my msm8992 which currently only supports basic
serial.

Having already compared it to downstream it looks like things
line up pretty good. (from a msm8992 perspective)

Reviewed-by: Jeremy McNicoll 


-jeremy



[PATCH] MAINTAINERS: add drivers/pinctrl/qcom to ARM/QUALCOMM SUPPORT

2016-10-21 Thread Michael Scott
When running checkpatch.pl on a new pinctrl driver in
drivers/pinctrl/qcom, I noticed a warning about creating
a new maintainer.  The drivers/pinctrl/qcom folder doesn't have
anyone assigned in the MAINTAINERS file.

Let's assign it to ARM/QUALCOMM SUPPORT.

Signed-off-by: Michael Scott 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index b6c28e1..d0d6f92 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1600,6 +1600,7 @@ F:arch/arm/mach-qcom/
 F: arch/arm64/boot/dts/qcom/*
 F: drivers/i2c/busses/i2c-qup.c
 F: drivers/clk/qcom/
+F: drivers/pinctrl/qcom/
 F: drivers/soc/qcom/
 F: drivers/spi/spi-qup.c
 F: drivers/tty/serial/msm_serial.h
-- 
2.9.3



[PATCH] MAINTAINERS: add drivers/pinctrl/qcom to ARM/QUALCOMM SUPPORT

2016-10-21 Thread Michael Scott
When running checkpatch.pl on a new pinctrl driver in
drivers/pinctrl/qcom, I noticed a warning about creating
a new maintainer.  The drivers/pinctrl/qcom folder doesn't have
anyone assigned in the MAINTAINERS file.

Let's assign it to ARM/QUALCOMM SUPPORT.

Signed-off-by: Michael Scott 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index b6c28e1..d0d6f92 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1600,6 +1600,7 @@ F:arch/arm/mach-qcom/
 F: arch/arm64/boot/dts/qcom/*
 F: drivers/i2c/busses/i2c-qup.c
 F: drivers/clk/qcom/
+F: drivers/pinctrl/qcom/
 F: drivers/soc/qcom/
 F: drivers/spi/spi-qup.c
 F: drivers/tty/serial/msm_serial.h
-- 
2.9.3



Re: [PATCH] x86/AMD: Apply erratum 688 on machines without a BIOS fix

2016-10-21 Thread Borislav Petkov
On Sat, Oct 22, 2016 at 12:51:32AM +0300, sonofa...@openmailbox.org wrote:
> Thank you for your time! I have chosen reply to list and all recipients, it
> must work now.

Yes, exactly what I had in mind.

> My brother rejected the proposed patch because it does not provide
> equivalent functionality with the original.
> 
> Our initial patch would fix 3 broken models and 1 working model. Your patch
> will only work for 1 model. Only machines having our APU will be fixed. All
> B0 APUs will be unpatched. This is not right. Check the revision guide to
> verify that.

Right you are: I read too much into the description of bit 2 of
D18F4x164. Of course we want to apply that fix to to ON-Bs too.

> To avoid unneeded complexity we propose this patch as V2, do you agree?
> 
> +#define MSR_AMD64_IC_CFG 0xC0011021
> +
> +static void init_amd_on(struct cpuinfo_x86 *c)
> +{
> + /*
> +  * Apply erratum 688 fix so machines without a BIOS
> +  * fix work.
> +  */
> +
> + u32 val = pci_read_config(0, 0x18, 0x4, 0x164);
> +
> + if (!(val & BIT(2))) {
> + msr_set_bit(MSR_AMD64_IC_CFG, 3);
> + msr_set_bit(MSR_AMD64_IC_CFG, 14);

Yes, that should work fine.

Btw, there's missing a closing } for the if-test here.

> +}
>  static void init_amd_bd(struct cpuinfo_x86 *c)
>  {
>   u64 value;
> @@ -738,6 +750,7 @@ static void init_amd(struct cpuinfo_x86
>   case 0xf:  init_amd_k8(c); break;
>   case 0x10: init_amd_gh(c); break;
>   case 0x12: init_amd_ln(c); break;
> + case 0x14: init_amd_on(c); break;
>   case 0x15: init_amd_bd(c); break;
>   }
> 
> Please advice to proceed!

Right, please send a tested version of the above with the explanation
text from your initial submission.

Thanks.

> erratum 721 :-(

Hmm, interesting.

Do you have a way to trigger that one?

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.


Re: [PATCH] x86/AMD: Apply erratum 688 on machines without a BIOS fix

2016-10-21 Thread Borislav Petkov
On Sat, Oct 22, 2016 at 12:51:32AM +0300, sonofa...@openmailbox.org wrote:
> Thank you for your time! I have chosen reply to list and all recipients, it
> must work now.

Yes, exactly what I had in mind.

> My brother rejected the proposed patch because it does not provide
> equivalent functionality with the original.
> 
> Our initial patch would fix 3 broken models and 1 working model. Your patch
> will only work for 1 model. Only machines having our APU will be fixed. All
> B0 APUs will be unpatched. This is not right. Check the revision guide to
> verify that.

Right you are: I read too much into the description of bit 2 of
D18F4x164. Of course we want to apply that fix to to ON-Bs too.

> To avoid unneeded complexity we propose this patch as V2, do you agree?
> 
> +#define MSR_AMD64_IC_CFG 0xC0011021
> +
> +static void init_amd_on(struct cpuinfo_x86 *c)
> +{
> + /*
> +  * Apply erratum 688 fix so machines without a BIOS
> +  * fix work.
> +  */
> +
> + u32 val = pci_read_config(0, 0x18, 0x4, 0x164);
> +
> + if (!(val & BIT(2))) {
> + msr_set_bit(MSR_AMD64_IC_CFG, 3);
> + msr_set_bit(MSR_AMD64_IC_CFG, 14);

Yes, that should work fine.

Btw, there's missing a closing } for the if-test here.

> +}
>  static void init_amd_bd(struct cpuinfo_x86 *c)
>  {
>   u64 value;
> @@ -738,6 +750,7 @@ static void init_amd(struct cpuinfo_x86
>   case 0xf:  init_amd_k8(c); break;
>   case 0x10: init_amd_gh(c); break;
>   case 0x12: init_amd_ln(c); break;
> + case 0x14: init_amd_on(c); break;
>   case 0x15: init_amd_bd(c); break;
>   }
> 
> Please advice to proceed!

Right, please send a tested version of the above with the explanation
text from your initial submission.

Thanks.

> erratum 721 :-(

Hmm, interesting.

Do you have a way to trigger that one?

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.


Re: [RFC] put more pressure on proc/sysfs slab shrink

2016-10-21 Thread Dave Chinner
On Fri, Oct 21, 2016 at 01:35:14PM -0700, Shaohua Li wrote:
> In our systems, proc/sysfs inode/dentry cache use more than 1G memory
> even memory pressure is high sometimes. Since proc/sysfs is in-memory
> filesystem, rebuilding the cache is fast. There is no point proc/sysfs
> and disk fs have equal pressure for slab shrink.
> 
> One idea is directly discarding proc/sysfs inode/dentry cache rightly
> after the proc/sysfs file is closed. But the discarding will make
> proc/sysfs file open slower next time, which is 20x slower in my test if
> multiple applications are accessing proc files. This patch doesn't go
> that far. Instead, just put more pressure to shrink proc/sysfs slabs.
> 
> Signed-off-by: Shaohua Li 
> ---
>  fs/kernfs/mount.c | 2 ++
>  fs/proc/inode.c   | 2 ++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index d5b149a..5b4e747 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -161,6 +161,8 @@ static int kernfs_fill_super(struct super_block *sb, 
> unsigned long magic)
>   sb->s_xattr = kernfs_xattr_handlers;
>   sb->s_time_gran = 1;
>  
> + sb->s_shrink.seeks = 1;
> + sb->s_shrink.batch = 0;

This sort of thing needs comments as to why they are being changed.
Otherwise the next person who comes along to do shrinker
modifications won't have a clue about why this magic exists.

Also, I don't think s_shrink.batch = 0 does what you think it does.
The superblock batch size default of 1024 is more efficient than
setting sb->s_shrink.batch = 0 as that makes the shrinker use
SHRINK_BATCH:

#define SHRINK_BATCH 128

i.e. it does less work per batch so has more overhead

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [RFC] put more pressure on proc/sysfs slab shrink

2016-10-21 Thread Dave Chinner
On Fri, Oct 21, 2016 at 01:35:14PM -0700, Shaohua Li wrote:
> In our systems, proc/sysfs inode/dentry cache use more than 1G memory
> even memory pressure is high sometimes. Since proc/sysfs is in-memory
> filesystem, rebuilding the cache is fast. There is no point proc/sysfs
> and disk fs have equal pressure for slab shrink.
> 
> One idea is directly discarding proc/sysfs inode/dentry cache rightly
> after the proc/sysfs file is closed. But the discarding will make
> proc/sysfs file open slower next time, which is 20x slower in my test if
> multiple applications are accessing proc files. This patch doesn't go
> that far. Instead, just put more pressure to shrink proc/sysfs slabs.
> 
> Signed-off-by: Shaohua Li 
> ---
>  fs/kernfs/mount.c | 2 ++
>  fs/proc/inode.c   | 2 ++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index d5b149a..5b4e747 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -161,6 +161,8 @@ static int kernfs_fill_super(struct super_block *sb, 
> unsigned long magic)
>   sb->s_xattr = kernfs_xattr_handlers;
>   sb->s_time_gran = 1;
>  
> + sb->s_shrink.seeks = 1;
> + sb->s_shrink.batch = 0;

This sort of thing needs comments as to why they are being changed.
Otherwise the next person who comes along to do shrinker
modifications won't have a clue about why this magic exists.

Also, I don't think s_shrink.batch = 0 does what you think it does.
The superblock batch size default of 1024 is more efficient than
setting sb->s_shrink.batch = 0 as that makes the shrinker use
SHRINK_BATCH:

#define SHRINK_BATCH 128

i.e. it does less work per batch so has more overhead

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH v4 4/9] clk: sunxi-ng: Add minimums for all the relevant structures and clocks

2016-10-21 Thread André Przywara
Salut,

On 11/10/16 15:28, Maxime Ripard wrote:
> Modify the current clocks we have to be able to specify the minimum for
> each clocks we support, just like we support the max.
> 
> Signed-off-by: Maxime Ripard 
> ---
>  drivers/clk/sunxi-ng/ccu_mult.c |  7 ++-
>  drivers/clk/sunxi-ng/ccu_nk.c   | 12 
>  drivers/clk/sunxi-ng/ccu_nkm.c  | 18 --
>  drivers/clk/sunxi-ng/ccu_nkmp.c | 16 
>  drivers/clk/sunxi-ng/ccu_nm.c   | 12 
>  5 files changed, 46 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/clk/sunxi-ng/ccu_mult.c b/drivers/clk/sunxi-ng/ccu_mult.c
> index 32a1964439a2..6a02ffee5386 100644
> --- a/drivers/clk/sunxi-ng/ccu_mult.c
> +++ b/drivers/clk/sunxi-ng/ccu_mult.c
> @@ -14,7 +14,7 @@
>  #include "ccu_mult.h"
>  
>  struct _ccu_mult {
> - unsigned long   mult, max;
> + unsigned long   mult, min, max;
>  };
>  
>  static void ccu_mult_find_best(unsigned long parent, unsigned long rate,
> @@ -23,6 +23,9 @@ static void ccu_mult_find_best(unsigned long parent, 
> unsigned long rate,
>   int _mult;
>  
>   _mult = rate / parent;
> + if (_mult < mult->min)
> + _mult = mult->min;
> +
>   if (_mult > mult->max)
>   _mult = mult->max;
>  
> @@ -37,6 +40,7 @@ static unsigned long ccu_mult_round_rate(struct 
> ccu_mux_internal *mux,
>   struct ccu_mult *cm = data;
>   struct _ccu_mult _cm;
>  
> + _cm.min = 1;
>   _cm.max = 1 << cm->mult.width;
>   ccu_mult_find_best(parent_rate, rate, &_cm);
>  
> @@ -101,6 +105,7 @@ static int ccu_mult_set_rate(struct clk_hw *hw, unsigned 
> long rate,
>   ccu_mux_helper_adjust_parent_for_prediv(>common, >mux, -1,
>   _rate);
>  
> + _cm.min = 1;
>   _cm.max = 1 << cm->mult.width;
>   ccu_mult_find_best(parent_rate, rate, &_cm);
>  
> diff --git a/drivers/clk/sunxi-ng/ccu_nk.c b/drivers/clk/sunxi-ng/ccu_nk.c
> index e7e2e75618ef..a42d870ba0ef 100644
> --- a/drivers/clk/sunxi-ng/ccu_nk.c
> +++ b/drivers/clk/sunxi-ng/ccu_nk.c
> @@ -14,8 +14,8 @@
>  #include "ccu_nk.h"
>  
>  struct _ccu_nk {
> - unsigned long   n, max_n;
> - unsigned long   k, max_k;
> + unsigned long   n, min_n, max_n;
> + unsigned long   k, min_k, max_k;
>  };
>  
>  static void ccu_nk_find_best(unsigned long parent, unsigned long rate,
> @@ -25,8 +25,8 @@ static void ccu_nk_find_best(unsigned long parent, unsigned 
> long rate,
>   unsigned int best_k = 0, best_n = 0;
>   unsigned int _k, _n;
>  
> - for (_k = 1; _k <= nk->max_k; _k++) {
> - for (_n = 1; _n <= nk->max_n; _n++) {
> + for (_k = nk->min_k; _k <= nk->max_k; _k++) {
> + for (_n = nk->min_n; _n <= nk->max_n; _n++) {
>   unsigned long tmp_rate = parent * _n * _k;
>  
>   if (tmp_rate > rate)
> @@ -97,7 +97,9 @@ static long ccu_nk_round_rate(struct clk_hw *hw, unsigned 
> long rate,
>   if (nk->common.features & CCU_FEATURE_FIXED_POSTDIV)
>   rate *= nk->fixed_post_div;
>  
> + _nk.min_n = 1;
>   _nk.max_n = 1 << nk->n.width;
> + _nk.min_k = 1;
>   _nk.max_k = 1 << nk->k.width;
>  
>   ccu_nk_find_best(*parent_rate, rate, &_nk);
> @@ -120,7 +122,9 @@ static int ccu_nk_set_rate(struct clk_hw *hw, unsigned 
> long rate,
>   if (nk->common.features & CCU_FEATURE_FIXED_POSTDIV)
>   rate = rate * nk->fixed_post_div;
>  
> + _nk.min_n = 1;
>   _nk.max_n = 1 << nk->n.width;
> + _nk.min_k = 1;
>   _nk.max_k = 1 << nk->k.width;
>  
>   ccu_nk_find_best(parent_rate, rate, &_nk);
> diff --git a/drivers/clk/sunxi-ng/ccu_nkm.c b/drivers/clk/sunxi-ng/ccu_nkm.c
> index 0b08d000eb38..b2a5fccf2f8c 100644
> --- a/drivers/clk/sunxi-ng/ccu_nkm.c
> +++ b/drivers/clk/sunxi-ng/ccu_nkm.c
> @@ -14,9 +14,9 @@
>  #include "ccu_nkm.h"
>  
>  struct _ccu_nkm {
> - unsigned long   n, max_n;
> - unsigned long   k, max_k;
> - unsigned long   m, max_m;
> + unsigned long   n, min_n, max_n;
> + unsigned long   k, min_k, max_k;
> + unsigned long   m, min_m, max_m;
>  };
>  
>  static void ccu_nkm_find_best(unsigned long parent, unsigned long rate,
> @@ -26,9 +26,9 @@ static void ccu_nkm_find_best(unsigned long parent, 
> unsigned long rate,
>   unsigned long best_n = 0, best_k = 0, best_m = 0;
>   unsigned long _n, _k, _m;
>  
> - for (_k = 1; _k <= nkm->max_k; _k++) {
> - for (_n = 1; _n <= nkm->max_n; _n++) {
> - for (_m = 1; _n <= nkm->max_m; _m++) {
> + for (_k = nkm->min_k; _k <= nkm->max_k; _k++) {
> + for (_n = nkm->min_n; _n <= nkm->max_n; _n++) {
> + for (_m = nkm->min_m; _n <= nkm->max_m; _m++) {

should be _m in the condition

>   unsigned long tmp_rate;
>  
>   tmp_rate = parent * _n * _k / _m;
> @@ -100,8 +100,11 @@ static 

Re: [PATCH v4 4/9] clk: sunxi-ng: Add minimums for all the relevant structures and clocks

2016-10-21 Thread André Przywara
Salut,

On 11/10/16 15:28, Maxime Ripard wrote:
> Modify the current clocks we have to be able to specify the minimum for
> each clocks we support, just like we support the max.
> 
> Signed-off-by: Maxime Ripard 
> ---
>  drivers/clk/sunxi-ng/ccu_mult.c |  7 ++-
>  drivers/clk/sunxi-ng/ccu_nk.c   | 12 
>  drivers/clk/sunxi-ng/ccu_nkm.c  | 18 --
>  drivers/clk/sunxi-ng/ccu_nkmp.c | 16 
>  drivers/clk/sunxi-ng/ccu_nm.c   | 12 
>  5 files changed, 46 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/clk/sunxi-ng/ccu_mult.c b/drivers/clk/sunxi-ng/ccu_mult.c
> index 32a1964439a2..6a02ffee5386 100644
> --- a/drivers/clk/sunxi-ng/ccu_mult.c
> +++ b/drivers/clk/sunxi-ng/ccu_mult.c
> @@ -14,7 +14,7 @@
>  #include "ccu_mult.h"
>  
>  struct _ccu_mult {
> - unsigned long   mult, max;
> + unsigned long   mult, min, max;
>  };
>  
>  static void ccu_mult_find_best(unsigned long parent, unsigned long rate,
> @@ -23,6 +23,9 @@ static void ccu_mult_find_best(unsigned long parent, 
> unsigned long rate,
>   int _mult;
>  
>   _mult = rate / parent;
> + if (_mult < mult->min)
> + _mult = mult->min;
> +
>   if (_mult > mult->max)
>   _mult = mult->max;
>  
> @@ -37,6 +40,7 @@ static unsigned long ccu_mult_round_rate(struct 
> ccu_mux_internal *mux,
>   struct ccu_mult *cm = data;
>   struct _ccu_mult _cm;
>  
> + _cm.min = 1;
>   _cm.max = 1 << cm->mult.width;
>   ccu_mult_find_best(parent_rate, rate, &_cm);
>  
> @@ -101,6 +105,7 @@ static int ccu_mult_set_rate(struct clk_hw *hw, unsigned 
> long rate,
>   ccu_mux_helper_adjust_parent_for_prediv(>common, >mux, -1,
>   _rate);
>  
> + _cm.min = 1;
>   _cm.max = 1 << cm->mult.width;
>   ccu_mult_find_best(parent_rate, rate, &_cm);
>  
> diff --git a/drivers/clk/sunxi-ng/ccu_nk.c b/drivers/clk/sunxi-ng/ccu_nk.c
> index e7e2e75618ef..a42d870ba0ef 100644
> --- a/drivers/clk/sunxi-ng/ccu_nk.c
> +++ b/drivers/clk/sunxi-ng/ccu_nk.c
> @@ -14,8 +14,8 @@
>  #include "ccu_nk.h"
>  
>  struct _ccu_nk {
> - unsigned long   n, max_n;
> - unsigned long   k, max_k;
> + unsigned long   n, min_n, max_n;
> + unsigned long   k, min_k, max_k;
>  };
>  
>  static void ccu_nk_find_best(unsigned long parent, unsigned long rate,
> @@ -25,8 +25,8 @@ static void ccu_nk_find_best(unsigned long parent, unsigned 
> long rate,
>   unsigned int best_k = 0, best_n = 0;
>   unsigned int _k, _n;
>  
> - for (_k = 1; _k <= nk->max_k; _k++) {
> - for (_n = 1; _n <= nk->max_n; _n++) {
> + for (_k = nk->min_k; _k <= nk->max_k; _k++) {
> + for (_n = nk->min_n; _n <= nk->max_n; _n++) {
>   unsigned long tmp_rate = parent * _n * _k;
>  
>   if (tmp_rate > rate)
> @@ -97,7 +97,9 @@ static long ccu_nk_round_rate(struct clk_hw *hw, unsigned 
> long rate,
>   if (nk->common.features & CCU_FEATURE_FIXED_POSTDIV)
>   rate *= nk->fixed_post_div;
>  
> + _nk.min_n = 1;
>   _nk.max_n = 1 << nk->n.width;
> + _nk.min_k = 1;
>   _nk.max_k = 1 << nk->k.width;
>  
>   ccu_nk_find_best(*parent_rate, rate, &_nk);
> @@ -120,7 +122,9 @@ static int ccu_nk_set_rate(struct clk_hw *hw, unsigned 
> long rate,
>   if (nk->common.features & CCU_FEATURE_FIXED_POSTDIV)
>   rate = rate * nk->fixed_post_div;
>  
> + _nk.min_n = 1;
>   _nk.max_n = 1 << nk->n.width;
> + _nk.min_k = 1;
>   _nk.max_k = 1 << nk->k.width;
>  
>   ccu_nk_find_best(parent_rate, rate, &_nk);
> diff --git a/drivers/clk/sunxi-ng/ccu_nkm.c b/drivers/clk/sunxi-ng/ccu_nkm.c
> index 0b08d000eb38..b2a5fccf2f8c 100644
> --- a/drivers/clk/sunxi-ng/ccu_nkm.c
> +++ b/drivers/clk/sunxi-ng/ccu_nkm.c
> @@ -14,9 +14,9 @@
>  #include "ccu_nkm.h"
>  
>  struct _ccu_nkm {
> - unsigned long   n, max_n;
> - unsigned long   k, max_k;
> - unsigned long   m, max_m;
> + unsigned long   n, min_n, max_n;
> + unsigned long   k, min_k, max_k;
> + unsigned long   m, min_m, max_m;
>  };
>  
>  static void ccu_nkm_find_best(unsigned long parent, unsigned long rate,
> @@ -26,9 +26,9 @@ static void ccu_nkm_find_best(unsigned long parent, 
> unsigned long rate,
>   unsigned long best_n = 0, best_k = 0, best_m = 0;
>   unsigned long _n, _k, _m;
>  
> - for (_k = 1; _k <= nkm->max_k; _k++) {
> - for (_n = 1; _n <= nkm->max_n; _n++) {
> - for (_m = 1; _n <= nkm->max_m; _m++) {
> + for (_k = nkm->min_k; _k <= nkm->max_k; _k++) {
> + for (_n = nkm->min_n; _n <= nkm->max_n; _n++) {
> + for (_m = nkm->min_m; _n <= nkm->max_m; _m++) {

should be _m in the condition

>   unsigned long tmp_rate;
>  
>   tmp_rate = parent * _n * _k / _m;
> @@ -100,8 +100,11 @@ static unsigned long 

Re: [PATCH] uapi: linux: acct: Remove redundant type comp2_t from kernel

2016-10-21 Thread Chen Gang

On 10/21/16 11:41, Andrew Morton wrote:
> On Wed,  5 Oct 2016 21:40:10 +0800 cheng...@emindsoft.com.cn wrote:
> 
>> In api itself, kernel does not use it -- it is divided into ac_etime_hi
>> and ac_etime_lo. So kernel side only need generate the correct
>> ac_etime_hi and ac_etime_lo, but need not know about comp2_t.
>>
>> At present, kernel use normal u64 type for it, when kernel provdes it to
>> outside, kernel can translate it into ac_etime_hi and ac_etime_lo,
>> directly, but need not notice about comp2_t, in fact.
> 
> hm.  Why is this an improvement?
> 

For me, it will let code a little more understanding, a little simpler,
and let the code a little more extendable (when kernel members really
needs comp2_t in future, they need not have to treat it as __u32).

Only when comp2_t is really used in api header in future, kernel has to
know about it, but kernel still can keep original code no touch. So for
me, our changing is harmless.

Thanks.
-- 
Chen Gang (陈刚)

Managing Natural Environments is the Duty of Human Beings.


Re: [PATCH] uapi: linux: acct: Remove redundant type comp2_t from kernel

2016-10-21 Thread Chen Gang

On 10/21/16 11:41, Andrew Morton wrote:
> On Wed,  5 Oct 2016 21:40:10 +0800 cheng...@emindsoft.com.cn wrote:
> 
>> In api itself, kernel does not use it -- it is divided into ac_etime_hi
>> and ac_etime_lo. So kernel side only need generate the correct
>> ac_etime_hi and ac_etime_lo, but need not know about comp2_t.
>>
>> At present, kernel use normal u64 type for it, when kernel provdes it to
>> outside, kernel can translate it into ac_etime_hi and ac_etime_lo,
>> directly, but need not notice about comp2_t, in fact.
> 
> hm.  Why is this an improvement?
> 

For me, it will let code a little more understanding, a little simpler,
and let the code a little more extendable (when kernel members really
needs comp2_t in future, they need not have to treat it as __u32).

Only when comp2_t is really used in api header in future, kernel has to
know about it, but kernel still can keep original code no touch. So for
me, our changing is harmless.

Thanks.
-- 
Chen Gang (陈刚)

Managing Natural Environments is the Duty of Human Beings.


Re: [PATCH] shmem: avoid huge pages for small files

2016-10-21 Thread Dave Chinner
On Fri, Oct 21, 2016 at 06:00:07PM +0300, Kirill A. Shutemov wrote:
> On Fri, Oct 21, 2016 at 04:01:18PM +1100, Dave Chinner wrote:
> > On Thu, Oct 20, 2016 at 07:01:16PM -0700, Andi Kleen wrote:
> > > > Ugh, no, please don't use mount options for file specific behaviours
> > > > in filesystems like ext4 and XFS. This is exactly the sort of
> > > > behaviour that should either just work automatically (i.e. be
> > > > completely controlled by the filesystem) or only be applied to files
> > > 
> > > Can you explain what you mean? How would the file system control it?
> > 
> > There's no point in asking for huge pages when populating the page
> > cache if the file is:
> > 
> > - significantly smaller than the huge page size
> > - largely sparse
> > - being randomly accessed in small chunks
> > - badly fragmented and so takes hundreds of IO to read/write
> >   a huge page
> > - able to optimise delayed allocation to match huge page
> >   sizes and alignments
> > 
> > These are all constraints the filesystem knows about, but the
> > application and user don't.
> 
> Really?
> 
> To me, most of things you're talking about is highly dependent on access
> pattern generated by userspace:
> 
>   - we may want to allocate huge pages from byte 1 if we know that file
> will grow;

delayed allocation takes care of that. We use a growing speculative
delalloc size that kicks in at specific sizes and can be used
directly to determine if a large page shoul dbe allocated. This code
is aware of sparse files, sparse writes, etc.

>   - the same for sparse file that will be filled;

See above.

>   - it will be beneficial to allocate huge page even for fragmented files,
> if it's read-mostly;

No, no it won't. The IO latency impact here can be massive.
read-ahead of single 4k pages hides most of this latency from the
application, but with a 2MB page, we can't use readhead to hide this
IO latency because the first access could stall for hundreds of
small random read IOs to be completed instead of just 1.


> > Further, we are moving the IO path to a model where we use extents
> > for mapping, not blocks.  We're optimising for the fact that modern
> > filesystems use extents and so massively reduce the number of block
> > mapping lookup calls we need to do for a given IO.
> > 
> > i.e. instead of doing "get page, map block to page" over and over
> > again until we've alked over the entire IO range, we're doing
> > "map extent for entire IO range" once, then iterating "get page"
> > until we've mapped the entire range.
> 
> That's great, but it's not how IO path works *now*. And will takes a long
> time (if ever) to flip it over to what you've described.

Wrong. fs/iomap.c. XFS already uses it, ext4 is being converted
right now, GFS2 will use parts of it in the next release, DAX
already uses it and PMD support in DAX is being built on top of it.

> > As such, there is no way we should be considering different
> > interfaces and methods for configuring the /same functionality/ just
> > because DAX is enabled or not. It's the /same decision/ that needs
> > to be made, and the filesystem knows an awful lot more about whether
> > huge pages can be used efficiently at the time of access than just
> > about any other actor you can name
> 
> I'm not convinced that filesystem is in better position to see access
> patterns than mm for page cache. It's not all about on-disk layout.

Spoken like a true mm developer. IO performance is all about IO
patterns, and the primary contributor to bad IO patterns is bad
filesystem allocation patterns :P

We're rapidly moving away from the world where a page cache is
needed to give applications decent performance. DAX doesn't have a
page cache, applications wanting to use high IOPS (hundreds of
thousands to millions) storage are using direct IO, because the page
cache just introduces latency, memory usage issues and
non-deterministic IO behaviour.

I we try to make the page cache the "one true IO optimisation source"
then we're screwing ourselves because the incoming IO technologies
simply don't require it anymore. We need to be ahead of that curve,
not playing catchup, and that's why this sort of "what should the
page cache do" decisions really need to come from the IO path where
we see /all/ the IO, not just buffered IO

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH] shmem: avoid huge pages for small files

2016-10-21 Thread Dave Chinner
On Fri, Oct 21, 2016 at 06:00:07PM +0300, Kirill A. Shutemov wrote:
> On Fri, Oct 21, 2016 at 04:01:18PM +1100, Dave Chinner wrote:
> > On Thu, Oct 20, 2016 at 07:01:16PM -0700, Andi Kleen wrote:
> > > > Ugh, no, please don't use mount options for file specific behaviours
> > > > in filesystems like ext4 and XFS. This is exactly the sort of
> > > > behaviour that should either just work automatically (i.e. be
> > > > completely controlled by the filesystem) or only be applied to files
> > > 
> > > Can you explain what you mean? How would the file system control it?
> > 
> > There's no point in asking for huge pages when populating the page
> > cache if the file is:
> > 
> > - significantly smaller than the huge page size
> > - largely sparse
> > - being randomly accessed in small chunks
> > - badly fragmented and so takes hundreds of IO to read/write
> >   a huge page
> > - able to optimise delayed allocation to match huge page
> >   sizes and alignments
> > 
> > These are all constraints the filesystem knows about, but the
> > application and user don't.
> 
> Really?
> 
> To me, most of things you're talking about is highly dependent on access
> pattern generated by userspace:
> 
>   - we may want to allocate huge pages from byte 1 if we know that file
> will grow;

delayed allocation takes care of that. We use a growing speculative
delalloc size that kicks in at specific sizes and can be used
directly to determine if a large page shoul dbe allocated. This code
is aware of sparse files, sparse writes, etc.

>   - the same for sparse file that will be filled;

See above.

>   - it will be beneficial to allocate huge page even for fragmented files,
> if it's read-mostly;

No, no it won't. The IO latency impact here can be massive.
read-ahead of single 4k pages hides most of this latency from the
application, but with a 2MB page, we can't use readhead to hide this
IO latency because the first access could stall for hundreds of
small random read IOs to be completed instead of just 1.


> > Further, we are moving the IO path to a model where we use extents
> > for mapping, not blocks.  We're optimising for the fact that modern
> > filesystems use extents and so massively reduce the number of block
> > mapping lookup calls we need to do for a given IO.
> > 
> > i.e. instead of doing "get page, map block to page" over and over
> > again until we've alked over the entire IO range, we're doing
> > "map extent for entire IO range" once, then iterating "get page"
> > until we've mapped the entire range.
> 
> That's great, but it's not how IO path works *now*. And will takes a long
> time (if ever) to flip it over to what you've described.

Wrong. fs/iomap.c. XFS already uses it, ext4 is being converted
right now, GFS2 will use parts of it in the next release, DAX
already uses it and PMD support in DAX is being built on top of it.

> > As such, there is no way we should be considering different
> > interfaces and methods for configuring the /same functionality/ just
> > because DAX is enabled or not. It's the /same decision/ that needs
> > to be made, and the filesystem knows an awful lot more about whether
> > huge pages can be used efficiently at the time of access than just
> > about any other actor you can name
> 
> I'm not convinced that filesystem is in better position to see access
> patterns than mm for page cache. It's not all about on-disk layout.

Spoken like a true mm developer. IO performance is all about IO
patterns, and the primary contributor to bad IO patterns is bad
filesystem allocation patterns :P

We're rapidly moving away from the world where a page cache is
needed to give applications decent performance. DAX doesn't have a
page cache, applications wanting to use high IOPS (hundreds of
thousands to millions) storage are using direct IO, because the page
cache just introduces latency, memory usage issues and
non-deterministic IO behaviour.

I we try to make the page cache the "one true IO optimisation source"
then we're screwing ourselves because the incoming IO technologies
simply don't require it anymore. We need to be ahead of that curve,
not playing catchup, and that's why this sort of "what should the
page cache do" decisions really need to come from the IO path where
we see /all/ the IO, not just buffered IO

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCHv4] shmem: avoid huge pages for small files

2016-10-21 Thread Kirill A. Shutemov
On Fri, Oct 21, 2016 at 09:51:03PM +0300, Kirill A. Shutemov wrote:
> + case SHEME_HUGE_ALWAYS:

Oops. Forgot to commit the fixup :-/

>From 79b0a3bf4503225d0e6ba553b8496f0c4d55514e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" 
Date: Mon, 17 Oct 2016 14:44:47 +0300
Subject: [PATCHv4] shmem: avoid huge pages for small files

Huge pages are detrimental for small file: they causes noticible
overhead on both allocation performance and memory footprint.

This patch aimed to address this issue by avoiding huge pages until file
grown to size of huge page. This would cover most of the cases where huge
pages causes regressions in performance.

Couple notes:

  - if shmem_enabled is set to 'force', the limit is ignored. We still
want to generate as many pages as possible for functional testing.

  - the limit doesn't affect khugepaged behaviour: it still can collapse
pages based on its settings;

Signed-off-by: Kirill A. Shutemov 
---
 Documentation/vm/transhuge.txt | 3 +++
 mm/shmem.c | 5 +
 2 files changed, 8 insertions(+)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 2ec6adb5a4ce..d1889c7c8c46 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -238,6 +238,9 @@ values:
   - "force":
 Force the huge option on for all - very useful for testing;
 
+To avoid overhead for small files, we don't allocate huge pages for a file
+until it grows to size of huge pages.
+
 == Need of application restart ==
 
 The transparent_hugepage/enabled values and tmpfs mount option only affect
diff --git a/mm/shmem.c b/mm/shmem.c
index ad7813d73ea7..49618d2d6330 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1692,6 +1692,11 @@ static int shmem_getpage_gfp(struct inode *inode, 
pgoff_t index,
goto alloc_huge;
/* TODO: implement fadvise() hints */
goto alloc_nohuge;
+   case SHMEM_HUGE_ALWAYS:
+   i_size = i_size_read(inode);
+   if (index < HPAGE_PMD_NR && i_size < HPAGE_PMD_SIZE)
+   goto alloc_nohuge;
+   break;
}
 
 alloc_huge:
-- 
 Kirill A. Shutemov


Re: [PATCHv4] shmem: avoid huge pages for small files

2016-10-21 Thread Kirill A. Shutemov
On Fri, Oct 21, 2016 at 09:51:03PM +0300, Kirill A. Shutemov wrote:
> + case SHEME_HUGE_ALWAYS:

Oops. Forgot to commit the fixup :-/

>From 79b0a3bf4503225d0e6ba553b8496f0c4d55514e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" 
Date: Mon, 17 Oct 2016 14:44:47 +0300
Subject: [PATCHv4] shmem: avoid huge pages for small files

Huge pages are detrimental for small file: they causes noticible
overhead on both allocation performance and memory footprint.

This patch aimed to address this issue by avoiding huge pages until file
grown to size of huge page. This would cover most of the cases where huge
pages causes regressions in performance.

Couple notes:

  - if shmem_enabled is set to 'force', the limit is ignored. We still
want to generate as many pages as possible for functional testing.

  - the limit doesn't affect khugepaged behaviour: it still can collapse
pages based on its settings;

Signed-off-by: Kirill A. Shutemov 
---
 Documentation/vm/transhuge.txt | 3 +++
 mm/shmem.c | 5 +
 2 files changed, 8 insertions(+)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 2ec6adb5a4ce..d1889c7c8c46 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -238,6 +238,9 @@ values:
   - "force":
 Force the huge option on for all - very useful for testing;
 
+To avoid overhead for small files, we don't allocate huge pages for a file
+until it grows to size of huge pages.
+
 == Need of application restart ==
 
 The transparent_hugepage/enabled values and tmpfs mount option only affect
diff --git a/mm/shmem.c b/mm/shmem.c
index ad7813d73ea7..49618d2d6330 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1692,6 +1692,11 @@ static int shmem_getpage_gfp(struct inode *inode, 
pgoff_t index,
goto alloc_huge;
/* TODO: implement fadvise() hints */
goto alloc_nohuge;
+   case SHMEM_HUGE_ALWAYS:
+   i_size = i_size_read(inode);
+   if (index < HPAGE_PMD_NR && i_size < HPAGE_PMD_SIZE)
+   goto alloc_nohuge;
+   break;
}
 
 alloc_huge:
-- 
 Kirill A. Shutemov


[PATCH] pinctrl: qcom: Add msm8994 pinctrl driver

2016-10-21 Thread Michael Scott
Initial pinctrl driver for QCOM msm8994 platforms.

In order to continue the initial board support for QCOM msm8994/msm8992
presented in patches from Jeremy McNicoll , let's put
a proper pinctrl driver in place.

Currently, the DT for these platforms uses the msm8x74 pinctrl driver to enable
basic UART.  Beyond the first few pins the rest are different enough to justify
it's own driver.

Note: This driver is also be used by QCOM's msm8992 platform as it's TLM block
is the same.

- Initial formatting and style was taken from the msm8x74 pinctrl driver added
  by Björn Andersson 
- Data was then adjusted per QCOM MSM8994 documentation for Top Level 
Multiplexing
- Bindings documentation was based on qcom,msm8996-pinctrl.txt by
  Joonwoo Park  and then modified for msm8994 content

Signed-off-by: Michael Scott 
---
 .../bindings/pinctrl/qcom,msm8994-pinctrl.txt  |  175 +++
 drivers/pinctrl/qcom/Kconfig   |9 +
 drivers/pinctrl/qcom/Makefile  |1 +
 drivers/pinctrl/qcom/pinctrl-msm8994.c | 1402 
 4 files changed, 1587 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
 create mode 100644 drivers/pinctrl/qcom/pinctrl-msm8994.c

diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt 
b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
new file mode 100644
index 000..e390087b
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
@@ -0,0 +1,175 @@
+Qualcomm MSM8994 TLMM block
+
+This binding describes the Top Level Mode Multiplexer block found in the
+MSM8994 platform.
+
+- compatible:
+   Usage: required
+   Value type: 
+   Definition: must be "qcom,msm8994-pinctrl"
+
+- reg:
+   Usage: required
+   Value type: 
+   Definition: the base address and size of the TLMM register space.
+
+- interrupts:
+   Usage: required
+   Value type: 
+   Definition: should specify the TLMM summary IRQ.
+
+- interrupt-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as an interrupt controller
+
+- #interrupt-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+- gpio-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as a gpio controller
+
+- #gpio-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
+a general description of GPIO and interrupt bindings.
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+The pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, drive strength, etc.
+
+
+PIN CONFIGURATION NODES:
+
+The name of each subnode is not important; all subnodes should be enumerated
+and processed purely based on their content.
+
+Each subnode only affects those parameters that are explicitly listed. In
+other words, a subnode that lists a mux function but no pin configuration
+parameters implies no information about any pin configuration parameters.
+Similarly, a pin subnode that describes a pullup parameter implies no
+information about e.g. the mux function.
+
+
+The following generic properties as defined in pinctrl-bindings.txt are valid
+to specify in a pin configuration subnode:
+
+- pins:
+   Usage: required
+   Value type: 
+   Definition: List of gpio pins affected by the properties specified in
+   this subnode.
+
+   Valid pins are:
+ gpio0-gpio145
+   Supports mux, bias and drive-strength
+
+ sdc1_clk, sdc1_cmd, sdc1_data sdc1_rclk, sdc2_clk,
+ sdc2_cmd, sdc2_data
+   Supports bias and drive-strength
+
+- function:
+   Usage: required
+   Value type: 
+   Definition: Specify the alternative function to be configured for the
+   specified pins. Functions are only valid for gpio pins.
+   Valid values are:
+
+   audio_ref_clk,  blsp_i2c1, blsp_i2c2, blsp_i2c3, blsp_i2c4, 
blsp_i2c5,
+   blsp_i2c6, blsp_i2c7, blsp_i2c8, blsp_i2c9, blsp_i2c10, 
blsp_i2c11,
+ 

[PATCH] pinctrl: qcom: Add msm8994 pinctrl driver

2016-10-21 Thread Michael Scott
Initial pinctrl driver for QCOM msm8994 platforms.

In order to continue the initial board support for QCOM msm8994/msm8992
presented in patches from Jeremy McNicoll , let's put
a proper pinctrl driver in place.

Currently, the DT for these platforms uses the msm8x74 pinctrl driver to enable
basic UART.  Beyond the first few pins the rest are different enough to justify
it's own driver.

Note: This driver is also be used by QCOM's msm8992 platform as it's TLM block
is the same.

- Initial formatting and style was taken from the msm8x74 pinctrl driver added
  by Björn Andersson 
- Data was then adjusted per QCOM MSM8994 documentation for Top Level 
Multiplexing
- Bindings documentation was based on qcom,msm8996-pinctrl.txt by
  Joonwoo Park  and then modified for msm8994 content

Signed-off-by: Michael Scott 
---
 .../bindings/pinctrl/qcom,msm8994-pinctrl.txt  |  175 +++
 drivers/pinctrl/qcom/Kconfig   |9 +
 drivers/pinctrl/qcom/Makefile  |1 +
 drivers/pinctrl/qcom/pinctrl-msm8994.c | 1402 
 4 files changed, 1587 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
 create mode 100644 drivers/pinctrl/qcom/pinctrl-msm8994.c

diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt 
b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
new file mode 100644
index 000..e390087b
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
@@ -0,0 +1,175 @@
+Qualcomm MSM8994 TLMM block
+
+This binding describes the Top Level Mode Multiplexer block found in the
+MSM8994 platform.
+
+- compatible:
+   Usage: required
+   Value type: 
+   Definition: must be "qcom,msm8994-pinctrl"
+
+- reg:
+   Usage: required
+   Value type: 
+   Definition: the base address and size of the TLMM register space.
+
+- interrupts:
+   Usage: required
+   Value type: 
+   Definition: should specify the TLMM summary IRQ.
+
+- interrupt-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as an interrupt controller
+
+- #interrupt-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+- gpio-controller:
+   Usage: required
+   Value type: 
+   Definition: identifies this node as a gpio controller
+
+- #gpio-cells:
+   Usage: required
+   Value type: 
+   Definition: must be 2. Specifying the pin number and flags, as defined
+   in 
+
+Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
+a general description of GPIO and interrupt bindings.
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+The pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, drive strength, etc.
+
+
+PIN CONFIGURATION NODES:
+
+The name of each subnode is not important; all subnodes should be enumerated
+and processed purely based on their content.
+
+Each subnode only affects those parameters that are explicitly listed. In
+other words, a subnode that lists a mux function but no pin configuration
+parameters implies no information about any pin configuration parameters.
+Similarly, a pin subnode that describes a pullup parameter implies no
+information about e.g. the mux function.
+
+
+The following generic properties as defined in pinctrl-bindings.txt are valid
+to specify in a pin configuration subnode:
+
+- pins:
+   Usage: required
+   Value type: 
+   Definition: List of gpio pins affected by the properties specified in
+   this subnode.
+
+   Valid pins are:
+ gpio0-gpio145
+   Supports mux, bias and drive-strength
+
+ sdc1_clk, sdc1_cmd, sdc1_data sdc1_rclk, sdc2_clk,
+ sdc2_cmd, sdc2_data
+   Supports bias and drive-strength
+
+- function:
+   Usage: required
+   Value type: 
+   Definition: Specify the alternative function to be configured for the
+   specified pins. Functions are only valid for gpio pins.
+   Valid values are:
+
+   audio_ref_clk,  blsp_i2c1, blsp_i2c2, blsp_i2c3, blsp_i2c4, 
blsp_i2c5,
+   blsp_i2c6, blsp_i2c7, blsp_i2c8, blsp_i2c9, blsp_i2c10, 
blsp_i2c11,
+   blsp_i2c12, blsp_spi1, blsp_spi1_cs1, blsp_spi1_cs2, 
blsp_spi1_cs3,
+

Re: [PATCH 1/2] pinctrl: pm8994: add pad voltage regulator defines

2016-10-21 Thread Andy Gross
On Sun, Sep 18, 2016 at 01:38:30PM +0200, Linus Walleij wrote:
> On Fri, Sep 16, 2016 at 7:41 PM, Srinivas Kandagatla
>  wrote:
> 
> > This patch adds defines for internal voltage regulators used
> > to switch voltage levels on gpio/mpp pads.
> >
> > Signed-off-by: Srinivas Kandagatla 
> 
> Acked-by: Linus Walleij 
> 
> Andy can merge this with the rest of the stuff to the Qualcomm SoC tree.

Yup.  Picked it up.  Thanks!


Andy


Re: [PATCH 1/2] pinctrl: pm8994: add pad voltage regulator defines

2016-10-21 Thread Andy Gross
On Sun, Sep 18, 2016 at 01:38:30PM +0200, Linus Walleij wrote:
> On Fri, Sep 16, 2016 at 7:41 PM, Srinivas Kandagatla
>  wrote:
> 
> > This patch adds defines for internal voltage regulators used
> > to switch voltage levels on gpio/mpp pads.
> >
> > Signed-off-by: Srinivas Kandagatla 
> 
> Acked-by: Linus Walleij 
> 
> Andy can merge this with the rest of the stuff to the Qualcomm SoC tree.

Yup.  Picked it up.  Thanks!


Andy


[GIT PULL] Please pull powerpc/linux.git powerpc-4.9-3 tag

2016-10-21 Thread Michael Ellerman
Hi Linus,

Please pull some more powerpc fixes for 4.9:

The following changes since commit 1001354ca34179f3db924eb66672442a173147dc:

  Linux 4.9-rc1 (2016-10-15 12:17:50 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
tags/powerpc-4.9-3

for you to fetch changes up to 78914ff0843623ee6dbeae92fa0bb8761828684e:

  powerpc: Ignore the pkey system calls for now (2016-10-19 20:36:24 +1100)


powerpc fixes for 4.9 #3

Fixes marked for stable:
 - Prevent unlikely crash in copro_calculate_slb() (Frederic Barrat)
 - cxl: Prevent adapter reset if an active context exists (Vaibhav Jain)

Fixes for code merged this cycle:
 - Fix boot on systems with uncompressed kernel image (Heiner Kallweit)
 - Drop dump_numa_memory_topology() (Michael Ellerman)
 - Fix numa topology console print (Aneesh Kumar K.V)
 - Ignore the pkey system calls for now (Stephen Rothwell)


Aneesh Kumar K.V (1):
  powerpc: Fix numa topology console print

Frederic Barrat (1):
  powerpc/mm: Prevent unlikely crash in copro_calculate_slb()

Heiner Kallweit (1):
  powerpc/boot: Fix boot on systems with uncompressed kernel image

Michael Ellerman (1):
  powerpc/mm: Drop dump_numa_memory_topology()

Stephen Rothwell (1):
  powerpc: Ignore the pkey system calls for now

Vaibhav Jain (1):
  cxl: Prevent adapter reset if an active context exists

 Documentation/ABI/testing/sysfs-class-cxl |  7 +++--
 arch/powerpc/boot/main.c  | 18 ++--
 arch/powerpc/include/asm/unistd.h |  4 +++
 arch/powerpc/mm/copro_fault.c |  2 ++
 arch/powerpc/mm/numa.c| 46 ---
 drivers/misc/cxl/api.c|  9 ++
 drivers/misc/cxl/context.c|  3 ++
 drivers/misc/cxl/cxl.h| 24 
 drivers/misc/cxl/file.c   | 11 
 drivers/misc/cxl/guest.c  |  3 ++
 drivers/misc/cxl/main.c   | 42 +++-
 drivers/misc/cxl/pci.c|  2 ++
 drivers/misc/cxl/sysfs.c  | 27 +++---
 13 files changed, 148 insertions(+), 50 deletions(-)


signature.asc
Description: PGP signature


[GIT PULL] Please pull powerpc/linux.git powerpc-4.9-3 tag

2016-10-21 Thread Michael Ellerman
Hi Linus,

Please pull some more powerpc fixes for 4.9:

The following changes since commit 1001354ca34179f3db924eb66672442a173147dc:

  Linux 4.9-rc1 (2016-10-15 12:17:50 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
tags/powerpc-4.9-3

for you to fetch changes up to 78914ff0843623ee6dbeae92fa0bb8761828684e:

  powerpc: Ignore the pkey system calls for now (2016-10-19 20:36:24 +1100)


powerpc fixes for 4.9 #3

Fixes marked for stable:
 - Prevent unlikely crash in copro_calculate_slb() (Frederic Barrat)
 - cxl: Prevent adapter reset if an active context exists (Vaibhav Jain)

Fixes for code merged this cycle:
 - Fix boot on systems with uncompressed kernel image (Heiner Kallweit)
 - Drop dump_numa_memory_topology() (Michael Ellerman)
 - Fix numa topology console print (Aneesh Kumar K.V)
 - Ignore the pkey system calls for now (Stephen Rothwell)


Aneesh Kumar K.V (1):
  powerpc: Fix numa topology console print

Frederic Barrat (1):
  powerpc/mm: Prevent unlikely crash in copro_calculate_slb()

Heiner Kallweit (1):
  powerpc/boot: Fix boot on systems with uncompressed kernel image

Michael Ellerman (1):
  powerpc/mm: Drop dump_numa_memory_topology()

Stephen Rothwell (1):
  powerpc: Ignore the pkey system calls for now

Vaibhav Jain (1):
  cxl: Prevent adapter reset if an active context exists

 Documentation/ABI/testing/sysfs-class-cxl |  7 +++--
 arch/powerpc/boot/main.c  | 18 ++--
 arch/powerpc/include/asm/unistd.h |  4 +++
 arch/powerpc/mm/copro_fault.c |  2 ++
 arch/powerpc/mm/numa.c| 46 ---
 drivers/misc/cxl/api.c|  9 ++
 drivers/misc/cxl/context.c|  3 ++
 drivers/misc/cxl/cxl.h| 24 
 drivers/misc/cxl/file.c   | 11 
 drivers/misc/cxl/guest.c  |  3 ++
 drivers/misc/cxl/main.c   | 42 +++-
 drivers/misc/cxl/pci.c|  2 ++
 drivers/misc/cxl/sysfs.c  | 27 +++---
 13 files changed, 148 insertions(+), 50 deletions(-)


signature.asc
Description: PGP signature


Re: [PATCH 26/26] ubifs: Raise write version to 5

2016-10-21 Thread Theodore Ts'o
On Fri, Oct 21, 2016 at 11:19:31AM -0700, Eric Biggers wrote:
> 
> I don't think it's reasonable to require require changes to filesystems 
> whenever
> someone introduces a new encryption mode --- contents, filenames, or both.
> Filesystems need to be able to handle unsupported encryption modes in some way
> that makes sense.  Currently, when it sees an unsupported encryption mode
> fscrypto will behave as if the encryption key is not available and will also
> print a one-time warning to the kernel log.  This happens when a file is
> accessed, not when the filesystem is mounted.  As far as I can tell, ext4, 
> f2fs,
> and ubifs would all behave this way because this code is shared.  I think this
> is probably the most realistic behavior.

I tend to agree, but file systems may choose some alternate approach
if they want to "fail fast" (e.g., at mount time).  I wouldn't want to
do that for ext4, but if ubifs (or some other file system) wants do
something more draconian, they can be afraid to do that.  Failing
that, some kind of one-time warning makes sense.

What I would like to do though is to is to have a callback so that
code in fs/crypto can call a file system specific notification
routine.  e.g., for ext4, we would probably want to be able to call
ext4_warning() and ext4_error() from fs/crypto, and other file systems
might want to have a different set of notification routines.

This way we can print a message like

kernel: EXT4-fs warning (device sdb1): fscrypto_xxx: foo bar baz

and if we later on have a way of sending file system specific warnings
or errors through some kind of IPC mechanism, such as netlink or some
future kdbus scheme, we can send the warning and error messages out
the same way we send other filesystem specific error messages.

  - Ted

P.S.  BTW, we actually _do_ have something hacked together inside the
Google production kernel which pipes ext4_error() messages to a
netlink socket, so that monitoring systems don't have scrape dmesg or
/var/log/messages.

If anyone inside or outside google is interested in that
functionality, I can make the code available.  There's nothing
sensitive or Google specific in it; it's just that unfortunately,
getting that code cleaned up and upstreamed has just never made it
"above the fold" on the priority list, the engineer who originally
implemented it is no longer on the team --- and I never had the time
to cleanup work to get the code to upstream quality myself.


Re: [PATCH 26/26] ubifs: Raise write version to 5

2016-10-21 Thread Theodore Ts'o
On Fri, Oct 21, 2016 at 11:19:31AM -0700, Eric Biggers wrote:
> 
> I don't think it's reasonable to require require changes to filesystems 
> whenever
> someone introduces a new encryption mode --- contents, filenames, or both.
> Filesystems need to be able to handle unsupported encryption modes in some way
> that makes sense.  Currently, when it sees an unsupported encryption mode
> fscrypto will behave as if the encryption key is not available and will also
> print a one-time warning to the kernel log.  This happens when a file is
> accessed, not when the filesystem is mounted.  As far as I can tell, ext4, 
> f2fs,
> and ubifs would all behave this way because this code is shared.  I think this
> is probably the most realistic behavior.

I tend to agree, but file systems may choose some alternate approach
if they want to "fail fast" (e.g., at mount time).  I wouldn't want to
do that for ext4, but if ubifs (or some other file system) wants do
something more draconian, they can be afraid to do that.  Failing
that, some kind of one-time warning makes sense.

What I would like to do though is to is to have a callback so that
code in fs/crypto can call a file system specific notification
routine.  e.g., for ext4, we would probably want to be able to call
ext4_warning() and ext4_error() from fs/crypto, and other file systems
might want to have a different set of notification routines.

This way we can print a message like

kernel: EXT4-fs warning (device sdb1): fscrypto_xxx: foo bar baz

and if we later on have a way of sending file system specific warnings
or errors through some kind of IPC mechanism, such as netlink or some
future kdbus scheme, we can send the warning and error messages out
the same way we send other filesystem specific error messages.

  - Ted

P.S.  BTW, we actually _do_ have something hacked together inside the
Google production kernel which pipes ext4_error() messages to a
netlink socket, so that monitoring systems don't have scrape dmesg or
/var/log/messages.

If anyone inside or outside google is interested in that
functionality, I can make the code available.  There's nothing
sensitive or Google specific in it; it's just that unfortunately,
getting that code cleaned up and upstreamed has just never made it
"above the fold" on the priority list, the engineer who originally
implemented it is no longer on the team --- and I never had the time
to cleanup work to get the code to upstream quality myself.


  1   2   3   4   5   6   7   8   9   10   >