from:"Sonny Rao"

[PATCH v2] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

This will allow usage of vsock from 32-bit binaries on a 64-bit
kernel.

Signed-off-by: Sonny Rao <sonny...@chromium.org>
---
 drivers/vhost/vsock.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0d14e2ff19f16..ee0c385d9fe54 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -699,12 +699,23 @@ static long vhost_vsock_dev_ioctl(struct file *f, 
unsigned int ioctl,
}
 }
 
+#ifdef CONFIG_COMPAT
+static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
+unsigned long arg)
+{
+   return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 static const struct file_operations vhost_vsock_fops = {
.owner  = THIS_MODULE,
.open   = vhost_vsock_dev_open,
.release= vhost_vsock_dev_release,
.llseek = noop_llseek,
.unlocked_ioctl = vhost_vsock_dev_ioctl,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = vhost_vsock_dev_compat_ioctl,
+#endif
 };
 
 static struct miscdevice vhost_vsock_misc = {
-- 
2.13.5

[PATCH v2] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

This will allow usage of vsock from 32-bit binaries on a 64-bit
kernel.

Signed-off-by: Sonny Rao 
---
 drivers/vhost/vsock.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0d14e2ff19f16..ee0c385d9fe54 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -699,12 +699,23 @@ static long vhost_vsock_dev_ioctl(struct file *f, 
unsigned int ioctl,
}
 }
 
+#ifdef CONFIG_COMPAT
+static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
+unsigned long arg)
+{
+   return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
+}
+#endif
+
 static const struct file_operations vhost_vsock_fops = {
.owner  = THIS_MODULE,
.open   = vhost_vsock_dev_open,
.release= vhost_vsock_dev_release,
.llseek = noop_llseek,
.unlocked_ioctl = vhost_vsock_dev_ioctl,
+#ifdef CONFIG_COMPAT
+   .compat_ioctl   = vhost_vsock_dev_compat_ioctl,
+#endif
 };
 
 static struct miscdevice vhost_vsock_misc = {
-- 
2.13.5

Re: [PATCH] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

On Wed, Mar 14, 2018 at 12:05 PM, Michael S. Tsirkin <m...@redhat.com> wrote:
> On Wed, Mar 14, 2018 at 10:26:05AM -0700, Sonny Rao wrote:
>> This will allow usage of vsock from 32-bit binaries on a 64-bit
>> kernel.
>>
>> Signed-off-by: Sonny Rao <sonny...@chromium.org>
>
> I think you need to convert the pointer argument though.
> Something along the lines of:
>
> #ifdef CONFIG_COMPAT
> static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
>  unsigned long arg)
> {
> return vhost_vsock_dev_ioctl(f, ioctl, (unsigned 
> long)compat_ptr(arg));
> }
> #endif

Ok, thanks for pointing that out -- it has worked for me so far, but
I'll re-spin as you suggested.

>
>
>
>> ---
>>  drivers/vhost/vsock.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index 0d14e2ff19f16..d0e65e92110e5 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
>>   .release= vhost_vsock_dev_release,
>>   .llseek = noop_llseek,
>>   .unlocked_ioctl = vhost_vsock_dev_ioctl,
>> + .compat_ioctl   = vhost_vsock_dev_ioctl,
>>  };
>>
>>  static struct miscdevice vhost_vsock_misc = {
>> --
>> 2.13.5

Re: [PATCH] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

On Wed, Mar 14, 2018 at 12:05 PM, Michael S. Tsirkin  wrote:
> On Wed, Mar 14, 2018 at 10:26:05AM -0700, Sonny Rao wrote:
>> This will allow usage of vsock from 32-bit binaries on a 64-bit
>> kernel.
>>
>> Signed-off-by: Sonny Rao 
>
> I think you need to convert the pointer argument though.
> Something along the lines of:
>
> #ifdef CONFIG_COMPAT
> static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
>  unsigned long arg)
> {
> return vhost_vsock_dev_ioctl(f, ioctl, (unsigned 
> long)compat_ptr(arg));
> }
> #endif

Ok, thanks for pointing that out -- it has worked for me so far, but
I'll re-spin as you suggested.

>
>
>
>> ---
>>  drivers/vhost/vsock.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index 0d14e2ff19f16..d0e65e92110e5 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
>>   .release= vhost_vsock_dev_release,
>>   .llseek = noop_llseek,
>>   .unlocked_ioctl = vhost_vsock_dev_ioctl,
>> + .compat_ioctl   = vhost_vsock_dev_ioctl,
>>  };
>>
>>  static struct miscdevice vhost_vsock_misc = {
>> --
>> 2.13.5

[PATCH] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

This will allow usage of vsock from 32-bit binaries on a 64-bit
kernel.

Signed-off-by: Sonny Rao <sonny...@chromium.org>
---
 drivers/vhost/vsock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0d14e2ff19f16..d0e65e92110e5 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
.release= vhost_vsock_dev_release,
.llseek = noop_llseek,
.unlocked_ioctl = vhost_vsock_dev_ioctl,
+   .compat_ioctl   = vhost_vsock_dev_ioctl,
 };
 
 static struct miscdevice vhost_vsock_misc = {
-- 
2.13.5

[PATCH] vhost: add vsock compat ioctl

2018-03-14 Thread Sonny Rao

This will allow usage of vsock from 32-bit binaries on a 64-bit
kernel.

Signed-off-by: Sonny Rao 
---
 drivers/vhost/vsock.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0d14e2ff19f16..d0e65e92110e5 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
.release= vhost_vsock_dev_release,
.llseek = noop_llseek,
.unlocked_ioctl = vhost_vsock_dev_ioctl,
+   .compat_ioctl   = vhost_vsock_dev_ioctl,
 };
 
 static struct miscdevice vhost_vsock_misc = {
-- 
2.13.5

[PATCH] vhost: fix vhost ioctl signature to build with clang

2018-03-14 Thread Sonny Rao

Clang is particularly anal about signed vs unsigned comparisons and
doesn't like the fact that some ioctl numbers set the MSB, so we get
this error when trying to build vhost on aarch64:

drivers/vhost/vhost.c:1400:7: error: overflow converting case value to
 switch condition type (3221794578 to 18446744072636378898)
 [-Werror, -Wswitch]
case VHOST_GET_VRING_BASE:

3221794578 is 0xC008AF12 in hex
18446744072636378898 is 0xC008AF12 in hex

Fix this by using unsigned ints in the function signature for
vhost_vring_ioctl().

Signed-off-by: Sonny Rao <sonny...@chromium.org>
---
 drivers/vhost/vhost.c | 2 +-
 drivers/vhost/vhost.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d5c8b4..5316319d84081 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1337,7 +1337,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct 
vhost_memory __user *m)
return -EFAULT;
 }
 
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp)
 {
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b6056f19ae..d8ee85ae8fdcc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,7 @@ void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 
 struct vhost_log {
u64 addr;
@@ -177,7 +177,7 @@ void vhost_dev_reset_owner(struct vhost_dev *, struct 
vhost_umem *);
 void vhost_dev_cleanup(struct vhost_dev *);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user 
*argp);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-- 
2.13.5

[PATCH] vhost: fix vhost ioctl signature to build with clang

2018-03-14 Thread Sonny Rao

Clang is particularly anal about signed vs unsigned comparisons and
doesn't like the fact that some ioctl numbers set the MSB, so we get
this error when trying to build vhost on aarch64:

drivers/vhost/vhost.c:1400:7: error: overflow converting case value to
 switch condition type (3221794578 to 18446744072636378898)
 [-Werror, -Wswitch]
case VHOST_GET_VRING_BASE:

3221794578 is 0xC008AF12 in hex
18446744072636378898 is 0xC008AF12 in hex

Fix this by using unsigned ints in the function signature for
vhost_vring_ioctl().

Signed-off-by: Sonny Rao 
---
 drivers/vhost/vhost.c | 2 +-
 drivers/vhost/vhost.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 1b3e8d2d5c8b4..5316319d84081 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1337,7 +1337,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct 
vhost_memory __user *m)
return -EFAULT;
 }
 
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp)
 {
struct file *eventfp, *filep = NULL;
bool pollstart = false, pollstop = false;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b6056f19ae..d8ee85ae8fdcc 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -45,7 +45,7 @@ void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
 void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 
 struct vhost_log {
u64 addr;
@@ -177,7 +177,7 @@ void vhost_dev_reset_owner(struct vhost_dev *, struct 
vhost_umem *);
 void vhost_dev_cleanup(struct vhost_dev *);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user 
*argp);
-long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
+long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-- 
2.13.5

Re: [PATCH RFC v2] Add /proc/pid/smaps_rollup

2017-08-10 Thread Sonny Rao

On Thu, Aug 10, 2017 at 3:58 AM, Michal Hocko  wrote:
> On Thu 10-08-17 03:23:23, Daniel Colascione wrote:
>> Thanks for taking a look at the patch!
>>
>> On Thu, Aug 10 2017, Michal Hocko wrote:
>> > [CC linux-api - the patch was posted here
>> > http://lkml.kernel.org/r/20170810001557.147285-1-dan...@google.com]
>> >
>> > On Thu 10-08-17 13:38:31, Minchan Kim wrote:
>> >> On Wed, Aug 09, 2017 at 05:15:57PM -0700, Daniel Colascione wrote:
>> >> > /proc/pid/smaps_rollup is a new proc file that improves the
>> >> > performance of user programs that determine aggregate memory
>> >> > statistics (e.g., total PSS) of a process.
>> >> >
>> >> > Android regularly "samples" the memory usage of various processes in
>> >> > order to balance its memory pool sizes. This sampling process involves
>> >> > opening /proc/pid/smaps and summing certain fields. For very large
>> >> > processes, sampling memory use this way can take several hundred
>> >> > milliseconds, due mostly to the overhead of the seq_printf calls in
>> >> > task_mmu.c.
>> >
>> > Have you tried to reduce that overhead? E.g. by replacing seq_printf by
>> > something more simple
>> > http://lkml.kernel.org/r/20160817130320.gc20...@dhcp22.suse.cz?
>>
>> I haven't tried that yet, but if I'm reading that thread correctly, it
>> looks like using more efficient printing primitives gives us a 7%
>> speedup. The smaps_rollup patch gives us a much bigger speedup while
>> reusing almost all the smaps code, so it seems easier and simpler than a
>> bunch of incremental improvements to smaps. And even an efficient smaps
>> would have to push 2MB through seq_file for the 3000-VMA process case.
>
> The thing is that more users would benefit from a more efficient
> /proc/pid/smaps call. Maybe we can use some caching tricks etc...  We
> should make sure that existing options should be attempted before a new
> user visible interface is added. It is kind of sad that the real work
> (pte walk) is less expensive than formating the output and copying it to
> the userspace...
>
>> > How often you you need to read this information?
>>
>> It varies depending on how often processes change state.  We sample a
>> short time (tens of seconds) after processes change state (e.g., enters
>> foreground) and every few minutes thereafter. We're particularly
>> concerned from an energy perspective about needlessly burning CPU on
>> background samples.
>
> Please make sure this is documented in the patch along with some numbers
> ideally.
>
> [...]
>
>> >> FYI, there was trial but got failed at that time so in this time,
>> >> https://marc.info/?l=linux-kernel=147310650003277=2
>> >> http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1229163.html
>> >
>> > Yes I really disliked the previous attempt and this one is not all that
>> > better. The primary unanswered question back then was a relevant
>> > usecase. Back then it was argued [1] that PSS was useful for userspace
>> > OOM handling but arguments were rather dubious. Follow up questions [2]
>> > shown that the useage of PSS was very workload specific. Minchan has
>> > noted some usecase as well but not very specific either.
>>
>> Anyway, I see what you mean about PSS being iffy for user-space OOM
>> processing (because PSS doesn't tell you how much memory you get back in
>> exchange for killing a given process at a particular moment). We're not
>> using it like that.
>>
>> Instead, we're using the PSS samples we collect asynchronously for
>> system-management tasks like fine-tuning oom_adj_score, memory use
>> tracking for debugging, application-level memory-use attribution, and
>> deciding whether we want to kill large processes during system idle
>> maintenance windows. Android has been using PSS for these purposes for a
>> long time; as the average process VMA count has increased and and
>> devices become more efficiency-conscious, PSS-collection inefficiency
>> has started to matter more. IMHO, it'd be a lot safer to optimize the
>> existing PSS-collection model, which has been fine-tuned over the years,
>> instead of changing the memory tracking approach entirely to work around
>> smaps-generation inefficiency.
>
> This is really vague. Please be more specific.

I actually think this is really similar to the Chrome OS use case --
we need to do proper accounting of memory from user space, and we need
something more accurate than what we have now (usually RSS) to figure
it out.  I'm not sure what is vague about that statement?

PSS is not perfect but in closed systems where we have some knowledge
about what is being shared amongst process, PSS is much better than
RSS and readily available.  So, I disagree that this is a dubious
usage -- if there's a better metric for making this kind of decision,
please share it.

Also I realized there's another argument for presenting this
information outside of smaps which is that we expose far less
information about a process and it's address space via something like

Re: [PATCH RFC v2] Add /proc/pid/smaps_rollup

2017-08-10 Thread Sonny Rao

On Thu, Aug 10, 2017 at 3:58 AM, Michal Hocko  wrote:
> On Thu 10-08-17 03:23:23, Daniel Colascione wrote:
>> Thanks for taking a look at the patch!
>>
>> On Thu, Aug 10 2017, Michal Hocko wrote:
>> > [CC linux-api - the patch was posted here
>> > http://lkml.kernel.org/r/20170810001557.147285-1-dan...@google.com]
>> >
>> > On Thu 10-08-17 13:38:31, Minchan Kim wrote:
>> >> On Wed, Aug 09, 2017 at 05:15:57PM -0700, Daniel Colascione wrote:
>> >> > /proc/pid/smaps_rollup is a new proc file that improves the
>> >> > performance of user programs that determine aggregate memory
>> >> > statistics (e.g., total PSS) of a process.
>> >> >
>> >> > Android regularly "samples" the memory usage of various processes in
>> >> > order to balance its memory pool sizes. This sampling process involves
>> >> > opening /proc/pid/smaps and summing certain fields. For very large
>> >> > processes, sampling memory use this way can take several hundred
>> >> > milliseconds, due mostly to the overhead of the seq_printf calls in
>> >> > task_mmu.c.
>> >
>> > Have you tried to reduce that overhead? E.g. by replacing seq_printf by
>> > something more simple
>> > http://lkml.kernel.org/r/20160817130320.gc20...@dhcp22.suse.cz?
>>
>> I haven't tried that yet, but if I'm reading that thread correctly, it
>> looks like using more efficient printing primitives gives us a 7%
>> speedup. The smaps_rollup patch gives us a much bigger speedup while
>> reusing almost all the smaps code, so it seems easier and simpler than a
>> bunch of incremental improvements to smaps. And even an efficient smaps
>> would have to push 2MB through seq_file for the 3000-VMA process case.
>
> The thing is that more users would benefit from a more efficient
> /proc/pid/smaps call. Maybe we can use some caching tricks etc...  We
> should make sure that existing options should be attempted before a new
> user visible interface is added. It is kind of sad that the real work
> (pte walk) is less expensive than formating the output and copying it to
> the userspace...
>
>> > How often you you need to read this information?
>>
>> It varies depending on how often processes change state.  We sample a
>> short time (tens of seconds) after processes change state (e.g., enters
>> foreground) and every few minutes thereafter. We're particularly
>> concerned from an energy perspective about needlessly burning CPU on
>> background samples.
>
> Please make sure this is documented in the patch along with some numbers
> ideally.
>
> [...]
>
>> >> FYI, there was trial but got failed at that time so in this time,
>> >> https://marc.info/?l=linux-kernel=147310650003277=2
>> >> http://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1229163.html
>> >
>> > Yes I really disliked the previous attempt and this one is not all that
>> > better. The primary unanswered question back then was a relevant
>> > usecase. Back then it was argued [1] that PSS was useful for userspace
>> > OOM handling but arguments were rather dubious. Follow up questions [2]
>> > shown that the useage of PSS was very workload specific. Minchan has
>> > noted some usecase as well but not very specific either.
>>
>> Anyway, I see what you mean about PSS being iffy for user-space OOM
>> processing (because PSS doesn't tell you how much memory you get back in
>> exchange for killing a given process at a particular moment). We're not
>> using it like that.
>>
>> Instead, we're using the PSS samples we collect asynchronously for
>> system-management tasks like fine-tuning oom_adj_score, memory use
>> tracking for debugging, application-level memory-use attribution, and
>> deciding whether we want to kill large processes during system idle
>> maintenance windows. Android has been using PSS for these purposes for a
>> long time; as the average process VMA count has increased and and
>> devices become more efficiency-conscious, PSS-collection inefficiency
>> has started to matter more. IMHO, it'd be a lot safer to optimize the
>> existing PSS-collection model, which has been fine-tuned over the years,
>> instead of changing the memory tracking approach entirely to work around
>> smaps-generation inefficiency.
>
> This is really vague. Please be more specific.

I actually think this is really similar to the Chrome OS use case --
we need to do proper accounting of memory from user space, and we need
something more accurate than what we have now (usually RSS) to figure
it out.  I'm not sure what is vague about that statement?

PSS is not perfect but in closed systems where we have some knowledge
about what is being shared amongst process, PSS is much better than
RSS and readily available.  So, I disagree that this is a dubious
usage -- if there's a better metric for making this kind of decision,
please share it.

Also I realized there's another argument for presenting this
information outside of smaps which is that we expose far less
information about a process and it's address space via something like
this, so it's much

Re: [PATCH v4 RESEND 1/2] Documentation: tpm: add powered-while-suspended binding documentation

2017-07-05 Thread Sonny Rao

On Mon, Jul 3, 2017 at 5:57 AM, Jarkko Sakkinen
<jarkko.sakki...@linux.intel.com> wrote:
> On Tue, Jun 27, 2017 at 12:27:23PM +0200, Enric Balletbo i Serra wrote:
>> Add a new powered-while-suspended property to control the behavior of the
>> TPM suspend/resume.
>>
>> Signed-off-by: Enric Balletbo i Serra <enric.balle...@collabora.com>
>> Signed-off-by: Sonny Rao <sonny...@chromium.org>
>> Reviewed-by: Jason Gunthorpe <jguntho...@obsidianresearch.com>
>> Reviewed-by: Jarkko Sakkinen <jarkko.sakki...@linux.intel.com>
>> Acked-by: Rob Herring <r...@kernel.org>
>> ---
>> Changes since v3.
>>   - Rebased on top of linux-next
>>  Rob Herring
>>   - Split DT binding from code patch as is preferred.
>>
>> Did not exist on previous versions.
>>
>>  Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt 
>> b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> index 8cb638b..85c8216 100644
>> --- a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> +++ b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> @@ -8,6 +8,12 @@ Required properties:
>> the firmware event log
>>  - linux,sml-size : size of the memory allocated for the firmware event log
>>
>> +Optional properties:
>> +
>> +- powered-while-suspended: present when the TPM is left powered on between
>> +   suspend and resume (makes the suspend/resume
>> +   callbacks do nothing).
>> +
>>  Example (for OpenPower Systems with Nuvoton TPM 2.0 on I2C)
>>  --
>>
>> --
>> 2.9.3
>>
>
> So... should I apply this?

Hi, since you applied the code part, it would make sense to apply the
documentation too.

>
> /Jarkko

Re: [PATCH v4 RESEND 1/2] Documentation: tpm: add powered-while-suspended binding documentation

2017-07-05 Thread Sonny Rao

On Mon, Jul 3, 2017 at 5:57 AM, Jarkko Sakkinen
 wrote:
> On Tue, Jun 27, 2017 at 12:27:23PM +0200, Enric Balletbo i Serra wrote:
>> Add a new powered-while-suspended property to control the behavior of the
>> TPM suspend/resume.
>>
>> Signed-off-by: Enric Balletbo i Serra 
>> Signed-off-by: Sonny Rao 
>> Reviewed-by: Jason Gunthorpe 
>> Reviewed-by: Jarkko Sakkinen 
>> Acked-by: Rob Herring 
>> ---
>> Changes since v3.
>>   - Rebased on top of linux-next
>>  Rob Herring
>>   - Split DT binding from code patch as is preferred.
>>
>> Did not exist on previous versions.
>>
>>  Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt 
>> b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> index 8cb638b..85c8216 100644
>> --- a/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> +++ b/Documentation/devicetree/bindings/security/tpm/tpm-i2c.txt
>> @@ -8,6 +8,12 @@ Required properties:
>> the firmware event log
>>  - linux,sml-size : size of the memory allocated for the firmware event log
>>
>> +Optional properties:
>> +
>> +- powered-while-suspended: present when the TPM is left powered on between
>> +   suspend and resume (makes the suspend/resume
>> +   callbacks do nothing).
>> +
>>  Example (for OpenPower Systems with Nuvoton TPM 2.0 on I2C)
>>  --
>>
>> --
>> 2.9.3
>>
>
> So... should I apply this?

Hi, since you applied the code part, it would make sense to apply the
documentation too.

>
> /Jarkko

Re: [tpmdd-devel] [PATCH] tpm: do not suspend/resume if power stays on

2017-03-01 Thread Sonny Rao

On Wed, Mar 1, 2017 at 3:18 PM, Jason Gunthorpe
<jguntho...@obsidianresearch.com> wrote:
> On Wed, Mar 01, 2017 at 02:39:09PM -0800, Sonny Rao wrote:
>
>> > We recently added global suspend/resume callbacks to the TPM
>> > core. Those call backs do not power off the TPM, they just prepare its
>> > internal state to loose power to the chip. Skipping that process on
>> > hardware that does not power-off the TPM makes sense to me.
>> >
>> > But, Sonny, perhaps this should be a global flag in tpm_chip, not a
>> > per-interface-driver override?
>>
>> It's a property of the board design not the chip -- maybe I'm
>> misunderstanding?
>
> I mean do not add the code to handle this to tpm_i2c_infineon.c but in
> the common chip code instead.
>
> tpm_i2c_infineon.c should only parse DT properties that are relavent
> to the bus that delivers commands to the TPM, things that apply to how
> a TPM chip operates should be handled in the core code because they
> apply to any command transport bus.

Oh right, sorry -- yes this makes perfect sense.

>
> Jason

Re: [tpmdd-devel] [PATCH] tpm: do not suspend/resume if power stays on

2017-03-01 Thread Sonny Rao

On Wed, Mar 1, 2017 at 3:18 PM, Jason Gunthorpe
 wrote:
> On Wed, Mar 01, 2017 at 02:39:09PM -0800, Sonny Rao wrote:
>
>> > We recently added global suspend/resume callbacks to the TPM
>> > core. Those call backs do not power off the TPM, they just prepare its
>> > internal state to loose power to the chip. Skipping that process on
>> > hardware that does not power-off the TPM makes sense to me.
>> >
>> > But, Sonny, perhaps this should be a global flag in tpm_chip, not a
>> > per-interface-driver override?
>>
>> It's a property of the board design not the chip -- maybe I'm
>> misunderstanding?
>
> I mean do not add the code to handle this to tpm_i2c_infineon.c but in
> the common chip code instead.
>
> tpm_i2c_infineon.c should only parse DT properties that are relavent
> to the bus that delivers commands to the TPM, things that apply to how
> a TPM chip operates should be handled in the core code because they
> apply to any command transport bus.

Oh right, sorry -- yes this makes perfect sense.

>
> Jason

Re: [tpmdd-devel] [PATCH] tpm: do not suspend/resume if power stays on

2017-03-01 Thread Sonny Rao

On Wed, Mar 1, 2017 at 10:43 AM, Jason Gunthorpe
 wrote:
>> > +Optional properties:
>> > +- powered-while-suspended: present when the TPM is left powered on between
>> > +  suspend and resume (makes the suspend/resume callbacks do nothing).
>>
>> This reads like configuration rather than a HW property.
>
> I read this to mean the HW does not cut power to the TPM when Linux
> does 'suspend'.

That's correct, it is a hardware property describing whether power is
removed during suspend.

>
> We recently added global suspend/resume callbacks to the TPM
> core. Those call backs do not power off the TPM, they just prepare its
> internal state to loose power to the chip. Skipping that process on
> hardware that does not power-off the TPM makes sense to me.
>
> But, Sonny, perhaps this should be a global flag in tpm_chip, not a
> per-interface-driver override?

It's a property of the board design not the chip -- maybe I'm misunderstanding?

>
> Jason

Re: [tpmdd-devel] [PATCH] tpm: do not suspend/resume if power stays on

2017-03-01 Thread Sonny Rao

On Wed, Mar 1, 2017 at 10:43 AM, Jason Gunthorpe
 wrote:
>> > +Optional properties:
>> > +- powered-while-suspended: present when the TPM is left powered on between
>> > +  suspend and resume (makes the suspend/resume callbacks do nothing).
>>
>> This reads like configuration rather than a HW property.
>
> I read this to mean the HW does not cut power to the TPM when Linux
> does 'suspend'.

That's correct, it is a hardware property describing whether power is
removed during suspend.

>
> We recently added global suspend/resume callbacks to the TPM
> core. Those call backs do not power off the TPM, they just prepare its
> internal state to loose power to the chip. Skipping that process on
> hardware that does not power-off the TPM makes sense to me.
>
> But, Sonny, perhaps this should be a global flag in tpm_chip, not a
> per-interface-driver override?

It's a property of the board design not the chip -- maybe I'm misunderstanding?

>
> Jason

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-19 Thread Sonny Rao

On Mon, Sep 19, 2016 at 5:27 PM, Robert Foss <robert.f...@collabora.com> wrote:
>
>
> On 2016-09-19 03:32 PM, Michal Hocko wrote:
>>
>> On Mon 19-09-16 11:16:31, Robert Foss wrote:
>>>
>>> On 2016-09-14 05:12 AM, Michal Hocko wrote:
>>>>
>>>> On Tue 13-09-16 13:27:39, Sonny Rao wrote:
>>
>> [...]
>>>>>
>>>>> Given that smaps
>>>>> doesn't provide this in a straightforward way, what do you think is
>>>>> the right way to provide this information?
>>>>
>>>>
>>>> I would be tempted to sneak it into /proc//statm because that looks
>>>> like a proper place but getting this information is not for free
>>>> performance wise so I am not really sure something that relies on this
>>>> file would see unexpected stalls. Maybe this could be worked around by
>>>> some caching... I would suggest to check who is actually using this file
>>>> (top/ps etc...)
>>>
>>>
>>> What would this caching look like? Can any information be re-used between
>>> vma walks?
>>
>>
>> yes basically return the same value if called within HZ or something
>> similar. But that assumes that statm latency really matters and it is
>> called often enough.
>
>
> Any single application querying more often than HZ, would presumably do so
> for accuracy reasons.
> However for multiple applications that combined query more often than HZ,
> this would most definitely be halpful in terms of performance.
>
> @Sonny, does chromiumos fall into the first or second category?

It's a single application -- and it definitely doesn't query at HZ --
especially given how long it takes to gather the data :-)

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-19 Thread Sonny Rao

On Mon, Sep 19, 2016 at 5:27 PM, Robert Foss  wrote:
>
>
> On 2016-09-19 03:32 PM, Michal Hocko wrote:
>>
>> On Mon 19-09-16 11:16:31, Robert Foss wrote:
>>>
>>> On 2016-09-14 05:12 AM, Michal Hocko wrote:
>>>>
>>>> On Tue 13-09-16 13:27:39, Sonny Rao wrote:
>>
>> [...]
>>>>>
>>>>> Given that smaps
>>>>> doesn't provide this in a straightforward way, what do you think is
>>>>> the right way to provide this information?
>>>>
>>>>
>>>> I would be tempted to sneak it into /proc//statm because that looks
>>>> like a proper place but getting this information is not for free
>>>> performance wise so I am not really sure something that relies on this
>>>> file would see unexpected stalls. Maybe this could be worked around by
>>>> some caching... I would suggest to check who is actually using this file
>>>> (top/ps etc...)
>>>
>>>
>>> What would this caching look like? Can any information be re-used between
>>> vma walks?
>>
>>
>> yes basically return the same value if called within HZ or something
>> similar. But that assumes that statm latency really matters and it is
>> called often enough.
>
>
> Any single application querying more often than HZ, would presumably do so
> for accuracy reasons.
> However for multiple applications that combined query more often than HZ,
> this would most definitely be halpful in terms of performance.
>
> @Sonny, does chromiumos fall into the first or second category?

It's a single application -- and it definitely doesn't query at HZ --
especially given how long it takes to gather the data :-)

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-19 Thread Sonny Rao

On Mon, Sep 19, 2016 at 12:56 PM, Jann Horn <j...@thejh.net> wrote:
> On Mon, Sep 19, 2016 at 09:51:13PM +0200, Michal Hocko wrote:
>> [not sure why the CC list was trimmed - do no do that please unless you
>>  have a strong reason for that - if this was not intentional please
>>  restpre it]
>
> Ah, sorry, pressed the wrong key.
>
>
>> On Mon 19-09-16 21:40:01, Jann Horn wrote:
>> > On Mon, Sep 19, 2016 at 09:32:38PM +0200, Michal Hocko wrote:
>> > > On Mon 19-09-16 11:16:31, Robert Foss wrote:
>> > > > On 2016-09-14 05:12 AM, Michal Hocko wrote:
>> > > > > On Tue 13-09-16 13:27:39, Sonny Rao wrote:
>> > > [...]
>> > > > > > Given that smaps
>> > > > > > doesn't provide this in a straightforward way, what do you think is
>> > > > > > the right way to provide this information?
>> > > > >
>> > > > > I would be tempted to sneak it into /proc//statm because that 
>> > > > > looks
>> > > > > like a proper place but getting this information is not for free
>> > > > > performance wise so I am not really sure something that relies on 
>> > > > > this
>> > > > > file would see unexpected stalls. Maybe this could be worked around 
>> > > > > by
>> > > > > some caching... I would suggest to check who is actually using this 
>> > > > > file
>> > > > > (top/ps etc...)
>> > > >
>> > > > What would this caching look like? Can any information be re-used 
>> > > > between
>> > > > vma walks?
>> > >
>> > > yes basically return the same value if called within HZ or something
>> > > similar. But that assumes that statm latency really matters and it is
>> > > called often enough.
>> >
>> > That sounds horrible. If some application decides that they want to check
>> > statm directly after some action or so (like after program startup), this 
>> > is
>> > going to give them a very bad time. That probably doesn't happen
>> > often - but still.
>> >
>> > I can already imagine some developer going "yeah, that usleep()... that's
>> > because the kernel API returns stale information for a couple milliseconds
>> > after we do something *shrug*".
>> >
>> > What are you trying to optimize for? Ten users on the same machine, each of
>> > which is running "top" because it looks so great?
>>
>> Please try to read what I wrote again. I didn't say this would be
>> needed. The idea was that _if_ /proc//statm is used very _often_
>> than some caching might help to reduce the overhead. Especially when you
>> consider that the information is not precise anyway. It can change
>> anytime while you are doing the address space walk.

Just thinking out loud here -- I haven't looked closely at the code so
please bear with me :-)

Instead of checking when the last read was and returning old data,
what about a scheme where we still have a timestamp for last stat read
on and any changes to that address space invalidate the timestamp.

The invalidation could be racy because we're not too concerned about
immediate accuracy -- so just a write.   The main issue I could see
which this is that it could cause the cacheline holding this timestamp
to bounce around a lot?  Maybe there's an existing solution in the
page table locking that could be leveraged here to at least maintain
whatever scalability enhancements are present for this type of
situation where there are many updates happening in parallel.

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-19 Thread Sonny Rao

On Mon, Sep 19, 2016 at 12:56 PM, Jann Horn  wrote:
> On Mon, Sep 19, 2016 at 09:51:13PM +0200, Michal Hocko wrote:
>> [not sure why the CC list was trimmed - do no do that please unless you
>>  have a strong reason for that - if this was not intentional please
>>  restpre it]
>
> Ah, sorry, pressed the wrong key.
>
>
>> On Mon 19-09-16 21:40:01, Jann Horn wrote:
>> > On Mon, Sep 19, 2016 at 09:32:38PM +0200, Michal Hocko wrote:
>> > > On Mon 19-09-16 11:16:31, Robert Foss wrote:
>> > > > On 2016-09-14 05:12 AM, Michal Hocko wrote:
>> > > > > On Tue 13-09-16 13:27:39, Sonny Rao wrote:
>> > > [...]
>> > > > > > Given that smaps
>> > > > > > doesn't provide this in a straightforward way, what do you think is
>> > > > > > the right way to provide this information?
>> > > > >
>> > > > > I would be tempted to sneak it into /proc//statm because that 
>> > > > > looks
>> > > > > like a proper place but getting this information is not for free
>> > > > > performance wise so I am not really sure something that relies on 
>> > > > > this
>> > > > > file would see unexpected stalls. Maybe this could be worked around 
>> > > > > by
>> > > > > some caching... I would suggest to check who is actually using this 
>> > > > > file
>> > > > > (top/ps etc...)
>> > > >
>> > > > What would this caching look like? Can any information be re-used 
>> > > > between
>> > > > vma walks?
>> > >
>> > > yes basically return the same value if called within HZ or something
>> > > similar. But that assumes that statm latency really matters and it is
>> > > called often enough.
>> >
>> > That sounds horrible. If some application decides that they want to check
>> > statm directly after some action or so (like after program startup), this 
>> > is
>> > going to give them a very bad time. That probably doesn't happen
>> > often - but still.
>> >
>> > I can already imagine some developer going "yeah, that usleep()... that's
>> > because the kernel API returns stale information for a couple milliseconds
>> > after we do something *shrug*".
>> >
>> > What are you trying to optimize for? Ten users on the same machine, each of
>> > which is running "top" because it looks so great?
>>
>> Please try to read what I wrote again. I didn't say this would be
>> needed. The idea was that _if_ /proc//statm is used very _often_
>> than some caching might help to reduce the overhead. Especially when you
>> consider that the information is not precise anyway. It can change
>> anytime while you are doing the address space walk.

Just thinking out loud here -- I haven't looked closely at the code so
please bear with me :-)

Instead of checking when the last read was and returning old data,
what about a scheme where we still have a timestamp for last stat read
on and any changes to that address space invalidate the timestamp.

The invalidation could be racy because we're not too concerned about
immediate accuracy -- so just a write.   The main issue I could see
which this is that it could cause the cacheline holding this timestamp
to bounce around a lot?  Maybe there's an existing solution in the
page table locking that could be leveraged here to at least maintain
whatever scalability enhancements are present for this type of
situation where there are many updates happening in parallel.

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-13 Thread Sonny Rao

On Tue, Sep 13, 2016 at 12:12 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Mon 12-09-16 10:28:53, Sonny Rao wrote:
>> On Mon, Sep 12, 2016 at 10:15 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Mon 12-09-16 08:31:36, Sonny Rao wrote:
> [...]
>> >> but how about the other fields like Swap, Private_Dirty and
>> >> Private_Shared?
>> >
>> > Private_Shared can be pretty confusing as well without the whole context
>> > as well see my other emails in the original thread (just to remind
>> > shmem/tmpfs makes all this really confusing).
>>
>> But this is exactly the issue -- RSS is can be just as confusing if
>> you don't know something about the application.
>
> I agree that rss can be confusing but we will not make the situation any
> better if we add yet another confusing metric.
>
>> I think the issue is
>> how common that situation is, and you seem to believe that it's so
>> uncommon that it's actually better to keep the information more
>> difficult to get for those of us who know something about our systems.
>>
>> That's fine, I guess we just have to disagree here, thanks for look at this.
>
> I think you should just step back and think more about what exactly
> you expect from the counter(s). I believe what you want is an
> estimate of a freeable memory when the particular process dies or is
> killed. That would mean resident single mapped private anonymous memory
> + unlinked single mapped shareable mappings + single mapped swapped out
> memory. Maybe I've missed something but it should be something along
> those lines. Definitely something that the current smaps infrastructure
> doesn't give you, though.

Yes your description of what we want is pretty good.  Having a
reasonable lower bound on the estimate is fine, though we probably
want to break out swapped out memory separately.  Given that smaps
doesn't provide this in a straightforward way, what do you think is
the right way to provide this information?

> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-13 Thread Sonny Rao

On Tue, Sep 13, 2016 at 12:12 AM, Michal Hocko  wrote:
> On Mon 12-09-16 10:28:53, Sonny Rao wrote:
>> On Mon, Sep 12, 2016 at 10:15 AM, Michal Hocko  wrote:
>> > On Mon 12-09-16 08:31:36, Sonny Rao wrote:
> [...]
>> >> but how about the other fields like Swap, Private_Dirty and
>> >> Private_Shared?
>> >
>> > Private_Shared can be pretty confusing as well without the whole context
>> > as well see my other emails in the original thread (just to remind
>> > shmem/tmpfs makes all this really confusing).
>>
>> But this is exactly the issue -- RSS is can be just as confusing if
>> you don't know something about the application.
>
> I agree that rss can be confusing but we will not make the situation any
> better if we add yet another confusing metric.
>
>> I think the issue is
>> how common that situation is, and you seem to believe that it's so
>> uncommon that it's actually better to keep the information more
>> difficult to get for those of us who know something about our systems.
>>
>> That's fine, I guess we just have to disagree here, thanks for look at this.
>
> I think you should just step back and think more about what exactly
> you expect from the counter(s). I believe what you want is an
> estimate of a freeable memory when the particular process dies or is
> killed. That would mean resident single mapped private anonymous memory
> + unlinked single mapped shareable mappings + single mapped swapped out
> memory. Maybe I've missed something but it should be something along
> those lines. Definitely something that the current smaps infrastructure
> doesn't give you, though.

Yes your description of what we want is pretty good.  Having a
reasonable lower bound on the estimate is fine, though we probably
want to break out swapped out memory separately.  Given that smaps
doesn't provide this in a straightforward way, what do you think is
the right way to provide this information?

> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-12 Thread Sonny Rao

On Mon, Sep 12, 2016 at 10:15 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Mon 12-09-16 08:31:36, Sonny Rao wrote:
>> On Mon, Sep 12, 2016 at 5:02 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Mon 05-09-16 16:14:06, robert.f...@collabora.com wrote:
>> >> From: Robert Foss <robert.f...@collabora.com>
>> >>
>> >> This series provides the /proc/PID/totmaps feature, which
>> >> summarizes the information provided by /proc/PID/smaps for
>> >> improved performance and usability reasons.
>> >>
>> >> A use case is to speed up monitoring of memory consumption in
>> >> environments where RSS isn't precise.
>> >>
>> >> For example Chrome tends to many processes which have hundreds of VMAs
>> >> with a substantial amount of shared memory, and the error of using
>> >> RSS rather than PSS tends to be very large when looking at overall
>> >> memory consumption.  PSS isn't kept as a single number that's exported
>> >> like RSS, so to calculate PSS means having to parse a very large smaps
>> >> file.
>> >>
>> >> This process is slow and has to be repeated for many processes, and we
>> >> found that the just act of doing the parsing was taking up a
>> >> significant amount of CPU time, so this patch is an attempt to make
>> >> that process cheaper.
>> >
>> > I still maintain my concerns about a single pss value. It might work in
>> > a very specific situations where the consumer knows what is shared but
>> > other than that the value can be more misleading than helpful. So a NACK
>> > from me until I am shown that this is usable in general and still
>> > helpful.
>>
>> I know you think Pss isn't useful in general (though I'll point out
>> two other independent people said they found it useful)
>
> sure, and one of them admitted that the value is useful because they
> _know_ the resource. The other was quite vague for me to understand
> all the details. Please, try to understand that once you provide a user
> API then it will carved in stone. If the interface is poor and ambigous
> it will bite us later. One very specific usecase doesn't justify
> something that might be really misleading for 90% of cases.
>
>> but how about the other fields like Swap, Private_Dirty and
>> Private_Shared?
>
> Private_Shared can be pretty confusing as well without the whole context
> as well see my other emails in the original thread (just to remind
> shmem/tmpfs makes all this really confusing).

But this is exactly the issue -- RSS is can be just as confusing if
you don't know something about the application.  I think the issue is
how common that situation is, and you seem to believe that it's so
uncommon that it's actually better to keep the information more
difficult to get for those of us who know something about our systems.

That's fine, I guess we just have to disagree here, thanks for look at this.

>
> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-12 Thread Sonny Rao

On Mon, Sep 12, 2016 at 10:15 AM, Michal Hocko  wrote:
> On Mon 12-09-16 08:31:36, Sonny Rao wrote:
>> On Mon, Sep 12, 2016 at 5:02 AM, Michal Hocko  wrote:
>> > On Mon 05-09-16 16:14:06, robert.f...@collabora.com wrote:
>> >> From: Robert Foss 
>> >>
>> >> This series provides the /proc/PID/totmaps feature, which
>> >> summarizes the information provided by /proc/PID/smaps for
>> >> improved performance and usability reasons.
>> >>
>> >> A use case is to speed up monitoring of memory consumption in
>> >> environments where RSS isn't precise.
>> >>
>> >> For example Chrome tends to many processes which have hundreds of VMAs
>> >> with a substantial amount of shared memory, and the error of using
>> >> RSS rather than PSS tends to be very large when looking at overall
>> >> memory consumption.  PSS isn't kept as a single number that's exported
>> >> like RSS, so to calculate PSS means having to parse a very large smaps
>> >> file.
>> >>
>> >> This process is slow and has to be repeated for many processes, and we
>> >> found that the just act of doing the parsing was taking up a
>> >> significant amount of CPU time, so this patch is an attempt to make
>> >> that process cheaper.
>> >
>> > I still maintain my concerns about a single pss value. It might work in
>> > a very specific situations where the consumer knows what is shared but
>> > other than that the value can be more misleading than helpful. So a NACK
>> > from me until I am shown that this is usable in general and still
>> > helpful.
>>
>> I know you think Pss isn't useful in general (though I'll point out
>> two other independent people said they found it useful)
>
> sure, and one of them admitted that the value is useful because they
> _know_ the resource. The other was quite vague for me to understand
> all the details. Please, try to understand that once you provide a user
> API then it will carved in stone. If the interface is poor and ambigous
> it will bite us later. One very specific usecase doesn't justify
> something that might be really misleading for 90% of cases.
>
>> but how about the other fields like Swap, Private_Dirty and
>> Private_Shared?
>
> Private_Shared can be pretty confusing as well without the whole context
> as well see my other emails in the original thread (just to remind
> shmem/tmpfs makes all this really confusing).

But this is exactly the issue -- RSS is can be just as confusing if
you don't know something about the application.  I think the issue is
how common that situation is, and you seem to believe that it's so
uncommon that it's actually better to keep the information more
difficult to get for those of us who know something about our systems.

That's fine, I guess we just have to disagree here, thanks for look at this.

>
> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-12 Thread Sonny Rao

On Mon, Sep 12, 2016 at 5:02 AM, Michal Hocko  wrote:
> On Mon 05-09-16 16:14:06, robert.f...@collabora.com wrote:
>> From: Robert Foss 
>>
>> This series provides the /proc/PID/totmaps feature, which
>> summarizes the information provided by /proc/PID/smaps for
>> improved performance and usability reasons.
>>
>> A use case is to speed up monitoring of memory consumption in
>> environments where RSS isn't precise.
>>
>> For example Chrome tends to many processes which have hundreds of VMAs
>> with a substantial amount of shared memory, and the error of using
>> RSS rather than PSS tends to be very large when looking at overall
>> memory consumption.  PSS isn't kept as a single number that's exported
>> like RSS, so to calculate PSS means having to parse a very large smaps
>> file.
>>
>> This process is slow and has to be repeated for many processes, and we
>> found that the just act of doing the parsing was taking up a
>> significant amount of CPU time, so this patch is an attempt to make
>> that process cheaper.
>
> I still maintain my concerns about a single pss value. It might work in
> a very specific situations where the consumer knows what is shared but
> other than that the value can be more misleading than helpful. So a NACK
> from me until I am shown that this is usable in general and still
> helpful.

I know you think Pss isn't useful in general (though I'll point out
two other independent people said they found it useful) but how about
the other fields like Swap, Private_Dirty and Private_Shared?

If we removed Pss would you still NACK it?

>
> --
> Michal Hocko
> SUSE Labs

Re: [PATCH v5 0/3] mm, proc: Implement /proc//totmaps

2016-09-12 Thread Sonny Rao

On Mon, Sep 12, 2016 at 5:02 AM, Michal Hocko  wrote:
> On Mon 05-09-16 16:14:06, robert.f...@collabora.com wrote:
>> From: Robert Foss 
>>
>> This series provides the /proc/PID/totmaps feature, which
>> summarizes the information provided by /proc/PID/smaps for
>> improved performance and usability reasons.
>>
>> A use case is to speed up monitoring of memory consumption in
>> environments where RSS isn't precise.
>>
>> For example Chrome tends to many processes which have hundreds of VMAs
>> with a substantial amount of shared memory, and the error of using
>> RSS rather than PSS tends to be very large when looking at overall
>> memory consumption.  PSS isn't kept as a single number that's exported
>> like RSS, so to calculate PSS means having to parse a very large smaps
>> file.
>>
>> This process is slow and has to be repeated for many processes, and we
>> found that the just act of doing the parsing was taking up a
>> significant amount of CPU time, so this patch is an attempt to make
>> that process cheaper.
>
> I still maintain my concerns about a single pss value. It might work in
> a very specific situations where the consumer knows what is shared but
> other than that the value can be more misleading than helpful. So a NACK
> from me until I am shown that this is usable in general and still
> helpful.

I know you think Pss isn't useful in general (though I'll point out
two other independent people said they found it useful) but how about
the other fields like Swap, Private_Dirty and Private_Shared?

If we removed Pss would you still NACK it?

>
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-22 Thread Sonny Rao

On Mon, Aug 22, 2016 at 12:54 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Fri 19-08-16 10:57:48, Sonny Rao wrote:
>> On Fri, Aug 19, 2016 at 12:59 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Thu 18-08-16 23:43:39, Sonny Rao wrote:
>> >> On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko <mho...@kernel.org> wrote:
>> >> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> >> >> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> 
>> >> >> wrote:
>> >> >> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> >> > [...]
>> >> >> >> 2) User space OOM handling -- we'd rather do a more graceful 
>> >> >> >> shutdown
>> >> >> >> than let the kernel's OOM killer activate and need to gather this
>> >> >> >> information and we'd like to be able to get this information to make
>> >> >> >> the decision much faster than 400ms
>> >> >> >
>> >> >> > Global OOM handling in userspace is really dubious if you ask me. I
>> >> >> > understand you want something better than SIGKILL and in fact this is
>> >> >> > already possible with memory cgroup controller (btw. memcg will give
>> >> >> > you a cheap access to rss, amount of shared, swapped out memory as
>> >> >> > well). Anyway if you are getting close to the OOM your system will 
>> >> >> > most
>> >> >> > probably be really busy and chances are that also reading your new 
>> >> >> > file
>> >> >> > will take much more time. I am also not quite sure how is pss useful 
>> >> >> > for
>> >> >> > oom decisions.
>> >> >>
>> >> >> I mentioned it before, but based on experience RSS just isn't good
>> >> >> enough -- there's too much sharing going on in our use case to make
>> >> >> the correct decision based on RSS.  If RSS were good enough, simply
>> >> >> put, this patch wouldn't exist.
>> >> >
>> >> > But that doesn't answer my question, I am afraid. So how exactly do you
>> >> > use pss for oom decisions?
>> >>
>> >> We use PSS to calculate the memory used by a process among all the
>> >> processes in the system, in the case of Chrome this tells us how much
>> >> each renderer process (which is roughly tied to a particular "tab" in
>> >> Chrome) is using and how much it has swapped out, so we know what the
>> >> worst offenders are -- I'm not sure what's unclear about that?
>> >
>> > So let me ask more specifically. How can you make any decision based on
>> > the pss when you do not know _what_ is the shared resource. In other
>> > words if you select a task to terminate based on the pss then you have to
>> > kill others who share the same resource otherwise you do not release
>> > that shared resource. Not to mention that such a shared resource might
>> > be on tmpfs/shmem and it won't get released even after all processes
>> > which map it are gone.
>>
>> Ok I see why you're confused now, sorry.
>>
>> In our case that we do know what is being shared in general because
>> the sharing is mostly between those processes that we're looking at
>> and not other random processes or tmpfs, so PSS gives us useful data
>> in the context of these processes which are sharing the data
>> especially for monitoring between the set of these renderer processes.
>
> OK, I see and agree that pss might be useful when you _know_ what is
> shared. But this sounds quite specific to a particular workload. How
> many users are in a similar situation? In other words, if we present
> a single number without the context, how much useful it will be in
> general? Is it possible that presenting such a number could be even
> misleading for somebody who doesn't have an idea which resources are
> shared? These are all questions which should be answered before we
> actually add this number (be it a new/existing proc file or a syscall).
> I still believe that the number without wider context is just not all
> that useful.


I see the specific point about  PSS -- because you need to know what
is being shared or otherwise use it in a whole system context, but I
still think the whole system context is a valid and generally useful
thing.  But what about the private_clean and private_dirty?  Surely
those are more generally useful for

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-22 Thread Sonny Rao

On Mon, Aug 22, 2016 at 12:54 AM, Michal Hocko  wrote:
> On Fri 19-08-16 10:57:48, Sonny Rao wrote:
>> On Fri, Aug 19, 2016 at 12:59 AM, Michal Hocko  wrote:
>> > On Thu 18-08-16 23:43:39, Sonny Rao wrote:
>> >> On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko  wrote:
>> >> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> >> >> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  
>> >> >> wrote:
>> >> >> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> >> > [...]
>> >> >> >> 2) User space OOM handling -- we'd rather do a more graceful 
>> >> >> >> shutdown
>> >> >> >> than let the kernel's OOM killer activate and need to gather this
>> >> >> >> information and we'd like to be able to get this information to make
>> >> >> >> the decision much faster than 400ms
>> >> >> >
>> >> >> > Global OOM handling in userspace is really dubious if you ask me. I
>> >> >> > understand you want something better than SIGKILL and in fact this is
>> >> >> > already possible with memory cgroup controller (btw. memcg will give
>> >> >> > you a cheap access to rss, amount of shared, swapped out memory as
>> >> >> > well). Anyway if you are getting close to the OOM your system will 
>> >> >> > most
>> >> >> > probably be really busy and chances are that also reading your new 
>> >> >> > file
>> >> >> > will take much more time. I am also not quite sure how is pss useful 
>> >> >> > for
>> >> >> > oom decisions.
>> >> >>
>> >> >> I mentioned it before, but based on experience RSS just isn't good
>> >> >> enough -- there's too much sharing going on in our use case to make
>> >> >> the correct decision based on RSS.  If RSS were good enough, simply
>> >> >> put, this patch wouldn't exist.
>> >> >
>> >> > But that doesn't answer my question, I am afraid. So how exactly do you
>> >> > use pss for oom decisions?
>> >>
>> >> We use PSS to calculate the memory used by a process among all the
>> >> processes in the system, in the case of Chrome this tells us how much
>> >> each renderer process (which is roughly tied to a particular "tab" in
>> >> Chrome) is using and how much it has swapped out, so we know what the
>> >> worst offenders are -- I'm not sure what's unclear about that?
>> >
>> > So let me ask more specifically. How can you make any decision based on
>> > the pss when you do not know _what_ is the shared resource. In other
>> > words if you select a task to terminate based on the pss then you have to
>> > kill others who share the same resource otherwise you do not release
>> > that shared resource. Not to mention that such a shared resource might
>> > be on tmpfs/shmem and it won't get released even after all processes
>> > which map it are gone.
>>
>> Ok I see why you're confused now, sorry.
>>
>> In our case that we do know what is being shared in general because
>> the sharing is mostly between those processes that we're looking at
>> and not other random processes or tmpfs, so PSS gives us useful data
>> in the context of these processes which are sharing the data
>> especially for monitoring between the set of these renderer processes.
>
> OK, I see and agree that pss might be useful when you _know_ what is
> shared. But this sounds quite specific to a particular workload. How
> many users are in a similar situation? In other words, if we present
> a single number without the context, how much useful it will be in
> general? Is it possible that presenting such a number could be even
> misleading for somebody who doesn't have an idea which resources are
> shared? These are all questions which should be answered before we
> actually add this number (be it a new/existing proc file or a syscall).
> I still believe that the number without wider context is just not all
> that useful.


I see the specific point about  PSS -- because you need to know what
is being shared or otherwise use it in a whole system context, but I
still think the whole system context is a valid and generally useful
thing.  But what about the private_clean and private_dirty?  Surely
those are more generally useful for calculating a lower bound on
process memory usage without additional knowledge?

At the end of th

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Fri, Aug 19, 2016 at 1:05 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Fri 19-08-16 11:26:34, Minchan Kim wrote:
>> Hi Michal,
>>
>> On Thu, Aug 18, 2016 at 08:01:04PM +0200, Michal Hocko wrote:
>> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> > > On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > > > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> > [...]
>> > > >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> > > >> than let the kernel's OOM killer activate and need to gather this
>> > > >> information and we'd like to be able to get this information to make
>> > > >> the decision much faster than 400ms
>> > > >
>> > > > Global OOM handling in userspace is really dubious if you ask me. I
>> > > > understand you want something better than SIGKILL and in fact this is
>> > > > already possible with memory cgroup controller (btw. memcg will give
>> > > > you a cheap access to rss, amount of shared, swapped out memory as
>> > > > well). Anyway if you are getting close to the OOM your system will most
>> > > > probably be really busy and chances are that also reading your new file
>> > > > will take much more time. I am also not quite sure how is pss useful 
>> > > > for
>> > > > oom decisions.
>> > >
>> > > I mentioned it before, but based on experience RSS just isn't good
>> > > enough -- there's too much sharing going on in our use case to make
>> > > the correct decision based on RSS.  If RSS were good enough, simply
>> > > put, this patch wouldn't exist.
>> >
>> > But that doesn't answer my question, I am afraid. So how exactly do you
>> > use pss for oom decisions?
>>
>> My case is not for OOM decision but I agree it would be great if we can get
>> *fast* smap summary information.
>>
>> PSS is really great tool to figure out how processes consume memory
>> more exactly rather than RSS. We have been used it for monitoring
>> of memory for per-process. Although it is not used for OOM decision,
>> it would be great if it is speed up because we don't want to spend
>> many CPU time for just monitoring.
>>
>> For our usecase, we don't need AnonHugePages, ShmemPmdMapped, Shared_Hugetlb,
>> Private_Hugetlb, KernelPageSize, MMUPageSize because we never enable THP and
>> hugetlb. Additionally, Locked can be known via vma flags so we don't need it,
>> either. Even, we don't need address range for just monitoring when we don't
>> investigate in detail.
>>
>> Although they are not severe overhead, why does it emit the useless
>> information? Even bloat day by day. :( With that, userspace tools should
>> spend more time to parse which is pointless.
>
> So far it doesn't really seem that the parsing is the biggest problem.
> The major cycles killer is the output formatting and that doesn't sound
> like a problem we are not able to address. And I would even argue that
> we want to address it in a generic way as much as possible.
>
>> Having said that, I'm not fan of creating new stat knob for that, either.
>> How about appending summary information in the end of smap?
>> So, monitoring users can just open the file and lseek to the (end - 1) and
>> read the summary only.
>
> That might confuse existing parsers. Besides that we already have
> /proc//statm which gives cumulative numbers already. I am not sure
> how often it is used and whether the pte walk is too expensive for
> existing users but that should be explored and evaluated before a new
> file is created.
>
> The /proc became a dump of everything people found interesting just
> because we were to easy to allow those additions. Do not repeat those
> mistakes, please!

Another thing I noticed was that we lock down smaps on Chromium OS.  I
think this is to avoid exposing more information than necessary via
proc.  The totmaps file gives us just the information we need and
nothing else.   I certainly don't think we need a proc file for this
use case -- do you think a new system call is better or something
else?

> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Fri, Aug 19, 2016 at 1:05 AM, Michal Hocko  wrote:
> On Fri 19-08-16 11:26:34, Minchan Kim wrote:
>> Hi Michal,
>>
>> On Thu, Aug 18, 2016 at 08:01:04PM +0200, Michal Hocko wrote:
>> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> > > On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> > > > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> > [...]
>> > > >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> > > >> than let the kernel's OOM killer activate and need to gather this
>> > > >> information and we'd like to be able to get this information to make
>> > > >> the decision much faster than 400ms
>> > > >
>> > > > Global OOM handling in userspace is really dubious if you ask me. I
>> > > > understand you want something better than SIGKILL and in fact this is
>> > > > already possible with memory cgroup controller (btw. memcg will give
>> > > > you a cheap access to rss, amount of shared, swapped out memory as
>> > > > well). Anyway if you are getting close to the OOM your system will most
>> > > > probably be really busy and chances are that also reading your new file
>> > > > will take much more time. I am also not quite sure how is pss useful 
>> > > > for
>> > > > oom decisions.
>> > >
>> > > I mentioned it before, but based on experience RSS just isn't good
>> > > enough -- there's too much sharing going on in our use case to make
>> > > the correct decision based on RSS.  If RSS were good enough, simply
>> > > put, this patch wouldn't exist.
>> >
>> > But that doesn't answer my question, I am afraid. So how exactly do you
>> > use pss for oom decisions?
>>
>> My case is not for OOM decision but I agree it would be great if we can get
>> *fast* smap summary information.
>>
>> PSS is really great tool to figure out how processes consume memory
>> more exactly rather than RSS. We have been used it for monitoring
>> of memory for per-process. Although it is not used for OOM decision,
>> it would be great if it is speed up because we don't want to spend
>> many CPU time for just monitoring.
>>
>> For our usecase, we don't need AnonHugePages, ShmemPmdMapped, Shared_Hugetlb,
>> Private_Hugetlb, KernelPageSize, MMUPageSize because we never enable THP and
>> hugetlb. Additionally, Locked can be known via vma flags so we don't need it,
>> either. Even, we don't need address range for just monitoring when we don't
>> investigate in detail.
>>
>> Although they are not severe overhead, why does it emit the useless
>> information? Even bloat day by day. :( With that, userspace tools should
>> spend more time to parse which is pointless.
>
> So far it doesn't really seem that the parsing is the biggest problem.
> The major cycles killer is the output formatting and that doesn't sound
> like a problem we are not able to address. And I would even argue that
> we want to address it in a generic way as much as possible.
>
>> Having said that, I'm not fan of creating new stat knob for that, either.
>> How about appending summary information in the end of smap?
>> So, monitoring users can just open the file and lseek to the (end - 1) and
>> read the summary only.
>
> That might confuse existing parsers. Besides that we already have
> /proc//statm which gives cumulative numbers already. I am not sure
> how often it is used and whether the pte walk is too expensive for
> existing users but that should be explored and evaluated before a new
> file is created.
>
> The /proc became a dump of everything people found interesting just
> because we were to easy to allow those additions. Do not repeat those
> mistakes, please!

Another thing I noticed was that we lock down smaps on Chromium OS.  I
think this is to avoid exposing more information than necessary via
proc.  The totmaps file gives us just the information we need and
nothing else.   I certainly don't think we need a proc file for this
use case -- do you think a new system call is better or something
else?

> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Fri, Aug 19, 2016 at 12:59 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Thu 18-08-16 23:43:39, Sonny Rao wrote:
>> On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> >> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> wrote:
>> >> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> > [...]
>> >> >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> >> >> than let the kernel's OOM killer activate and need to gather this
>> >> >> information and we'd like to be able to get this information to make
>> >> >> the decision much faster than 400ms
>> >> >
>> >> > Global OOM handling in userspace is really dubious if you ask me. I
>> >> > understand you want something better than SIGKILL and in fact this is
>> >> > already possible with memory cgroup controller (btw. memcg will give
>> >> > you a cheap access to rss, amount of shared, swapped out memory as
>> >> > well). Anyway if you are getting close to the OOM your system will most
>> >> > probably be really busy and chances are that also reading your new file
>> >> > will take much more time. I am also not quite sure how is pss useful for
>> >> > oom decisions.
>> >>
>> >> I mentioned it before, but based on experience RSS just isn't good
>> >> enough -- there's too much sharing going on in our use case to make
>> >> the correct decision based on RSS.  If RSS were good enough, simply
>> >> put, this patch wouldn't exist.
>> >
>> > But that doesn't answer my question, I am afraid. So how exactly do you
>> > use pss for oom decisions?
>>
>> We use PSS to calculate the memory used by a process among all the
>> processes in the system, in the case of Chrome this tells us how much
>> each renderer process (which is roughly tied to a particular "tab" in
>> Chrome) is using and how much it has swapped out, so we know what the
>> worst offenders are -- I'm not sure what's unclear about that?
>
> So let me ask more specifically. How can you make any decision based on
> the pss when you do not know _what_ is the shared resource. In other
> words if you select a task to terminate based on the pss then you have to
> kill others who share the same resource otherwise you do not release
> that shared resource. Not to mention that such a shared resource might
> be on tmpfs/shmem and it won't get released even after all processes
> which map it are gone.

Ok I see why you're confused now, sorry.

In our case that we do know what is being shared in general because
the sharing is mostly between those processes that we're looking at
and not other random processes or tmpfs, so PSS gives us useful data
in the context of these processes which are sharing the data
especially for monitoring between the set of these renderer processes.

We also use the private clean and private dirty and swap fields to
make a few metrics for the processes and charge each process for it's
private, shared, and swap data. Private clean and dirty are used for
estimating a lower bound on how much memory would be freed.  Swap and
PSS also give us some indication of additional memory which might get
freed up.

>
> I am sorry for being dense but it is still not clear to me how the
> single pss number can be used for oom or, in general, any serious
> decisions. The counter might be useful of course for debugging purposes
> or to have a general overview but then arguing about 40 vs 20ms sounds a
> bit strange to me.

Yeah so it's more than just the single PSS number, it's PSS,
Private_Clean, Private_dirty, Swap are all interesting numbers to make
these decisions.

>
>> Chrome tends to use a lot of shared memory so we found PSS to be
>> better than RSS, and I can give you examples of the  RSS and PSS on
>> real systems to illustrate the magnitude of the difference between
>> those two numbers if that would be useful.
>>
>> >
>> >> So even with memcg I think we'd have the same problem?
>> >
>> > memcg will give you instant anon, shared counters for all processes in
>> > the memcg.
>> >
>>
>> We want to be able to get per-process granularity quickly.  I'm not
>> sure if memcg provides that exactly?
>
> I will give you that information if you do process-per-memcg but that
> doesn't sound ideal. I thought those 20-something processes you were
> talking about are treated together but it seems I misunderstood.
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Fri, Aug 19, 2016 at 12:59 AM, Michal Hocko  wrote:
> On Thu 18-08-16 23:43:39, Sonny Rao wrote:
>> On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko  wrote:
>> > On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> >> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> >> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> > [...]
>> >> >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> >> >> than let the kernel's OOM killer activate and need to gather this
>> >> >> information and we'd like to be able to get this information to make
>> >> >> the decision much faster than 400ms
>> >> >
>> >> > Global OOM handling in userspace is really dubious if you ask me. I
>> >> > understand you want something better than SIGKILL and in fact this is
>> >> > already possible with memory cgroup controller (btw. memcg will give
>> >> > you a cheap access to rss, amount of shared, swapped out memory as
>> >> > well). Anyway if you are getting close to the OOM your system will most
>> >> > probably be really busy and chances are that also reading your new file
>> >> > will take much more time. I am also not quite sure how is pss useful for
>> >> > oom decisions.
>> >>
>> >> I mentioned it before, but based on experience RSS just isn't good
>> >> enough -- there's too much sharing going on in our use case to make
>> >> the correct decision based on RSS.  If RSS were good enough, simply
>> >> put, this patch wouldn't exist.
>> >
>> > But that doesn't answer my question, I am afraid. So how exactly do you
>> > use pss for oom decisions?
>>
>> We use PSS to calculate the memory used by a process among all the
>> processes in the system, in the case of Chrome this tells us how much
>> each renderer process (which is roughly tied to a particular "tab" in
>> Chrome) is using and how much it has swapped out, so we know what the
>> worst offenders are -- I'm not sure what's unclear about that?
>
> So let me ask more specifically. How can you make any decision based on
> the pss when you do not know _what_ is the shared resource. In other
> words if you select a task to terminate based on the pss then you have to
> kill others who share the same resource otherwise you do not release
> that shared resource. Not to mention that such a shared resource might
> be on tmpfs/shmem and it won't get released even after all processes
> which map it are gone.

Ok I see why you're confused now, sorry.

In our case that we do know what is being shared in general because
the sharing is mostly between those processes that we're looking at
and not other random processes or tmpfs, so PSS gives us useful data
in the context of these processes which are sharing the data
especially for monitoring between the set of these renderer processes.

We also use the private clean and private dirty and swap fields to
make a few metrics for the processes and charge each process for it's
private, shared, and swap data. Private clean and dirty are used for
estimating a lower bound on how much memory would be freed.  Swap and
PSS also give us some indication of additional memory which might get
freed up.

>
> I am sorry for being dense but it is still not clear to me how the
> single pss number can be used for oom or, in general, any serious
> decisions. The counter might be useful of course for debugging purposes
> or to have a general overview but then arguing about 40 vs 20ms sounds a
> bit strange to me.

Yeah so it's more than just the single PSS number, it's PSS,
Private_Clean, Private_dirty, Swap are all interesting numbers to make
these decisions.

>
>> Chrome tends to use a lot of shared memory so we found PSS to be
>> better than RSS, and I can give you examples of the  RSS and PSS on
>> real systems to illustrate the magnitude of the difference between
>> those two numbers if that would be useful.
>>
>> >
>> >> So even with memcg I think we'd have the same problem?
>> >
>> > memcg will give you instant anon, shared counters for all processes in
>> > the memcg.
>> >
>>
>> We want to be able to get per-process granularity quickly.  I'm not
>> sure if memcg provides that exactly?
>
> I will give you that information if you do process-per-memcg but that
> doesn't sound ideal. I thought those 20-something processes you were
> talking about are treated together but it seems I misunderstood.
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Thu, Aug 18, 2016 at 7:26 PM, Minchan Kim <minc...@kernel.org> wrote:
> Hi Michal,
>
> On Thu, Aug 18, 2016 at 08:01:04PM +0200, Michal Hocko wrote:
>> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> > On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> [...]
>> > >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> > >> than let the kernel's OOM killer activate and need to gather this
>> > >> information and we'd like to be able to get this information to make
>> > >> the decision much faster than 400ms
>> > >
>> > > Global OOM handling in userspace is really dubious if you ask me. I
>> > > understand you want something better than SIGKILL and in fact this is
>> > > already possible with memory cgroup controller (btw. memcg will give
>> > > you a cheap access to rss, amount of shared, swapped out memory as
>> > > well). Anyway if you are getting close to the OOM your system will most
>> > > probably be really busy and chances are that also reading your new file
>> > > will take much more time. I am also not quite sure how is pss useful for
>> > > oom decisions.
>> >
>> > I mentioned it before, but based on experience RSS just isn't good
>> > enough -- there's too much sharing going on in our use case to make
>> > the correct decision based on RSS.  If RSS were good enough, simply
>> > put, this patch wouldn't exist.
>>
>> But that doesn't answer my question, I am afraid. So how exactly do you
>> use pss for oom decisions?
>
> My case is not for OOM decision but I agree it would be great if we can get
> *fast* smap summary information.
>
> PSS is really great tool to figure out how processes consume memory
> more exactly rather than RSS. We have been used it for monitoring
> of memory for per-process. Although it is not used for OOM decision,
> it would be great if it is speed up because we don't want to spend
> many CPU time for just monitoring.
>
> For our usecase, we don't need AnonHugePages, ShmemPmdMapped, Shared_Hugetlb,
> Private_Hugetlb, KernelPageSize, MMUPageSize because we never enable THP and
> hugetlb. Additionally, Locked can be known via vma flags so we don't need it,
> either. Even, we don't need address range for just monitoring when we don't
> investigate in detail.
>
> Although they are not severe overhead, why does it emit the useless
> information? Even bloat day by day. :( With that, userspace tools should
> spend more time to parse which is pointless.
>
> Having said that, I'm not fan of creating new stat knob for that, either.
> How about appending summary information in the end of smap?
> So, monitoring users can just open the file and lseek to the (end - 1) and
> read the summary only.
>

That would work fine for us as long as it's fast -- i.e. we don't
still have to do all the expensive per-VMA format conversion in the
kernel.

> Thanks.

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Thu, Aug 18, 2016 at 7:26 PM, Minchan Kim  wrote:
> Hi Michal,
>
> On Thu, Aug 18, 2016 at 08:01:04PM +0200, Michal Hocko wrote:
>> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> > On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> > > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> [...]
>> > >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> > >> than let the kernel's OOM killer activate and need to gather this
>> > >> information and we'd like to be able to get this information to make
>> > >> the decision much faster than 400ms
>> > >
>> > > Global OOM handling in userspace is really dubious if you ask me. I
>> > > understand you want something better than SIGKILL and in fact this is
>> > > already possible with memory cgroup controller (btw. memcg will give
>> > > you a cheap access to rss, amount of shared, swapped out memory as
>> > > well). Anyway if you are getting close to the OOM your system will most
>> > > probably be really busy and chances are that also reading your new file
>> > > will take much more time. I am also not quite sure how is pss useful for
>> > > oom decisions.
>> >
>> > I mentioned it before, but based on experience RSS just isn't good
>> > enough -- there's too much sharing going on in our use case to make
>> > the correct decision based on RSS.  If RSS were good enough, simply
>> > put, this patch wouldn't exist.
>>
>> But that doesn't answer my question, I am afraid. So how exactly do you
>> use pss for oom decisions?
>
> My case is not for OOM decision but I agree it would be great if we can get
> *fast* smap summary information.
>
> PSS is really great tool to figure out how processes consume memory
> more exactly rather than RSS. We have been used it for monitoring
> of memory for per-process. Although it is not used for OOM decision,
> it would be great if it is speed up because we don't want to spend
> many CPU time for just monitoring.
>
> For our usecase, we don't need AnonHugePages, ShmemPmdMapped, Shared_Hugetlb,
> Private_Hugetlb, KernelPageSize, MMUPageSize because we never enable THP and
> hugetlb. Additionally, Locked can be known via vma flags so we don't need it,
> either. Even, we don't need address range for just monitoring when we don't
> investigate in detail.
>
> Although they are not severe overhead, why does it emit the useless
> information? Even bloat day by day. :( With that, userspace tools should
> spend more time to parse which is pointless.
>
> Having said that, I'm not fan of creating new stat knob for that, either.
> How about appending summary information in the end of smap?
> So, monitoring users can just open the file and lseek to the (end - 1) and
> read the summary only.
>

That would work fine for us as long as it's fast -- i.e. we don't
still have to do all the expensive per-VMA format conversion in the
kernel.

> Thanks.

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
> [...]
>> >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> >> than let the kernel's OOM killer activate and need to gather this
>> >> information and we'd like to be able to get this information to make
>> >> the decision much faster than 400ms
>> >
>> > Global OOM handling in userspace is really dubious if you ask me. I
>> > understand you want something better than SIGKILL and in fact this is
>> > already possible with memory cgroup controller (btw. memcg will give
>> > you a cheap access to rss, amount of shared, swapped out memory as
>> > well). Anyway if you are getting close to the OOM your system will most
>> > probably be really busy and chances are that also reading your new file
>> > will take much more time. I am also not quite sure how is pss useful for
>> > oom decisions.
>>
>> I mentioned it before, but based on experience RSS just isn't good
>> enough -- there's too much sharing going on in our use case to make
>> the correct decision based on RSS.  If RSS were good enough, simply
>> put, this patch wouldn't exist.
>
> But that doesn't answer my question, I am afraid. So how exactly do you
> use pss for oom decisions?

We use PSS to calculate the memory used by a process among all the
processes in the system, in the case of Chrome this tells us how much
each renderer process (which is roughly tied to a particular "tab" in
Chrome) is using and how much it has swapped out, so we know what the
worst offenders are -- I'm not sure what's unclear about that?

Chrome tends to use a lot of shared memory so we found PSS to be
better than RSS, and I can give you examples of the  RSS and PSS on
real systems to illustrate the magnitude of the difference between
those two numbers if that would be useful.

>
>> So even with memcg I think we'd have the same problem?
>
> memcg will give you instant anon, shared counters for all processes in
> the memcg.
>

We want to be able to get per-process granularity quickly.  I'm not
sure if memcg provides that exactly?

>> > Don't take me wrong, /proc//totmaps might be suitable for your
>> > specific usecase but so far I haven't heard any sound argument for it to
>> > be generally usable. It is true that smaps is unnecessarily costly but
>> > at least I can see some room for improvements. A simple patch I've
>> > posted cut the formatting overhead by 7%. Maybe we can do more.
>>
>> It seems like a general problem that if you want these values the
>> existing kernel interface can be very expensive, so it would be
>> generally usable by any application which wants a per process PSS,
>> private data, dirty data or swap value.
>
> yes this is really unfortunate. And if at all possible we should address
> that. Precise values require the expensive rmap walk. We can introduce
> some caching to help that. But so far it seems the biggest overhead is
> to simply format the output and that should be addressed before any new
> proc file is added.
>
>> I mentioned two use cases, but I guess I don't understand the comment
>> about why it's not usable by other use cases.
>
> I might be wrong here but a use of pss is quite limited and I do not
> remember anybody asking for large optimizations in that area. I still do
> not understand your use cases properly so I am quite skeptical about a
> general usefulness of a new file.

How do you know that usage of PSS is quite limited?  I can only say
that we've been using it on Chromium OS for at least four years and
have found it very valuable, and I think I've explained the use cases
in this thread. If you have more specific questions then I can try to
clarify.

>
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Thu, Aug 18, 2016 at 11:01 AM, Michal Hocko  wrote:
> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>> > On Wed 17-08-16 11:57:56, Sonny Rao wrote:
> [...]
>> >> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> >> than let the kernel's OOM killer activate and need to gather this
>> >> information and we'd like to be able to get this information to make
>> >> the decision much faster than 400ms
>> >
>> > Global OOM handling in userspace is really dubious if you ask me. I
>> > understand you want something better than SIGKILL and in fact this is
>> > already possible with memory cgroup controller (btw. memcg will give
>> > you a cheap access to rss, amount of shared, swapped out memory as
>> > well). Anyway if you are getting close to the OOM your system will most
>> > probably be really busy and chances are that also reading your new file
>> > will take much more time. I am also not quite sure how is pss useful for
>> > oom decisions.
>>
>> I mentioned it before, but based on experience RSS just isn't good
>> enough -- there's too much sharing going on in our use case to make
>> the correct decision based on RSS.  If RSS were good enough, simply
>> put, this patch wouldn't exist.
>
> But that doesn't answer my question, I am afraid. So how exactly do you
> use pss for oom decisions?

We use PSS to calculate the memory used by a process among all the
processes in the system, in the case of Chrome this tells us how much
each renderer process (which is roughly tied to a particular "tab" in
Chrome) is using and how much it has swapped out, so we know what the
worst offenders are -- I'm not sure what's unclear about that?

Chrome tends to use a lot of shared memory so we found PSS to be
better than RSS, and I can give you examples of the  RSS and PSS on
real systems to illustrate the magnitude of the difference between
those two numbers if that would be useful.

>
>> So even with memcg I think we'd have the same problem?
>
> memcg will give you instant anon, shared counters for all processes in
> the memcg.
>

We want to be able to get per-process granularity quickly.  I'm not
sure if memcg provides that exactly?

>> > Don't take me wrong, /proc//totmaps might be suitable for your
>> > specific usecase but so far I haven't heard any sound argument for it to
>> > be generally usable. It is true that smaps is unnecessarily costly but
>> > at least I can see some room for improvements. A simple patch I've
>> > posted cut the formatting overhead by 7%. Maybe we can do more.
>>
>> It seems like a general problem that if you want these values the
>> existing kernel interface can be very expensive, so it would be
>> generally usable by any application which wants a per process PSS,
>> private data, dirty data or swap value.
>
> yes this is really unfortunate. And if at all possible we should address
> that. Precise values require the expensive rmap walk. We can introduce
> some caching to help that. But so far it seems the biggest overhead is
> to simply format the output and that should be addressed before any new
> proc file is added.
>
>> I mentioned two use cases, but I guess I don't understand the comment
>> about why it's not usable by other use cases.
>
> I might be wrong here but a use of pss is quite limited and I do not
> remember anybody asking for large optimizations in that area. I still do
> not understand your use cases properly so I am quite skeptical about a
> general usefulness of a new file.

How do you know that usage of PSS is quite limited?  I can only say
that we've been using it on Chromium OS for at least four years and
have found it very valuable, and I think I've explained the use cases
in this thread. If you have more specific questions then I can try to
clarify.

>
> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Thu, Aug 18, 2016 at 2:05 PM, Robert Foss <robert.f...@collabora.com> wrote:
>
>
> On 2016-08-18 02:01 PM, Michal Hocko wrote:
>>
>> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>>>
>>> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> wrote:
>>>>
>>>> On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>>
>> [...]
>>>>>
>>>>> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>>>>> than let the kernel's OOM killer activate and need to gather this
>>>>> information and we'd like to be able to get this information to make
>>>>> the decision much faster than 400ms
>>>>
>>>>
>>>> Global OOM handling in userspace is really dubious if you ask me. I
>>>> understand you want something better than SIGKILL and in fact this is
>>>> already possible with memory cgroup controller (btw. memcg will give
>>>> you a cheap access to rss, amount of shared, swapped out memory as
>>>> well). Anyway if you are getting close to the OOM your system will most
>>>> probably be really busy and chances are that also reading your new file
>>>> will take much more time. I am also not quite sure how is pss useful for
>>>> oom decisions.
>>>
>>>
>>> I mentioned it before, but based on experience RSS just isn't good
>>> enough -- there's too much sharing going on in our use case to make
>>> the correct decision based on RSS.  If RSS were good enough, simply
>>> put, this patch wouldn't exist.
>>
>>
>> But that doesn't answer my question, I am afraid. So how exactly do you
>> use pss for oom decisions?
>>
>>> So even with memcg I think we'd have the same problem?
>>
>>
>> memcg will give you instant anon, shared counters for all processes in
>> the memcg.
>
>
> Is it technically feasible to add instant pss support to memcg?
>
> @Sonny Rao: Would using cgroups be acceptable for chromiumos?

It's possible, though I think we'd end up putting each renderer in
it's own cgroup to get the PSS stat, so it seems a bit like overkill.
I think memcg also has some overhead that we'd need to quantify but I
could be mistaken about this.

>
>
>>
>>>> Don't take me wrong, /proc//totmaps might be suitable for your
>>>> specific usecase but so far I haven't heard any sound argument for it to
>>>> be generally usable. It is true that smaps is unnecessarily costly but
>>>> at least I can see some room for improvements. A simple patch I've
>>>> posted cut the formatting overhead by 7%. Maybe we can do more.
>>>
>>>
>>> It seems like a general problem that if you want these values the
>>> existing kernel interface can be very expensive, so it would be
>>> generally usable by any application which wants a per process PSS,
>>> private data, dirty data or swap value.
>>
>>
>> yes this is really unfortunate. And if at all possible we should address
>> that. Precise values require the expensive rmap walk. We can introduce
>> some caching to help that. But so far it seems the biggest overhead is
>> to simply format the output and that should be addressed before any new
>> proc file is added.
>>
>>> I mentioned two use cases, but I guess I don't understand the comment
>>> about why it's not usable by other use cases.
>>
>>
>> I might be wrong here but a use of pss is quite limited and I do not
>> remember anybody asking for large optimizations in that area. I still do
>> not understand your use cases properly so I am quite skeptical about a
>> general usefulness of a new file.
>>
>

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-19 Thread Sonny Rao

On Thu, Aug 18, 2016 at 2:05 PM, Robert Foss  wrote:
>
>
> On 2016-08-18 02:01 PM, Michal Hocko wrote:
>>
>> On Thu 18-08-16 10:47:57, Sonny Rao wrote:
>>>
>>> On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
>>>>
>>>> On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>>
>> [...]
>>>>>
>>>>> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>>>>> than let the kernel's OOM killer activate and need to gather this
>>>>> information and we'd like to be able to get this information to make
>>>>> the decision much faster than 400ms
>>>>
>>>>
>>>> Global OOM handling in userspace is really dubious if you ask me. I
>>>> understand you want something better than SIGKILL and in fact this is
>>>> already possible with memory cgroup controller (btw. memcg will give
>>>> you a cheap access to rss, amount of shared, swapped out memory as
>>>> well). Anyway if you are getting close to the OOM your system will most
>>>> probably be really busy and chances are that also reading your new file
>>>> will take much more time. I am also not quite sure how is pss useful for
>>>> oom decisions.
>>>
>>>
>>> I mentioned it before, but based on experience RSS just isn't good
>>> enough -- there's too much sharing going on in our use case to make
>>> the correct decision based on RSS.  If RSS were good enough, simply
>>> put, this patch wouldn't exist.
>>
>>
>> But that doesn't answer my question, I am afraid. So how exactly do you
>> use pss for oom decisions?
>>
>>> So even with memcg I think we'd have the same problem?
>>
>>
>> memcg will give you instant anon, shared counters for all processes in
>> the memcg.
>
>
> Is it technically feasible to add instant pss support to memcg?
>
> @Sonny Rao: Would using cgroups be acceptable for chromiumos?

It's possible, though I think we'd end up putting each renderer in
it's own cgroup to get the PSS stat, so it seems a bit like overkill.
I think memcg also has some overhead that we'd need to quantify but I
could be mistaken about this.

>
>
>>
>>>> Don't take me wrong, /proc//totmaps might be suitable for your
>>>> specific usecase but so far I haven't heard any sound argument for it to
>>>> be generally usable. It is true that smaps is unnecessarily costly but
>>>> at least I can see some room for improvements. A simple patch I've
>>>> posted cut the formatting overhead by 7%. Maybe we can do more.
>>>
>>>
>>> It seems like a general problem that if you want these values the
>>> existing kernel interface can be very expensive, so it would be
>>> generally usable by any application which wants a per process PSS,
>>> private data, dirty data or swap value.
>>
>>
>> yes this is really unfortunate. And if at all possible we should address
>> that. Precise values require the expensive rmap walk. We can introduce
>> some caching to help that. But so far it seems the biggest overhead is
>> to simply format the output and that should be addressed before any new
>> proc file is added.
>>
>>> I mentioned two use cases, but I guess I don't understand the comment
>>> about why it's not usable by other use cases.
>>
>>
>> I might be wrong here but a use of pss is quite limited and I do not
>> remember anybody asking for large optimizations in that area. I still do
>> not understand your use cases properly so I am quite skeptical about a
>> general usefulness of a new file.
>>
>

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-18 Thread Sonny Rao

On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko <mho...@kernel.org> wrote:
> On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> On Wed, Aug 17, 2016 at 6:03 AM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Wed 17-08-16 11:31:25, Jann Horn wrote:
> [...]
>> >> That's at least 30.43% + 9.12% + 7.66% = 47.21% of the task's kernel
>> >> time spent on evaluating format strings. The new interface
>> >> wouldn't have to spend that much time on format strings because there
>> >> isn't so much text to format.
>> >
>> > well, this is true of course but I would much rather try to reduce the
>> > overhead of smaps file than add a new file. The following should help
>> > already. I've measured ~7% systime cut down. I guess there is still some
>> > room for improvements but I have to say I'm far from being convinced about
>> > a new proc file just because we suck at dumping information to the
>> > userspace.
>> > If this was something like /proc//stat which is
>> > essentially read all the time then it would be a different question but
>> > is the rss, pss going to be all that often? If yes why?
>>
>> If the question is why do we need to read RSS, PSS, Private_*, Swap
>> and the other fields so often?
>>
>> I have two use cases so far involving monitoring per-process memory
>> usage, and we usually need to read stats for about 25 processes.
>>
>> Here's a timing example on an fairly recent ARM system 4 core RK3288
>> running at 1.8Ghz
>>
>> localhost ~ # time cat /proc/25946/smaps > /dev/null
>>
>> real0m0.036s
>> user0m0.020s
>> sys 0m0.020s
>>
>> localhost ~ # time cat /proc/25946/totmaps > /dev/null
>>
>> real0m0.027s
>> user0m0.010s
>> sys 0m0.010s
>> localhost ~ #
>>
>> I'll ignore the user time for now, and we see about 20 ms of system
>> time with smaps and 10 ms with totmaps, with 20 similar processes it
>> would be 400 milliseconds of cpu time for the kernel to get this
>> information from smaps vs 200 milliseconds with totmaps.  Even totmaps
>> is still pretty slow, but much better than smaps.
>>
>> Use cases:
>> 1) Basic task monitoring -- like "top" that shows memory consumption
>> including PSS, Private, Swap
>> 1 second update means about 40% of one CPU is spent in the kernel
>> gathering the data with smaps
>
> I would argue that even 20% is way too much for such a monitoring. What
> is the value to do it so often tha 20 vs 40ms really matters?

Yeah it is too much (I believe I said that) but it's significantly better.

>> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> than let the kernel's OOM killer activate and need to gather this
>> information and we'd like to be able to get this information to make
>> the decision much faster than 400ms
>
> Global OOM handling in userspace is really dubious if you ask me. I
> understand you want something better than SIGKILL and in fact this is
> already possible with memory cgroup controller (btw. memcg will give
> you a cheap access to rss, amount of shared, swapped out memory as
> well). Anyway if you are getting close to the OOM your system will most
> probably be really busy and chances are that also reading your new file
> will take much more time. I am also not quite sure how is pss useful for
> oom decisions.

I mentioned it before, but based on experience RSS just isn't good
enough -- there's too much sharing going on in our use case to make
the correct decision based on RSS.  If RSS were good enough, simply
put, this patch wouldn't exist.  So even with memcg I think we'd have
the same problem?

>
> Don't take me wrong, /proc//totmaps might be suitable for your
> specific usecase but so far I haven't heard any sound argument for it to
> be generally usable. It is true that smaps is unnecessarily costly but
> at least I can see some room for improvements. A simple patch I've
> posted cut the formatting overhead by 7%. Maybe we can do more.

It seems like a general problem that if you want these values the
existing kernel interface can be very expensive, so it would be
generally usable by any application which wants a per process PSS,
private data, dirty data or swap value.   I mentioned two use cases,
but I guess I don't understand the comment about why it's not usable
by other use cases.

> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-18 Thread Sonny Rao

On Thu, Aug 18, 2016 at 12:44 AM, Michal Hocko  wrote:
> On Wed 17-08-16 11:57:56, Sonny Rao wrote:
>> On Wed, Aug 17, 2016 at 6:03 AM, Michal Hocko  wrote:
>> > On Wed 17-08-16 11:31:25, Jann Horn wrote:
> [...]
>> >> That's at least 30.43% + 9.12% + 7.66% = 47.21% of the task's kernel
>> >> time spent on evaluating format strings. The new interface
>> >> wouldn't have to spend that much time on format strings because there
>> >> isn't so much text to format.
>> >
>> > well, this is true of course but I would much rather try to reduce the
>> > overhead of smaps file than add a new file. The following should help
>> > already. I've measured ~7% systime cut down. I guess there is still some
>> > room for improvements but I have to say I'm far from being convinced about
>> > a new proc file just because we suck at dumping information to the
>> > userspace.
>> > If this was something like /proc//stat which is
>> > essentially read all the time then it would be a different question but
>> > is the rss, pss going to be all that often? If yes why?
>>
>> If the question is why do we need to read RSS, PSS, Private_*, Swap
>> and the other fields so often?
>>
>> I have two use cases so far involving monitoring per-process memory
>> usage, and we usually need to read stats for about 25 processes.
>>
>> Here's a timing example on an fairly recent ARM system 4 core RK3288
>> running at 1.8Ghz
>>
>> localhost ~ # time cat /proc/25946/smaps > /dev/null
>>
>> real0m0.036s
>> user0m0.020s
>> sys 0m0.020s
>>
>> localhost ~ # time cat /proc/25946/totmaps > /dev/null
>>
>> real0m0.027s
>> user0m0.010s
>> sys 0m0.010s
>> localhost ~ #
>>
>> I'll ignore the user time for now, and we see about 20 ms of system
>> time with smaps and 10 ms with totmaps, with 20 similar processes it
>> would be 400 milliseconds of cpu time for the kernel to get this
>> information from smaps vs 200 milliseconds with totmaps.  Even totmaps
>> is still pretty slow, but much better than smaps.
>>
>> Use cases:
>> 1) Basic task monitoring -- like "top" that shows memory consumption
>> including PSS, Private, Swap
>> 1 second update means about 40% of one CPU is spent in the kernel
>> gathering the data with smaps
>
> I would argue that even 20% is way too much for such a monitoring. What
> is the value to do it so often tha 20 vs 40ms really matters?

Yeah it is too much (I believe I said that) but it's significantly better.

>> 2) User space OOM handling -- we'd rather do a more graceful shutdown
>> than let the kernel's OOM killer activate and need to gather this
>> information and we'd like to be able to get this information to make
>> the decision much faster than 400ms
>
> Global OOM handling in userspace is really dubious if you ask me. I
> understand you want something better than SIGKILL and in fact this is
> already possible with memory cgroup controller (btw. memcg will give
> you a cheap access to rss, amount of shared, swapped out memory as
> well). Anyway if you are getting close to the OOM your system will most
> probably be really busy and chances are that also reading your new file
> will take much more time. I am also not quite sure how is pss useful for
> oom decisions.

I mentioned it before, but based on experience RSS just isn't good
enough -- there's too much sharing going on in our use case to make
the correct decision based on RSS.  If RSS were good enough, simply
put, this patch wouldn't exist.  So even with memcg I think we'd have
the same problem?

>
> Don't take me wrong, /proc//totmaps might be suitable for your
> specific usecase but so far I haven't heard any sound argument for it to
> be generally usable. It is true that smaps is unnecessarily costly but
> at least I can see some room for improvements. A simple patch I've
> posted cut the formatting overhead by 7%. Maybe we can do more.

It seems like a general problem that if you want these values the
existing kernel interface can be very expensive, so it would be
generally usable by any application which wants a per process PSS,
private data, dirty data or swap value.   I mentioned two use cases,
but I guess I don't understand the comment about why it's not usable
by other use cases.

> --
> Michal Hocko
> SUSE Labs

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-17 Thread Sonny Rao

On Wed, Aug 17, 2016 at 6:03 AM, Michal Hocko  wrote:
> On Wed 17-08-16 11:31:25, Jann Horn wrote:
>> On Wed, Aug 17, 2016 at 10:22:00AM +0200, Michal Hocko wrote:
>> > On Tue 16-08-16 12:46:51, Robert Foss wrote:
>> > [...]
>> > > $ /usr/bin/time -v -p zsh -c "repeat 25 { awk '/^Rss/{rss+=\$2}
>> > > /^Pss/{pss+=\$2} END {printf \"rss:%d pss:%d\n\", rss, pss}\'
>> > > /proc/5025/smaps }"
>> > > [...]
>> > >   Command being timed: "zsh -c repeat 25 { awk '/^Rss/{rss+=$2}
>> > > /^Pss/{pss+=$2} END {printf "rss:%d pss:%d\n", rss, pss}\' 
>> > > /proc/5025/smaps
>> > > }"
>> > >   User time (seconds): 0.37
>> > >   System time (seconds): 0.45
>> > >   Percent of CPU this job got: 92%
>> > >   Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.89
>> >
>> > This is really unexpected. Where is the user time spent? Anyway, rather
>> > than measuring some random processes I've tried to measure something
>> > resembling the worst case. So I've created a simple program to mmap as
>> > much as possible:
>> >
>> > #include 
>> > #include 
>> > #include 
>> > #include 
>> > int main()
>> > {
>> > while (mmap(NULL, 4096, PROT_READ|PROT_WRITE, 
>> > MAP_ANON|MAP_SHARED|MAP_POPULATE, -1, 0) != MAP_FAILED)
>> > ;
>> >
>> > printf("pid:%d\n", getpid());
>> > pause();
>> > return 0;
>> > }
>>
>> Ah, nice, that's a reasonable test program. :)
>>
>>
>> > So with a reasonable user space the parsing is really not all that time
>> > consuming wrt. smaps handling. That being said I am still very skeptical
>> > about a dedicated proc file which accomplishes what userspace can done
>> > in a trivial way.
>>
>> Now, since your numbers showed that all the time is spent in the kernel,
>> also create this test program to just read that file over and over again:
>>
>> $ cat justreadloop.c
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> char buf[100];
>>
>> int main(int argc, char **argv) {
>>   printf("pid:%d\n", getpid());
>>   while (1) {
>> int fd = open(argv[1], O_RDONLY);
>> if (fd < 0) continue;
>> if (read(fd, buf, sizeof(buf)) < 0)
>>   err(1, "read");
>> close(fd);
>>   }
>> }
>> $ gcc -Wall -o justreadloop justreadloop.c
>> $
>>
>> Now launch your test:
>>
>> $ ./mapstuff
>> pid:29397
>>
>> point justreadloop at it:
>>
>> $ ./justreadloop /proc/29397/smaps
>> pid:32567
>>
>> ... and then check the performance stats of justreadloop:
>>
>> # perf top -p 32567
>>
>> This is what I see:
>>
>> Samples: 232K of event 'cycles:ppp', Event count (approx.): 60448424325
>> Overhead  Shared Object Symbol
>>   30,43%  [kernel]  [k] format_decode
>>9,12%  [kernel]  [k] number
>>7,66%  [kernel]  [k] vsnprintf
>>7,06%  [kernel]  [k] __lock_acquire
>>3,23%  [kernel]  [k] lock_release
>>2,85%  [kernel]  [k] debug_lockdep_rcu_enabled
>>2,25%  [kernel]  [k] skip_atoi
>>2,13%  [kernel]  [k] lock_acquire
>>2,05%  [kernel]  [k] show_smap
>
> This is a lot! I would expect the rmap walk to consume more but it even
> doesn't show up in the top consumers.
>
>> That's at least 30.43% + 9.12% + 7.66% = 47.21% of the task's kernel
>> time spent on evaluating format strings. The new interface
>> wouldn't have to spend that much time on format strings because there
>> isn't so much text to format.
>
> well, this is true of course but I would much rather try to reduce the
> overhead of smaps file than add a new file. The following should help
> already. I've measured ~7% systime cut down. I guess there is still some
> room for improvements but I have to say I'm far from being convinced about
> a new proc file just because we suck at dumping information to the
> userspace.
> If this was something like /proc//stat which is
> essentially read all the time then it would be a different question but
> is the rss, pss going to be all that often? If yes why?

If the question is why do we need to read RSS, PSS, Private_*, Swap
and the other fields so often?

I have two use cases so far involving monitoring per-process memory
usage, and we usually need to read stats for about 25 processes.

Here's a timing example on an fairly recent ARM system 4 core RK3288
running at 1.8Ghz

localhost ~ # time cat /proc/25946/smaps > /dev/null

real0m0.036s
user0m0.020s
sys 0m0.020s

localhost ~ # time cat /proc/25946/totmaps > /dev/null

real0m0.027s
user0m0.010s
sys 0m0.010s
localhost ~ #

I'll ignore the user time for now, and we see about 20 ms of system
time with smaps and 10 ms with totmaps, with 20 similar processes it
would be 400 milliseconds of cpu time for the kernel to get this
information from smaps vs 200 milliseconds with totmaps.  Even totmaps
is still pretty slow, but much better than smaps.

Use cases:
1) Basic task monitoring -- like "top" that shows memory consumption
including PSS, Private, Swap
1 second update

Re: [PACTH v2 0/3] Implement /proc//totmaps

2016-08-17 Thread Sonny Rao

On Wed, Aug 17, 2016 at 6:03 AM, Michal Hocko  wrote:
> On Wed 17-08-16 11:31:25, Jann Horn wrote:
>> On Wed, Aug 17, 2016 at 10:22:00AM +0200, Michal Hocko wrote:
>> > On Tue 16-08-16 12:46:51, Robert Foss wrote:
>> > [...]
>> > > $ /usr/bin/time -v -p zsh -c "repeat 25 { awk '/^Rss/{rss+=\$2}
>> > > /^Pss/{pss+=\$2} END {printf \"rss:%d pss:%d\n\", rss, pss}\'
>> > > /proc/5025/smaps }"
>> > > [...]
>> > >   Command being timed: "zsh -c repeat 25 { awk '/^Rss/{rss+=$2}
>> > > /^Pss/{pss+=$2} END {printf "rss:%d pss:%d\n", rss, pss}\' 
>> > > /proc/5025/smaps
>> > > }"
>> > >   User time (seconds): 0.37
>> > >   System time (seconds): 0.45
>> > >   Percent of CPU this job got: 92%
>> > >   Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.89
>> >
>> > This is really unexpected. Where is the user time spent? Anyway, rather
>> > than measuring some random processes I've tried to measure something
>> > resembling the worst case. So I've created a simple program to mmap as
>> > much as possible:
>> >
>> > #include 
>> > #include 
>> > #include 
>> > #include 
>> > int main()
>> > {
>> > while (mmap(NULL, 4096, PROT_READ|PROT_WRITE, 
>> > MAP_ANON|MAP_SHARED|MAP_POPULATE, -1, 0) != MAP_FAILED)
>> > ;
>> >
>> > printf("pid:%d\n", getpid());
>> > pause();
>> > return 0;
>> > }
>>
>> Ah, nice, that's a reasonable test program. :)
>>
>>
>> > So with a reasonable user space the parsing is really not all that time
>> > consuming wrt. smaps handling. That being said I am still very skeptical
>> > about a dedicated proc file which accomplishes what userspace can done
>> > in a trivial way.
>>
>> Now, since your numbers showed that all the time is spent in the kernel,
>> also create this test program to just read that file over and over again:
>>
>> $ cat justreadloop.c
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>>
>> char buf[100];
>>
>> int main(int argc, char **argv) {
>>   printf("pid:%d\n", getpid());
>>   while (1) {
>> int fd = open(argv[1], O_RDONLY);
>> if (fd < 0) continue;
>> if (read(fd, buf, sizeof(buf)) < 0)
>>   err(1, "read");
>> close(fd);
>>   }
>> }
>> $ gcc -Wall -o justreadloop justreadloop.c
>> $
>>
>> Now launch your test:
>>
>> $ ./mapstuff
>> pid:29397
>>
>> point justreadloop at it:
>>
>> $ ./justreadloop /proc/29397/smaps
>> pid:32567
>>
>> ... and then check the performance stats of justreadloop:
>>
>> # perf top -p 32567
>>
>> This is what I see:
>>
>> Samples: 232K of event 'cycles:ppp', Event count (approx.): 60448424325
>> Overhead  Shared Object Symbol
>>   30,43%  [kernel]  [k] format_decode
>>9,12%  [kernel]  [k] number
>>7,66%  [kernel]  [k] vsnprintf
>>7,06%  [kernel]  [k] __lock_acquire
>>3,23%  [kernel]  [k] lock_release
>>2,85%  [kernel]  [k] debug_lockdep_rcu_enabled
>>2,25%  [kernel]  [k] skip_atoi
>>2,13%  [kernel]  [k] lock_acquire
>>2,05%  [kernel]  [k] show_smap
>
> This is a lot! I would expect the rmap walk to consume more but it even
> doesn't show up in the top consumers.
>
>> That's at least 30.43% + 9.12% + 7.66% = 47.21% of the task's kernel
>> time spent on evaluating format strings. The new interface
>> wouldn't have to spend that much time on format strings because there
>> isn't so much text to format.
>
> well, this is true of course but I would much rather try to reduce the
> overhead of smaps file than add a new file. The following should help
> already. I've measured ~7% systime cut down. I guess there is still some
> room for improvements but I have to say I'm far from being convinced about
> a new proc file just because we suck at dumping information to the
> userspace.
> If this was something like /proc//stat which is
> essentially read all the time then it would be a different question but
> is the rss, pss going to be all that often? If yes why?

If the question is why do we need to read RSS, PSS, Private_*, Swap
and the other fields so often?

I have two use cases so far involving monitoring per-process memory
usage, and we usually need to read stats for about 25 processes.

Here's a timing example on an fairly recent ARM system 4 core RK3288
running at 1.8Ghz

localhost ~ # time cat /proc/25946/smaps > /dev/null

real0m0.036s
user0m0.020s
sys 0m0.020s

localhost ~ # time cat /proc/25946/totmaps > /dev/null

real0m0.027s
user0m0.010s
sys 0m0.010s
localhost ~ #

I'll ignore the user time for now, and we see about 20 ms of system
time with smaps and 10 ms with totmaps, with 20 similar processes it
would be 400 milliseconds of cpu time for the kernel to get this
information from smaps vs 200 milliseconds with totmaps.  Even totmaps
is still pretty slow, but much better than smaps.

Use cases:
1) Basic task monitoring -- like "top" that shows memory consumption
including PSS, Private, Swap
1 second update means about 40% of

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-10 Thread Sonny Rao

On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn <j...@thejh.net> wrote:
> On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
>> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.f...@collabora.com> 
>> wrote:
>> >
>> >
>> > On 2016-08-09 03:24 PM, Jann Horn wrote:
>> >>
>> >> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>> >>>
>> >>> From: Sonny Rao <sonny...@chromium.org>
>> >>>
>> >>> This is based on earlier work by Thiago Goncales. It implements a new
>> >>> per process proc file which summarizes the contents of the smaps file
>> >>> but doesn't display any addresses.  It gives more detailed information
>> >>> than statm like the PSS (proprotional set size).  It differs from the
>> >>> original implementation in that it doesn't use the full blown set of
>> >>> seq operations, uses a different termination condition, and doesn't
>> >>> displayed "Locked" as that was broken on the original implemenation.
>> >>>
>> >>> This new proc file provides information faster than parsing the
>> >>> potentially
>> >>> huge smaps file.
>> >>>
>> >>> Signed-off-by: Sonny Rao <sonny...@chromium.org>
>> >>>
>> >>> Tested-by: Robert Foss <robert.f...@collabora.com>
>> >>> Signed-off-by: Robert Foss <robert.f...@collabora.com>
>> >>
>> >>
>> >>
>> >>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> >>> +{
>> >>> +   struct proc_maps_private *priv = m->private;
>> >>> +   struct mm_struct *mm;
>> >>> +   struct vm_area_struct *vma;
>> >>> +   struct mem_size_stats *mss_sum = priv->mss;
>> >>> +
>> >>> +   /* reference to priv->task already taken */
>> >>> +   /* but need to get the mm here because */
>> >>> +   /* task could be in the process of exiting */
>> >>
>> >>
>> >> Can you please elaborate on this? My understanding here is that you
>> >> intend for the caller to be able to repeatedly read the same totmaps
>> >> file with pread() and still see updated information after the target
>> >> process has called execve() and be able to detect process death
>> >> (instead of simply seeing stale values). Is that accurate?
>> >>
>> >> I would prefer it if you could grab a reference to the mm_struct
>> >> directly at open time.
>> >
>> >
>> > Sonny, do you know more about the above comment?
>>
>> I think right now the file gets re-opened every time, but the mode
>> where the file is opened once and repeatedly read is interesting
>> because it avoids having to open the file again and again.
>>
>> I guess you could end up with a wierd situation where you don't read
>> the entire contents of the file in open call to read() and you might
>> get inconsistent data across the different statistics?
>
> If the file is read in two chunks, totmaps_proc_show is only called
> once. The patch specifies seq_read as read handler. Have a look at its
> definition. As long as you don't read from the same seq file in
> parallel or seek around in it, simple sequential reads will not
> re-invoke the show() method for data that has already been formatted.
> For partially consumed data, the kernel buffers the rest until someone
> reads it or seeks to another offset.

Ok that's good.  If the consumer were using pread() though, would that
look like a seek?

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-10 Thread Sonny Rao

On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn  wrote:
> On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
>> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss  
>> wrote:
>> >
>> >
>> > On 2016-08-09 03:24 PM, Jann Horn wrote:
>> >>
>> >> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>> >>>
>> >>> From: Sonny Rao 
>> >>>
>> >>> This is based on earlier work by Thiago Goncales. It implements a new
>> >>> per process proc file which summarizes the contents of the smaps file
>> >>> but doesn't display any addresses.  It gives more detailed information
>> >>> than statm like the PSS (proprotional set size).  It differs from the
>> >>> original implementation in that it doesn't use the full blown set of
>> >>> seq operations, uses a different termination condition, and doesn't
>> >>> displayed "Locked" as that was broken on the original implemenation.
>> >>>
>> >>> This new proc file provides information faster than parsing the
>> >>> potentially
>> >>> huge smaps file.
>> >>>
>> >>> Signed-off-by: Sonny Rao 
>> >>>
>> >>> Tested-by: Robert Foss 
>> >>> Signed-off-by: Robert Foss 
>> >>
>> >>
>> >>
>> >>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> >>> +{
>> >>> +   struct proc_maps_private *priv = m->private;
>> >>> +   struct mm_struct *mm;
>> >>> +   struct vm_area_struct *vma;
>> >>> +   struct mem_size_stats *mss_sum = priv->mss;
>> >>> +
>> >>> +   /* reference to priv->task already taken */
>> >>> +   /* but need to get the mm here because */
>> >>> +   /* task could be in the process of exiting */
>> >>
>> >>
>> >> Can you please elaborate on this? My understanding here is that you
>> >> intend for the caller to be able to repeatedly read the same totmaps
>> >> file with pread() and still see updated information after the target
>> >> process has called execve() and be able to detect process death
>> >> (instead of simply seeing stale values). Is that accurate?
>> >>
>> >> I would prefer it if you could grab a reference to the mm_struct
>> >> directly at open time.
>> >
>> >
>> > Sonny, do you know more about the above comment?
>>
>> I think right now the file gets re-opened every time, but the mode
>> where the file is opened once and repeatedly read is interesting
>> because it avoids having to open the file again and again.
>>
>> I guess you could end up with a wierd situation where you don't read
>> the entire contents of the file in open call to read() and you might
>> get inconsistent data across the different statistics?
>
> If the file is read in two chunks, totmaps_proc_show is only called
> once. The patch specifies seq_read as read handler. Have a look at its
> definition. As long as you don't read from the same seq file in
> parallel or seek around in it, simple sequential reads will not
> re-invoke the show() method for data that has already been formatted.
> For partially consumed data, the kernel buffers the rest until someone
> reads it or seeks to another offset.

Ok that's good.  If the consumer were using pread() though, would that
look like a seek?

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-10 Thread Sonny Rao

On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.f...@collabora.com> wrote:
>
>
> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>
>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>>>
>>> From: Sonny Rao <sonny...@chromium.org>
>>>
>>> This is based on earlier work by Thiago Goncales. It implements a new
>>> per process proc file which summarizes the contents of the smaps file
>>> but doesn't display any addresses.  It gives more detailed information
>>> than statm like the PSS (proprotional set size).  It differs from the
>>> original implementation in that it doesn't use the full blown set of
>>> seq operations, uses a different termination condition, and doesn't
>>> displayed "Locked" as that was broken on the original implemenation.
>>>
>>> This new proc file provides information faster than parsing the
>>> potentially
>>> huge smaps file.
>>>
>>> Signed-off-by: Sonny Rao <sonny...@chromium.org>
>>>
>>> Tested-by: Robert Foss <robert.f...@collabora.com>
>>> Signed-off-by: Robert Foss <robert.f...@collabora.com>
>>
>>
>>
>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>> +{
>>> +   struct proc_maps_private *priv = m->private;
>>> +   struct mm_struct *mm;
>>> +   struct vm_area_struct *vma;
>>> +   struct mem_size_stats *mss_sum = priv->mss;
>>> +
>>> +   /* reference to priv->task already taken */
>>> +   /* but need to get the mm here because */
>>> +   /* task could be in the process of exiting */
>>
>>
>> Can you please elaborate on this? My understanding here is that you
>> intend for the caller to be able to repeatedly read the same totmaps
>> file with pread() and still see updated information after the target
>> process has called execve() and be able to detect process death
>> (instead of simply seeing stale values). Is that accurate?
>>
>> I would prefer it if you could grab a reference to the mm_struct
>> directly at open time.
>
>
> Sonny, do you know more about the above comment?

I think right now the file gets re-opened every time, but the mode
where the file is opened once and repeatedly read is interesting
because it avoids having to open the file again and again.

I guess you could end up with a wierd situation where you don't read
the entire contents of the file in open call to read() and you might
get inconsistent data across the different statistics?

>
>>
>>
>>> +   mm = get_task_mm(priv->task);
>>> +   if (!mm || IS_ERR(mm))
>>> +   return -EINVAL;
>>
>>
>> get_task_mm() doesn't return error codes, and all other callers just
>> check whether the return value is NULL.
>>
>
> I'll have that fixed in v2, thanks for spotting it!
>
>
>>
>>> +   down_read(>mmap_sem);
>>> +   hold_task_mempolicy(priv);
>>> +
>>> +   for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>>> +   struct mem_size_stats mss;
>>> +   struct mm_walk smaps_walk = {
>>> +   .pmd_entry = smaps_pte_range,
>>> +   .mm = vma->vm_mm,
>>> +   .private = ,
>>> +   };
>>> +
>>> +   if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>>> +   memset(, 0, sizeof(mss));
>>> +   walk_page_vma(vma, _walk);
>>> +   add_smaps_sum(, mss_sum);
>>> +   }
>>> +   }
>>
>>
>> E... what? You accumulate values from mem_size_stats items into a
>> struct mss_sum that is associated with the struct file? So when you
>> read the file the second time, you get the old values plus the new ones?
>> And when you read the file in parallel, you get inconsistent values?
>>
>> For most files in procfs, the behavior is that you can just call
>> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
>> you the current values every time, without mutating state. I strongly
>> recommend that you get rid of priv->mss and just accumulate the state
>> in a local variable (maybe one on the stack).
>
>
> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
> better solution?
>
>>
>>
>>> @@ -836,6 +911,50 @@ static int tid

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-10 Thread Sonny Rao

On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss  wrote:
>
>
> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>
>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>>>
>>> From: Sonny Rao 
>>>
>>> This is based on earlier work by Thiago Goncales. It implements a new
>>> per process proc file which summarizes the contents of the smaps file
>>> but doesn't display any addresses.  It gives more detailed information
>>> than statm like the PSS (proprotional set size).  It differs from the
>>> original implementation in that it doesn't use the full blown set of
>>> seq operations, uses a different termination condition, and doesn't
>>> displayed "Locked" as that was broken on the original implemenation.
>>>
>>> This new proc file provides information faster than parsing the
>>> potentially
>>> huge smaps file.
>>>
>>> Signed-off-by: Sonny Rao 
>>>
>>> Tested-by: Robert Foss 
>>> Signed-off-by: Robert Foss 
>>
>>
>>
>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>> +{
>>> +   struct proc_maps_private *priv = m->private;
>>> +   struct mm_struct *mm;
>>> +   struct vm_area_struct *vma;
>>> +   struct mem_size_stats *mss_sum = priv->mss;
>>> +
>>> +   /* reference to priv->task already taken */
>>> +   /* but need to get the mm here because */
>>> +   /* task could be in the process of exiting */
>>
>>
>> Can you please elaborate on this? My understanding here is that you
>> intend for the caller to be able to repeatedly read the same totmaps
>> file with pread() and still see updated information after the target
>> process has called execve() and be able to detect process death
>> (instead of simply seeing stale values). Is that accurate?
>>
>> I would prefer it if you could grab a reference to the mm_struct
>> directly at open time.
>
>
> Sonny, do you know more about the above comment?

I think right now the file gets re-opened every time, but the mode
where the file is opened once and repeatedly read is interesting
because it avoids having to open the file again and again.

I guess you could end up with a wierd situation where you don't read
the entire contents of the file in open call to read() and you might
get inconsistent data across the different statistics?

>
>>
>>
>>> +   mm = get_task_mm(priv->task);
>>> +   if (!mm || IS_ERR(mm))
>>> +   return -EINVAL;
>>
>>
>> get_task_mm() doesn't return error codes, and all other callers just
>> check whether the return value is NULL.
>>
>
> I'll have that fixed in v2, thanks for spotting it!
>
>
>>
>>> +   down_read(>mmap_sem);
>>> +   hold_task_mempolicy(priv);
>>> +
>>> +   for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>>> +   struct mem_size_stats mss;
>>> +   struct mm_walk smaps_walk = {
>>> +   .pmd_entry = smaps_pte_range,
>>> +   .mm = vma->vm_mm,
>>> +   .private = ,
>>> +   };
>>> +
>>> +   if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>>> +   memset(, 0, sizeof(mss));
>>> +   walk_page_vma(vma, _walk);
>>> +   add_smaps_sum(, mss_sum);
>>> +   }
>>> +   }
>>
>>
>> E... what? You accumulate values from mem_size_stats items into a
>> struct mss_sum that is associated with the struct file? So when you
>> read the file the second time, you get the old values plus the new ones?
>> And when you read the file in parallel, you get inconsistent values?
>>
>> For most files in procfs, the behavior is that you can just call
>> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
>> you the current values every time, without mutating state. I strongly
>> recommend that you get rid of priv->mss and just accumulate the state
>> in a local variable (maybe one on the stack).
>
>
> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
> better solution?
>
>>
>>
>>> @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode,
>>> struct file *file)
>>> return do_maps_open(inode, file, _tid_smaps_op);
>>>  }
>>>

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 12:16 PM, Konstantin Khlebnikov <koc...@gmail.com> wrote:
>
> On Tue, Aug 9, 2016 at 7:05 PM,  <robert.f...@collabora.com> wrote:
> > From: Sonny Rao <sonny...@chromium.org>
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> What statistics do you really need?

PSS (Proportional Set Size) and related accounting of shared pages
(swap could be shared) is where the existing summaries of memory usage
are cumbersome.

>
>
> I think, performance and flexibility issues could be really solved only by new
> syscall for querying memory statistics for address range in any process:
> process_vm_stat() or some kind of pumped fincore() for /proc/$pid/mem


That would be a good long term solution if people want similarly
complicated statistics without having to iterate through current
interfaces.
I mentioned monitoring before but I'll add that Proportional Set size,
Unique Set Size, Swap are per process are also useful because they
help us make better decisions about what processes need to be
throttled or gracefully killed.

>
> >
> > Signed-off-by: Sonny Rao <sonny...@chromium.org>
> >
> > Tested-by: Robert Foss <robert.f...@collabora.com>
> > Signed-off-by: Robert Foss <robert.f...@collabora.com>
> >
> > ---
> >  fs/proc/base.c |   1 +
> >  fs/proc/internal.h |   4 ++
> >  fs/proc/task_mmu.c | 126 
> > +
> >  3 files changed, 131 insertions(+)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index a11eb71..de3acdf 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -2855,6 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> > REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
> > REG("smaps",  S_IRUGO, proc_pid_smaps_operations),
> > REG("pagemap",S_IRUSR, proc_pagemap_operations),
> > +   REG("totmaps",S_IRUGO, proc_totmaps_operations),
> >  #endif
> >  #ifdef CONFIG_SECURITY
> > DIR("attr",   S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, 
> > proc_attr_dir_operations),
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index aa27810..6f3540f 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -58,6 +58,9 @@ union proc_op {
> > struct task_struct *task);
> >  };
> >
> > +
> > +extern const struct file_operations proc_totmaps_operations;
> > +
> >  struct proc_inode {
> > struct pid *pid;
> > int fd;
> > @@ -281,6 +284,7 @@ struct proc_maps_private {
> > struct mm_struct *mm;
> >  #ifdef CONFIG_MMU
> > struct vm_area_struct *tail_vma;
> > +   struct mem_size_stats *mss;
> >  #endif
> >  #ifdef CONFIG_NUMA
> > struct mempolicy *task_mempolicy;
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 4648c7f..b61873e 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -802,6 +802,81 @@ static int show_smap(struct seq_file *m, void *v, int 
> > is_pid)
> > return 0;
> >  }
> >
> > +static void add_smaps_sum(struct mem_size_stats *mss,
> > +   struct mem_size_stats *mss_sum)
> > +{
> > +   mss_sum->resident += mss->resident;
> > +   mss_sum->pss += mss->pss;
> > +   mss_sum->shared_clean += mss->shared_clean;
> > +   mss_sum->shared_dirty += mss->shared_dirty;
> > +   mss_sum->private_clean += mss->private_clean;
> > +   mss_sum->private_dirty += mss->private_dirty;
> > +   mss_sum->referenced += mss->referenced;
> > +   mss_sum->anonymous += mss->anonymous;
> > +   mss_sum->anonymous_thp += mss->anonymous_thp;
> > +   mss_sum->swap += mss->swap;
> > +}
> > +
> > +static int totmaps_proc_show(struct seq_file *m, void

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 12:16 PM, Konstantin Khlebnikov  wrote:
>
> On Tue, Aug 9, 2016 at 7:05 PM,   wrote:
> > From: Sonny Rao 
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> What statistics do you really need?

PSS (Proportional Set Size) and related accounting of shared pages
(swap could be shared) is where the existing summaries of memory usage
are cumbersome.

>
>
> I think, performance and flexibility issues could be really solved only by new
> syscall for querying memory statistics for address range in any process:
> process_vm_stat() or some kind of pumped fincore() for /proc/$pid/mem


That would be a good long term solution if people want similarly
complicated statistics without having to iterate through current
interfaces.
I mentioned monitoring before but I'll add that Proportional Set size,
Unique Set Size, Swap are per process are also useful because they
help us make better decisions about what processes need to be
throttled or gracefully killed.

>
> >
> > Signed-off-by: Sonny Rao 
> >
> > Tested-by: Robert Foss 
> > Signed-off-by: Robert Foss 
> >
> > ---
> >  fs/proc/base.c |   1 +
> >  fs/proc/internal.h |   4 ++
> >  fs/proc/task_mmu.c | 126 
> > +
> >  3 files changed, 131 insertions(+)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index a11eb71..de3acdf 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -2855,6 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> > REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
> > REG("smaps",  S_IRUGO, proc_pid_smaps_operations),
> > REG("pagemap",S_IRUSR, proc_pagemap_operations),
> > +   REG("totmaps",S_IRUGO, proc_totmaps_operations),
> >  #endif
> >  #ifdef CONFIG_SECURITY
> > DIR("attr",   S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, 
> > proc_attr_dir_operations),
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index aa27810..6f3540f 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -58,6 +58,9 @@ union proc_op {
> > struct task_struct *task);
> >  };
> >
> > +
> > +extern const struct file_operations proc_totmaps_operations;
> > +
> >  struct proc_inode {
> > struct pid *pid;
> > int fd;
> > @@ -281,6 +284,7 @@ struct proc_maps_private {
> > struct mm_struct *mm;
> >  #ifdef CONFIG_MMU
> > struct vm_area_struct *tail_vma;
> > +   struct mem_size_stats *mss;
> >  #endif
> >  #ifdef CONFIG_NUMA
> > struct mempolicy *task_mempolicy;
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 4648c7f..b61873e 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -802,6 +802,81 @@ static int show_smap(struct seq_file *m, void *v, int 
> > is_pid)
> > return 0;
> >  }
> >
> > +static void add_smaps_sum(struct mem_size_stats *mss,
> > +   struct mem_size_stats *mss_sum)
> > +{
> > +   mss_sum->resident += mss->resident;
> > +   mss_sum->pss += mss->pss;
> > +   mss_sum->shared_clean += mss->shared_clean;
> > +   mss_sum->shared_dirty += mss->shared_dirty;
> > +   mss_sum->private_clean += mss->private_clean;
> > +   mss_sum->private_dirty += mss->private_dirty;
> > +   mss_sum->referenced += mss->referenced;
> > +   mss_sum->anonymous += mss->anonymous;
> > +   mss_sum->anonymous_thp += mss->anonymous_thp;
> > +   mss_sum->swap += mss->swap;
> > +}
> > +
> > +static int totmaps_proc_show(struct seq_file *m, void *data)
> > +{
> > +   struct proc_maps_private *priv = m->private;
> > +   struct mm_struct *mm;
> > +   struct vm_area_struct *vma;
> > +

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 9:58 AM, Alexey Dobriyan <adobri...@gmail.com> wrote:
>
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
> > From: Sonny Rao <sonny...@chromium.org>
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> You can "parse" /proc/*/pagemap . RSS, swap are there.


/proc/*pagemap is generally restricted and I don't believe it would
quickly give PSS.

>
> So which ones do you really need?

PSS and Swap are the most important.  RSS isn't precise enough because
it counts shared pages fully, and there tends to be a lot of sharing.

> Why the separate anon hugepages and anon regular pages?

I'm not sure if it's necessary, but that's how it's broken out in smaps.

>
> > + seq_printf(m,
> > +"Rss:%8lu kB\n"
> > +"Pss:%8lu kB\n"
> > +"Shared_Clean:   %8lu kB\n"
> > +"Shared_Dirty:   %8lu kB\n"
> > +"Private_Clean:  %8lu kB\n"
> > +"Private_Dirty:  %8lu kB\n"
> > +"Referenced: %8lu kB\n"
> > +"Anonymous:  %8lu kB\n"
> > +"AnonHugePages:  %8lu kB\n"
> > +"Swap:   %8lu kB\n",
> > +mss_sum->resident >> 10,
> > +(unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> > +mss_sum->shared_clean  >> 10,
> > +mss_sum->shared_dirty  >> 10,
> > +mss_sum->private_clean >> 10,
> > +mss_sum->private_dirty >> 10,
> > +mss_sum->referenced >> 10,
> > +mss_sum->anonymous >> 10,
> > +mss_sum->anonymous_thp >> 10,
> > +mss_sum->swap >> 10);

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 9:58 AM, Alexey Dobriyan  wrote:
>
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
> > From: Sonny Rao 
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> You can "parse" /proc/*/pagemap . RSS, swap are there.


/proc/*pagemap is generally restricted and I don't believe it would
quickly give PSS.

>
> So which ones do you really need?

PSS and Swap are the most important.  RSS isn't precise enough because
it counts shared pages fully, and there tends to be a lot of sharing.

> Why the separate anon hugepages and anon regular pages?

I'm not sure if it's necessary, but that's how it's broken out in smaps.

>
> > + seq_printf(m,
> > +"Rss:%8lu kB\n"
> > +"Pss:%8lu kB\n"
> > +"Shared_Clean:   %8lu kB\n"
> > +"Shared_Dirty:   %8lu kB\n"
> > +"Private_Clean:  %8lu kB\n"
> > +"Private_Dirty:  %8lu kB\n"
> > +"Referenced: %8lu kB\n"
> > +"Anonymous:  %8lu kB\n"
> > +"AnonHugePages:  %8lu kB\n"
> > +"Swap:   %8lu kB\n",
> > +mss_sum->resident >> 10,
> > +(unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> > +mss_sum->shared_clean  >> 10,
> > +mss_sum->shared_dirty  >> 10,
> > +mss_sum->private_clean >> 10,
> > +mss_sum->private_dirty >> 10,
> > +mss_sum->referenced >> 10,
> > +mss_sum->anonymous >> 10,
> > +mss_sum->anonymous_thp >> 10,
> > +mss_sum->swap >> 10);

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 9:29 AM, Mateusz Guzik <mgu...@redhat.com> wrote:
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>> From: Sonny Rao <sonny...@chromium.org>
>>
>> This is based on earlier work by Thiago Goncales. It implements a new
>> per process proc file which summarizes the contents of the smaps file
>> but doesn't display any addresses.  It gives more detailed information
>> than statm like the PSS (proprotional set size).  It differs from the
>> original implementation in that it doesn't use the full blown set of
>> seq operations, uses a different termination condition, and doesn't
>> displayed "Locked" as that was broken on the original implemenation.
>>
>> This new proc file provides information faster than parsing the potentially
>> huge smaps file.
>
> I have no idea about usefulness of this.

I can comment about this.  The use case is to speed up monitoring of
memory consumption in environments where RSS isn't precise.

For example Chrome tends to many processes which have hundreds of VMAs
with a substantial amount of shared memory, and the error of using
RSS rather than PSS tends to be very large when looking at overall
memory consumption.  PSS isn't kept as a single number that's exported
like RSS, so to calculate PSS means having to parse a very large smaps
file.

This process is slow and has to be repeated for many processes, and we
found that the just act of doing the parsing was taking up a
significant amount of CPU time, so this patch is an attempt to make
that process cheaper.

>
> The patch is definitely buggy with respect to how it implements actual
> access to mm.
>
>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> +{
>> + struct proc_maps_private *priv = m->private;
>> + struct mm_struct *mm;
>> + struct vm_area_struct *vma;
>> + struct mem_size_stats *mss_sum = priv->mss;
>> +
>> + /* reference to priv->task already taken */
>> + /* but need to get the mm here because */
>> + /* task could be in the process of exiting */
>> + mm = get_task_mm(priv->task);
>> + if (!mm || IS_ERR(mm))
>> + return -EINVAL;
>> +
>
> That's not how it's done in smaps.
>
>> +static int totmaps_open(struct inode *inode, struct file *file)
>> +{
>> + struct proc_maps_private *priv;
>> + int ret = -ENOMEM;
>> + priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> + if (priv) {
>> + priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>> + if (!priv->mss)
>> + return -ENOMEM;
>
> Cases below explicitly kfree(priv). I can't remember whether the close
> routine gets called if this one fails. Either way, something is wrong
> here.
>
>> +
>> + /* we need to grab references to the task_struct */
>> + /* at open time, because there's a potential information */
>> + /* leak where the totmaps file is opened and held open */
>> + /* while the underlying pid to task mapping changes */
>> + /* underneath it */
>> + priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>
> This performs no permission checks that I would see. If you take a look
> at smaps you will see the user ends up in proc_maps_open which performs
> proc_mem_open(inode, PTRACE_MODE_READ) and gets a mm from there.
>
>
>> + if (!priv->task) {
>> + kfree(priv->mss);
>> + kfree(priv);
>> + return -ESRCH;
>> + }
>> +
>> + ret = single_open(file, totmaps_proc_show, priv);
>> + if (ret) {
>> + put_task_struct(priv->task);
>> + kfree(priv->mss);
>> + kfree(priv);
>> + }
>> + }
>> + return ret;
>> +}
>> +
>
> --
> Mateusz Guzik

Re: [PACTH v1] mm, proc: Implement /proc//totmaps

2016-08-09 Thread Sonny Rao

On Tue, Aug 9, 2016 at 9:29 AM, Mateusz Guzik  wrote:
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.f...@collabora.com wrote:
>> From: Sonny Rao 
>>
>> This is based on earlier work by Thiago Goncales. It implements a new
>> per process proc file which summarizes the contents of the smaps file
>> but doesn't display any addresses.  It gives more detailed information
>> than statm like the PSS (proprotional set size).  It differs from the
>> original implementation in that it doesn't use the full blown set of
>> seq operations, uses a different termination condition, and doesn't
>> displayed "Locked" as that was broken on the original implemenation.
>>
>> This new proc file provides information faster than parsing the potentially
>> huge smaps file.
>
> I have no idea about usefulness of this.

I can comment about this.  The use case is to speed up monitoring of
memory consumption in environments where RSS isn't precise.

For example Chrome tends to many processes which have hundreds of VMAs
with a substantial amount of shared memory, and the error of using
RSS rather than PSS tends to be very large when looking at overall
memory consumption.  PSS isn't kept as a single number that's exported
like RSS, so to calculate PSS means having to parse a very large smaps
file.

This process is slow and has to be repeated for many processes, and we
found that the just act of doing the parsing was taking up a
significant amount of CPU time, so this patch is an attempt to make
that process cheaper.

>
> The patch is definitely buggy with respect to how it implements actual
> access to mm.
>
>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> +{
>> + struct proc_maps_private *priv = m->private;
>> + struct mm_struct *mm;
>> + struct vm_area_struct *vma;
>> + struct mem_size_stats *mss_sum = priv->mss;
>> +
>> + /* reference to priv->task already taken */
>> + /* but need to get the mm here because */
>> + /* task could be in the process of exiting */
>> + mm = get_task_mm(priv->task);
>> + if (!mm || IS_ERR(mm))
>> + return -EINVAL;
>> +
>
> That's not how it's done in smaps.
>
>> +static int totmaps_open(struct inode *inode, struct file *file)
>> +{
>> + struct proc_maps_private *priv;
>> + int ret = -ENOMEM;
>> + priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> + if (priv) {
>> + priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>> + if (!priv->mss)
>> + return -ENOMEM;
>
> Cases below explicitly kfree(priv). I can't remember whether the close
> routine gets called if this one fails. Either way, something is wrong
> here.
>
>> +
>> + /* we need to grab references to the task_struct */
>> + /* at open time, because there's a potential information */
>> + /* leak where the totmaps file is opened and held open */
>> + /* while the underlying pid to task mapping changes */
>> + /* underneath it */
>> + priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>
> This performs no permission checks that I would see. If you take a look
> at smaps you will see the user ends up in proc_maps_open which performs
> proc_mem_open(inode, PTRACE_MODE_READ) and gets a mm from there.
>
>
>> + if (!priv->task) {
>> + kfree(priv->mss);
>> + kfree(priv);
>> + return -ESRCH;
>> + }
>> +
>> + ret = single_open(file, totmaps_proc_show, priv);
>> + if (ret) {
>> + put_task_struct(priv->task);
>> + kfree(priv->mss);
>> + kfree(priv);
>> + }
>> + }
>> + return ret;
>> +}
>> +
>
> --
> Mateusz Guzik

Re: [PATCH v2 3/5] Documentation: arm-pl330: add description of arm,pl330-broken-no-flushp

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:38 PM, Shawn Lin  wrote:
> Signed-off-by: Shawn Lin 
> Reviewed-by: Doug Anderson 
> ---
>
> Changes in v2:
> - add Reviewed-by: Doug Anderson 
> - reorder this patch before the usage of the quirk
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
>
>  Documentation/devicetree/bindings/dma/arm-pl330.txt | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt 
> b/Documentation/devicetree/bindings/dma/arm-pl330.txt
> index 2675658..db7e226 100644
> --- a/Documentation/devicetree/bindings/dma/arm-pl330.txt
> +++ b/Documentation/devicetree/bindings/dma/arm-pl330.txt
> @@ -15,6 +15,7 @@ Optional properties:
>  cells in the dmas property of client device.
>- dma-channels: contains the total number of DMA channels supported by the 
> DMAC
>- dma-requests: contains the total number of DMA requests supported by the 
> DMAC
> +  - arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP
>
>  Example:
>
> --
> 2.3.7

Reviewed-by: Sonny Rao 

>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/5] DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:37 PM, Shawn Lin  wrote:
> From: Boojin Kim 
>
> This patch adds to support burst mode for dev-to-mem and
> mem-to-dev transmit.
>
> Signed-off-by: Boojin Kim 
> Signed-off-by: Addy Ke 
> Signed-off-by: Shawn Lin 
> cc: Heiko Stuebner 
> cc: Doug Anderson 
> cc: Olof Johansson 
>
> ---
>
> Changes in v2:
> - amend the author
> - amend Olof's mail address
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
> - add From original author.
> - remove Sunny's tag
>
>  drivers/dma/pl330.c | 18 --
>  1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
> index ecab4ea0..0d544d2 100644
> --- a/drivers/dma/pl330.c
> +++ b/drivers/dma/pl330.c
> @@ -1141,10 +1141,13 @@ static inline int _ldst_devtomem(unsigned dry_run, u8 
> buf[],
> const struct _xfer_spec *pxs, int cyc)
>  {
> int off = 0;
> +   enum pl330_cond cond;
> +
> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>
> while (cyc--) {
> -   off += _emit_WFP(dry_run, [off], SINGLE, pxs->desc->peri);
> -   off += _emit_LDP(dry_run, [off], SINGLE, pxs->desc->peri);
> +   off += _emit_WFP(dry_run, [off], cond, pxs->desc->peri);
> +   off += _emit_LDP(dry_run, [off], cond, pxs->desc->peri);
> off += _emit_ST(dry_run, [off], ALWAYS);
> off += _emit_FLUSHP(dry_run, [off], pxs->desc->peri);
> }
> @@ -1156,11 +1159,14 @@ static inline int _ldst_memtodev(unsigned dry_run, u8 
> buf[],
> const struct _xfer_spec *pxs, int cyc)
>  {
> int off = 0;
> +   enum pl330_cond cond;
> +
> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>
> while (cyc--) {
> -   off += _emit_WFP(dry_run, [off], SINGLE, pxs->desc->peri);
> +   off += _emit_WFP(dry_run, [off], cond, pxs->desc->peri);
> off += _emit_LD(dry_run, [off], ALWAYS);
> -   off += _emit_STP(dry_run, [off], SINGLE, pxs->desc->peri);
> +   off += _emit_STP(dry_run, [off], cond, pxs->desc->peri);
> off += _emit_FLUSHP(dry_run, [off], pxs->desc->peri);
> }
>
> @@ -2557,7 +2563,7 @@ static struct dma_async_tx_descriptor 
> *pl330_prep_dma_cyclic(
>
> desc->rqtype = direction;
> desc->rqcfg.brst_size = pch->burst_sz;
> -   desc->rqcfg.brst_len = 1;
> +   desc->rqcfg.brst_len = pch->burst_len;
> desc->bytes_requested = period_len;
> fill_px(>px, dst, src, period_len);
>
> @@ -2702,7 +2708,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct 
> scatterlist *sgl,
> }
>
> desc->rqcfg.brst_size = pch->burst_sz;
> -   desc->rqcfg.brst_len = 1;
> +   desc->rqcfg.brst_len = pch->burst_len;
> desc->rqtype = direction;
> desc->bytes_requested = sg_dma_len(sg);
> }
> --
> 2.3.7

Reviewed-by: Sonny Rao 

>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/5] DMA: pl330: add quirk for broken no flushp

2015-08-31 Thread Sonny Rao

l330, dry_run, [off], pxs, cyc);
> break;
> case DMA_DEV_TO_MEM:
> -   off += _ldst_devtomem(dry_run, [off], pxs, cyc);
> +   off += _ldst_devtomem(pl330, dry_run, [off], pxs, cyc);
> break;
> case DMA_MEM_TO_MEM:
> off += _ldst_memtomem(dry_run, [off], pxs, cyc);
> @@ -1197,7 +1225,7 @@ static int _bursts(unsigned dry_run, u8 buf[],
>  }
>
>  /* Returns bytes consumed and updates bursts */
> -static inline int _loop(unsigned dry_run, u8 buf[],
> +static inline int _loop(struct pl330_dmac *pl330, unsigned dry_run, u8 buf[],
> unsigned long *bursts, const struct _xfer_spec *pxs)
>  {
> int cyc, cycmax, szlp, szlpend, szbrst, off;
> @@ -1220,7 +1248,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> }
>
> szlp = _emit_LP(1, buf, 0, 0);
> -   szbrst = _bursts(1, buf, pxs, 1);
> +   szbrst = _bursts(pl330, 1, buf, pxs, 1);
>
> lpend.cond = ALWAYS;
> lpend.forever = false;
> @@ -1252,7 +1280,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> off += _emit_LP(dry_run, [off], 1, lcnt1);
> ljmp1 = off;
>
> -   off += _bursts(dry_run, [off], pxs, cyc);
> +   off += _bursts(pl330, dry_run, [off], pxs, cyc);
>
> lpend.cond = ALWAYS;
> lpend.forever = false;
> @@ -1275,8 +1303,9 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> return off;
>  }
>
> -static inline int _setup_loops(unsigned dry_run, u8 buf[],
> -   const struct _xfer_spec *pxs)
> +static inline int _setup_loops(struct pl330_dmac *pl330,
> +  unsigned dry_run, u8 buf[],
> +  const struct _xfer_spec *pxs)
>  {
> struct pl330_xfer *x = >desc->px;
> u32 ccr = pxs->ccr;
> @@ -1285,15 +1314,16 @@ static inline int _setup_loops(unsigned dry_run, u8 
> buf[],
>
> while (bursts) {
> c = bursts;
> -   off += _loop(dry_run, [off], , pxs);
> +   off += _loop(pl330, dry_run, [off], , pxs);
> bursts -= c;
> }
>
> return off;
>  }
>
> -static inline int _setup_xfer(unsigned dry_run, u8 buf[],
> -   const struct _xfer_spec *pxs)
> +static inline int _setup_xfer(struct pl330_dmac *pl330,
> + unsigned dry_run, u8 buf[],
> + const struct _xfer_spec *pxs)
>  {
> struct pl330_xfer *x = >desc->px;
> int off = 0;
> @@ -1304,7 +1334,7 @@ static inline int _setup_xfer(unsigned dry_run, u8 
> buf[],
> off += _emit_MOV(dry_run, [off], DAR, x->dst_addr);
>
> /* Setup Loop(s) */
> -   off += _setup_loops(dry_run, [off], pxs);
> +   off += _setup_loops(pl330, dry_run, [off], pxs);
>
> return off;
>  }
> @@ -1313,8 +1343,9 @@ static inline int _setup_xfer(unsigned dry_run, u8 
> buf[],
>   * A req is a sequence of one or more xfer units.
>   * Returns the number of bytes taken to setup the MC for the req.
>   */
> -static int _setup_req(unsigned dry_run, struct pl330_thread *thrd,
> -   unsigned index, struct _xfer_spec *pxs)
> +static int _setup_req(struct pl330_dmac *pl330, unsigned dry_run,
> + struct pl330_thread *thrd, unsigned index,
> + struct _xfer_spec *pxs)
>  {
> struct _pl330_req *req = >req[index];
> struct pl330_xfer *x;
> @@ -1331,7 +1362,7 @@ static int _setup_req(unsigned dry_run, struct 
> pl330_thread *thrd,
> if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
> return -EINVAL;
>
> -   off += _setup_xfer(dry_run, [off], pxs);
> +   off += _setup_xfer(pl330, dry_run, [off], pxs);
>
> /* DMASEV peripheral/event */
> off += _emit_SEV(dry_run, [off], thrd->ev);
> @@ -1425,7 +1456,7 @@ static int pl330_submit_req(struct pl330_thread *thrd,
> xs.desc = desc;
>
> /* First dry run to check if req is acceptable */
> -   ret = _setup_req(1, thrd, idx, );
> +   ret = _setup_req(pl330, 1, thrd, idx, );
> if (ret < 0)
> goto xfer_exit;
>
> @@ -1439,7 +1470,7 @@ static int pl330_submit_req(struct pl330_thread *thrd,
> /* Hook the request */
> thrd->lstenq = idx;
> thrd->req[idx].desc = desc;
> -   _setup_req(0, thrd, idx, );
> +   _setup_req(pl330, 0, thrd, idx, );
>
> ret = 0;
>
> @@ -2784,6 +2815,7 @@ pl330_probe(struct amba_device *adev, const struct 
> amba_id *id)
>

Re: [PATCH v2 0/5] Fix broken DMAFLUSHP on Rockchips platform

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:36 PM, Shawn Lin  wrote:
>
> The purpose of the DMAFLUSHP instruction:
> - Tell the peripheral to clear its status and control registers.
> - Send a message to the peripheral to resend its level status.
>
> There are 3 timings described in PL330 Technical Reference Manual:
> - Timing 1: Burst request, can work well without DMAFLUSHP.
> - Timing 2: Single and burst request, DMAC will ignore the single
> transfer request. This timing happens if there are single
> and burst request.
> - Timing 3: Single transfers for a burst request, DMAC should signals
> datype to request the peripheral to flush the contents of
> any control registers. This timing happens if there is
> not enough MFIFO to places the burst data.
>
> A peripheral may signal a DMA request during the execution of
> DMAFLUSHP instruction, that cause DMA request being ignored by DMAC.
>
> But DMAC and all peripherals on RK3X SoCs DO NOT support DMAFLUSHP.
> It can't send a message to the peripheral to resend DMA request,
> and the peripheral can't acknowledge a flush request from DMAC.
> So all DMA requests should NOT be ignored by DMAC, and DMAC will not
> notify the peripheral to flush.
>
> To fix this problem, we need:
> - Do NOT execute DMAFLUSHP instruction.
> - Timing 2 and timing 3 should not happen.
>
> Because on RK3X SoCs, there are 6 or below  channels and 32 MFIFO depth
> for DMAC_BUS, and 8 channels and 64 MFIFO depth for DMAC_PERI, it is
> impossible to hit the timing 3 if burst length is equal or less than 4.

Fixing this issue also requires changes to drivers, so it would be
nice if you put those changes into the same patchset.
Otherwise someone may apply this series and expect things to work but
they will still be broken. Specifically the peripherals should be
setting their burst sizes for their DMA requests low enough to avoid
needing the working DMAFLUSHP instruction.

Also, I remember we ran into an issue when we tried using burst length
of 4 with the i2s device on RK3288 because we could get requests that
either weren't aligned or a multiple of 4 sizes and some transfers
would just fail, so we ended up using a burst size of 1.  I recommend
if we aren't sure about size or alignment for a particular peripheral,
a burst size of 1 is safest.  For something like a block device, I
think we can use the larger size bursts.  That's another reason to
include the driver fixes in the series, just so we get it right,
thanks.

>
> Since the request type signal by the peripheral can only be set by
> software. We can set Rockchip Soc's GRF_PERIDMAC_CON0[2:1] to select single
> or burst request, if it is set b01,  all of the peripharals will signal a 
> brust
> request. So the timing 2 will not happen, too.
>
> So DMAC on RK3X can support single or burst transfer, but can't support
> mixed transfer.
>
> Because burst transfer is more efficient than single transfer, this is
> confirmed by our ASIC team, who strongly suggest to use burst transfer.
> And this is confirmed by Addy's test on RK3288-Pink2 board, the speed of
> spi flash burst transfer will increase about two times than single transfer.
> Also, I have tested dw_mmc with pl330 on RK3188 platform to double confirm
> the result. That means burst transfer is reansonable.
>
> So we need a quirk not to execute DMAFLUSHP instruction and to use burst
> transfer.
>
> Note:
> - The Rockchip Soc default value of GRF_PERIDMAC_CON0[2:1] is b01. To
>   support brust transfer, these bits should not be changed in bootloader.
>
>
> Changes in v2:
> - amend the author
> - reorder the patches suggested by Doug
> - add Reviewed-by: Doug Anderson  for
>   rk3288.dtsi patch and arm-pl330.txt patch
> - amend Olof's mail address
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
> - add From original author.
> - remove Sunny's tag
>
> Addy Ke (2):
>   DMA: pl330: add quirk for broken no flushp
>   ARM: dts: Add arm,pl330-broken-no-flushp quirk for rk3288 platform
>
> Boojin Kim (1):
>   DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit
>
> Shawn Lin (2):
>   Documentation: arm-pl330: add description of
> arm,pl330-broken-no-flushp
>   ARM: dts: Add arm,pl330-broken-no-flushp quirk for rk3xxx platform
>
>  .../devicetree/bindings/dma/arm-pl330.txt  |   1 +
>  arch/arm/boot/dts/rk3288.dtsi  |   3 +
>  arch/arm/boot/dts/rk3xxx.dtsi  |   3 +
>  drivers/dma/pl330.c| 101 
> +++--
>  4 files changed, 79 insertions(+), 29 deletions(-)
>
> --
> 2.3.7
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line

Re: [PATCH v2 0/5] Fix broken DMAFLUSHP on Rockchips platform

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:36 PM, Shawn Lin  wrote:
>
> The purpose of the DMAFLUSHP instruction:
> - Tell the peripheral to clear its status and control registers.
> - Send a message to the peripheral to resend its level status.
>
> There are 3 timings described in PL330 Technical Reference Manual:
> - Timing 1: Burst request, can work well without DMAFLUSHP.
> - Timing 2: Single and burst request, DMAC will ignore the single
> transfer request. This timing happens if there are single
> and burst request.
> - Timing 3: Single transfers for a burst request, DMAC should signals
> datype to request the peripheral to flush the contents of
> any control registers. This timing happens if there is
> not enough MFIFO to places the burst data.
>
> A peripheral may signal a DMA request during the execution of
> DMAFLUSHP instruction, that cause DMA request being ignored by DMAC.
>
> But DMAC and all peripherals on RK3X SoCs DO NOT support DMAFLUSHP.
> It can't send a message to the peripheral to resend DMA request,
> and the peripheral can't acknowledge a flush request from DMAC.
> So all DMA requests should NOT be ignored by DMAC, and DMAC will not
> notify the peripheral to flush.
>
> To fix this problem, we need:
> - Do NOT execute DMAFLUSHP instruction.
> - Timing 2 and timing 3 should not happen.
>
> Because on RK3X SoCs, there are 6 or below  channels and 32 MFIFO depth
> for DMAC_BUS, and 8 channels and 64 MFIFO depth for DMAC_PERI, it is
> impossible to hit the timing 3 if burst length is equal or less than 4.

Fixing this issue also requires changes to drivers, so it would be
nice if you put those changes into the same patchset.
Otherwise someone may apply this series and expect things to work but
they will still be broken. Specifically the peripherals should be
setting their burst sizes for their DMA requests low enough to avoid
needing the working DMAFLUSHP instruction.

Also, I remember we ran into an issue when we tried using burst length
of 4 with the i2s device on RK3288 because we could get requests that
either weren't aligned or a multiple of 4 sizes and some transfers
would just fail, so we ended up using a burst size of 1.  I recommend
if we aren't sure about size or alignment for a particular peripheral,
a burst size of 1 is safest.  For something like a block device, I
think we can use the larger size bursts.  That's another reason to
include the driver fixes in the series, just so we get it right,
thanks.

>
> Since the request type signal by the peripheral can only be set by
> software. We can set Rockchip Soc's GRF_PERIDMAC_CON0[2:1] to select single
> or burst request, if it is set b01,  all of the peripharals will signal a 
> brust
> request. So the timing 2 will not happen, too.
>
> So DMAC on RK3X can support single or burst transfer, but can't support
> mixed transfer.
>
> Because burst transfer is more efficient than single transfer, this is
> confirmed by our ASIC team, who strongly suggest to use burst transfer.
> And this is confirmed by Addy's test on RK3288-Pink2 board, the speed of
> spi flash burst transfer will increase about two times than single transfer.
> Also, I have tested dw_mmc with pl330 on RK3188 platform to double confirm
> the result. That means burst transfer is reansonable.
>
> So we need a quirk not to execute DMAFLUSHP instruction and to use burst
> transfer.
>
> Note:
> - The Rockchip Soc default value of GRF_PERIDMAC_CON0[2:1] is b01. To
>   support brust transfer, these bits should not be changed in bootloader.
>
>
> Changes in v2:
> - amend the author
> - reorder the patches suggested by Doug
> - add Reviewed-by: Doug Anderson  for
>   rk3288.dtsi patch and arm-pl330.txt patch
> - amend Olof's mail address
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
> - add From original author.
> - remove Sunny's tag
>
> Addy Ke (2):
>   DMA: pl330: add quirk for broken no flushp
>   ARM: dts: Add arm,pl330-broken-no-flushp quirk for rk3288 platform
>
> Boojin Kim (1):
>   DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit
>
> Shawn Lin (2):
>   Documentation: arm-pl330: add description of
> arm,pl330-broken-no-flushp
>   ARM: dts: Add arm,pl330-broken-no-flushp quirk for rk3xxx platform
>
>  .../devicetree/bindings/dma/arm-pl330.txt  |   1 +
>  arch/arm/boot/dts/rk3288.dtsi  |   3 +
>  arch/arm/boot/dts/rk3xxx.dtsi  |   3 +
>  drivers/dma/pl330.c| 101 
> +++--
>  4 files changed, 79 insertions(+), 29 deletions(-)
>
> --
> 2.3.7
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To

Re: [PATCH v2 2/5] DMA: pl330: add quirk for broken no flushp

2015-08-31 Thread Sonny Rao

;
> switch (pxs->desc->rqtype) {
> case DMA_MEM_TO_DEV:
> -   off += _ldst_memtodev(dry_run, [off], pxs, cyc);
> +   off += _ldst_memtodev(pl330, dry_run, [off], pxs, cyc);
> break;
> case DMA_DEV_TO_MEM:
> -   off += _ldst_devtomem(dry_run, [off], pxs, cyc);
> +   off += _ldst_devtomem(pl330, dry_run, [off], pxs, cyc);
> break;
> case DMA_MEM_TO_MEM:
> off += _ldst_memtomem(dry_run, [off], pxs, cyc);
> @@ -1197,7 +1225,7 @@ static int _bursts(unsigned dry_run, u8 buf[],
>  }
>
>  /* Returns bytes consumed and updates bursts */
> -static inline int _loop(unsigned dry_run, u8 buf[],
> +static inline int _loop(struct pl330_dmac *pl330, unsigned dry_run, u8 buf[],
> unsigned long *bursts, const struct _xfer_spec *pxs)
>  {
> int cyc, cycmax, szlp, szlpend, szbrst, off;
> @@ -1220,7 +1248,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> }
>
> szlp = _emit_LP(1, buf, 0, 0);
> -   szbrst = _bursts(1, buf, pxs, 1);
> +   szbrst = _bursts(pl330, 1, buf, pxs, 1);
>
> lpend.cond = ALWAYS;
> lpend.forever = false;
> @@ -1252,7 +1280,7 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> off += _emit_LP(dry_run, [off], 1, lcnt1);
> ljmp1 = off;
>
> -   off += _bursts(dry_run, [off], pxs, cyc);
> +   off += _bursts(pl330, dry_run, [off], pxs, cyc);
>
> lpend.cond = ALWAYS;
> lpend.forever = false;
> @@ -1275,8 +1303,9 @@ static inline int _loop(unsigned dry_run, u8 buf[],
> return off;
>  }
>
> -static inline int _setup_loops(unsigned dry_run, u8 buf[],
> -   const struct _xfer_spec *pxs)
> +static inline int _setup_loops(struct pl330_dmac *pl330,
> +  unsigned dry_run, u8 buf[],
> +  const struct _xfer_spec *pxs)
>  {
> struct pl330_xfer *x = >desc->px;
> u32 ccr = pxs->ccr;
> @@ -1285,15 +1314,16 @@ static inline int _setup_loops(unsigned dry_run, u8 
> buf[],
>
> while (bursts) {
> c = bursts;
> -   off += _loop(dry_run, [off], , pxs);
> +   off += _loop(pl330, dry_run, [off], , pxs);
> bursts -= c;
> }
>
> return off;
>  }
>
> -static inline int _setup_xfer(unsigned dry_run, u8 buf[],
> -   const struct _xfer_spec *pxs)
> +static inline int _setup_xfer(struct pl330_dmac *pl330,
> + unsigned dry_run, u8 buf[],
> + const struct _xfer_spec *pxs)
>  {
> struct pl330_xfer *x = >desc->px;
> int off = 0;
> @@ -1304,7 +1334,7 @@ static inline int _setup_xfer(unsigned dry_run, u8 
> buf[],
> off += _emit_MOV(dry_run, [off], DAR, x->dst_addr);
>
> /* Setup Loop(s) */
> -   off += _setup_loops(dry_run, [off], pxs);
> +   off += _setup_loops(pl330, dry_run, [off], pxs);
>
> return off;
>  }
> @@ -1313,8 +1343,9 @@ static inline int _setup_xfer(unsigned dry_run, u8 
> buf[],
>   * A req is a sequence of one or more xfer units.
>   * Returns the number of bytes taken to setup the MC for the req.
>   */
> -static int _setup_req(unsigned dry_run, struct pl330_thread *thrd,
> -   unsigned index, struct _xfer_spec *pxs)
> +static int _setup_req(struct pl330_dmac *pl330, unsigned dry_run,
> + struct pl330_thread *thrd, unsigned index,
> + struct _xfer_spec *pxs)
>  {
> struct _pl330_req *req = >req[index];
> struct pl330_xfer *x;
> @@ -1331,7 +1362,7 @@ static int _setup_req(unsigned dry_run, struct 
> pl330_thread *thrd,
> if (x->bytes % (BRST_SIZE(pxs->ccr) * BRST_LEN(pxs->ccr)))
> return -EINVAL;
>
> -   off += _setup_xfer(dry_run, [off], pxs);
> +   off += _setup_xfer(pl330, dry_run, [off], pxs);
>
> /* DMASEV peripheral/event */
> off += _emit_SEV(dry_run, [off], thrd->ev);
> @@ -1425,7 +1456,7 @@ static int pl330_submit_req(struct pl330_thread *thrd,
> xs.desc = desc;
>
> /* First dry run to check if req is acceptable */
> -   ret = _setup_req(1, thrd, idx, );
> +   ret = _setup_req(pl330, 1, thrd, idx, );
> if (ret < 0)
> goto xfer_exit;
>
> @@ -1439,7 +1470,7 @@ static int pl330_submit_req(struct pl330_thread *thrd,
> /* Hook the request */
> thrd->lstenq = idx;
> thrd->req[idx].desc = desc;
> -   _setup_req(0, thrd

Re: [PATCH v2 1/5] DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:37 PM, Shawn Lin <shawn@rock-chips.com> wrote:
> From: Boojin Kim <boojin@samsung.com>
>
> This patch adds to support burst mode for dev-to-mem and
> mem-to-dev transmit.
>
> Signed-off-by: Boojin Kim <boojin@samsung.com>
> Signed-off-by: Addy Ke <addy...@rock-chips.com>
> Signed-off-by: Shawn Lin <shawn@rock-chips.com>
> cc: Heiko Stuebner <he...@sntech.de>
> cc: Doug Anderson <diand...@chromium.org>
> cc: Olof Johansson <o...@lixom.net>
>
> ---
>
> Changes in v2:
> - amend the author
> - amend Olof's mail address
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
> - add From original author.
> - remove Sunny's tag
>
>  drivers/dma/pl330.c | 18 --
>  1 file changed, 12 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
> index ecab4ea0..0d544d2 100644
> --- a/drivers/dma/pl330.c
> +++ b/drivers/dma/pl330.c
> @@ -1141,10 +1141,13 @@ static inline int _ldst_devtomem(unsigned dry_run, u8 
> buf[],
> const struct _xfer_spec *pxs, int cyc)
>  {
> int off = 0;
> +   enum pl330_cond cond;
> +
> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>
> while (cyc--) {
> -   off += _emit_WFP(dry_run, [off], SINGLE, pxs->desc->peri);
> -   off += _emit_LDP(dry_run, [off], SINGLE, pxs->desc->peri);
> +   off += _emit_WFP(dry_run, [off], cond, pxs->desc->peri);
> +   off += _emit_LDP(dry_run, [off], cond, pxs->desc->peri);
> off += _emit_ST(dry_run, [off], ALWAYS);
> off += _emit_FLUSHP(dry_run, [off], pxs->desc->peri);
> }
> @@ -1156,11 +1159,14 @@ static inline int _ldst_memtodev(unsigned dry_run, u8 
> buf[],
> const struct _xfer_spec *pxs, int cyc)
>  {
> int off = 0;
> +   enum pl330_cond cond;
> +
> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>
> while (cyc--) {
> -   off += _emit_WFP(dry_run, [off], SINGLE, pxs->desc->peri);
> +   off += _emit_WFP(dry_run, [off], cond, pxs->desc->peri);
> off += _emit_LD(dry_run, [off], ALWAYS);
> -   off += _emit_STP(dry_run, [off], SINGLE, pxs->desc->peri);
> +   off += _emit_STP(dry_run, [off], cond, pxs->desc->peri);
> off += _emit_FLUSHP(dry_run, [off], pxs->desc->peri);
> }
>
> @@ -2557,7 +2563,7 @@ static struct dma_async_tx_descriptor 
> *pl330_prep_dma_cyclic(
>
> desc->rqtype = direction;
> desc->rqcfg.brst_size = pch->burst_sz;
> -   desc->rqcfg.brst_len = 1;
> +   desc->rqcfg.brst_len = pch->burst_len;
> desc->bytes_requested = period_len;
> fill_px(>px, dst, src, period_len);
>
> @@ -2702,7 +2708,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct 
> scatterlist *sgl,
> }
>
> desc->rqcfg.brst_size = pch->burst_sz;
> -   desc->rqcfg.brst_len = 1;
> +   desc->rqcfg.brst_len = pch->burst_len;
> desc->rqtype = direction;
> desc->bytes_requested = sg_dma_len(sg);
> }
> --
> 2.3.7

Reviewed-by: Sonny Rao <sonny...@chromium.org>

>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 3/5] Documentation: arm-pl330: add description of arm,pl330-broken-no-flushp

2015-08-31 Thread Sonny Rao

On Thu, Aug 27, 2015 at 5:38 PM, Shawn Lin <shawn@rock-chips.com> wrote:
> Signed-off-by: Shawn Lin <shawn@rock-chips.com>
> Reviewed-by: Doug Anderson <diand...@chromium.org>
> ---
>
> Changes in v2:
> - add Reviewed-by: Doug Anderson <diand...@chromium.org>
> - reorder this patch before the usage of the quirk
>
> Changes in v1:
> - rename broken-no-flushp to "arm,pl330-broken-no-flushp" suggested
>   by Krzysztof.
>
>  Documentation/devicetree/bindings/dma/arm-pl330.txt | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt 
> b/Documentation/devicetree/bindings/dma/arm-pl330.txt
> index 2675658..db7e226 100644
> --- a/Documentation/devicetree/bindings/dma/arm-pl330.txt
> +++ b/Documentation/devicetree/bindings/dma/arm-pl330.txt
> @@ -15,6 +15,7 @@ Optional properties:
>  cells in the dmas property of client device.
>- dma-channels: contains the total number of DMA channels supported by the 
> DMAC
>- dma-requests: contains the total number of DMA requests supported by the 
> DMAC
> +  - arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP
>
>  Example:
>
> --
> 2.3.7

Reviewed-by: Sonny Rao <sonny...@chromium.org>

>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/5] DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit

2015-08-28 Thread Sonny Rao

On Thu, Aug 27, 2015 at 6:28 AM, Shawn Lin  wrote:
> 在 2015/8/27 20:57, Krzysztof Kozlowski 写道:
>>
>> 2015-08-27 17:48 GMT+09:00 Shawn Lin :
>>>
>>>
>>> This patch adds to support burst mode for dev-to-mem and
>>> mem-to-dev transmit.
>>>
>>> Signed-off-by: Boojin Kim 
>>> Signed-off-by: Addy Ke 
>>> Signed-off-by: Shawn Lin 
>>> cc: Heiko Stuebner 
>>> cc: Doug Anderson 
>>> cc: Olof Johansson 
>>> Reviewed-and-tested-by: Sonny Rao 
>>
>>
>> For the entire patchset: I would prefer to see someone's
>> reviewed/tested tag in his response. Sending a version 1 of patchset
>> (regardless of Boojin Kim's work two years ago) with such tag could
>> mean anything. I cannot verify it easily (unless digging somewhere...
>> or asking people). You could add for example: Reviewed-by Santa Claus.
>> Should I sent a letter to him asking for confirmation? :)
>>
>
> :) yes, you are right. I should comply with the rule, even if the patchest
> had been reviewed or tested by someone on another tree.

Hi, yeah I reviewed on a different tree, so you shouldn't put that tag
here, thanks for removing it.
I can re-review if you'd like.

>
>
>> More seriously - reviewed-by is a statement (please look at
>> Documentation/SubmittingPatches) and you cannot force someone to make
>> that statement. He must make such statement on his own.
>>
>> That's all from my side since I don't feel skilled enough to review the
>> code.
>>
>> Best regards,
>> Krzysztof
>>
>>>
>>> ---
>>>
>>>   drivers/dma/pl330.c | 18 --
>>>   1 file changed, 12 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
>>> index ecab4ea0..0d544d2 100644
>>> --- a/drivers/dma/pl330.c
>>> +++ b/drivers/dma/pl330.c
>>> @@ -1141,10 +1141,13 @@ static inline int _ldst_devtomem(unsigned
>>> dry_run, u8 buf[],
>>>  const struct _xfer_spec *pxs, int cyc)
>>>   {
>>>  int off = 0;
>>> +   enum pl330_cond cond;
>>> +
>>> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>>>
>>>  while (cyc--) {
>>> -   off += _emit_WFP(dry_run, [off], SINGLE,
>>> pxs->desc->peri);
>>> -   off += _emit_LDP(dry_run, [off], SINGLE,
>>> pxs->desc->peri);
>>> +   off += _emit_WFP(dry_run, [off], cond,
>>> pxs->desc->peri);
>>> +   off += _emit_LDP(dry_run, [off], cond,
>>> pxs->desc->peri);
>>>  off += _emit_ST(dry_run, [off], ALWAYS);
>>>  off += _emit_FLUSHP(dry_run, [off],
>>> pxs->desc->peri);
>>>  }
>>> @@ -1156,11 +1159,14 @@ static inline int _ldst_memtodev(unsigned
>>> dry_run, u8 buf[],
>>>  const struct _xfer_spec *pxs, int cyc)
>>>   {
>>>  int off = 0;
>>> +   enum pl330_cond cond;
>>> +
>>> +   cond = (pxs->desc->rqcfg.brst_len == 1) ? SINGLE : BURST;
>>>
>>>  while (cyc--) {
>>> -   off += _emit_WFP(dry_run, [off], SINGLE,
>>> pxs->desc->peri);
>>> +   off += _emit_WFP(dry_run, [off], cond,
>>> pxs->desc->peri);
>>>  off += _emit_LD(dry_run, [off], ALWAYS);
>>> -   off += _emit_STP(dry_run, [off], SINGLE,
>>> pxs->desc->peri);
>>> +   off += _emit_STP(dry_run, [off], cond,
>>> pxs->desc->peri);
>>>  off += _emit_FLUSHP(dry_run, [off],
>>> pxs->desc->peri);
>>>  }
>>>
>>> @@ -2557,7 +2563,7 @@ static struct dma_async_tx_descriptor
>>> *pl330_prep_dma_cyclic(
>>>
>>>  desc->rqtype = direction;
>>>  desc->rqcfg.brst_size = pch->burst_sz;
>>> -   desc->rqcfg.brst_len = 1;
>>> +   desc->rqcfg.brst_len = pch->burst_len;
>>>  desc->bytes_requested = period_len;
>>>  fill_px(>px, dst, src, period_len);
>>>
>>> @@ -2702,7 +2708,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct
>>> scatterlist *sgl,
>>>  }
>>>
>>>  desc->rqcfg.brst_size = pch->burst_sz;
>>> -   desc->rqcfg.brst_len = 1;
>>> +   desc->rqcfg.brst_len = pch->burst_len;
>>>  desc->rqtype = direction;
>>>  desc->bytes_requested = sg_dma_len(sg);
>>>  }
>>> --
>>> 2.3.7
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
>>> the body of a message to majord...@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>>
>
>
> --
> Best Regards
> Shawn Lin
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/5] DMA: pl330: support burst mode for dev-to-mem and mem-to-dev transmit

2015-08-28 Thread Sonny Rao

On Thu, Aug 27, 2015 at 6:28 AM, Shawn Lin shawn@rock-chips.com wrote:
 在 2015/8/27 20:57, Krzysztof Kozlowski 写道:

 2015-08-27 17:48 GMT+09:00 Shawn Lin shawn@rock-chips.com:


 This patch adds to support burst mode for dev-to-mem and
 mem-to-dev transmit.

 Signed-off-by: Boojin Kim boojin@samsung.com
 Signed-off-by: Addy Ke addy...@rock-chips.com
 Signed-off-by: Shawn Lin shawn@rock-chips.com
 cc: Heiko Stuebner he...@sntech.de
 cc: Doug Anderson diand...@chromium.org
 cc: Olof Johansson ol...@google.com
 Reviewed-and-tested-by: Sonny Rao sonny...@chromium.org


 For the entire patchset: I would prefer to see someone's
 reviewed/tested tag in his response. Sending a version 1 of patchset
 (regardless of Boojin Kim's work two years ago) with such tag could
 mean anything. I cannot verify it easily (unless digging somewhere...
 or asking people). You could add for example: Reviewed-by Santa Claus.
 Should I sent a letter to him asking for confirmation? :)


 :) yes, you are right. I should comply with the rule, even if the patchest
 had been reviewed or tested by someone on another tree.

Hi, yeah I reviewed on a different tree, so you shouldn't put that tag
here, thanks for removing it.
I can re-review if you'd like.



 More seriously - reviewed-by is a statement (please look at
 Documentation/SubmittingPatches) and you cannot force someone to make
 that statement. He must make such statement on his own.

 That's all from my side since I don't feel skilled enough to review the
 code.

 Best regards,
 Krzysztof


 ---

   drivers/dma/pl330.c | 18 --
   1 file changed, 12 insertions(+), 6 deletions(-)

 diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
 index ecab4ea0..0d544d2 100644
 --- a/drivers/dma/pl330.c
 +++ b/drivers/dma/pl330.c
 @@ -1141,10 +1141,13 @@ static inline int _ldst_devtomem(unsigned
 dry_run, u8 buf[],
  const struct _xfer_spec *pxs, int cyc)
   {
  int off = 0;
 +   enum pl330_cond cond;
 +
 +   cond = (pxs-desc-rqcfg.brst_len == 1) ? SINGLE : BURST;

  while (cyc--) {
 -   off += _emit_WFP(dry_run, buf[off], SINGLE,
 pxs-desc-peri);
 -   off += _emit_LDP(dry_run, buf[off], SINGLE,
 pxs-desc-peri);
 +   off += _emit_WFP(dry_run, buf[off], cond,
 pxs-desc-peri);
 +   off += _emit_LDP(dry_run, buf[off], cond,
 pxs-desc-peri);
  off += _emit_ST(dry_run, buf[off], ALWAYS);
  off += _emit_FLUSHP(dry_run, buf[off],
 pxs-desc-peri);
  }
 @@ -1156,11 +1159,14 @@ static inline int _ldst_memtodev(unsigned
 dry_run, u8 buf[],
  const struct _xfer_spec *pxs, int cyc)
   {
  int off = 0;
 +   enum pl330_cond cond;
 +
 +   cond = (pxs-desc-rqcfg.brst_len == 1) ? SINGLE : BURST;

  while (cyc--) {
 -   off += _emit_WFP(dry_run, buf[off], SINGLE,
 pxs-desc-peri);
 +   off += _emit_WFP(dry_run, buf[off], cond,
 pxs-desc-peri);
  off += _emit_LD(dry_run, buf[off], ALWAYS);
 -   off += _emit_STP(dry_run, buf[off], SINGLE,
 pxs-desc-peri);
 +   off += _emit_STP(dry_run, buf[off], cond,
 pxs-desc-peri);
  off += _emit_FLUSHP(dry_run, buf[off],
 pxs-desc-peri);
  }

 @@ -2557,7 +2563,7 @@ static struct dma_async_tx_descriptor
 *pl330_prep_dma_cyclic(

  desc-rqtype = direction;
  desc-rqcfg.brst_size = pch-burst_sz;
 -   desc-rqcfg.brst_len = 1;
 +   desc-rqcfg.brst_len = pch-burst_len;
  desc-bytes_requested = period_len;
  fill_px(desc-px, dst, src, period_len);

 @@ -2702,7 +2708,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct
 scatterlist *sgl,
  }

  desc-rqcfg.brst_size = pch-burst_sz;
 -   desc-rqcfg.brst_len = 1;
 +   desc-rqcfg.brst_len = pch-burst_len;
  desc-rqtype = direction;
  desc-bytes_requested = sg_dma_len(sg);
  }
 --
 2.3.7


 --
 To unsubscribe from this list: send the line unsubscribe dmaengine in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html






 --
 Best Regards
 Shawn Lin

 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: add Broadwell-U uncore IMC PMU support

2015-04-23 Thread Sonny Rao

On Wed, Apr 22, 2015 at 11:56 PM, Stephane Eranian  wrote:
>
> This patch enables the uncore Memory Controller (IMC) PMU support
> for Intel Broadwell-U (Model 61) mobile processors.
> The IMC PMU enables measuring memory bandwidth.
>
> To use with perf:
> $ perf stat -a -I 1000 -e uncore_imc/data_reads/,uncore_imc/data_writes/ 
> sleep 10
>
> Signed-off-by: Stephane Eranian 

Tested-by: Sonny Rao 

> ---
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> index c635b8b..a03f964 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
> @@ -922,6 +922,9 @@ static int __init uncore_pci_init(void)
> case 69: /* Haswell Celeron */
> ret = hsw_uncore_pci_init();
> break;
> +   case 61: /* Broadwell */
> +   ret = bdw_uncore_pci_init();
> +   break;
> default:
> return 0;
> }
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> index 6c8c1e7..06b0793 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
> @@ -326,6 +326,7 @@ extern struct event_constraint uncore_constraint_empty;
>  int snb_uncore_pci_init(void);
>  int ivb_uncore_pci_init(void);
>  int hsw_uncore_pci_init(void);
> +int bdw_uncore_pci_init(void);
>  void snb_uncore_cpu_init(void);
>  void nhm_uncore_cpu_init(void);
>
> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
> b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> index 0333d0b..0f768bf 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
> @@ -7,6 +7,7 @@
>  #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>  #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
> +#define PCI_DEVICE_ID_INTEL_BDW_IMC0x1604
>
>  /* SNB event control */
>  #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
> @@ -488,6 +489,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = 
> {
> { /* end: all zeroes */ },
>  };
>
> +static const struct pci_device_id bdw_uncore_pci_ids[] = {
> +   { /* IMC */
> +   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
> +   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
> +   },
> +   { /* end: all zeroes */ },
> +};
> +
>  static struct pci_driver snb_uncore_pci_driver = {
> .name   = "snb_uncore",
> .id_table   = snb_uncore_pci_ids,
> @@ -503,6 +512,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
> .id_table   = hsw_uncore_pci_ids,
>  };
>
> +static struct pci_driver bdw_uncore_pci_driver = {
> +   .name   = "bdw_uncore",
> +   .id_table   = bdw_uncore_pci_ids,
> +};
> +
>  struct imc_uncore_pci_dev {
> __u32 pci_id;
> struct pci_driver *driver;
> @@ -516,6 +530,7 @@ static const struct imc_uncore_pci_dev 
> desktop_imc_pci_ids[] = {
> IMC_DEV(IVB_E3_IMC, _uncore_pci_driver), /* Xeon E3-1200 v2/3rd 
> Gen Core processor */
> IMC_DEV(HSW_IMC, _uncore_pci_driver),/* 4th Gen Core 
> Processor */
> IMC_DEV(HSW_U_IMC, _uncore_pci_driver),  /* 4th Gen Core ULT 
> Mobile Processor */
> +   IMC_DEV(BDW_IMC, _uncore_pci_driver),/* 5th Gen Core U */
> {  /* end marker */ }
>  };
>
> @@ -563,6 +578,11 @@ int hsw_uncore_pci_init(void)
> return imc_uncore_pci_init();
>  }
>
> +int bdw_uncore_pci_init(void)
> +{
> +   return imc_uncore_pci_init();
> +}
> +
>  /* end of Sandy Bridge uncore support */
>
>  /* Nehalem uncore support */
> --
> 2.1.0
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: add Broadwell-U uncore IMC PMU support

2015-04-23 Thread Sonny Rao

On Wed, Apr 22, 2015 at 11:56 PM, Stephane Eranian eran...@google.com wrote:

 This patch enables the uncore Memory Controller (IMC) PMU support
 for Intel Broadwell-U (Model 61) mobile processors.
 The IMC PMU enables measuring memory bandwidth.

 To use with perf:
 $ perf stat -a -I 1000 -e uncore_imc/data_reads/,uncore_imc/data_writes/ 
 sleep 10

 Signed-off-by: Stephane Eranian eran...@google.com

Tested-by: Sonny Rao sonny...@chromium.org

 ---

 diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
 b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
 index c635b8b..a03f964 100644
 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
 +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
 @@ -922,6 +922,9 @@ static int __init uncore_pci_init(void)
 case 69: /* Haswell Celeron */
 ret = hsw_uncore_pci_init();
 break;
 +   case 61: /* Broadwell */
 +   ret = bdw_uncore_pci_init();
 +   break;
 default:
 return 0;
 }
 diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
 b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
 index 6c8c1e7..06b0793 100644
 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
 +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
 @@ -326,6 +326,7 @@ extern struct event_constraint uncore_constraint_empty;
  int snb_uncore_pci_init(void);
  int ivb_uncore_pci_init(void);
  int hsw_uncore_pci_init(void);
 +int bdw_uncore_pci_init(void);
  void snb_uncore_cpu_init(void);
  void nhm_uncore_cpu_init(void);

 diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
 b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
 index 0333d0b..0f768bf 100644
 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
 +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
 @@ -7,6 +7,7 @@
  #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
  #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 +#define PCI_DEVICE_ID_INTEL_BDW_IMC0x1604

  /* SNB event control */
  #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 @@ -488,6 +489,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = 
 {
 { /* end: all zeroes */ },
  };

 +static const struct pci_device_id bdw_uncore_pci_ids[] = {
 +   { /* IMC */
 +   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
 +   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
 +   },
 +   { /* end: all zeroes */ },
 +};
 +
  static struct pci_driver snb_uncore_pci_driver = {
 .name   = snb_uncore,
 .id_table   = snb_uncore_pci_ids,
 @@ -503,6 +512,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
 .id_table   = hsw_uncore_pci_ids,
  };

 +static struct pci_driver bdw_uncore_pci_driver = {
 +   .name   = bdw_uncore,
 +   .id_table   = bdw_uncore_pci_ids,
 +};
 +
  struct imc_uncore_pci_dev {
 __u32 pci_id;
 struct pci_driver *driver;
 @@ -516,6 +530,7 @@ static const struct imc_uncore_pci_dev 
 desktop_imc_pci_ids[] = {
 IMC_DEV(IVB_E3_IMC, ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd 
 Gen Core processor */
 IMC_DEV(HSW_IMC, hsw_uncore_pci_driver),/* 4th Gen Core 
 Processor */
 IMC_DEV(HSW_U_IMC, hsw_uncore_pci_driver),  /* 4th Gen Core ULT 
 Mobile Processor */
 +   IMC_DEV(BDW_IMC, bdw_uncore_pci_driver),/* 5th Gen Core U */
 {  /* end marker */ }
  };

 @@ -563,6 +578,11 @@ int hsw_uncore_pci_init(void)
 return imc_uncore_pci_init();
  }

 +int bdw_uncore_pci_init(void)
 +{
 +   return imc_uncore_pci_init();
 +}
 +
  /* end of Sandy Bridge uncore support */

  /* Nehalem uncore support */
 --
 2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-22 Thread tip-bot for Sonny Rao

Commit-ID:  0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2
Gitweb: http://git.kernel.org/tip/0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2
Author: Sonny Rao 
AuthorDate: Tue, 21 Apr 2015 12:33:11 -0700
Committer:  Ingo Molnar 
CommitDate: Wed, 22 Apr 2015 08:29:19 +0200

perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

This keeps all the related PCI IDs together in the driver where
they are used.

Signed-off-by: Sonny Rao 
Acked-by: Bjorn Helgaas 
Cc: Arnaldo Carvalho de Melo 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1429644791-25724-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..4562e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e63c02a..a593858 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2539,10 +2539,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs

2015-04-22 Thread tip-bot for Sonny Rao

Commit-ID:  80bcffb376a6890dd7452b12c1ba032f8f24fef6
Gitweb: http://git.kernel.org/tip/80bcffb376a6890dd7452b12c1ba032f8f24fef6
Author: Sonny Rao 
AuthorDate: Mon, 20 Apr 2015 15:34:07 -0700
Committer:  Ingo Molnar 
CommitDate: Wed, 22 Apr 2015 08:27:43 +0200

perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile 
Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
Cc: Arnaldo Carvalho de Melo 
Cc: Bjorn Helgaas 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, _uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, _uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, _uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, _uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs

2015-04-22 Thread tip-bot for Sonny Rao

Commit-ID:  80bcffb376a6890dd7452b12c1ba032f8f24fef6
Gitweb: http://git.kernel.org/tip/80bcffb376a6890dd7452b12c1ba032f8f24fef6
Author: Sonny Rao sonny...@chromium.org
AuthorDate: Mon, 20 Apr 2015 15:34:07 -0700
Committer:  Ingo Molnar mi...@kernel.org
CommitDate: Wed, 22 Apr 2015 08:27:43 +0200

perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile 
Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao sonny...@chromium.org
Cc: Arnaldo Carvalho de Melo a...@kernel.org
Cc: Bjorn Helgaas bhelg...@google.com
Cc: Paul Mackerras pau...@samba.org
Cc: Peter Zijlstra a.p.zijls...@chello.nl
Cc: Stephane Eranian eran...@google.com
Link: 
http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar mi...@kernel.org
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include perf_event_intel_uncore.h
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:perf/urgent] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-22 Thread tip-bot for Sonny Rao

Commit-ID:  0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2
Gitweb: http://git.kernel.org/tip/0140e6141e4f1d4b15fb469e6912b0e71b7d1cc2
Author: Sonny Rao sonny...@chromium.org
AuthorDate: Tue, 21 Apr 2015 12:33:11 -0700
Committer:  Ingo Molnar mi...@kernel.org
CommitDate: Wed, 22 Apr 2015 08:29:19 +0200

perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

This keeps all the related PCI IDs together in the driver where
they are used.

Signed-off-by: Sonny Rao sonny...@chromium.org
Acked-by: Bjorn Helgaas bhelg...@google.com
Cc: Arnaldo Carvalho de Melo a...@kernel.org
Cc: Paul Mackerras pau...@samba.org
Cc: Peter Zijlstra a.p.zijls...@chello.nl
Cc: Stephane Eranian eran...@google.com
Link: 
http://lkml.kernel.org/r/1429644791-25724-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar mi...@kernel.org
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..4562e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include perf_event_intel_uncore.h
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index e63c02a..a593858 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2539,10 +2539,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHv2] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

This keeps all the related PCI IDs together in the driver where they
are used.

Signed-off-by: Sonny Rao 
Acked-by: Bjorn Helgaas 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..4562e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..2f7b9a4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,10 +2541,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

On Tue, Apr 21, 2015 at 12:21 PM, Bjorn Helgaas  wrote:
> On Tue, Apr 21, 2015 at 2:09 PM, Sonny Rao  wrote:
>> This keeps all the related PCI IDs together in the driver where they
>> are used.
>>
>> Signed-off-by: Sonny Rao 
>
> Acked-by: Bjorn Helgaas 
>
>> ---
>>  arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
>>  include/linux/pci_ids.h   | 4 
>>  2 files changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
>> b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>> index ca75e70..02c1a13 100644
>> --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>> +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>> @@ -1,7 +1,11 @@
>>  /* Nehalem/SandBridge/Haswell uncore support */
>>  #include "perf_event_intel_uncore.h"
>>
>> -/* Uncore IMC PCI Id */
>> +/* Uncore IMC PCI Ids */
>
> "IDs" would be more consistent.
>

Oops, will fix.

>> +#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
>> +#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
>> +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>> +#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>>  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
>>
>>  /* SNB event control */
>> diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
>> index 38cff8f..2f7b9a4 100644
>> --- a/include/linux/pci_ids.h
>> +++ b/include/linux/pci_ids.h
>> @@ -2541,10 +2541,6 @@
>>
>>  #define PCI_VENDOR_ID_INTEL0x8086
>>  #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
>> -#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
>> -#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
>> -#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>> -#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>>  #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
>>  #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
>>  #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
>> --
>> 2.1.2
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

This keeps all the related PCI IDs together in the driver where they
are used.

Signed-off-by: Sonny Rao 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..02c1a13 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI Ids */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..2f7b9a4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,10 +2541,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/urgent] perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs

2015-04-21 Thread tip-bot for Sonny Rao

Commit-ID:  5324e72e00012126101aee6f3e62977055a3b5ee
Gitweb: http://git.kernel.org/tip/5324e72e00012126101aee6f3e62977055a3b5ee
Author: Sonny Rao 
AuthorDate: Mon, 20 Apr 2015 15:34:07 -0700
Committer:  Ingo Molnar 
CommitDate: Tue, 21 Apr 2015 09:31:17 +0200

perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile 
Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
Cc: Arnaldo Carvalho de Melo 
Cc: Bjorn Helgaas 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Link: 
http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, _uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, _uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, _uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, _uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

This keeps all the related PCI IDs together in the driver where they
are used.

Signed-off-by: Sonny Rao sonny...@chromium.org
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..02c1a13 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include perf_event_intel_uncore.h
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI Ids */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..2f7b9a4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,10 +2541,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

On Tue, Apr 21, 2015 at 12:21 PM, Bjorn Helgaas bhelg...@google.com wrote:
 On Tue, Apr 21, 2015 at 2:09 PM, Sonny Rao sonny...@chromium.org wrote:
 This keeps all the related PCI IDs together in the driver where they
 are used.

 Signed-off-by: Sonny Rao sonny...@chromium.org

 Acked-by: Bjorn Helgaas bhelg...@google.com

 ---
  arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
  include/linux/pci_ids.h   | 4 
  2 files changed, 5 insertions(+), 5 deletions(-)

 diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
 b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
 index ca75e70..02c1a13 100644
 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
 +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
 @@ -1,7 +1,11 @@
  /* Nehalem/SandBridge/Haswell uncore support */
  #include perf_event_intel_uncore.h

 -/* Uncore IMC PCI Id */
 +/* Uncore IMC PCI Ids */

 IDs would be more consistent.


Oops, will fix.

 +#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
 +#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
 +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
 +#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
  #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04

  /* SNB event control */
 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
 index 38cff8f..2f7b9a4 100644
 --- a/include/linux/pci_ids.h
 +++ b/include/linux/pci_ids.h
 @@ -2541,10 +2541,6 @@

  #define PCI_VENDOR_ID_INTEL0x8086
  #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
 -#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
 -#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
 -#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
 -#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
  #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
  #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
  #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
 --
 2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHv2] perf/x86/intel/uncore: Move PCI IDs for IMC to uncore driver

2015-04-21 Thread Sonny Rao

This keeps all the related PCI IDs together in the driver where they
are used.

Signed-off-by: Sonny Rao sonny...@chromium.org
Acked-by: Bjorn Helgaas bhelg...@google.com
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 6 +-
 include/linux/pci_ids.h   | 4 
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index ca75e70..4562e9e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,7 +1,11 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include perf_event_intel_uncore.h
 
-/* Uncore IMC PCI Id */
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 
 /* SNB event control */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..2f7b9a4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2541,10 +2541,6 @@
 
 #define PCI_VENDOR_ID_INTEL0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC  0x0008
-#define PCI_DEVICE_ID_INTEL_SNB_IMC0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/urgent] perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile Processor) IMC uncore PMUs

2015-04-21 Thread tip-bot for Sonny Rao

Commit-ID:  5324e72e00012126101aee6f3e62977055a3b5ee
Gitweb: http://git.kernel.org/tip/5324e72e00012126101aee6f3e62977055a3b5ee
Author: Sonny Rao sonny...@chromium.org
AuthorDate: Mon, 20 Apr 2015 15:34:07 -0700
Committer:  Ingo Molnar mi...@kernel.org
CommitDate: Tue, 21 Apr 2015 09:31:17 +0200

perf/x86/intel/uncore: Add support for Intel Haswell ULT (lower power Mobile 
Processor) IMC uncore PMUs

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao sonny...@chromium.org
Cc: Arnaldo Carvalho de Melo a...@kernel.org
Cc: Bjorn Helgaas bhelg...@google.com
Cc: Paul Mackerras pau...@samba.org
Cc: Peter Zijlstra a.p.zijls...@chello.nl
Cc: Stephane Eranian eran...@google.com
Link: 
http://lkml.kernel.org/r/1429569247-16697-1-git-send-email-sonny...@chromium.org
Signed-off-by: Ingo Molnar mi...@kernel.org
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include perf_event_intel_uncore.h
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHv2] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include "perf_event_intel_uncore.h"
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, _uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, _uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, _uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, _uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

On Mon, Apr 20, 2015 at 12:34 PM, Bjorn Helgaas  wrote:
> On Mon, Apr 20, 2015 at 1:58 PM, Stephane Eranian  wrote:
>> On Mon, Apr 20, 2015 at 11:56 AM, Bjorn Helgaas  wrote:
>>>
>>> On Mon, Apr 20, 2015 at 1:42 PM, Sonny Rao  wrote:
>>> > This uncore is the same as the Haswell desktop part but uses a
>>> > different PCI ID.
>>> >
>>> > Signed-off-by: Sonny Rao 
>>> > ---
>>> >  arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 5 +
>>> >  include/linux/pci_ids.h   | 1 +
>>> >  2 files changed, 6 insertions(+)
>>> >
>>> > diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
>>> > b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>>> > index 3001015..0bda6fc 100644
>>> > --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>>> > +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
>>> > @@ -472,6 +472,10 @@ static const struct pci_device_id 
>>> > hsw_uncore_pci_ids[] = {
>>> > PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
>>> > PCI_DEVICE_ID_INTEL_HSW_IMC),
>>> > .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
>>> > },
>>> > +   { /* IMC */
>>> > +   PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
>>> > PCI_DEVICE_ID_INTEL_HSW_U_IMC),
>>> > +   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
>>> > +   },
>>> > { /* end: all zeroes */ },
>>> >  };
>>> >
>>> > @@ -502,6 +506,7 @@ static const struct imc_uncore_pci_dev 
>>> > desktop_imc_pci_ids[] = {
>>> > IMC_DEV(IVB_IMC, _uncore_pci_driver),/* 3rd Gen Core 
>>> > processor */
>>> > IMC_DEV(IVB_E3_IMC, _uncore_pci_driver), /* Xeon E3-1200 
>>> > v2/3rd Gen Core processor */
>>> > IMC_DEV(HSW_IMC, _uncore_pci_driver),/* 4th Gen Core 
>>> > Processor */
>>> > +   IMC_DEV(HSW_U_IMC, _uncore_pci_driver),  /* 4th Gen Core ULT 
>>> > Mobile Processor */
>>> > {  /* end marker */ }
>>> >  };
>>> >
>>> > diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
>>> > index 38cff8f..e5ae042 100644
>>> > --- a/include/linux/pci_ids.h
>>> > +++ b/include/linux/pci_ids.h
>>> > @@ -2545,6 +2545,7 @@
>>> >  #define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
>>> >  #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
>>> >  #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
>>> > +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
>>>
>>> Please either use the 0x0a04 constant directly in
>>> perf_event_intel_uncore_snb.c, or explain why the #define should be
>>> here, e.g., maybe it will be used in multiple places.  See the comment
>>> at the top of pci_ids.h.
>>>
>> But then, the same reasoning would apply to the other 3 IMC defines,
>> wouldn't it?
>
> Yes.  But if we made a mistake in the past, that doesn't mean we
> should repeat it today.

Shall I post a patch moving the others as well?

>
>>> >  #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
>>> >  #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
>>> >  #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
>>> > --
>>> > 2.1.2
>>> >
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 5 +
 include/linux/pci_ids.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..0bda6fc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -472,6 +472,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +506,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, _uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, _uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, _uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, _uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..e5ae042 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2545,6 +2545,7 @@
 #define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
 #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
 #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHv2] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao sonny...@chromium.org
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..ca75e70 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,9 @@
 /* Nehalem/SandBridge/Haswell uncore support */
 #include perf_event_intel_uncore.h
 
+/* Uncore IMC PCI Id */
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
+
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
 #define SNB_UNC_CTL_UMASK_MASK 0xff00
@@ -472,6 +475,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +509,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
-- 
2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

This uncore is the same as the Haswell desktop part but uses a
different PCI ID.

Signed-off-by: Sonny Rao sonny...@chromium.org
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 5 +
 include/linux/pci_ids.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015..0bda6fc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -472,6 +472,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+   { /* IMC */
+   PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+   },
{ /* end: all zeroes */ },
 };
 
@@ -502,6 +506,7 @@ static const struct imc_uncore_pci_dev 
desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, ivb_uncore_pci_driver),/* 3rd Gen Core processor 
*/
IMC_DEV(IVB_E3_IMC, ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen 
Core processor */
IMC_DEV(HSW_IMC, hsw_uncore_pci_driver),/* 4th Gen Core Processor 
*/
+   IMC_DEV(HSW_U_IMC, hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile 
Processor */
{  /* end marker */ }
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 38cff8f..e5ae042 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2545,6 +2545,7 @@
 #define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
 #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
 #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04
 #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
 #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
 #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
-- 
2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] perf/x86/intel/uncore: add support for Haswell ULT IMC uncore

2015-04-20 Thread Sonny Rao

On Mon, Apr 20, 2015 at 12:34 PM, Bjorn Helgaas bhelg...@google.com wrote:
 On Mon, Apr 20, 2015 at 1:58 PM, Stephane Eranian eran...@google.com wrote:
 On Mon, Apr 20, 2015 at 11:56 AM, Bjorn Helgaas bhelg...@google.com wrote:

 On Mon, Apr 20, 2015 at 1:42 PM, Sonny Rao sonny...@chromium.org wrote:
  This uncore is the same as the Haswell desktop part but uses a
  different PCI ID.
 
  Signed-off-by: Sonny Rao sonny...@chromium.org
  ---
   arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c | 5 +
   include/linux/pci_ids.h   | 1 +
   2 files changed, 6 insertions(+)
 
  diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
  b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
  index 3001015..0bda6fc 100644
  --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
  +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
  @@ -472,6 +472,10 @@ static const struct pci_device_id 
  hsw_uncore_pci_ids[] = {
  PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
  PCI_DEVICE_ID_INTEL_HSW_IMC),
  .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
  },
  +   { /* IMC */
  +   PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
  PCI_DEVICE_ID_INTEL_HSW_U_IMC),
  +   .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
  +   },
  { /* end: all zeroes */ },
   };
 
  @@ -502,6 +506,7 @@ static const struct imc_uncore_pci_dev 
  desktop_imc_pci_ids[] = {
  IMC_DEV(IVB_IMC, ivb_uncore_pci_driver),/* 3rd Gen Core 
  processor */
  IMC_DEV(IVB_E3_IMC, ivb_uncore_pci_driver), /* Xeon E3-1200 
  v2/3rd Gen Core processor */
  IMC_DEV(HSW_IMC, hsw_uncore_pci_driver),/* 4th Gen Core 
  Processor */
  +   IMC_DEV(HSW_U_IMC, hsw_uncore_pci_driver),  /* 4th Gen Core ULT 
  Mobile Processor */
  {  /* end marker */ }
   };
 
  diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
  index 38cff8f..e5ae042 100644
  --- a/include/linux/pci_ids.h
  +++ b/include/linux/pci_ids.h
  @@ -2545,6 +2545,7 @@
   #define PCI_DEVICE_ID_INTEL_IVB_IMC0x0154
   #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
   #define PCI_DEVICE_ID_INTEL_HSW_IMC0x0c00
  +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC  0x0a04

 Please either use the 0x0a04 constant directly in
 perf_event_intel_uncore_snb.c, or explain why the #define should be
 here, e.g., maybe it will be used in multiple places.  See the comment
 at the top of pci_ids.h.

 But then, the same reasoning would apply to the other 3 IMC defines,
 wouldn't it?

 Yes.  But if we made a mistake in the past, that doesn't mean we
 should repeat it today.

Shall I post a patch moving the others as well?


   #define PCI_DEVICE_ID_INTEL_PXHD_0 0x0320
   #define PCI_DEVICE_ID_INTEL_PXHD_1 0x0321
   #define PCI_DEVICE_ID_INTEL_PXH_0  0x0329
  --
  2.1.2
 
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] arm: dts: rk3288: Enable Cortex-A12 HW PMU events

2015-04-07 Thread Sonny Rao

This adds the dts node for the PMU with the correct PMUIRQ interrupts
for each core.

Signed-off-by: Sonny Rao 
---
 arch/arm/boot/dts/rk3288.dtsi | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 165968d..8253abb 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -44,6 +44,14 @@
spi2 = 
};
 
+   arm-pmu {
+   compatible = "arm,cortex-a12-pmu";
+   interrupts = ,
+,
+,
+;
+   };
+
cpus {
#address-cells = <1>;
#size-cells = <0>;
-- 
2.2.0.rc0.207.ga3a616c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] arm: dts: rk3288: Enable Cortex-A12 HW PMU events

2015-04-07 Thread Sonny Rao

This adds the dts node for the PMU with the correct PMUIRQ interrupts
for each core.

Signed-off-by: Sonny Rao sonny...@chromium.org
---
 arch/arm/boot/dts/rk3288.dtsi | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 165968d..8253abb 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -44,6 +44,14 @@
spi2 = spi2;
};
 
+   arm-pmu {
+   compatible = arm,cortex-a12-pmu;
+   interrupts = GIC_SPI 151 IRQ_TYPE_LEVEL_HIGH,
+GIC_SPI 152 IRQ_TYPE_LEVEL_HIGH,
+GIC_SPI 153 IRQ_TYPE_LEVEL_HIGH,
+GIC_SPI 154 IRQ_TYPE_LEVEL_HIGH;
+   };
+
cpus {
#address-cells = 1;
#size-cells = 0;
-- 
2.2.0.rc0.207.ga3a616c

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] RTC: RK808: fix the rtc time reading issue

2015-01-14 Thread Sonny Rao

On Wed, Jan 14, 2015 at 10:36 AM, Doug Anderson  wrote:
> Sonny,
>
>> Chris, it looks like you swapped the set and the clear of this bit,
>> and you're relying on the fact that the i2c transaction takes a
>> certain amount of time after the RTC_GET_TIME BIT is set.   I'm not
>> sure how long it actually takes, but why not just put in a usleep()
>> for the minimum wait time?
>
> I think we are safe.
>
> At 400kHz (the max speed of this part) each bit can be transferred no
> faster than 2.5us.  In order to do a valid i2c transaction we need to
> _at least_ write the address of the device and the data onto the bus,
> which is 16 bits.  16 * 2.5us = 40us.  That's above the 31.25us
>
> Personally I think what Chris has is fine, with the comment.

Ok, I'm fine with that if we're sure it's slow enough.  Comment
explaining would certainly help.

>
> -Doug
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] RTC: RK808: fix the rtc time reading issue

2015-01-14 Thread Sonny Rao

On Tue, Jan 13, 2015 at 6:43 PM, Chris Zhong  wrote:
> After we set the GET_TIME bit, the rtc time couldn't be read immediately,
> we should wait up to 31.25 us, about one cycle of 32khz. Otherwise reading
> RTC time will return a old time. If clear the GET_TIME bit after setting,
> the time of i2c transfer certainly more than 31.25us.
>
> Signed-off-by: Chris Zhong 
>
> ---
>
>  drivers/rtc/rtc-rk808.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/rtc/rtc-rk808.c b/drivers/rtc/rtc-rk808.c
> index df42257..8dae322 100644
> --- a/drivers/rtc/rtc-rk808.c
> +++ b/drivers/rtc/rtc-rk808.c
> @@ -67,15 +67,20 @@ static int rk808_rtc_readtime(struct device *dev, struct 
> rtc_time *tm)
> /* Force an update of the shadowed registers right now */
> ret = regmap_update_bits(rk808->regmap, RK808_RTC_CTRL_REG,
>  BIT_RTC_CTRL_REG_RTC_GET_TIME,
> -0);
> +BIT_RTC_CTRL_REG_RTC_GET_TIME);
> if (ret) {
> dev_err(dev, "Failed to update bits rtc_ctrl: %d\n", ret);
> return ret;
> }
>
> +   /* After we set the GET_TIME bit, the rtc time couldn't be read
> +* immediately, we should wait up to 31.25 us, about one cycle of
> +* 32khz. If we clear the GET_TIME bit here, the time of i2c transfer
> +* certainly more than 31.25us.
> +*/

Chris, it looks like you swapped the set and the clear of this bit,
and you're relying on the fact that the i2c transaction takes a
certain amount of time after the RTC_GET_TIME BIT is set.   I'm not
sure how long it actually takes, but why not just put in a usleep()
for the minimum wait time?

> ret = regmap_update_bits(rk808->regmap, RK808_RTC_CTRL_REG,
>  BIT_RTC_CTRL_REG_RTC_GET_TIME,
> -BIT_RTC_CTRL_REG_RTC_GET_TIME);
> +0);
> if (ret) {
> dev_err(dev, "Failed to update bits rtc_ctrl: %d\n", ret);
> return ret;
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] RTC: RK808: fix the rtc time reading issue

2015-01-14 Thread Sonny Rao

On Wed, Jan 14, 2015 at 10:36 AM, Doug Anderson diand...@chromium.org wrote:
 Sonny,

 Chris, it looks like you swapped the set and the clear of this bit,
 and you're relying on the fact that the i2c transaction takes a
 certain amount of time after the RTC_GET_TIME BIT is set.   I'm not
 sure how long it actually takes, but why not just put in a usleep()
 for the minimum wait time?

 I think we are safe.

 At 400kHz (the max speed of this part) each bit can be transferred no
 faster than 2.5us.  In order to do a valid i2c transaction we need to
 _at least_ write the address of the device and the data onto the bus,
 which is 16 bits.  16 * 2.5us = 40us.  That's above the 31.25us

 Personally I think what Chris has is fine, with the comment.

Ok, I'm fine with that if we're sure it's slow enough.  Comment
explaining would certainly help.


 -Doug
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] RTC: RK808: fix the rtc time reading issue

2015-01-14 Thread Sonny Rao

On Tue, Jan 13, 2015 at 6:43 PM, Chris Zhong z...@rock-chips.com wrote:
 After we set the GET_TIME bit, the rtc time couldn't be read immediately,
 we should wait up to 31.25 us, about one cycle of 32khz. Otherwise reading
 RTC time will return a old time. If clear the GET_TIME bit after setting,
 the time of i2c transfer certainly more than 31.25us.

 Signed-off-by: Chris Zhong z...@rock-chips.com

 ---

  drivers/rtc/rtc-rk808.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

 diff --git a/drivers/rtc/rtc-rk808.c b/drivers/rtc/rtc-rk808.c
 index df42257..8dae322 100644
 --- a/drivers/rtc/rtc-rk808.c
 +++ b/drivers/rtc/rtc-rk808.c
 @@ -67,15 +67,20 @@ static int rk808_rtc_readtime(struct device *dev, struct 
 rtc_time *tm)
 /* Force an update of the shadowed registers right now */
 ret = regmap_update_bits(rk808-regmap, RK808_RTC_CTRL_REG,
  BIT_RTC_CTRL_REG_RTC_GET_TIME,
 -0);
 +BIT_RTC_CTRL_REG_RTC_GET_TIME);
 if (ret) {
 dev_err(dev, Failed to update bits rtc_ctrl: %d\n, ret);
 return ret;
 }

 +   /* After we set the GET_TIME bit, the rtc time couldn't be read
 +* immediately, we should wait up to 31.25 us, about one cycle of
 +* 32khz. If we clear the GET_TIME bit here, the time of i2c transfer
 +* certainly more than 31.25us.
 +*/

Chris, it looks like you swapped the set and the clear of this bit,
and you're relying on the fact that the i2c transaction takes a
certain amount of time after the RTC_GET_TIME BIT is set.   I'm not
sure how long it actually takes, but why not just put in a usleep()
for the minimum wait time?

 ret = regmap_update_bits(rk808-regmap, RK808_RTC_CTRL_REG,
  BIT_RTC_CTRL_REG_RTC_GET_TIME,
 -BIT_RTC_CTRL_REG_RTC_GET_TIME);
 +0);
 if (ret) {
 dev_err(dev, Failed to update bits rtc_ctrl: %d\n, ret);
 return ret;
 --
 1.9.1

 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: arm64 hitting BUG in arch_timer.h

2014-12-10 Thread Sonny Rao

On Wed, Dec 10, 2014 at 12:56 PM, Mark Salter  wrote:
> Using Linus' tree from this morning, I am hitting:
>
>[0.00] BUG: failure at 
> ./arch/arm64/include/asm/arch_timer.h:112/arch_counter_get_cntpct!
>
> This is triggered by commit 0b46b8a718 ("clocksource: arch_timer: Fix
> code to use physical timers when requested") which addresses an armv7
> problem. Arm64 wants to always use a virtual timer. I used this to avoid
> the BUG and get a booting kernel:
>
> diff --git a/drivers/clocksource/arm_arch_timer.c 
> b/drivers/clocksource/arm_arch
> index 71846f9..4d8a01e 100644
> --- a/drivers/clocksource/arm_arch_timer.c
> +++ b/drivers/clocksource/arm_arch_timer.c
> @@ -468,7 +468,7 @@ static void __init arch_counter_register(unsigned type)
>
> /* Register the CP15 based counter if we have one */
> if (type & ARCH_CP15_TIMER) {
> -   if (arch_timer_use_virtual)
> +   if (IS_ENABLED(CONFIG_ARM64) || arch_timer_use_virtual)
> arch_timer_read_counter = arch_counter_get_cntvct;
> else
> arch_timer_read_counter = arch_counter_get_cntpct;
>
>

Yes Catalin has prepared a similar patch:
https://patchwork.kernel.org/patch/5468031/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: arm64 hitting BUG in arch_timer.h

2014-12-10 Thread Sonny Rao

On Wed, Dec 10, 2014 at 12:56 PM, Mark Salter msal...@redhat.com wrote:
 Using Linus' tree from this morning, I am hitting:

[0.00] BUG: failure at 
 ./arch/arm64/include/asm/arch_timer.h:112/arch_counter_get_cntpct!

 This is triggered by commit 0b46b8a718 (clocksource: arch_timer: Fix
 code to use physical timers when requested) which addresses an armv7
 problem. Arm64 wants to always use a virtual timer. I used this to avoid
 the BUG and get a booting kernel:

 diff --git a/drivers/clocksource/arm_arch_timer.c 
 b/drivers/clocksource/arm_arch
 index 71846f9..4d8a01e 100644
 --- a/drivers/clocksource/arm_arch_timer.c
 +++ b/drivers/clocksource/arm_arch_timer.c
 @@ -468,7 +468,7 @@ static void __init arch_counter_register(unsigned type)

 /* Register the CP15 based counter if we have one */
 if (type  ARCH_CP15_TIMER) {
 -   if (arch_timer_use_virtual)
 +   if (IS_ENABLED(CONFIG_ARM64) || arch_timer_use_virtual)
 arch_timer_read_counter = arch_counter_get_cntvct;
 else
 arch_timer_read_counter = arch_counter_get_cntpct;



Yes Catalin has prepared a similar patch:
https://patchwork.kernel.org/patch/5468031/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [alsa-devel] [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 3:22 PM, Dylan Reid  wrote:
> On Wed, Dec 3, 2014 at 3:03 PM, Sonny Rao  wrote:
>> On Wed, Dec 3, 2014 at 12:03 PM, Mark Brown  wrote:
>>> On Wed, Dec 03, 2014 at 11:38:13AM -0800, Sonny Rao wrote:
>>>> On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown  wrote:
>>>
>>>> > I would expect that the clock for the CODEC should be managed by the
>>>> > CODEC if at all possible - that seems more logical than having the CPU
>>>> > I2S controller request and manage it if it's a separate clock.  Why add
>>>> > this to the CPU side driver?
>>>
>>>> This output clock has a mux and can either be a fixed 12Mhz output or
>>>> can be derived from the same fractional divider which drives the i2s
>>>> block.   I thought it was simpler to keep them all the same, but need
>>>> to put ownership in the i2s in anticipation of the i2s driver setting
>>>> it's own clock rate.
>>>
>>>> If you think this is an implementation detail and this output clock
>>>> should just be owned by the codec driver, even though I'm guessing it
>>>> will just have to be the same as i2s, then I think we can drop this
>>>> and make sure simple card (or whatever other codec driver) claims this
>>>> clock.
>>>
>>> simple-card obviously isn't a CODEC driver...
>>
>> Yeah, sorry.
>>
>>> For generality I think
>>> the clock does need to be exposed to the CODEC driver, otherwise this
>>> will work differently to how other systems are working and we can't
>>> substitute in a different clock on the CODEC side so easily if it
>>> doesn't happen to use the output from the I2S block.
>>
>> Ok, then I think what we will do is abandon this patch and I will send
>> something that adds this functionality to the particular codec that
>> I'm interested in -- max98090.
>
> Sorry I didn't read this earlier.  I don't think that this belongs in
> the max98090.  The original patch description is a bit confusing.  The
> clock being grabbed here is actually i2s mclk.  My understanding is
> that, on this SoC, the mclk is driven from a different IP block than
> the rest of the i2s signals.  The i2s driver needs to be told about
> the clock and enable/disable it at the appropriate times.  I'm
> assuming it's optional because there are boards using this SoC with
> i2s slave mode that don't drive mclk at all.
>
> Please correct me if I'm wrong on any of the above.

I don't think you're wrong, and I'm an audio/i2s neophyte so I think
you're probably right and hopefully Mark can confirm that this is how
we want it.

One important thing to point out, which might be causing confusion, is
that this driver is claiming a clock which it internally calls "mclk"
but the way it's specified for rk3288 in the DT, that one is just the
one which drives the internal logic and has a gate.

This clock I'm adding is the actual mclk which is being driven to the
i2s slave device, and it has it's own gate and also has a mux, and we
need to claim both to be able to enable proper clock gating.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 12:03 PM, Mark Brown  wrote:
> On Wed, Dec 03, 2014 at 11:38:13AM -0800, Sonny Rao wrote:
>> On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown  wrote:
>
>> > I would expect that the clock for the CODEC should be managed by the
>> > CODEC if at all possible - that seems more logical than having the CPU
>> > I2S controller request and manage it if it's a separate clock.  Why add
>> > this to the CPU side driver?
>
>> This output clock has a mux and can either be a fixed 12Mhz output or
>> can be derived from the same fractional divider which drives the i2s
>> block.   I thought it was simpler to keep them all the same, but need
>> to put ownership in the i2s in anticipation of the i2s driver setting
>> it's own clock rate.
>
>> If you think this is an implementation detail and this output clock
>> should just be owned by the codec driver, even though I'm guessing it
>> will just have to be the same as i2s, then I think we can drop this
>> and make sure simple card (or whatever other codec driver) claims this
>> clock.
>
> simple-card obviously isn't a CODEC driver...

Yeah, sorry.

> For generality I think
> the clock does need to be exposed to the CODEC driver, otherwise this
> will work differently to how other systems are working and we can't
> substitute in a different clock on the CODEC side so easily if it
> doesn't happen to use the output from the I2S block.

Ok, then I think what we will do is abandon this patch and I will send
something that adds this functionality to the particular codec that
I'm interested in -- max98090.

I'm a little tied up at the moment so I'm not going to send that for a
little while, but will come eventually.

Thanks for the advice!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown  wrote:
> On Wed, Dec 03, 2014 at 03:18:38PM +0800, Jianqun Xu wrote:
>> From: Sonny Rao 
>>
>> We need to claim the clock which is driving the codec so that when we enable
>> clock gating, we continue to clock the codec when needed.  I make this an
>> optional clock since there might be some applications where we don't need it
>> but can still use the I2S block.
>
> I would expect that the clock for the CODEC should be managed by the
> CODEC if at all possible - that seems more logical than having the CPU
> I2S controller request and manage it if it's a separate clock.  Why add
> this to the CPU side driver?

It's a good question.  Right now the way I'm running this stuff we're
mostly setting all the i2s the clock rates from the codec driver, but
I think this isn't the correct way to go, and the i2s driver needs to
set it's rate based on the hw params, but that isn't happening (yet).

This output clock has a mux and can either be a fixed 12Mhz output or
can be derived from the same fractional divider which drives the i2s
block.   I thought it was simpler to keep them all the same, but need
to put ownership in the i2s in anticipation of the i2s driver setting
it's own clock rate.

If you think this is an implementation detail and this output clock
should just be owned by the codec driver, even though I'm guessing it
will just have to be the same as i2s, then I think we can drop this
and make sure simple card (or whatever other codec driver) claims this
clock.

>
> We've not always done this for older systems due to the lack of a usable
> clock API but that's starting to be addressed.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown broo...@kernel.org wrote:
 On Wed, Dec 03, 2014 at 03:18:38PM +0800, Jianqun Xu wrote:
 From: Sonny Rao sonny...@chromium.org

 We need to claim the clock which is driving the codec so that when we enable
 clock gating, we continue to clock the codec when needed.  I make this an
 optional clock since there might be some applications where we don't need it
 but can still use the I2S block.

 I would expect that the clock for the CODEC should be managed by the
 CODEC if at all possible - that seems more logical than having the CPU
 I2S controller request and manage it if it's a separate clock.  Why add
 this to the CPU side driver?

It's a good question.  Right now the way I'm running this stuff we're
mostly setting all the i2s the clock rates from the codec driver, but
I think this isn't the correct way to go, and the i2s driver needs to
set it's rate based on the hw params, but that isn't happening (yet).

This output clock has a mux and can either be a fixed 12Mhz output or
can be derived from the same fractional divider which drives the i2s
block.   I thought it was simpler to keep them all the same, but need
to put ownership in the i2s in anticipation of the i2s driver setting
it's own clock rate.

If you think this is an implementation detail and this output clock
should just be owned by the codec driver, even though I'm guessing it
will just have to be the same as i2s, then I think we can drop this
and make sure simple card (or whatever other codec driver) claims this
clock.



 We've not always done this for older systems due to the lack of a usable
 clock API but that's starting to be addressed.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 12:03 PM, Mark Brown broo...@kernel.org wrote:
 On Wed, Dec 03, 2014 at 11:38:13AM -0800, Sonny Rao wrote:
 On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown broo...@kernel.org wrote:

  I would expect that the clock for the CODEC should be managed by the
  CODEC if at all possible - that seems more logical than having the CPU
  I2S controller request and manage it if it's a separate clock.  Why add
  this to the CPU side driver?

 This output clock has a mux and can either be a fixed 12Mhz output or
 can be derived from the same fractional divider which drives the i2s
 block.   I thought it was simpler to keep them all the same, but need
 to put ownership in the i2s in anticipation of the i2s driver setting
 it's own clock rate.

 If you think this is an implementation detail and this output clock
 should just be owned by the codec driver, even though I'm guessing it
 will just have to be the same as i2s, then I think we can drop this
 and make sure simple card (or whatever other codec driver) claims this
 clock.

 simple-card obviously isn't a CODEC driver...

Yeah, sorry.

 For generality I think
 the clock does need to be exposed to the CODEC driver, otherwise this
 will work differently to how other systems are working and we can't
 substitute in a different clock on the CODEC side so easily if it
 doesn't happen to use the output from the I2S block.

Ok, then I think what we will do is abandon this patch and I will send
something that adds this functionality to the particular codec that
I'm interested in -- max98090.

I'm a little tied up at the moment so I'm not going to send that for a
little while, but will come eventually.

Thanks for the advice!
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [alsa-devel] [PATCH v2 2/2] ASoC: rockchip: i2s: add support for grabbing output clock to codec

2014-12-03 Thread Sonny Rao

On Wed, Dec 3, 2014 at 3:22 PM, Dylan Reid dgr...@chromium.org wrote:
 On Wed, Dec 3, 2014 at 3:03 PM, Sonny Rao sonny...@chromium.org wrote:
 On Wed, Dec 3, 2014 at 12:03 PM, Mark Brown broo...@kernel.org wrote:
 On Wed, Dec 03, 2014 at 11:38:13AM -0800, Sonny Rao wrote:
 On Wed, Dec 3, 2014 at 11:20 AM, Mark Brown broo...@kernel.org wrote:

  I would expect that the clock for the CODEC should be managed by the
  CODEC if at all possible - that seems more logical than having the CPU
  I2S controller request and manage it if it's a separate clock.  Why add
  this to the CPU side driver?

 This output clock has a mux and can either be a fixed 12Mhz output or
 can be derived from the same fractional divider which drives the i2s
 block.   I thought it was simpler to keep them all the same, but need
 to put ownership in the i2s in anticipation of the i2s driver setting
 it's own clock rate.

 If you think this is an implementation detail and this output clock
 should just be owned by the codec driver, even though I'm guessing it
 will just have to be the same as i2s, then I think we can drop this
 and make sure simple card (or whatever other codec driver) claims this
 clock.

 simple-card obviously isn't a CODEC driver...

 Yeah, sorry.

 For generality I think
 the clock does need to be exposed to the CODEC driver, otherwise this
 will work differently to how other systems are working and we can't
 substitute in a different clock on the CODEC side so easily if it
 doesn't happen to use the output from the I2S block.

 Ok, then I think what we will do is abandon this patch and I will send
 something that adds this functionality to the particular codec that
 I'm interested in -- max98090.

 Sorry I didn't read this earlier.  I don't think that this belongs in
 the max98090.  The original patch description is a bit confusing.  The
 clock being grabbed here is actually i2s mclk.  My understanding is
 that, on this SoC, the mclk is driven from a different IP block than
 the rest of the i2s signals.  The i2s driver needs to be told about
 the clock and enable/disable it at the appropriate times.  I'm
 assuming it's optional because there are boards using this SoC with
 i2s slave mode that don't drive mclk at all.

 Please correct me if I'm wrong on any of the above.

I don't think you're wrong, and I'm an audio/i2s neophyte so I think
you're probably right and hopefully Mark can confirm that this is how
we want it.

One important thing to point out, which might be causing confusion, is
that this driver is claiming a clock which it internally calls mclk
but the way it's specified for rk3288 in the DT, that one is just the
one which drives the internal logic and has a gate.

This clock I'm adding is the actual mclk which is being driven to the
i2s slave device, and it has it's own gate and also has a mux, and we
need to claim both to be able to enable proper clock gating.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clk: rockchip: rk3288 export i2s0_clkout for use in DT

2014-11-26 Thread Sonny Rao

On Wed, Nov 26, 2014 at 3:32 PM, Heiko Stübner  wrote:
> Am Dienstag, 18. November 2014, 23:15:19 schrieb Sonny Rao:
>> This exposes the clock that comes out of the i2s block which generally
>> goes to the audio codec.
>>
>> Signed-off-by: Sonny Rao 
>
> applied to my clk branch after removing the CLK_SET_RATE_PARENT

Hi, sorry for the delay, and thanks for fixing it.  I think when I
applied the patch to next-20141118 that had a CLK_SET_RATE_PARENT in
it from this patch:

commit fc69ed70c16a31d6a77ec47a30a9fe941f763f1e
Author: Jianqun 
Date:   Tue Sep 30 11:12:04 2014 +0800

clk: rockchip: rk3288: i2s_frac adds flag to set parent's rate


I agree that is is not necessary and maybe not desirable.  Thanks again!


> Heiko
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clk: rockchip: rk3288 export i2s0_clkout for use in DT

2014-11-26 Thread Sonny Rao

On Wed, Nov 26, 2014 at 3:32 PM, Heiko Stübner he...@sntech.de wrote:
 Am Dienstag, 18. November 2014, 23:15:19 schrieb Sonny Rao:
 This exposes the clock that comes out of the i2s block which generally
 goes to the audio codec.

 Signed-off-by: Sonny Rao sonny...@chromium.org

 applied to my clk branch after removing the CLK_SET_RATE_PARENT

Hi, sorry for the delay, and thanks for fixing it.  I think when I
applied the patch to next-20141118 that had a CLK_SET_RATE_PARENT in
it from this patch:

commit fc69ed70c16a31d6a77ec47a30a9fe941f763f1e
Author: Jianqun jay...@rock-chips.com
Date:   Tue Sep 30 11:12:04 2014 +0800

clk: rockchip: rk3288: i2s_frac adds flag to set parent's rate


I agree that is is not necessary and maybe not desirable.  Thanks again!


 Heiko
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ARM: dts: rk3288: add arm,cpu-registers-not-fw-configured

2014-11-25 Thread Sonny Rao

This will enable use of physical arch timers on rk3288, where each
core comes out of reset with a different virtual offset.  Using
physical timers will help with SMP booting on coreboot and older
u-boot and should also allow suspend-resume and cpu-hotplug to work on
all firmwares.

Firmware which does initialize the cpu registers properly at boot and
cpu-hotplug can remove this property from the device tree.

Signed-off-by: Sonny Rao 
---
 arch/arm/boot/dts/rk3288.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 0f50d5d..c861f52 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -139,6 +139,7 @@
 
timer {
compatible = "arm,armv7-timer";
+   arm,cpu-registers-not-fw-configured;
interrupts = ,
 ,
 ,
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ARM: dts: rk3288: add arm,cpu-registers-not-fw-configured

2014-11-25 Thread Sonny Rao

This will enable use of physical arch timers on rk3288, where each
core comes out of reset with a different virtual offset.  Using
physical timers will help with SMP booting on coreboot and older
u-boot and should also allow suspend-resume and cpu-hotplug to work on
all firmwares.

Firmware which does initialize the cpu registers properly at boot and
cpu-hotplug can remove this property from the device tree.

Signed-off-by: Sonny Rao sonny...@chromium.org
---
 arch/arm/boot/dts/rk3288.dtsi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi
index 0f50d5d..c861f52 100644
--- a/arch/arm/boot/dts/rk3288.dtsi
+++ b/arch/arm/boot/dts/rk3288.dtsi
@@ -139,6 +139,7 @@
 
timer {
compatible = arm,armv7-timer;
+   arm,cpu-registers-not-fw-configured;
interrupts = GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | 
IRQ_TYPE_LEVEL_HIGH),
 GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | 
IRQ_TYPE_LEVEL_HIGH),
 GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | 
IRQ_TYPE_LEVEL_HIGH),
-- 
2.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 >

1 - 100 of 260 matches

Mail list logo