[RFC 4/6] memtrack: Adds the accounting to keep track of all mmaped/unmapped pages.
Since mmaped pages will be accounted by the PSS, memtrack needs a way to differentiate the total memory that hasn't been accounted for. Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> Signed-off-by: Greg Hackmann <ghackm...@google.com> --- drivers/misc/memtrack.c | 175 -- drivers/staging/android/ion/ion.c | 5 +- include/linux/memtrack.h | 29 +++ 3 files changed, 180 insertions(+), 29 deletions(-) diff --git a/drivers/misc/memtrack.c b/drivers/misc/memtrack.c index e5c7e03..4b2d17f 100644 --- a/drivers/misc/memtrack.c +++ b/drivers/misc/memtrack.c @@ -22,12 +22,19 @@ #include #include #include +#include + +struct memtrack_vma_list { + struct hlist_node node; + const struct vm_area_struct *vma; +}; struct memtrack_handle { struct memtrack_buffer *buffer; struct rb_node node; struct rb_root *root; struct kref refcount; + struct hlist_head vma_list; }; static struct kmem_cache *memtrack_handle_cache; @@ -40,8 +47,8 @@ static DEFINE_IDR(mem_idr); static DEFINE_IDA(mem_ida); #endif -static void memtrack_buffer_install_locked(struct rb_root *root, - struct memtrack_buffer *buffer) +static struct memtrack_handle *memtrack_handle_find_locked(struct rb_root *root, + struct memtrack_buffer *buffer, bool alloc) { struct rb_node **new = >rb_node, *parent = NULL; struct memtrack_handle *handle; @@ -56,22 +63,38 @@ static void memtrack_buffer_install_locked(struct rb_root *root, } else if (handle->buffer->id < buffer->id) { new = >rb_right; } else { - kref_get(>refcount); - return; + return handle; } } - handle = kmem_cache_alloc(memtrack_handle_cache, GFP_KERNEL); - if (!handle) - return; + if (alloc) { + handle = kmem_cache_alloc(memtrack_handle_cache, GFP_KERNEL); + if (!handle) + return NULL; - handle->buffer = buffer; - handle->root = root; - kref_init(>refcount); + handle->buffer = buffer; + handle->root = root; + kref_init(>refcount); + INIT_HLIST_HEAD(>vma_list); - rb_link_node(>node, parent, new); - rb_insert_color(>node, root); - atomic_inc(>buffer->userspace_handles); + rb_link_node(>node, parent, new); + rb_insert_color(>node, root); + atomic_inc(>buffer->userspace_handles); + } + + return NULL; +} + +static void memtrack_buffer_install_locked(struct rb_root *root, + struct memtrack_buffer *buffer) +{ + struct memtrack_handle *handle; + + handle = memtrack_handle_find_locked(root, buffer, true); + if (handle) { + kref_get(>refcount); + return; + } } /** @@ -112,19 +135,41 @@ static void memtrack_handle_destroy(struct kref *ref) static void memtrack_buffer_uninstall_locked(struct rb_root *root, struct memtrack_buffer *buffer) { - struct rb_node *node = root->rb_node; + struct memtrack_handle *handle; - while (node) { - struct memtrack_handle *handle = rb_entry(node, - struct memtrack_handle, node); + handle = memtrack_handle_find_locked(root, buffer, false); - if (handle->buffer->id > buffer->id) { - node = node->rb_left; - } else if (handle->buffer->id < buffer->id) { - node = node->rb_right; - } else { - kref_put(>refcount, memtrack_handle_destroy); - return; + if (handle) + kref_put(>refcount, memtrack_handle_destroy); +} + +static void memtrack_buffer_vm_open_locked(struct rb_root *root, + struct memtrack_buffer *buffer, + struct memtrack_vma_list *vma_list) +{ + struct memtrack_handle *handle; + + handle = memtrack_handle_find_locked(root, buffer, false); + if (handle) + hlist_add_head(_list->node, >vma_list); +} + +static void memtrack_buffer_vm_close_locked(struct rb_root *root, + struct memtrack_buffer *buffer, + const struct vm_area_struct *vma) +{ + struct memtrack_handle *handle; + + handle = memtrack_handle_find_locked(root, buffer, false); + if (handle) { + struct memtrack_vma_list *vma_list; + + hlist_for_each_entry(vma_list, >vma_list, node) { + if (vma_list->vma == vma) { + hlist_del(_list->node); +
[RFC 4/6] memtrack: Adds the accounting to keep track of all mmaped/unmapped pages.
Since mmaped pages will be accounted by the PSS, memtrack needs a way to differentiate the total memory that hasn't been accounted for. Signed-off-by: Ruchi Kandoi Signed-off-by: Greg Hackmann --- drivers/misc/memtrack.c | 175 -- drivers/staging/android/ion/ion.c | 5 +- include/linux/memtrack.h | 29 +++ 3 files changed, 180 insertions(+), 29 deletions(-) diff --git a/drivers/misc/memtrack.c b/drivers/misc/memtrack.c index e5c7e03..4b2d17f 100644 --- a/drivers/misc/memtrack.c +++ b/drivers/misc/memtrack.c @@ -22,12 +22,19 @@ #include #include #include +#include + +struct memtrack_vma_list { + struct hlist_node node; + const struct vm_area_struct *vma; +}; struct memtrack_handle { struct memtrack_buffer *buffer; struct rb_node node; struct rb_root *root; struct kref refcount; + struct hlist_head vma_list; }; static struct kmem_cache *memtrack_handle_cache; @@ -40,8 +47,8 @@ static DEFINE_IDR(mem_idr); static DEFINE_IDA(mem_ida); #endif -static void memtrack_buffer_install_locked(struct rb_root *root, - struct memtrack_buffer *buffer) +static struct memtrack_handle *memtrack_handle_find_locked(struct rb_root *root, + struct memtrack_buffer *buffer, bool alloc) { struct rb_node **new = >rb_node, *parent = NULL; struct memtrack_handle *handle; @@ -56,22 +63,38 @@ static void memtrack_buffer_install_locked(struct rb_root *root, } else if (handle->buffer->id < buffer->id) { new = >rb_right; } else { - kref_get(>refcount); - return; + return handle; } } - handle = kmem_cache_alloc(memtrack_handle_cache, GFP_KERNEL); - if (!handle) - return; + if (alloc) { + handle = kmem_cache_alloc(memtrack_handle_cache, GFP_KERNEL); + if (!handle) + return NULL; - handle->buffer = buffer; - handle->root = root; - kref_init(>refcount); + handle->buffer = buffer; + handle->root = root; + kref_init(>refcount); + INIT_HLIST_HEAD(>vma_list); - rb_link_node(>node, parent, new); - rb_insert_color(>node, root); - atomic_inc(>buffer->userspace_handles); + rb_link_node(>node, parent, new); + rb_insert_color(>node, root); + atomic_inc(>buffer->userspace_handles); + } + + return NULL; +} + +static void memtrack_buffer_install_locked(struct rb_root *root, + struct memtrack_buffer *buffer) +{ + struct memtrack_handle *handle; + + handle = memtrack_handle_find_locked(root, buffer, true); + if (handle) { + kref_get(>refcount); + return; + } } /** @@ -112,19 +135,41 @@ static void memtrack_handle_destroy(struct kref *ref) static void memtrack_buffer_uninstall_locked(struct rb_root *root, struct memtrack_buffer *buffer) { - struct rb_node *node = root->rb_node; + struct memtrack_handle *handle; - while (node) { - struct memtrack_handle *handle = rb_entry(node, - struct memtrack_handle, node); + handle = memtrack_handle_find_locked(root, buffer, false); - if (handle->buffer->id > buffer->id) { - node = node->rb_left; - } else if (handle->buffer->id < buffer->id) { - node = node->rb_right; - } else { - kref_put(>refcount, memtrack_handle_destroy); - return; + if (handle) + kref_put(>refcount, memtrack_handle_destroy); +} + +static void memtrack_buffer_vm_open_locked(struct rb_root *root, + struct memtrack_buffer *buffer, + struct memtrack_vma_list *vma_list) +{ + struct memtrack_handle *handle; + + handle = memtrack_handle_find_locked(root, buffer, false); + if (handle) + hlist_add_head(_list->node, >vma_list); +} + +static void memtrack_buffer_vm_close_locked(struct rb_root *root, + struct memtrack_buffer *buffer, + const struct vm_area_struct *vma) +{ + struct memtrack_handle *handle; + + handle = memtrack_handle_find_locked(root, buffer, false); + if (handle) { + struct memtrack_vma_list *vma_list; + + hlist_for_each_entry(vma_list, >vma_list, node) { + if (vma_list->vma == vma) { + hlist_del(_list->node); + kfree(vma_list); +
[RFC 3/6] dma-buf: add memtrack support
Signed-off-by: Greg Hackmann <ghackm...@google.com> Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- drivers/dma-buf/dma-buf.c | 37 ++ drivers/staging/android/ion/ion.c | 14 + drivers/staging/android/ion/ion_priv.h | 2 ++ include/linux/dma-buf.h| 5 + 4 files changed, 58 insertions(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index ddaee60..f632c2b 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -297,12 +297,32 @@ static long dma_buf_ioctl(struct file *file, } } +static void dma_buf_installed(struct file *file, struct task_struct *task) +{ + struct memtrack_buffer *memtrack = + dma_buf_memtrack_buffer(file->private_data); + + if (memtrack) + memtrack_buffer_install(memtrack, task); +} + +static void dma_buf_uninstalled(struct file *file, struct task_struct *task) +{ + struct memtrack_buffer *memtrack = + dma_buf_memtrack_buffer(file->private_data); + + if (memtrack) + memtrack_buffer_uninstall(memtrack, task); +} + static const struct file_operations dma_buf_fops = { .release= dma_buf_release, .mmap = dma_buf_mmap_internal, .llseek = dma_buf_llseek, .poll = dma_buf_poll, .unlocked_ioctl = dma_buf_ioctl, + .installed = dma_buf_installed, + .uninstalled= dma_buf_uninstalled, }; /* @@ -830,6 +850,23 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr) } EXPORT_SYMBOL_GPL(dma_buf_vunmap); +/** + * dma_buf_memtrack_buffer - returns a memtrack entry associated with dma_buf + * + * @dmabuf:[in]pointer to dma_buf + * + * Returns the struct memtrack_buffer associated with this dma_buf's + * backing pages. If memtrack isn't enabled in the kernel, or the dma_buf + * exporter doesn't have memtrack support, returns NULL. + */ +struct memtrack_buffer *dma_buf_memtrack_buffer(struct dma_buf *dmabuf) +{ + if (!dmabuf->ops->memtrack_buffer) + return NULL; + return dmabuf->ops->memtrack_buffer(dmabuf); +} +EXPORT_SYMBOL_GPL(dma_buf_memtrack_buffer); + #ifdef CONFIG_DEBUG_FS static int dma_buf_debug_show(struct seq_file *s, void *unused) { diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 396ded5..1c2df54 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -196,6 +196,7 @@ void ion_buffer_destroy(struct ion_buffer *buffer) buffer->heap->ops->unmap_kernel(buffer->heap, buffer); buffer->heap->ops->free(buffer); vfree(buffer->pages); + memtrack_buffer_remove(>memtrack_buffer); kfree(buffer); } @@ -458,6 +459,8 @@ struct ion_handle *ion_alloc(struct ion_client *client, size_t len, handle = ERR_PTR(ret); } + memtrack_buffer_init(>memtrack_buffer, len); + return handle; } EXPORT_SYMBOL(ion_alloc); @@ -1013,6 +1016,16 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf, return 0; } +static struct memtrack_buffer *ion_memtrack_buffer(struct dma_buf *buffer) +{ + if (IS_ENABLED(CONFIG_MEMTRACK) && buffer && buffer->priv) { + struct ion_buffer *ion_buffer = buffer->priv; + + return _buffer->memtrack_buffer; + } + return NULL; +} + static struct dma_buf_ops dma_buf_ops = { .map_dma_buf = ion_map_dma_buf, .unmap_dma_buf = ion_unmap_dma_buf, @@ -1024,6 +1037,7 @@ static struct dma_buf_ops dma_buf_ops = { .kunmap_atomic = ion_dma_buf_kunmap, .kmap = ion_dma_buf_kmap, .kunmap = ion_dma_buf_kunmap, + .memtrack_buffer = ion_memtrack_buffer, }; struct dma_buf *ion_share_dma_buf(struct ion_client *client, diff --git a/drivers/staging/android/ion/ion_priv.h b/drivers/staging/android/ion/ion_priv.h index 3c3b324..74c38eb 100644 --- a/drivers/staging/android/ion/ion_priv.h +++ b/drivers/staging/android/ion/ion_priv.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "ion.h" @@ -78,6 +79,7 @@ struct ion_buffer { int handle_count; char task_comm[TASK_COMM_LEN]; pid_t pid; + struct memtrack_buffer memtrack_buffer; }; void ion_buffer_destroy(struct ion_buffer *buffer); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e0b0741..dfcc2d0 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -32,6 +32,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -70,6 +71,8 @@ struct dma_buf_attachment; * @vmap: [optional] creates a virtual mapping for the buffer into kernel * address space. Same restrictions as for vmap and friends apply.
[RFC 2/6] drivers: misc: add memtrack
Shared-buffer allocators like ion or GEM traditionally call into CMA or alloc_pages() to get backing memory, meaning these allocations will not show up in any process's mm counters. But since these allocations are often used for things like graphics buffers that can be extremely large, the user just sees a bunch of pages vanishing from the system without an explanation. CONFIG_MEMTRACK adds infrastructure for "blaming" these allocations back to the processes currently holding a reference to the shared buffer. This information is exposed to userspace through /proc/[pid]/memtrack. To use memtrack, the shared memory allocator should: (1) Embed a struct memtrack_buffer somewhere in the underlying buffer's metadata, and initialize it with memtrack_buffer_init() (3) Call memtrack_buffer_{install,uninstall} each time a task takes or drops a reference to the shared buffer (3) Call memtrack_buffer_remove() before destroying a tracked buffer CONFIG_MEMTRACK_DEBUG adds a global list of all buffers tracked by memtrack, accessible through /sys/kernel/debug/memtrack. This involves maintaining a global idr of buffers. Due to the extra overhead, CONFIG_MEMTRACK_DEBUG is intended for debugging memory leaks rather than production use. Signed-off-by: Greg Hackmann <ghackm...@google.com> Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- drivers/misc/Kconfig | 16 +++ drivers/misc/Makefile| 1 + drivers/misc/memtrack.c | 360 +++ fs/proc/base.c | 4 + include/linux/memtrack.h | 94 + include/linux/sched.h| 3 + kernel/fork.c| 4 + 7 files changed, 482 insertions(+) create mode 100644 drivers/misc/memtrack.c create mode 100644 include/linux/memtrack.h diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 64971ba..7557fb1 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -766,6 +766,22 @@ config PANEL_BOOT_MESSAGE An empty message will only clear the display at driver init time. Any other printf()-formatted message is valid with newline and escape codes. +config MEMTRACK + tristate "Per-pid memory statistics" + default n + ---help--- + Keeps track of shared buffers allocated by the process and + exports them via /proc//memtrack. + +config MEMTRACK_DEBUG + tristate "Per-pid memory statistics debug option" + depends on MEMTRACK && DEBUG_FS + default n + ---help--- + Keeps track of all shared buffers allocated and exports the list + via /sys/kernel/debug/memtrack. + + source "drivers/misc/c2port/Kconfig" source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 3198336..1fbb084 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -68,3 +68,4 @@ OBJCOPYFLAGS_lkdtm_rodata_objcopy.o := \ targets += lkdtm_rodata.o lkdtm_rodata_objcopy.o $(obj)/lkdtm_rodata_objcopy.o: $(obj)/lkdtm_rodata.o FORCE $(call if_changed,objcopy) +obj-$(CONFIG_MEMTRACK) += memtrack.o diff --git a/drivers/misc/memtrack.c b/drivers/misc/memtrack.c new file mode 100644 index 000..e5c7e03 --- /dev/null +++ b/drivers/misc/memtrack.c @@ -0,0 +1,360 @@ +/* drivers/misc/memtrack.c + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct memtrack_handle { + struct memtrack_buffer *buffer; + struct rb_node node; + struct rb_root *root; + struct kref refcount; +}; + +static struct kmem_cache *memtrack_handle_cache; + +static DEFINE_MUTEX(memtrack_id_lock); +#if IS_ENABLED(CONFIG_MEMTRACK_DEBUG) +static struct dentry *debugfs_file; +static DEFINE_IDR(mem_idr); +#else +static DEFINE_IDA(mem_ida); +#endif + +static void memtrack_buffer_install_locked(struct rb_root *root, + struct memtrack_buffer *buffer) +{ + struct rb_node **new = >rb_node, *parent = NULL; + struct memtrack_handle *handle; + + while (*new) { + struct rb_node *node = *new; + + handle = rb_entry(node, struct memtrack_handle, node); + parent = node; + if (handle->buffer->id > buffer->id) { + new = >rb
[RFC 3/6] dma-buf: add memtrack support
Signed-off-by: Greg Hackmann Signed-off-by: Ruchi Kandoi --- drivers/dma-buf/dma-buf.c | 37 ++ drivers/staging/android/ion/ion.c | 14 + drivers/staging/android/ion/ion_priv.h | 2 ++ include/linux/dma-buf.h| 5 + 4 files changed, 58 insertions(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index ddaee60..f632c2b 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -297,12 +297,32 @@ static long dma_buf_ioctl(struct file *file, } } +static void dma_buf_installed(struct file *file, struct task_struct *task) +{ + struct memtrack_buffer *memtrack = + dma_buf_memtrack_buffer(file->private_data); + + if (memtrack) + memtrack_buffer_install(memtrack, task); +} + +static void dma_buf_uninstalled(struct file *file, struct task_struct *task) +{ + struct memtrack_buffer *memtrack = + dma_buf_memtrack_buffer(file->private_data); + + if (memtrack) + memtrack_buffer_uninstall(memtrack, task); +} + static const struct file_operations dma_buf_fops = { .release= dma_buf_release, .mmap = dma_buf_mmap_internal, .llseek = dma_buf_llseek, .poll = dma_buf_poll, .unlocked_ioctl = dma_buf_ioctl, + .installed = dma_buf_installed, + .uninstalled= dma_buf_uninstalled, }; /* @@ -830,6 +850,23 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr) } EXPORT_SYMBOL_GPL(dma_buf_vunmap); +/** + * dma_buf_memtrack_buffer - returns a memtrack entry associated with dma_buf + * + * @dmabuf:[in]pointer to dma_buf + * + * Returns the struct memtrack_buffer associated with this dma_buf's + * backing pages. If memtrack isn't enabled in the kernel, or the dma_buf + * exporter doesn't have memtrack support, returns NULL. + */ +struct memtrack_buffer *dma_buf_memtrack_buffer(struct dma_buf *dmabuf) +{ + if (!dmabuf->ops->memtrack_buffer) + return NULL; + return dmabuf->ops->memtrack_buffer(dmabuf); +} +EXPORT_SYMBOL_GPL(dma_buf_memtrack_buffer); + #ifdef CONFIG_DEBUG_FS static int dma_buf_debug_show(struct seq_file *s, void *unused) { diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 396ded5..1c2df54 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -196,6 +196,7 @@ void ion_buffer_destroy(struct ion_buffer *buffer) buffer->heap->ops->unmap_kernel(buffer->heap, buffer); buffer->heap->ops->free(buffer); vfree(buffer->pages); + memtrack_buffer_remove(>memtrack_buffer); kfree(buffer); } @@ -458,6 +459,8 @@ struct ion_handle *ion_alloc(struct ion_client *client, size_t len, handle = ERR_PTR(ret); } + memtrack_buffer_init(>memtrack_buffer, len); + return handle; } EXPORT_SYMBOL(ion_alloc); @@ -1013,6 +1016,16 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf, return 0; } +static struct memtrack_buffer *ion_memtrack_buffer(struct dma_buf *buffer) +{ + if (IS_ENABLED(CONFIG_MEMTRACK) && buffer && buffer->priv) { + struct ion_buffer *ion_buffer = buffer->priv; + + return _buffer->memtrack_buffer; + } + return NULL; +} + static struct dma_buf_ops dma_buf_ops = { .map_dma_buf = ion_map_dma_buf, .unmap_dma_buf = ion_unmap_dma_buf, @@ -1024,6 +1037,7 @@ static struct dma_buf_ops dma_buf_ops = { .kunmap_atomic = ion_dma_buf_kunmap, .kmap = ion_dma_buf_kmap, .kunmap = ion_dma_buf_kunmap, + .memtrack_buffer = ion_memtrack_buffer, }; struct dma_buf *ion_share_dma_buf(struct ion_client *client, diff --git a/drivers/staging/android/ion/ion_priv.h b/drivers/staging/android/ion/ion_priv.h index 3c3b324..74c38eb 100644 --- a/drivers/staging/android/ion/ion_priv.h +++ b/drivers/staging/android/ion/ion_priv.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "ion.h" @@ -78,6 +79,7 @@ struct ion_buffer { int handle_count; char task_comm[TASK_COMM_LEN]; pid_t pid; + struct memtrack_buffer memtrack_buffer; }; void ion_buffer_destroy(struct ion_buffer *buffer); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e0b0741..dfcc2d0 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -32,6 +32,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -70,6 +71,8 @@ struct dma_buf_attachment; * @vmap: [optional] creates a virtual mapping for the buffer into kernel * address space. Same restrictions as for vmap and friends apply. * @vunmap: [optional] unmaps a vmap from the b
[RFC 2/6] drivers: misc: add memtrack
Shared-buffer allocators like ion or GEM traditionally call into CMA or alloc_pages() to get backing memory, meaning these allocations will not show up in any process's mm counters. But since these allocations are often used for things like graphics buffers that can be extremely large, the user just sees a bunch of pages vanishing from the system without an explanation. CONFIG_MEMTRACK adds infrastructure for "blaming" these allocations back to the processes currently holding a reference to the shared buffer. This information is exposed to userspace through /proc/[pid]/memtrack. To use memtrack, the shared memory allocator should: (1) Embed a struct memtrack_buffer somewhere in the underlying buffer's metadata, and initialize it with memtrack_buffer_init() (3) Call memtrack_buffer_{install,uninstall} each time a task takes or drops a reference to the shared buffer (3) Call memtrack_buffer_remove() before destroying a tracked buffer CONFIG_MEMTRACK_DEBUG adds a global list of all buffers tracked by memtrack, accessible through /sys/kernel/debug/memtrack. This involves maintaining a global idr of buffers. Due to the extra overhead, CONFIG_MEMTRACK_DEBUG is intended for debugging memory leaks rather than production use. Signed-off-by: Greg Hackmann Signed-off-by: Ruchi Kandoi --- drivers/misc/Kconfig | 16 +++ drivers/misc/Makefile| 1 + drivers/misc/memtrack.c | 360 +++ fs/proc/base.c | 4 + include/linux/memtrack.h | 94 + include/linux/sched.h| 3 + kernel/fork.c| 4 + 7 files changed, 482 insertions(+) create mode 100644 drivers/misc/memtrack.c create mode 100644 include/linux/memtrack.h diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 64971ba..7557fb1 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -766,6 +766,22 @@ config PANEL_BOOT_MESSAGE An empty message will only clear the display at driver init time. Any other printf()-formatted message is valid with newline and escape codes. +config MEMTRACK + tristate "Per-pid memory statistics" + default n + ---help--- + Keeps track of shared buffers allocated by the process and + exports them via /proc//memtrack. + +config MEMTRACK_DEBUG + tristate "Per-pid memory statistics debug option" + depends on MEMTRACK && DEBUG_FS + default n + ---help--- + Keeps track of all shared buffers allocated and exports the list + via /sys/kernel/debug/memtrack. + + source "drivers/misc/c2port/Kconfig" source "drivers/misc/c2port/Kconfig" source "drivers/misc/eeprom/Kconfig" source "drivers/misc/cb710/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 3198336..1fbb084 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -68,3 +68,4 @@ OBJCOPYFLAGS_lkdtm_rodata_objcopy.o := \ targets += lkdtm_rodata.o lkdtm_rodata_objcopy.o $(obj)/lkdtm_rodata_objcopy.o: $(obj)/lkdtm_rodata.o FORCE $(call if_changed,objcopy) +obj-$(CONFIG_MEMTRACK) += memtrack.o diff --git a/drivers/misc/memtrack.c b/drivers/misc/memtrack.c new file mode 100644 index 000..e5c7e03 --- /dev/null +++ b/drivers/misc/memtrack.c @@ -0,0 +1,360 @@ +/* drivers/misc/memtrack.c + * + * Copyright (C) 2016 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct memtrack_handle { + struct memtrack_buffer *buffer; + struct rb_node node; + struct rb_root *root; + struct kref refcount; +}; + +static struct kmem_cache *memtrack_handle_cache; + +static DEFINE_MUTEX(memtrack_id_lock); +#if IS_ENABLED(CONFIG_MEMTRACK_DEBUG) +static struct dentry *debugfs_file; +static DEFINE_IDR(mem_idr); +#else +static DEFINE_IDA(mem_ida); +#endif + +static void memtrack_buffer_install_locked(struct rb_root *root, + struct memtrack_buffer *buffer) +{ + struct rb_node **new = >rb_node, *parent = NULL; + struct memtrack_handle *handle; + + while (*new) { + struct rb_node *node = *new; + + handle = rb_entry(node, struct memtrack_handle, node); + parent = node; + if (handle->buffer->id > buffer->id) { + new = >rb_left; + } else if (ha
[RFC 5/6] memtrack: Add memtrack accounting for forked processes.
When a process is forked, all the buffers are shared with the forked process too. Adds the functionality to add memtrack accounting for the forked processes. Forked process gets a copy of the mapped pages of the parent process. This patch makes sure that the new mapped pages are attributed to the child process instead of the parent. Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- drivers/misc/memtrack.c | 45 +++ drivers/staging/android/ion/ion.c | 45 +-- include/linux/memtrack.h | 19 +++-- include/linux/mm.h| 3 +++ kernel/fork.c | 19 +++-- 5 files changed, 117 insertions(+), 14 deletions(-) diff --git a/drivers/misc/memtrack.c b/drivers/misc/memtrack.c index 4b2d17f..fa2601a 100644 --- a/drivers/misc/memtrack.c +++ b/drivers/misc/memtrack.c @@ -204,12 +204,13 @@ EXPORT_SYMBOL(memtrack_buffer_uninstall); * @buffer: the buffer's memtrack entry * * @vma: vma being opened + * @task: task which mapped the pages */ void memtrack_buffer_vm_open(struct memtrack_buffer *buffer, - const struct vm_area_struct *vma) + const struct vm_area_struct *vma, struct task_struct *task) { unsigned long flags; - struct task_struct *leader = current->group_leader; + struct task_struct *leader = task->group_leader; struct memtrack_vma_list *vma_list; vma_list = kmalloc(sizeof(*vma_list), GFP_KERNEL); @@ -228,12 +229,13 @@ EXPORT_SYMBOL(memtrack_buffer_vm_open); * * @buffer: the buffer's memtrack entry * @vma: the vma being closed + * @task: task that mmaped the pages */ void memtrack_buffer_vm_close(struct memtrack_buffer *buffer, - const struct vm_area_struct *vma) + const struct vm_area_struct *vma, struct task_struct *task) { unsigned long flags; - struct task_struct *leader = current->group_leader; + struct task_struct *leader = task->group_leader; write_lock_irqsave(>memtrack_lock, flags); memtrack_buffer_vm_close_locked(>memtrack_rb, buffer, vma); @@ -241,6 +243,41 @@ void memtrack_buffer_vm_close(struct memtrack_buffer *buffer, } EXPORT_SYMBOL(memtrack_buffer_vm_close); +/** + * memtrack_buffer_install_fork - Install all parent's handles into + * child. + * + * @parent: parent task + * @child: child task + */ +void memtrack_buffer_install_fork(struct task_struct *parent, + struct task_struct *child) +{ + struct task_struct *leader, *leader_child; + struct rb_root *root; + struct rb_node *node; + unsigned long flags; + + if (!child || !parent) + return; + + leader = parent->group_leader; + leader_child = child->group_leader; + write_lock_irqsave(>memtrack_lock, flags); + root = >memtrack_rb; + node = rb_first(root); + while (node) { + struct memtrack_handle *handle; + + handle = rb_entry(node, struct memtrack_handle, node); + memtrack_buffer_install_locked(_child->memtrack_rb, + handle->buffer); + node = rb_next(node); + } + write_unlock_irqrestore(>memtrack_lock, flags); +} +EXPORT_SYMBOL(memtrack_buffer_install_fork); + static int memtrack_id_alloc(struct memtrack_buffer *buffer) { int ret; diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index c32d520..451aa0f 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -906,7 +906,7 @@ static void ion_vm_open(struct vm_area_struct *vma) list_add(_list->list, >vmas); mutex_unlock(>lock); pr_debug("%s: adding %p\n", __func__, vma); - memtrack_buffer_vm_open(>memtrack_buffer, vma); + memtrack_buffer_vm_open(>memtrack_buffer, vma, current); } static void ion_vm_close(struct vm_area_struct *vma) @@ -925,13 +925,51 @@ static void ion_vm_close(struct vm_area_struct *vma) break; } mutex_unlock(>lock); - memtrack_buffer_vm_close(>memtrack_buffer, vma); + memtrack_buffer_vm_close(>memtrack_buffer, vma, current); +} + +void vm_track(struct vm_area_struct *vma, struct task_struct *task) +{ + struct ion_buffer *buffer = vma->vm_private_data; + + memtrack_buffer_vm_open(>memtrack_buffer, vma, task); +} + +void vm_untrack(struct vm_area_struct *vma, struct task_struct *task) +{ + struct ion_buffer *buffer = vma->vm_private_data; + + memtrack_buffer_vm_close(>memtrack_buffer, vma, task); } static const struct vm_operations_struct ion_vma_ops = { .open = ion_vm_open, .close = ion_vm_close, .fault = ion_vm_fault, + .track = vm_track, + .untrack = vm_untrack, +}; + +sta
[RFC 5/6] memtrack: Add memtrack accounting for forked processes.
When a process is forked, all the buffers are shared with the forked process too. Adds the functionality to add memtrack accounting for the forked processes. Forked process gets a copy of the mapped pages of the parent process. This patch makes sure that the new mapped pages are attributed to the child process instead of the parent. Signed-off-by: Ruchi Kandoi --- drivers/misc/memtrack.c | 45 +++ drivers/staging/android/ion/ion.c | 45 +-- include/linux/memtrack.h | 19 +++-- include/linux/mm.h| 3 +++ kernel/fork.c | 19 +++-- 5 files changed, 117 insertions(+), 14 deletions(-) diff --git a/drivers/misc/memtrack.c b/drivers/misc/memtrack.c index 4b2d17f..fa2601a 100644 --- a/drivers/misc/memtrack.c +++ b/drivers/misc/memtrack.c @@ -204,12 +204,13 @@ EXPORT_SYMBOL(memtrack_buffer_uninstall); * @buffer: the buffer's memtrack entry * * @vma: vma being opened + * @task: task which mapped the pages */ void memtrack_buffer_vm_open(struct memtrack_buffer *buffer, - const struct vm_area_struct *vma) + const struct vm_area_struct *vma, struct task_struct *task) { unsigned long flags; - struct task_struct *leader = current->group_leader; + struct task_struct *leader = task->group_leader; struct memtrack_vma_list *vma_list; vma_list = kmalloc(sizeof(*vma_list), GFP_KERNEL); @@ -228,12 +229,13 @@ EXPORT_SYMBOL(memtrack_buffer_vm_open); * * @buffer: the buffer's memtrack entry * @vma: the vma being closed + * @task: task that mmaped the pages */ void memtrack_buffer_vm_close(struct memtrack_buffer *buffer, - const struct vm_area_struct *vma) + const struct vm_area_struct *vma, struct task_struct *task) { unsigned long flags; - struct task_struct *leader = current->group_leader; + struct task_struct *leader = task->group_leader; write_lock_irqsave(>memtrack_lock, flags); memtrack_buffer_vm_close_locked(>memtrack_rb, buffer, vma); @@ -241,6 +243,41 @@ void memtrack_buffer_vm_close(struct memtrack_buffer *buffer, } EXPORT_SYMBOL(memtrack_buffer_vm_close); +/** + * memtrack_buffer_install_fork - Install all parent's handles into + * child. + * + * @parent: parent task + * @child: child task + */ +void memtrack_buffer_install_fork(struct task_struct *parent, + struct task_struct *child) +{ + struct task_struct *leader, *leader_child; + struct rb_root *root; + struct rb_node *node; + unsigned long flags; + + if (!child || !parent) + return; + + leader = parent->group_leader; + leader_child = child->group_leader; + write_lock_irqsave(>memtrack_lock, flags); + root = >memtrack_rb; + node = rb_first(root); + while (node) { + struct memtrack_handle *handle; + + handle = rb_entry(node, struct memtrack_handle, node); + memtrack_buffer_install_locked(_child->memtrack_rb, + handle->buffer); + node = rb_next(node); + } + write_unlock_irqrestore(>memtrack_lock, flags); +} +EXPORT_SYMBOL(memtrack_buffer_install_fork); + static int memtrack_id_alloc(struct memtrack_buffer *buffer) { int ret; diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index c32d520..451aa0f 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -906,7 +906,7 @@ static void ion_vm_open(struct vm_area_struct *vma) list_add(_list->list, >vmas); mutex_unlock(>lock); pr_debug("%s: adding %p\n", __func__, vma); - memtrack_buffer_vm_open(>memtrack_buffer, vma); + memtrack_buffer_vm_open(>memtrack_buffer, vma, current); } static void ion_vm_close(struct vm_area_struct *vma) @@ -925,13 +925,51 @@ static void ion_vm_close(struct vm_area_struct *vma) break; } mutex_unlock(>lock); - memtrack_buffer_vm_close(>memtrack_buffer, vma); + memtrack_buffer_vm_close(>memtrack_buffer, vma, current); +} + +void vm_track(struct vm_area_struct *vma, struct task_struct *task) +{ + struct ion_buffer *buffer = vma->vm_private_data; + + memtrack_buffer_vm_open(>memtrack_buffer, vma, task); +} + +void vm_untrack(struct vm_area_struct *vma, struct task_struct *task) +{ + struct ion_buffer *buffer = vma->vm_private_data; + + memtrack_buffer_vm_close(>memtrack_buffer, vma, task); } static const struct vm_operations_struct ion_vma_ops = { .open = ion_vm_open, .close = ion_vm_close, .fault = ion_vm_fault, + .track = vm_track, + .untrack = vm_untrack, +}; + +static void memtrack_vm_close(st
[RFC 6/6] drivers: staging: ion: add ION_IOC_TAG ioctl
From: Greg Hackmann <ghackm...@google.com> ION_IOC_TAG provides a userspace interface for tagging buffers with their memtrack usage after allocation. Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- drivers/staging/android/ion/ion-ioctl.c | 17 + drivers/staging/android/uapi/ion.h | 25 + 2 files changed, 42 insertions(+) diff --git a/drivers/staging/android/ion/ion-ioctl.c b/drivers/staging/android/ion/ion-ioctl.c index 7e7431d..8745a85 100644 --- a/drivers/staging/android/ion/ion-ioctl.c +++ b/drivers/staging/android/ion/ion-ioctl.c @@ -28,6 +28,7 @@ union ion_ioctl_arg { struct ion_handle_data handle; struct ion_custom_data custom; struct ion_heap_query query; + struct ion_tag_data tag; }; static int validate_ioctl_arg(unsigned int cmd, union ion_ioctl_arg *arg) @@ -162,6 +163,22 @@ long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case ION_IOC_HEAP_QUERY: ret = ion_query_heaps(client, ); break; + case ION_IOC_TAG: + { +#ifdef CONFIG_MEMTRACK + struct ion_handle *handle; + + handle = ion_handle_get_by_id(client, data.tag.handle); + if (IS_ERR(handle)) + return PTR_ERR(handle); + data.tag.tag[sizeof(data.tag.tag) - 1] = 0; + memtrack_buffer_set_tag(>buffer->memtrack_buffer, + data.tag.tag); +#else + ret = -ENOTTY; +#endif + break; + } default: return -ENOTTY; } diff --git a/drivers/staging/android/uapi/ion.h b/drivers/staging/android/uapi/ion.h index 14cd873..4c26196 100644 --- a/drivers/staging/android/uapi/ion.h +++ b/drivers/staging/android/uapi/ion.h @@ -115,6 +115,22 @@ struct ion_handle_data { ion_user_handle_t handle; }; +#define ION_MAX_TAG_LEN 32 + +/** + * struct ion_fd_data - metadata passed from userspace for a handle + * @handle:a handle + * @tag: a string describing the buffer + * + * For ION_IOC_TAG userspace populates the handle field with + * the handle returned from ion alloc and type contains the memtrack_type which + * accurately describes the usage for the memory. + */ +struct ion_tag_data { + ion_user_handle_t handle; + char tag[ION_MAX_TAG_LEN]; +}; + /** * struct ion_custom_data - metadata passed to/from userspace for a custom ioctl * @cmd: the custom ioctl function to call @@ -217,6 +233,15 @@ struct ion_heap_query { #define ION_IOC_SYNC _IOWR(ION_IOC_MAGIC, 7, struct ion_fd_data) /** + * DOC: ION_IOC_TAG - adds a memtrack descriptor tag to memory + * + * Takes an ion_tag_data struct with the type field populated with a + * memtrack_type and handle populated with a valid opaque handle. The + * memtrack_type should accurately define the usage for the memory. + */ +#define ION_IOC_TAG_IOWR(ION_IOC_MAGIC, 8, struct ion_tag_data) + +/** * DOC: ION_IOC_CUSTOM - call architecture specific ion ioctl * * Takes the argument of the architecture specific ioctl to call and -- 2.8.0.rc3.226.g39d4020
[RFC 6/6] drivers: staging: ion: add ION_IOC_TAG ioctl
From: Greg Hackmann ION_IOC_TAG provides a userspace interface for tagging buffers with their memtrack usage after allocation. Signed-off-by: Ruchi Kandoi --- drivers/staging/android/ion/ion-ioctl.c | 17 + drivers/staging/android/uapi/ion.h | 25 + 2 files changed, 42 insertions(+) diff --git a/drivers/staging/android/ion/ion-ioctl.c b/drivers/staging/android/ion/ion-ioctl.c index 7e7431d..8745a85 100644 --- a/drivers/staging/android/ion/ion-ioctl.c +++ b/drivers/staging/android/ion/ion-ioctl.c @@ -28,6 +28,7 @@ union ion_ioctl_arg { struct ion_handle_data handle; struct ion_custom_data custom; struct ion_heap_query query; + struct ion_tag_data tag; }; static int validate_ioctl_arg(unsigned int cmd, union ion_ioctl_arg *arg) @@ -162,6 +163,22 @@ long ion_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case ION_IOC_HEAP_QUERY: ret = ion_query_heaps(client, ); break; + case ION_IOC_TAG: + { +#ifdef CONFIG_MEMTRACK + struct ion_handle *handle; + + handle = ion_handle_get_by_id(client, data.tag.handle); + if (IS_ERR(handle)) + return PTR_ERR(handle); + data.tag.tag[sizeof(data.tag.tag) - 1] = 0; + memtrack_buffer_set_tag(>buffer->memtrack_buffer, + data.tag.tag); +#else + ret = -ENOTTY; +#endif + break; + } default: return -ENOTTY; } diff --git a/drivers/staging/android/uapi/ion.h b/drivers/staging/android/uapi/ion.h index 14cd873..4c26196 100644 --- a/drivers/staging/android/uapi/ion.h +++ b/drivers/staging/android/uapi/ion.h @@ -115,6 +115,22 @@ struct ion_handle_data { ion_user_handle_t handle; }; +#define ION_MAX_TAG_LEN 32 + +/** + * struct ion_fd_data - metadata passed from userspace for a handle + * @handle:a handle + * @tag: a string describing the buffer + * + * For ION_IOC_TAG userspace populates the handle field with + * the handle returned from ion alloc and type contains the memtrack_type which + * accurately describes the usage for the memory. + */ +struct ion_tag_data { + ion_user_handle_t handle; + char tag[ION_MAX_TAG_LEN]; +}; + /** * struct ion_custom_data - metadata passed to/from userspace for a custom ioctl * @cmd: the custom ioctl function to call @@ -217,6 +233,15 @@ struct ion_heap_query { #define ION_IOC_SYNC _IOWR(ION_IOC_MAGIC, 7, struct ion_fd_data) /** + * DOC: ION_IOC_TAG - adds a memtrack descriptor tag to memory + * + * Takes an ion_tag_data struct with the type field populated with a + * memtrack_type and handle populated with a valid opaque handle. The + * memtrack_type should accurately define the usage for the memory. + */ +#define ION_IOC_TAG_IOWR(ION_IOC_MAGIC, 8, struct ion_tag_data) + +/** * DOC: ION_IOC_CUSTOM - call architecture specific ion ioctl * * Takes the argument of the architecture specific ioctl to call and -- 2.8.0.rc3.226.g39d4020
[RFC 0/6] Module for tracking/accounting shared memory buffers
This patchstack introduces a new "memtrack" module for tracking and accounting memory exported to userspace as shared buffers, like dma-buf fds or GEM handles. Any process holding a reference to these buffers will keep the kernel from reclaiming its backing pages. mm counters don't provide a complete picture of these allocations, since they only account for pages that are mapped into a process's address space. This problem is especially bad for systems like Android that use dma-buf fds to share graphics and multimedia buffers between processes: these allocations are often large, have complex sharing patterns, and are rarely mapped into every process that holds a reference to them. memtrack maintains a per-process list of shared buffer references, which is exported to userspace as /proc/[pid]/memtrack. Buffers can be optionally "tagged" with a short string: for example, Android userspace would use this tag to identify whether buffers were allocated on behalf of the camera stack, GL, etc. memtrack also exports the VMAs associated with these buffers so that pages already included in the process's mm counters aren't double-counted. Shared-buffer allocators can hook into memtrack by embedding struct memtrack_buffer in their buffer metadata, calling memtrack_buffer_{init,remove} at buffer allocation and free time, and memtrack_buffer_{install,uninstall} when a userspace process takes or drops a reference to the buffer. For fd-backed buffers like dma-bufs, hooks in fdtable.c and fork.c automatically notify memtrack when references are added or removed from a process's fd table. This patchstack adds memtrack hooks into dma-buf and ion. If there's upstream interest in memtrack, it can be extended to other memory allocators as well, such as GEM implementations. Greg Hackmann (1): drivers: staging: ion: add ION_IOC_TAG ioctl Ruchi Kandoi (5): fs: add installed and uninstalled file_operations drivers: misc: add memtrack dma-buf: add memtrack support memtrack: Adds the accounting to keep track of all mmaped/unmapped pages. memtrack: Add memtrack accounting for forked processes. drivers/android/binder.c| 4 +- drivers/dma-buf/dma-buf.c | 37 +++ drivers/misc/Kconfig| 16 + drivers/misc/Makefile | 1 + drivers/misc/memtrack.c | 516 drivers/staging/android/ion/ion-ioctl.c | 17 ++ drivers/staging/android/ion/ion.c | 60 +++- drivers/staging/android/ion/ion_priv.h | 2 + drivers/staging/android/uapi/ion.h | 25 ++ fs/file.c | 38 ++- fs/open.c | 2 +- fs/proc/base.c | 4 + include/linux/dma-buf.h | 5 + include/linux/fdtable.h | 4 +- include/linux/fs.h | 2 + include/linux/memtrack.h| 130 include/linux/mm.h | 3 + include/linux/sched.h | 3 + kernel/fork.c | 23 +- 19 files changed, 875 insertions(+), 17 deletions(-) create mode 100644 drivers/misc/memtrack.c create mode 100644 include/linux/memtrack.h -- 2.8.0.rc3.226.g39d4020
[RFC 1/6] fs: add installed and uninstalled file_operations
These optional file_operations notify a file implementation when it is installed or uninstalled from a task's fd table. This can be used for accounting of file-backed shared resources like dma-buf. This involves some changes to the __fd_install() and __close_fd() APIs to actually pass along the responsible task_struct. These are low-level APIs with only two in-tree callers, both adjusted in this patch. Signed-off-by: Greg Hackmann <ghackm...@google.com> Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- drivers/android/binder.c | 4 ++-- fs/file.c| 38 +- fs/open.c| 2 +- include/linux/fdtable.h | 4 ++-- include/linux/fs.h | 2 ++ 5 files changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 562af94..0bb174e 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -398,7 +398,7 @@ static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { if (proc->files) - __fd_install(proc->files, fd, file); + __fd_install(proc->tsk, fd, file); } /* @@ -411,7 +411,7 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) if (proc->files == NULL) return -ESRCH; - retval = __close_fd(proc->files, fd); + retval = __close_fd(proc->tsk, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || diff --git a/fs/file.c b/fs/file.c index 69d6990..19c5fad 100644 --- a/fs/file.c +++ b/fs/file.c @@ -282,6 +282,24 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +static inline void fdt_install(struct fdtable *fdt, int fd, struct file *file, + struct task_struct *task) +{ + if (file->f_op->installed) + file->f_op->installed(file, task); + rcu_assign_pointer(fdt->fd[fd], file); +} + +static inline void fdt_uninstall(struct fdtable *fdt, int fd, + struct task_struct *task) +{ + struct file *old_file = fdt->fd[fd]; + + if (old_file->f_op->uninstalled) + old_file->f_op->uninstalled(old_file, task); + rcu_assign_pointer(fdt->fd[fd], NULL); +} + /* * Allocate a new files structure and copy contents from the * passed in files structure. @@ -543,7 +561,7 @@ int __alloc_fd(struct files_struct *files, /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); + fdt_uninstall(fdt, fd, current); } #endif @@ -601,10 +619,11 @@ EXPORT_SYMBOL(put_unused_fd); * fd_install() instead. */ -void __fd_install(struct files_struct *files, unsigned int fd, +void __fd_install(struct task_struct *task, unsigned int fd, struct file *file) { struct fdtable *fdt; + struct files_struct *files = task->files; might_sleep(); rcu_read_lock_sched(); @@ -618,13 +637,13 @@ void __fd_install(struct files_struct *files, unsigned int fd, smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); + fdt_install(fdt, fd, file, task); rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) { - __fd_install(current->files, fd, file); + __fd_install(current, fd, file); } EXPORT_SYMBOL(fd_install); @@ -632,10 +651,11 @@ EXPORT_SYMBOL(fd_install); /* * The same warnings as for __alloc_fd()/__fd_install() apply here... */ -int __close_fd(struct files_struct *files, unsigned fd) +int __close_fd(struct task_struct *task, unsigned fd) { struct file *file; struct fdtable *fdt; + struct files_struct *files = task->files; spin_lock(>file_lock); fdt = files_fdtable(files); @@ -644,7 +664,7 @@ int __close_fd(struct files_struct *files, unsigned fd) file = fdt->fd[fd]; if (!file) goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); + fdt_uninstall(fdt, fd, task); __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(>file_lock); @@ -679,7 +699,7 @@ void do_close_on_exec(struct files_struct *files) file = fdt->fd[fd]; if (!file) continue; - rcu_assign_pointer(fdt->fd[fd], NULL); + fdt_uninstall(fdt, fd, current); __put_unused_fd(files, fd);
[RFC 0/6] Module for tracking/accounting shared memory buffers
This patchstack introduces a new "memtrack" module for tracking and accounting memory exported to userspace as shared buffers, like dma-buf fds or GEM handles. Any process holding a reference to these buffers will keep the kernel from reclaiming its backing pages. mm counters don't provide a complete picture of these allocations, since they only account for pages that are mapped into a process's address space. This problem is especially bad for systems like Android that use dma-buf fds to share graphics and multimedia buffers between processes: these allocations are often large, have complex sharing patterns, and are rarely mapped into every process that holds a reference to them. memtrack maintains a per-process list of shared buffer references, which is exported to userspace as /proc/[pid]/memtrack. Buffers can be optionally "tagged" with a short string: for example, Android userspace would use this tag to identify whether buffers were allocated on behalf of the camera stack, GL, etc. memtrack also exports the VMAs associated with these buffers so that pages already included in the process's mm counters aren't double-counted. Shared-buffer allocators can hook into memtrack by embedding struct memtrack_buffer in their buffer metadata, calling memtrack_buffer_{init,remove} at buffer allocation and free time, and memtrack_buffer_{install,uninstall} when a userspace process takes or drops a reference to the buffer. For fd-backed buffers like dma-bufs, hooks in fdtable.c and fork.c automatically notify memtrack when references are added or removed from a process's fd table. This patchstack adds memtrack hooks into dma-buf and ion. If there's upstream interest in memtrack, it can be extended to other memory allocators as well, such as GEM implementations. Greg Hackmann (1): drivers: staging: ion: add ION_IOC_TAG ioctl Ruchi Kandoi (5): fs: add installed and uninstalled file_operations drivers: misc: add memtrack dma-buf: add memtrack support memtrack: Adds the accounting to keep track of all mmaped/unmapped pages. memtrack: Add memtrack accounting for forked processes. drivers/android/binder.c| 4 +- drivers/dma-buf/dma-buf.c | 37 +++ drivers/misc/Kconfig| 16 + drivers/misc/Makefile | 1 + drivers/misc/memtrack.c | 516 drivers/staging/android/ion/ion-ioctl.c | 17 ++ drivers/staging/android/ion/ion.c | 60 +++- drivers/staging/android/ion/ion_priv.h | 2 + drivers/staging/android/uapi/ion.h | 25 ++ fs/file.c | 38 ++- fs/open.c | 2 +- fs/proc/base.c | 4 + include/linux/dma-buf.h | 5 + include/linux/fdtable.h | 4 +- include/linux/fs.h | 2 + include/linux/memtrack.h| 130 include/linux/mm.h | 3 + include/linux/sched.h | 3 + kernel/fork.c | 23 +- 19 files changed, 875 insertions(+), 17 deletions(-) create mode 100644 drivers/misc/memtrack.c create mode 100644 include/linux/memtrack.h -- 2.8.0.rc3.226.g39d4020
[RFC 1/6] fs: add installed and uninstalled file_operations
These optional file_operations notify a file implementation when it is installed or uninstalled from a task's fd table. This can be used for accounting of file-backed shared resources like dma-buf. This involves some changes to the __fd_install() and __close_fd() APIs to actually pass along the responsible task_struct. These are low-level APIs with only two in-tree callers, both adjusted in this patch. Signed-off-by: Greg Hackmann Signed-off-by: Ruchi Kandoi --- drivers/android/binder.c | 4 ++-- fs/file.c| 38 +- fs/open.c| 2 +- include/linux/fdtable.h | 4 ++-- include/linux/fs.h | 2 ++ 5 files changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 562af94..0bb174e 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -398,7 +398,7 @@ static void task_fd_install( struct binder_proc *proc, unsigned int fd, struct file *file) { if (proc->files) - __fd_install(proc->files, fd, file); + __fd_install(proc->tsk, fd, file); } /* @@ -411,7 +411,7 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) if (proc->files == NULL) return -ESRCH; - retval = __close_fd(proc->files, fd); + retval = __close_fd(proc->tsk, fd); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || retval == -ERESTARTNOINTR || diff --git a/fs/file.c b/fs/file.c index 69d6990..19c5fad 100644 --- a/fs/file.c +++ b/fs/file.c @@ -282,6 +282,24 @@ static unsigned int count_open_files(struct fdtable *fdt) return i; } +static inline void fdt_install(struct fdtable *fdt, int fd, struct file *file, + struct task_struct *task) +{ + if (file->f_op->installed) + file->f_op->installed(file, task); + rcu_assign_pointer(fdt->fd[fd], file); +} + +static inline void fdt_uninstall(struct fdtable *fdt, int fd, + struct task_struct *task) +{ + struct file *old_file = fdt->fd[fd]; + + if (old_file->f_op->uninstalled) + old_file->f_op->uninstalled(old_file, task); + rcu_assign_pointer(fdt->fd[fd], NULL); +} + /* * Allocate a new files structure and copy contents from the * passed in files structure. @@ -543,7 +561,7 @@ int __alloc_fd(struct files_struct *files, /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); - rcu_assign_pointer(fdt->fd[fd], NULL); + fdt_uninstall(fdt, fd, current); } #endif @@ -601,10 +619,11 @@ EXPORT_SYMBOL(put_unused_fd); * fd_install() instead. */ -void __fd_install(struct files_struct *files, unsigned int fd, +void __fd_install(struct task_struct *task, unsigned int fd, struct file *file) { struct fdtable *fdt; + struct files_struct *files = task->files; might_sleep(); rcu_read_lock_sched(); @@ -618,13 +637,13 @@ void __fd_install(struct files_struct *files, unsigned int fd, smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); - rcu_assign_pointer(fdt->fd[fd], file); + fdt_install(fdt, fd, file, task); rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) { - __fd_install(current->files, fd, file); + __fd_install(current, fd, file); } EXPORT_SYMBOL(fd_install); @@ -632,10 +651,11 @@ EXPORT_SYMBOL(fd_install); /* * The same warnings as for __alloc_fd()/__fd_install() apply here... */ -int __close_fd(struct files_struct *files, unsigned fd) +int __close_fd(struct task_struct *task, unsigned fd) { struct file *file; struct fdtable *fdt; + struct files_struct *files = task->files; spin_lock(>file_lock); fdt = files_fdtable(files); @@ -644,7 +664,7 @@ int __close_fd(struct files_struct *files, unsigned fd) file = fdt->fd[fd]; if (!file) goto out_unlock; - rcu_assign_pointer(fdt->fd[fd], NULL); + fdt_uninstall(fdt, fd, task); __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(>file_lock); @@ -679,7 +699,7 @@ void do_close_on_exec(struct files_struct *files) file = fdt->fd[fd]; if (!file) continue; - rcu_assign_pointer(fdt->fd[fd], NULL); + fdt_uninstall(fdt, fd, current); __put_unused_fd(files, fd); spin_unlock(>file_lock); fil
[PATCH v2] timekeeping: Prints the amounts of time spent during suspend
This helps to keep track of real time while debugging using kernel logs. Cc: John Stultz <john.stu...@linaro.org> Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- Changelog since v1: - removed cross platform warnings. kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd652..7fdb34f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -70,5 +70,7 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; + pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } -- 2.8.0.rc3.226.g39d4020
[PATCH v2] timekeeping: Prints the amounts of time spent during suspend
This helps to keep track of real time while debugging using kernel logs. Cc: John Stultz Signed-off-by: Ruchi Kandoi --- Changelog since v1: - removed cross platform warnings. kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd652..7fdb34f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -70,5 +70,7 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; + pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } -- 2.8.0.rc3.226.g39d4020
[PATCH] timekeeping: Prints the amounts of time spent during suspend
This helps to keep track of real time while debugging using kernel logs. Cc: John Stultz <john.stu...@linaro.org> Signed-off-by: Ruchi Kandoi <kandoiru...@google.com> --- kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd652..b67abe8 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -70,5 +70,7 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; + pr_info("Suspended for %lu.%03lu seconds\n", t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } -- 2.8.0.rc3.226.g39d4020
[PATCH] timekeeping: Prints the amounts of time spent during suspend
This helps to keep track of real time while debugging using kernel logs. Cc: John Stultz Signed-off-by: Ruchi Kandoi --- kernel/time/timekeeping_debug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd652..b67abe8 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -70,5 +70,7 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { sleep_time_bin[fls(t->tv_sec)]++; + pr_info("Suspended for %lu.%03lu seconds\n", t->tv_sec, + t->tv_nsec / NSEC_PER_MSEC); } -- 2.8.0.rc3.226.g39d4020
Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
On Thu, May 21, 2015 at 7:34 AM, Daniel Lezcano wrote: > Hi Ruchi, > > On 05/15/2015 02:12 AM, Ruchi Kandoi wrote: >> >> These patches add a mechanism which will accurately caculate the CPU power >> used by all the processes in the system. In order to account for the power >> used by all the processes a data field "cpu_power" has been added in the >> task_struct. > > > The term 'energy' makes more sense than 'power'. >> This field adds power for both the system as well as user >> time. cpu_power contains the total amount of charge(in uAmsec units) used > > > Why not use the Joules unit ? > Because most of the devices working on battery has their capacity defined in mAh(to avoid floating point and to prevent losing precision uAmsec is used). It will be be easier to keep it in that unit so that it can be aggregated when we are trying to find the total capacity which was used by a process(which will be combined for a particular application). >> by the process. This model takes into account the frequency at which the >> process was running(i.e higher power for processes running at higher >> frequencies). It requires the cpufreq_stats module to be initialized with >> the current numbers for each of the CPU core at each frequency. This will >> be initialized during init time. > > > The energy task accounting is an interesting feature in my opinion. But your > patchset does not deal with the power management hardware complexity. > > If we reduce the scope of the task energy accounting to the cpu, we are > facing several issues: > > * A cpu may be supposed to run at a specific OPP but it could share a clock > line with another cpu which is in a higher frequency. So the frequency is > actually at a higher rate than what is assumed > > * The firmware may override the cpufreq decisions > > * A process may be idle but its behavior forces the cpuidle governor to > choose shallow states (that won't occur without the process). For example, > the process is using very short timers, does a small processing and then go > to sleep again waiting for the next timer expiration. The result will be a > process having a low energy consumption but actually because of these > timers, it will prevent the cpu to enter deep idle state > > Beside that, the process may be soliciting a subsystem (another process or > hardware) which consumes a lot of energy. That won't be accounted even if > the process is responsible of this extra consumption. > True, there will be cases where the accounting for the energy/power will be deceptive, because we are not taking into consideration the idle time and time intervals between which the process is running. This was aimed to be a simplistic model where only the active time for the process were taken into account and the processes were blamed for the active power that they are consuming. There are similar efforts for other subsystem too which will be keeping track of the subsystem power used by a particular pid/uid. > And the last point is: how do you expect to have the energy numbers as > nobody is willing to share them for their platform ? > This is a tough question. Yes it is difficult to get these numbers, but I don't think it is unfeasible. We get some numbers from SoC vendors for the CPUs, trying to drive it to a point where we can get more accurate numbers. >> Ruchi Kandoi (2): >>cpufreq_stats: Adds sysfs file >> /sys/devices/system/cpu/cpufreq/current_in_state >>sched: cpufreq: Adds a field cpu_power in the task_struct >> >> drivers/cpufreq/cpufreq_stats.c | 191 >> +++- >> include/linux/cpufreq.h | 8 ++ >> include/linux/sched.h | 2 + >> kernel/fork.c | 1 + >> kernel/sched/cputime.c | 7 ++ >> 5 files changed, 207 insertions(+), 2 deletions(-) > > > > > -- > <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs > > Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook | > <http://twitter.com/#!/linaroorg> Twitter | > <http://www.linaro.org/linaro-blog/> Blog > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
On Thu, May 21, 2015 at 7:34 AM, Daniel Lezcano daniel.lezc...@linaro.org wrote: Hi Ruchi, On 05/15/2015 02:12 AM, Ruchi Kandoi wrote: These patches add a mechanism which will accurately caculate the CPU power used by all the processes in the system. In order to account for the power used by all the processes a data field cpu_power has been added in the task_struct. The term 'energy' makes more sense than 'power'. This field adds power for both the system as well as user time. cpu_power contains the total amount of charge(in uAmsec units) used Why not use the Joules unit ? Because most of the devices working on battery has their capacity defined in mAh(to avoid floating point and to prevent losing precision uAmsec is used). It will be be easier to keep it in that unit so that it can be aggregated when we are trying to find the total capacity which was used by a process(which will be combined for a particular application). by the process. This model takes into account the frequency at which the process was running(i.e higher power for processes running at higher frequencies). It requires the cpufreq_stats module to be initialized with the current numbers for each of the CPU core at each frequency. This will be initialized during init time. The energy task accounting is an interesting feature in my opinion. But your patchset does not deal with the power management hardware complexity. If we reduce the scope of the task energy accounting to the cpu, we are facing several issues: * A cpu may be supposed to run at a specific OPP but it could share a clock line with another cpu which is in a higher frequency. So the frequency is actually at a higher rate than what is assumed * The firmware may override the cpufreq decisions * A process may be idle but its behavior forces the cpuidle governor to choose shallow states (that won't occur without the process). For example, the process is using very short timers, does a small processing and then go to sleep again waiting for the next timer expiration. The result will be a process having a low energy consumption but actually because of these timers, it will prevent the cpu to enter deep idle state Beside that, the process may be soliciting a subsystem (another process or hardware) which consumes a lot of energy. That won't be accounted even if the process is responsible of this extra consumption. True, there will be cases where the accounting for the energy/power will be deceptive, because we are not taking into consideration the idle time and time intervals between which the process is running. This was aimed to be a simplistic model where only the active time for the process were taken into account and the processes were blamed for the active power that they are consuming. There are similar efforts for other subsystem too which will be keeping track of the subsystem power used by a particular pid/uid. And the last point is: how do you expect to have the energy numbers as nobody is willing to share them for their platform ? This is a tough question. Yes it is difficult to get these numbers, but I don't think it is unfeasible. We get some numbers from SoC vendors for the CPUs, trying to drive it to a point where we can get more accurate numbers. Ruchi Kandoi (2): cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state sched: cpufreq: Adds a field cpu_power in the task_struct drivers/cpufreq/cpufreq_stats.c | 191 +++- include/linux/cpufreq.h | 8 ++ include/linux/sched.h | 2 + kernel/fork.c | 1 + kernel/sched/cputime.c | 7 ++ 5 files changed, 207 insertions(+), 2 deletions(-) -- http://www.linaro.org/ Linaro.org │ Open source software for ARM SoCs Follow Linaro: http://www.facebook.com/pages/Linaro Facebook | http://twitter.com/#!/linaroorg Twitter | http://www.linaro.org/linaro-blog/ Blog -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
On Fri, May 15, 2015 at 9:07 AM, Peter Zijlstra wrote: > > On Thu, May 14, 2015 at 04:55:48PM -0700, Ruchi Kandoi wrote: > > cpu_power has been added to keep track of amount of power each task is > > consuming. cpu_power is updated whenever stime and utime are updated for > > a task. power is computed by taking into account the frequency at which > > the current core was running and the current for cpu actively > > running at hat frequency. > > > > Both you patches completely lack any reason for me to even start > considering this. > > _WHY_ and _what_ are you doing? We need a mechanism in which we can get information about how much cpu power each of the process(which is then aggregated fro each uid/application) is consuming. In the current architecture, it is based on the amount of the time the process ran. This brings in inaccuracy because running x seconds at low frequency will have different power consumption as compared to running at the higher frequency. With these changes we have the information about the power which is not only dependent on the time it was running but also takes into account the frequency it was running at as well as the CPU # it was running at. Because the cost of running at different CPUs at the same frequency is different. This gives a better overview of the current power state of the system wrt cpu power. Please let me know if more information is required for the same. Thanks, Ruchi Kandoi -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
On Thu, May 14, 2015 at 11:34 PM, Heinrich Schuchardt wrote: > > On 15.05.2015 02:12, Ruchi Kandoi wrote: > > These patches add a mechanism which will accurately caculate the CPU power > > used by all the processes in the system. In order to account for the power > > used by all the processes a data field "cpu_power" has been added in the > > task_struct. > > Hello Ruchi, > > could you, please, explain why the CPU power consumption per task > information is needed. Please, consider that the CPU causes only part of > the total system power consumption which also comprises GPU, cooling, > RAM, etc. In order to accurately account for the battery used by each of the process, keeping a track of how long the process ran is not sufficient. Since running at different frequency has varying power consumption, we want to track a power number which takes into consideration the frequency as well as the core on which it was running. There are similar efforts for other subsystems too to account for the power used by each process which can then accurately be aggregated for an application. > > The patch series increases the memory size of the kernel, the memory > consumption per thread and the thread switching time. So, please, > introduce a configuration switch to enable/disable the function. > Yes, configuration can be added. Will update that in the next patch. > > This field adds power for both the system as well as user > > time. cpu_power contains the total amount of charge(in uAmsec units) used > > by the process. > > Is there any reasonable way to assign the power consumption to a single > task if multiple tasks are executed on the same core at the same time > (e.g. using hyperthreading)? > I think the power will be accounted for both the processes on their respective cores. With hyperthreading, as far as kernel is concerned they are running on different cores and the time for all the tasks will be accounted appropriately and hence power. Correct me if I am wrong. > > This model takes into account the frequency at which the > > process was running(i.e higher power for processes running at higher > > frequencies). It requires the cpufreq_stats module to be initialized with > > the current numbers for each of the CPU core at each frequency. This will > > be initialized during init time. > > This does not account for power consumption depending on anything else > but frequency, e.g. floating point commands consuming more power than NOPs. Currently we have been able to get power numbers for a core when they are active and running at a particular frequency. Agreed that will be a better and more accurate mode.But getting the power numbers for the type of instruction and keeping track of number of such instructions will be cumbersome. > > > Best regards > > Heinrich Schuchardt > > > > Ruchi Kandoi (2): > > cpufreq_stats: Adds sysfs file > > /sys/devices/system/cpu/cpufreq/current_in_state > > sched: cpufreq: Adds a field cpu_power in the task_struct > > > > drivers/cpufreq/cpufreq_stats.c | 191 > > +++- > > include/linux/cpufreq.h | 8 ++ > > include/linux/sched.h | 2 + > > kernel/fork.c | 1 + > > kernel/sched/cputime.c | 7 ++ > > 5 files changed, 207 insertions(+), 2 deletions(-) > > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 0/2] Adds cpu power accounting per-pid basis.
On Thu, May 14, 2015 at 11:34 PM, Heinrich Schuchardt xypron.g...@gmx.de wrote: On 15.05.2015 02:12, Ruchi Kandoi wrote: These patches add a mechanism which will accurately caculate the CPU power used by all the processes in the system. In order to account for the power used by all the processes a data field cpu_power has been added in the task_struct. Hello Ruchi, could you, please, explain why the CPU power consumption per task information is needed. Please, consider that the CPU causes only part of the total system power consumption which also comprises GPU, cooling, RAM, etc. In order to accurately account for the battery used by each of the process, keeping a track of how long the process ran is not sufficient. Since running at different frequency has varying power consumption, we want to track a power number which takes into consideration the frequency as well as the core on which it was running. There are similar efforts for other subsystems too to account for the power used by each process which can then accurately be aggregated for an application. The patch series increases the memory size of the kernel, the memory consumption per thread and the thread switching time. So, please, introduce a configuration switch to enable/disable the function. Yes, configuration can be added. Will update that in the next patch. This field adds power for both the system as well as user time. cpu_power contains the total amount of charge(in uAmsec units) used by the process. Is there any reasonable way to assign the power consumption to a single task if multiple tasks are executed on the same core at the same time (e.g. using hyperthreading)? I think the power will be accounted for both the processes on their respective cores. With hyperthreading, as far as kernel is concerned they are running on different cores and the time for all the tasks will be accounted appropriately and hence power. Correct me if I am wrong. This model takes into account the frequency at which the process was running(i.e higher power for processes running at higher frequencies). It requires the cpufreq_stats module to be initialized with the current numbers for each of the CPU core at each frequency. This will be initialized during init time. This does not account for power consumption depending on anything else but frequency, e.g. floating point commands consuming more power than NOPs. Currently we have been able to get power numbers for a core when they are active and running at a particular frequency. Agreed that will be a better and more accurate mode.But getting the power numbers for the type of instruction and keeping track of number of such instructions will be cumbersome. Best regards Heinrich Schuchardt Ruchi Kandoi (2): cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state sched: cpufreq: Adds a field cpu_power in the task_struct drivers/cpufreq/cpufreq_stats.c | 191 +++- include/linux/cpufreq.h | 8 ++ include/linux/sched.h | 2 + kernel/fork.c | 1 + kernel/sched/cputime.c | 7 ++ 5 files changed, 207 insertions(+), 2 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
On Fri, May 15, 2015 at 9:07 AM, Peter Zijlstra pet...@infradead.org wrote: On Thu, May 14, 2015 at 04:55:48PM -0700, Ruchi Kandoi wrote: cpu_power has been added to keep track of amount of power each task is consuming. cpu_power is updated whenever stime and utime are updated for a task. power is computed by taking into account the frequency at which the current core was running and the current for cpu actively running at hat frequency. Both you patches completely lack any reason for me to even start considering this. _WHY_ and _what_ are you doing? We need a mechanism in which we can get information about how much cpu power each of the process(which is then aggregated fro each uid/application) is consuming. In the current architecture, it is based on the amount of the time the process ran. This brings in inaccuracy because running x seconds at low frequency will have different power consumption as compared to running at the higher frequency. With these changes we have the information about the power which is not only dependent on the time it was running but also takes into account the frequency it was running at as well as the CPU # it was running at. Because the cost of running at different CPUs at the same frequency is different. This gives a better overview of the current power state of the system wrt cpu power. Please let me know if more information is required for the same. Thanks, Ruchi Kandoi -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
On Thu, May 14, 2015 at 7:48 PM, Viresh Kumar wrote: > I am not replying for concept here, as sched maintainers are in a > better position for that, but a nit below.. > > On 14-05-15, 17:12, Ruchi Kandoi wrote: >> Adds the sysfs file for userspace to initialize the active current >> values for all the cores at each of the frequencies. >> >> The format for storing the values is as follows: >> echo "CPU:= =,CPU: >> ..." > /sys/devices/system/cpu/cpufreq/current_in_state > > Why this file? And not > /sys/devices/system/cpu/cpuX/cpufreq/stats/current_in_state ? That way > you don't have to replicate the same information for all CPUs, as the > stats folder can be shared by multiple CPUs (which share their > clock/voltage rails).. Some of the hand-held devices support hot-plugging of the cpus and when the core is hot-plugged out the /sys/devices/system/cpu/cpuX/cpufreq directory is removed too. So it won't be possible to share folders by multiple CPUs. > > -- > viresh -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
On Thu, May 14, 2015 at 7:48 PM, Viresh Kumar viresh.ku...@linaro.org wrote: I am not replying for concept here, as sched maintainers are in a better position for that, but a nit below.. On 14-05-15, 17:12, Ruchi Kandoi wrote: Adds the sysfs file for userspace to initialize the active current values for all the cores at each of the frequencies. The format for storing the values is as follows: echo CPUcpu#:freq1=current in uA freq2=current,CPUcpu#: ... /sys/devices/system/cpu/cpufreq/current_in_state Why this file? And not /sys/devices/system/cpu/cpuX/cpufreq/stats/current_in_state ? That way you don't have to replicate the same information for all CPUs, as the stats folder can be shared by multiple CPUs (which share their clock/voltage rails).. Some of the hand-held devices support hot-plugging of the cpus and when the core is hot-plugged out the /sys/devices/system/cpu/cpuX/cpufreq directory is removed too. So it won't be possible to share folders by multiple CPUs. -- viresh -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
Adds the sysfs file for userspace to initialize the active current values for all the cores at each of the frequencies. The format for storing the values is as follows: echo "CPU:= =,CPU: ..." > /sys/devices/system/cpu/cpufreq/current_in_state Signed-off-by: Ruchi Kandoi --- drivers/cpufreq/cpufreq_stats.c | 163 +++- 1 file changed, 161 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 5e370a3..6f0b562 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -30,6 +30,14 @@ struct cpufreq_stats { #endif }; +struct cpufreq_power_stats { + unsigned int state_num; + unsigned int *curr; + unsigned int *freq_table; +}; + +static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats); + static int cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); @@ -61,6 +69,87 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) return len; } +static void store_current_value(struct cpufreq_power_stats *powerstats, + int freq, int curr) +{ + int i; + + /* freq_table doesn't contain any CPU_FREQ_INVALID */ + for (i = 0; i < powerstats->state_num; i++) { + if (powerstats->freq_table[i] == freq) { + powerstats->curr[i] = curr; + break; + } + } +} + +static ssize_t store_current_in_state(struct cpufreq_policy *policy, + const char *buf, size_t len) +{ + char *cp, *cp2, *start, *buffer; + unsigned int cpu_num, ret, curr, freq; + struct cpufreq_power_stats *powerstats; + + if (!buf || len < 0) + return len; + + buffer = kzalloc(len + 1, GFP_KERNEL); + if (!buffer) + return len; + + strncpy(buffer, buf, len); + buffer[len] = '\0'; + cp = buffer; + spin_lock(_stats_lock); + while ((start = strsep(, ","))) { + ret = sscanf(start, "CPU%u:", _num); + if (ret != 1 || cpu_num > (num_possible_cpus() - 1)) { + ret = -EINVAL; + goto error; + } + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + if (!powerstats) + continue; + + /* sscanf makes sure that strchr doesn't return a NULL */ + cp2 = strchr(start, ':') + 1; + while ((start = strsep(, " "))) { + if (sscanf(start, "%u=%u", , ) != 2) { + ret = -EINVAL; + goto error; + } + store_current_value(powerstats, freq, curr); + } + } + ret = len; +error: + spin_unlock(_stats_lock); + kfree(buffer); + return ret; +} + +static ssize_t show_current_in_state(struct cpufreq_policy *policy, char *buf) +{ + ssize_t len = 0; + unsigned int i, cpu; + struct cpufreq_power_stats *powerstats; + + spin_lock(_stats_lock); + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + len += scnprintf(buf + len, PAGE_SIZE - len, "CPU%d:", cpu); + for (i = 0; i < powerstats->state_num; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, + "%d=%d ", powerstats->freq_table[i], + powerstats->curr[i]); + len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); + } + spin_unlock(_stats_lock); + return len; +} + #ifdef CONFIG_CPU_FREQ_STAT_DETAILS static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { @@ -107,6 +196,7 @@ cpufreq_freq_attr_ro(trans_table); cpufreq_freq_attr_ro(total_trans); cpufreq_freq_attr_ro(time_in_state); +cpufreq_freq_attr_rw(current_in_state); static struct attribute *default_attrs[] = { _trans.attr, @@ -159,6 +249,67 @@ static void cpufreq_stats_free_table(unsigned int cpu) cpufreq_cpu_put(policy); } +static void cpufreq_powerstats_free(void) +{ + int cpu; + struct cpufreq_power_stats *powerstats; + + sysfs_remove_file(cpufreq_global_kobject, _in_state.attr); + + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + kfree(powerstats->curr); + kfree(powerstats); + per_cpu(cpufreq_power_stats, cpu) = NULL; + } +} + +static void cpufreq_powerstats_crea
[PATCH v2 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
cpu_power has been added to keep track of amount of power each task is consuming. cpu_power is updated whenever stime and utime are updated for a task. power is computed by taking into account the frequency at which the current core was running and the current for cpu actively running at hat frequency. Signed-off-by: Ruchi Kandoi --- drivers/cpufreq/cpufreq_stats.c | 28 include/linux/cpufreq.h | 8 include/linux/sched.h | 2 ++ kernel/fork.c | 1 + kernel/sched/cputime.c | 7 +++ 5 files changed, 46 insertions(+) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 6f0b562..682ed898 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -14,6 +14,7 @@ #include #include #include +#include static spinlock_t cpufreq_stats_lock; @@ -83,6 +84,33 @@ static void store_current_value(struct cpufreq_power_stats *powerstats, } } +void acct_update_power(struct task_struct *task, cputime_t cputime) +{ + struct cpufreq_power_stats *powerstats; + struct cpufreq_stats *stats; + struct cpufreq_policy *policy; + unsigned int cpu_num, curr; + + if (!task) + return; + cpu_num = task_cpu(task); + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + policy = cpufreq_cpu_get(cpu_num); + if (!policy) + return; + + if (!powerstats || !(policy->stats)) { + cpufreq_cpu_put(policy); + return; + } + + stats = policy->stats; + curr = powerstats->curr[stats->last_index]; + task->cpu_power += curr * cputime_to_usecs(cputime); + cpufreq_cpu_put(policy); +} +EXPORT_SYMBOL_GPL(acct_update_power); + static ssize_t store_current_in_state(struct cpufreq_policy *policy, const char *buf, size_t len) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..86826c8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -18,6 +18,7 @@ #include #include #include +#include /* *CPUFREQ INTERFACE * @@ -601,4 +602,11 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +/* + * CPUFREQ STATS * + */ + +void acct_update_power(struct task_struct *p, cputime_t cputime); + #endif /* _LINUX_CPUFREQ_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e61..1f2400a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1429,6 +1429,7 @@ struct task_struct { int __user *clear_child_tid;/* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled; + unsigned long long cpu_power; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE struct cputime prev_cputime; @@ -1441,6 +1442,7 @@ struct task_struct { VTIME_USER, VTIME_SYS, } vtime_snap_whence; + #endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaa..2ca0e9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1341,6 +1341,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; + p->cpu_power = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1e..53a79d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "sched.h" @@ -149,6 +150,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Account for user time used */ acct_account_cputime(p); + + /* Account power usage for user time */ + acct_update_power(p, cputime); } /* @@ -199,6 +203,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, /* Account for system time used */ acct_account_cputime(p); + + /* Account power usage for system time */ + acct_update_power(p, cputime); } /* -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More ma
[PATCH v2 0/2] Adds cpu power accounting per-pid basis.
These patches add a mechanism which will accurately caculate the CPU power used by all the processes in the system. In order to account for the power used by all the processes a data field "cpu_power" has been added in the task_struct. This field adds power for both the system as well as user time. cpu_power contains the total amount of charge(in uAmsec units) used by the process. This model takes into account the frequency at which the process was running(i.e higher power for processes running at higher frequencies). It requires the cpufreq_stats module to be initialized with the current numbers for each of the CPU core at each frequency. This will be initialized during init time. Ruchi Kandoi (2): cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state sched: cpufreq: Adds a field cpu_power in the task_struct drivers/cpufreq/cpufreq_stats.c | 191 +++- include/linux/cpufreq.h | 8 ++ include/linux/sched.h | 2 + kernel/fork.c | 1 + kernel/sched/cputime.c | 7 ++ 5 files changed, 207 insertions(+), 2 deletions(-) -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
cpu_power has been added to keep track of amount of power each task is consuming. cpu_power is updated whenever stime and utime are updated for a task. power is computed by taking into account the frequency at which the current core was running and the current for cpu actively running at hat frequency. Signed-off-by: Ruchi Kandoi --- drivers/cpufreq/cpufreq_stats.c | 23 +++ include/linux/cpufreq.h | 8 include/linux/sched.h | 2 ++ kernel/fork.c | 1 + kernel/sched/cputime.c | 7 +++ 5 files changed, 41 insertions(+) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 6f0b562..4a0bd9a 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -14,6 +14,7 @@ #include #include #include +#include static spinlock_t cpufreq_stats_lock; @@ -83,6 +84,28 @@ static void store_current_value(struct cpufreq_power_stats *powerstats, } } +void acct_update_power(struct task_struct *task, cputime_t cputime) +{ + struct cpufreq_power_stats *powerstats; + struct cpufreq_stats *stats; + struct cpufreq_policy *policy; + unsigned int cpu_num, curr; + + if (!task) + return; + cpu_num = task_cpu(task); + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + policy = cpufreq_cpu_get(cpu_num); + if (!powerstats || !policy || !(policy->stats)) + return; + + stats = policy->stats; + curr = powerstats->curr[stats->last_index]; + task->cpu_power += curr * cputime_to_usecs(cputime); + cpufreq_cpu_put(cpu_num); +} +EXPORT_SYMBOL_GPL(acct_update_power); + static ssize_t store_current_in_state(struct cpufreq_policy *policy, const char *buf, size_t len) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..86826c8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -18,6 +18,7 @@ #include #include #include +#include /* *CPUFREQ INTERFACE * @@ -601,4 +602,11 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +/* + * CPUFREQ STATS * + */ + +void acct_update_power(struct task_struct *p, cputime_t cputime); + #endif /* _LINUX_CPUFREQ_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e61..1f2400a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1429,6 +1429,7 @@ struct task_struct { int __user *clear_child_tid;/* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled; + unsigned long long cpu_power; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE struct cputime prev_cputime; @@ -1441,6 +1442,7 @@ struct task_struct { VTIME_USER, VTIME_SYS, } vtime_snap_whence; + #endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaa..2ca0e9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1341,6 +1341,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; + p->cpu_power = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1e..53a79d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "sched.h" @@ -149,6 +150,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Account for user time used */ acct_account_cputime(p); + + /* Account power usage for user time */ + acct_update_power(p, cputime); } /* @@ -199,6 +203,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, /* Account for system time used */ acct_account_cputime(p); + + /* Account power usage for system time */ + acct_update_power(p, cputime); } /* -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
Adds the sysfs file for userspace to initialize the active current values for all the cores at each of the frequencies. The format for storing the values is as follows: echo "CPU:= =,CPU: ..." > /sys/devices/system/cpu/cpufreq/current_in_state Signed-off-by: Ruchi Kandoi --- drivers/cpufreq/cpufreq_stats.c | 163 +++- 1 file changed, 161 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 5e370a3..6f0b562 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -30,6 +30,14 @@ struct cpufreq_stats { #endif }; +struct cpufreq_power_stats { + unsigned int state_num; + unsigned int *curr; + unsigned int *freq_table; +}; + +static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats); + static int cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); @@ -61,6 +69,87 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) return len; } +static void store_current_value(struct cpufreq_power_stats *powerstats, + int freq, int curr) +{ + int i; + + /* freq_table doesn't contain any CPU_FREQ_INVALID */ + for (i = 0; i < powerstats->state_num; i++) { + if (powerstats->freq_table[i] == freq) { + powerstats->curr[i] = curr; + break; + } + } +} + +static ssize_t store_current_in_state(struct cpufreq_policy *policy, + const char *buf, size_t len) +{ + char *cp, *cp2, *start, *buffer; + unsigned int cpu_num, ret, curr, freq; + struct cpufreq_power_stats *powerstats; + + if (!buf || len < 0) + return len; + + buffer = kzalloc(len + 1, GFP_KERNEL); + if (!buffer) + return len; + + strncpy(buffer, buf, len); + buffer[len] = '\0'; + cp = buffer; + spin_lock(_stats_lock); + while ((start = strsep(, ","))) { + ret = sscanf(start, "CPU%u:", _num); + if (ret != 1 || cpu_num > (num_possible_cpus() - 1)) { + ret = -EINVAL; + goto error; + } + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + if (!powerstats) + continue; + + /* sscanf makes sure that strchr doesn't return a NULL */ + cp2 = strchr(start, ':') + 1; + while ((start = strsep(, " "))) { + if (sscanf(start, "%u=%u", , ) != 2) { + ret = -EINVAL; + goto error; + } + store_current_value(powerstats, freq, curr); + } + } + ret = len; +error: + spin_unlock(_stats_lock); + kfree(buffer); + return ret; +} + +static ssize_t show_current_in_state(struct cpufreq_policy *policy, char *buf) +{ + ssize_t len = 0; + unsigned int i, cpu; + struct cpufreq_power_stats *powerstats; + + spin_lock(_stats_lock); + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + len += scnprintf(buf + len, PAGE_SIZE - len, "CPU%d:", cpu); + for (i = 0; i < powerstats->state_num; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, + "%d=%d ", powerstats->freq_table[i], + powerstats->curr[i]); + len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); + } + spin_unlock(_stats_lock); + return len; +} + #ifdef CONFIG_CPU_FREQ_STAT_DETAILS static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { @@ -107,6 +196,7 @@ cpufreq_freq_attr_ro(trans_table); cpufreq_freq_attr_ro(total_trans); cpufreq_freq_attr_ro(time_in_state); +cpufreq_freq_attr_rw(current_in_state); static struct attribute *default_attrs[] = { _trans.attr, @@ -159,6 +249,67 @@ static void cpufreq_stats_free_table(unsigned int cpu) cpufreq_cpu_put(policy); } +static void cpufreq_powerstats_free(void) +{ + int cpu; + struct cpufreq_power_stats *powerstats; + + sysfs_remove_file(cpufreq_global_kobject, _in_state.attr); + + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + kfree(powerstats->curr); + kfree(powerstats); + per_cpu(cpufreq_power_stats, cpu) = NULL; + } +} + +static void cpufreq_powerstats_crea
[PATCH 0/2] Adds cpu power accounting per-pid basis.
These patches add a mechanism which will accurately caculate the CPU power used by all the processes in the system. In order to account for the power used by all the processes a data field "cpu_power" has been added in the task_struct. This field adds power for both the system as well as user time. cpu_power contains the total amount of charge(in uAmsec units) used by the process. This model takes into account the frequency at which the process was running(i.e higher power for processes running at higher frequencies). It requires the cpufreq_stats module to be initialized with the current numbers for each of the CPU core at each frequency. This will be initialized during init time. Ruchi Kandoi (2): cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state sched: cpufreq: Adds a field cpu_power in the task_struct drivers/cpufreq/cpufreq_stats.c | 186 +++- include/linux/cpufreq.h | 8 ++ include/linux/sched.h | 2 + kernel/fork.c | 1 + kernel/sched/cputime.c | 7 ++ 5 files changed, 202 insertions(+), 2 deletions(-) -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
Adds the sysfs file for userspace to initialize the active current values for all the cores at each of the frequencies. The format for storing the values is as follows: echo CPUcpu#:freq1=current in uA freq2=current,CPUcpu#: ... /sys/devices/system/cpu/cpufreq/current_in_state Signed-off-by: Ruchi Kandoi kandoiru...@google.com --- drivers/cpufreq/cpufreq_stats.c | 163 +++- 1 file changed, 161 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 5e370a3..6f0b562 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -30,6 +30,14 @@ struct cpufreq_stats { #endif }; +struct cpufreq_power_stats { + unsigned int state_num; + unsigned int *curr; + unsigned int *freq_table; +}; + +static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats); + static int cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); @@ -61,6 +69,87 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) return len; } +static void store_current_value(struct cpufreq_power_stats *powerstats, + int freq, int curr) +{ + int i; + + /* freq_table doesn't contain any CPU_FREQ_INVALID */ + for (i = 0; i powerstats-state_num; i++) { + if (powerstats-freq_table[i] == freq) { + powerstats-curr[i] = curr; + break; + } + } +} + +static ssize_t store_current_in_state(struct cpufreq_policy *policy, + const char *buf, size_t len) +{ + char *cp, *cp2, *start, *buffer; + unsigned int cpu_num, ret, curr, freq; + struct cpufreq_power_stats *powerstats; + + if (!buf || len 0) + return len; + + buffer = kzalloc(len + 1, GFP_KERNEL); + if (!buffer) + return len; + + strncpy(buffer, buf, len); + buffer[len] = '\0'; + cp = buffer; + spin_lock(cpufreq_stats_lock); + while ((start = strsep(cp, ,))) { + ret = sscanf(start, CPU%u:, cpu_num); + if (ret != 1 || cpu_num (num_possible_cpus() - 1)) { + ret = -EINVAL; + goto error; + } + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + if (!powerstats) + continue; + + /* sscanf makes sure that strchr doesn't return a NULL */ + cp2 = strchr(start, ':') + 1; + while ((start = strsep(cp2, ))) { + if (sscanf(start, %u=%u, freq, curr) != 2) { + ret = -EINVAL; + goto error; + } + store_current_value(powerstats, freq, curr); + } + } + ret = len; +error: + spin_unlock(cpufreq_stats_lock); + kfree(buffer); + return ret; +} + +static ssize_t show_current_in_state(struct cpufreq_policy *policy, char *buf) +{ + ssize_t len = 0; + unsigned int i, cpu; + struct cpufreq_power_stats *powerstats; + + spin_lock(cpufreq_stats_lock); + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + len += scnprintf(buf + len, PAGE_SIZE - len, CPU%d:, cpu); + for (i = 0; i powerstats-state_num; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, + %d=%d , powerstats-freq_table[i], + powerstats-curr[i]); + len += scnprintf(buf + len, PAGE_SIZE - len, \n); + } + spin_unlock(cpufreq_stats_lock); + return len; +} + #ifdef CONFIG_CPU_FREQ_STAT_DETAILS static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { @@ -107,6 +196,7 @@ cpufreq_freq_attr_ro(trans_table); cpufreq_freq_attr_ro(total_trans); cpufreq_freq_attr_ro(time_in_state); +cpufreq_freq_attr_rw(current_in_state); static struct attribute *default_attrs[] = { total_trans.attr, @@ -159,6 +249,67 @@ static void cpufreq_stats_free_table(unsigned int cpu) cpufreq_cpu_put(policy); } +static void cpufreq_powerstats_free(void) +{ + int cpu; + struct cpufreq_power_stats *powerstats; + + sysfs_remove_file(cpufreq_global_kobject, current_in_state.attr); + + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + kfree(powerstats-curr); + kfree(powerstats); + per_cpu(cpufreq_power_stats, cpu) = NULL; + } +} + +static void cpufreq_powerstats_create(unsigned int cpu
[PATCH 0/2] Adds cpu power accounting per-pid basis.
These patches add a mechanism which will accurately caculate the CPU power used by all the processes in the system. In order to account for the power used by all the processes a data field cpu_power has been added in the task_struct. This field adds power for both the system as well as user time. cpu_power contains the total amount of charge(in uAmsec units) used by the process. This model takes into account the frequency at which the process was running(i.e higher power for processes running at higher frequencies). It requires the cpufreq_stats module to be initialized with the current numbers for each of the CPU core at each frequency. This will be initialized during init time. Ruchi Kandoi (2): cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state sched: cpufreq: Adds a field cpu_power in the task_struct drivers/cpufreq/cpufreq_stats.c | 186 +++- include/linux/cpufreq.h | 8 ++ include/linux/sched.h | 2 + kernel/fork.c | 1 + kernel/sched/cputime.c | 7 ++ 5 files changed, 202 insertions(+), 2 deletions(-) -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
cpu_power has been added to keep track of amount of power each task is consuming. cpu_power is updated whenever stime and utime are updated for a task. power is computed by taking into account the frequency at which the current core was running and the current for cpu actively running at hat frequency. Signed-off-by: Ruchi Kandoi kandoiru...@google.com --- drivers/cpufreq/cpufreq_stats.c | 28 include/linux/cpufreq.h | 8 include/linux/sched.h | 2 ++ kernel/fork.c | 1 + kernel/sched/cputime.c | 7 +++ 5 files changed, 46 insertions(+) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 6f0b562..682ed898 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -14,6 +14,7 @@ #include linux/module.h #include linux/slab.h #include linux/cputime.h +#include linux/sched.h static spinlock_t cpufreq_stats_lock; @@ -83,6 +84,33 @@ static void store_current_value(struct cpufreq_power_stats *powerstats, } } +void acct_update_power(struct task_struct *task, cputime_t cputime) +{ + struct cpufreq_power_stats *powerstats; + struct cpufreq_stats *stats; + struct cpufreq_policy *policy; + unsigned int cpu_num, curr; + + if (!task) + return; + cpu_num = task_cpu(task); + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + policy = cpufreq_cpu_get(cpu_num); + if (!policy) + return; + + if (!powerstats || !(policy-stats)) { + cpufreq_cpu_put(policy); + return; + } + + stats = policy-stats; + curr = powerstats-curr[stats-last_index]; + task-cpu_power += curr * cputime_to_usecs(cputime); + cpufreq_cpu_put(policy); +} +EXPORT_SYMBOL_GPL(acct_update_power); + static ssize_t store_current_in_state(struct cpufreq_policy *policy, const char *buf, size_t len) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..86826c8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -18,6 +18,7 @@ #include linux/notifier.h #include linux/spinlock.h #include linux/sysfs.h +#include asm/cputime.h /* *CPUFREQ INTERFACE * @@ -601,4 +602,11 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +/* + * CPUFREQ STATS * + */ + +void acct_update_power(struct task_struct *p, cputime_t cputime); + #endif /* _LINUX_CPUFREQ_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e61..1f2400a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1429,6 +1429,7 @@ struct task_struct { int __user *clear_child_tid;/* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled; + unsigned long long cpu_power; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE struct cputime prev_cputime; @@ -1441,6 +1442,7 @@ struct task_struct { VTIME_USER, VTIME_SYS, } vtime_snap_whence; + #endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaa..2ca0e9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1341,6 +1341,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p-utime = p-stime = p-gtime = 0; p-utimescaled = p-stimescaled = 0; + p-cpu_power = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p-prev_cputime.utime = p-prev_cputime.stime = 0; #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1e..53a79d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,6 +4,7 @@ #include linux/kernel_stat.h #include linux/static_key.h #include linux/context_tracking.h +#include linux/cpufreq.h #include sched.h @@ -149,6 +150,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Account for user time used */ acct_account_cputime(p); + + /* Account power usage for user time */ + acct_update_power(p, cputime); } /* @@ -199,6 +203,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, /* Account for system time used */ acct_account_cputime(p); + + /* Account power usage for system time */ + acct_update_power(p, cputime); } /* -- 2.2.0.rc0.207.ga3a616c
[PATCH v2 0/2] Adds cpu power accounting per-pid basis.
These patches add a mechanism which will accurately caculate the CPU power used by all the processes in the system. In order to account for the power used by all the processes a data field cpu_power has been added in the task_struct. This field adds power for both the system as well as user time. cpu_power contains the total amount of charge(in uAmsec units) used by the process. This model takes into account the frequency at which the process was running(i.e higher power for processes running at higher frequencies). It requires the cpufreq_stats module to be initialized with the current numbers for each of the CPU core at each frequency. This will be initialized during init time. Ruchi Kandoi (2): cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state sched: cpufreq: Adds a field cpu_power in the task_struct drivers/cpufreq/cpufreq_stats.c | 191 +++- include/linux/cpufreq.h | 8 ++ include/linux/sched.h | 2 + kernel/fork.c | 1 + kernel/sched/cputime.c | 7 ++ 5 files changed, 207 insertions(+), 2 deletions(-) -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] sched: cpufreq: Adds a field cpu_power in the task_struct
cpu_power has been added to keep track of amount of power each task is consuming. cpu_power is updated whenever stime and utime are updated for a task. power is computed by taking into account the frequency at which the current core was running and the current for cpu actively running at hat frequency. Signed-off-by: Ruchi Kandoi kandoiru...@google.com --- drivers/cpufreq/cpufreq_stats.c | 23 +++ include/linux/cpufreq.h | 8 include/linux/sched.h | 2 ++ kernel/fork.c | 1 + kernel/sched/cputime.c | 7 +++ 5 files changed, 41 insertions(+) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 6f0b562..4a0bd9a 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -14,6 +14,7 @@ #include linux/module.h #include linux/slab.h #include linux/cputime.h +#include linux/sched.h static spinlock_t cpufreq_stats_lock; @@ -83,6 +84,28 @@ static void store_current_value(struct cpufreq_power_stats *powerstats, } } +void acct_update_power(struct task_struct *task, cputime_t cputime) +{ + struct cpufreq_power_stats *powerstats; + struct cpufreq_stats *stats; + struct cpufreq_policy *policy; + unsigned int cpu_num, curr; + + if (!task) + return; + cpu_num = task_cpu(task); + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + policy = cpufreq_cpu_get(cpu_num); + if (!powerstats || !policy || !(policy-stats)) + return; + + stats = policy-stats; + curr = powerstats-curr[stats-last_index]; + task-cpu_power += curr * cputime_to_usecs(cputime); + cpufreq_cpu_put(cpu_num); +} +EXPORT_SYMBOL_GPL(acct_update_power); + static ssize_t store_current_in_state(struct cpufreq_policy *policy, const char *buf, size_t len) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 2ee4888..86826c8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -18,6 +18,7 @@ #include linux/notifier.h #include linux/spinlock.h #include linux/sysfs.h +#include asm/cputime.h /* *CPUFREQ INTERFACE * @@ -601,4 +602,11 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +/* + * CPUFREQ STATS * + */ + +void acct_update_power(struct task_struct *p, cputime_t cputime); + #endif /* _LINUX_CPUFREQ_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 26a2e61..1f2400a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1429,6 +1429,7 @@ struct task_struct { int __user *clear_child_tid;/* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled; + unsigned long long cpu_power; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE struct cputime prev_cputime; @@ -1441,6 +1442,7 @@ struct task_struct { VTIME_USER, VTIME_SYS, } vtime_snap_whence; + #endif unsigned long nvcsw, nivcsw; /* context switch counts */ u64 start_time; /* monotonic time in nsec */ diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaa..2ca0e9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1341,6 +1341,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p-utime = p-stime = p-gtime = 0; p-utimescaled = p-stimescaled = 0; + p-cpu_power = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE p-prev_cputime.utime = p-prev_cputime.stime = 0; #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8394b1e..53a79d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,6 +4,7 @@ #include linux/kernel_stat.h #include linux/static_key.h #include linux/context_tracking.h +#include linux/cpufreq.h #include sched.h @@ -149,6 +150,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime, /* Account for user time used */ acct_account_cputime(p); + + /* Account power usage for user time */ + acct_update_power(p, cputime); } /* @@ -199,6 +203,9 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, /* Account for system time used */ acct_account_cputime(p); + + /* Account power usage for system time */ + acct_update_power(p, cputime); } /* -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body
[PATCH v2 1/2] cpufreq_stats: Adds sysfs file /sys/devices/system/cpu/cpufreq/current_in_state
Adds the sysfs file for userspace to initialize the active current values for all the cores at each of the frequencies. The format for storing the values is as follows: echo CPUcpu#:freq1=current in uA freq2=current,CPUcpu#: ... /sys/devices/system/cpu/cpufreq/current_in_state Signed-off-by: Ruchi Kandoi kandoiru...@google.com --- drivers/cpufreq/cpufreq_stats.c | 163 +++- 1 file changed, 161 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 5e370a3..6f0b562 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -30,6 +30,14 @@ struct cpufreq_stats { #endif }; +struct cpufreq_power_stats { + unsigned int state_num; + unsigned int *curr; + unsigned int *freq_table; +}; + +static DEFINE_PER_CPU(struct cpufreq_power_stats *, cpufreq_power_stats); + static int cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); @@ -61,6 +69,87 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) return len; } +static void store_current_value(struct cpufreq_power_stats *powerstats, + int freq, int curr) +{ + int i; + + /* freq_table doesn't contain any CPU_FREQ_INVALID */ + for (i = 0; i powerstats-state_num; i++) { + if (powerstats-freq_table[i] == freq) { + powerstats-curr[i] = curr; + break; + } + } +} + +static ssize_t store_current_in_state(struct cpufreq_policy *policy, + const char *buf, size_t len) +{ + char *cp, *cp2, *start, *buffer; + unsigned int cpu_num, ret, curr, freq; + struct cpufreq_power_stats *powerstats; + + if (!buf || len 0) + return len; + + buffer = kzalloc(len + 1, GFP_KERNEL); + if (!buffer) + return len; + + strncpy(buffer, buf, len); + buffer[len] = '\0'; + cp = buffer; + spin_lock(cpufreq_stats_lock); + while ((start = strsep(cp, ,))) { + ret = sscanf(start, CPU%u:, cpu_num); + if (ret != 1 || cpu_num (num_possible_cpus() - 1)) { + ret = -EINVAL; + goto error; + } + powerstats = per_cpu(cpufreq_power_stats, cpu_num); + if (!powerstats) + continue; + + /* sscanf makes sure that strchr doesn't return a NULL */ + cp2 = strchr(start, ':') + 1; + while ((start = strsep(cp2, ))) { + if (sscanf(start, %u=%u, freq, curr) != 2) { + ret = -EINVAL; + goto error; + } + store_current_value(powerstats, freq, curr); + } + } + ret = len; +error: + spin_unlock(cpufreq_stats_lock); + kfree(buffer); + return ret; +} + +static ssize_t show_current_in_state(struct cpufreq_policy *policy, char *buf) +{ + ssize_t len = 0; + unsigned int i, cpu; + struct cpufreq_power_stats *powerstats; + + spin_lock(cpufreq_stats_lock); + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + len += scnprintf(buf + len, PAGE_SIZE - len, CPU%d:, cpu); + for (i = 0; i powerstats-state_num; i++) + len += scnprintf(buf + len, PAGE_SIZE - len, + %d=%d , powerstats-freq_table[i], + powerstats-curr[i]); + len += scnprintf(buf + len, PAGE_SIZE - len, \n); + } + spin_unlock(cpufreq_stats_lock); + return len; +} + #ifdef CONFIG_CPU_FREQ_STAT_DETAILS static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { @@ -107,6 +196,7 @@ cpufreq_freq_attr_ro(trans_table); cpufreq_freq_attr_ro(total_trans); cpufreq_freq_attr_ro(time_in_state); +cpufreq_freq_attr_rw(current_in_state); static struct attribute *default_attrs[] = { total_trans.attr, @@ -159,6 +249,67 @@ static void cpufreq_stats_free_table(unsigned int cpu) cpufreq_cpu_put(policy); } +static void cpufreq_powerstats_free(void) +{ + int cpu; + struct cpufreq_power_stats *powerstats; + + sysfs_remove_file(cpufreq_global_kobject, current_in_state.attr); + + for_each_possible_cpu(cpu) { + powerstats = per_cpu(cpufreq_power_stats, cpu); + if (!powerstats) + continue; + kfree(powerstats-curr); + kfree(powerstats); + per_cpu(cpufreq_power_stats, cpu) = NULL; + } +} + +static void cpufreq_powerstats_create(unsigned int cpu
[PATCH] suspend: Return error when pending wakeup source is found.
If a wakeup source is found to be pending in the last stage of suspend after syscore suspend then the device doesn't suspend but the error is not propogated which causes an error in the accounting for the number of suspend aborts and successful suspends. Signed-off-by: Ruchi Kandoi --- kernel/power/suspend.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8d7a1ef..343b4e4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -366,6 +366,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) trace_suspend_resume(TPS("machine_suspend"), state, false); events_check_enabled = false; + } else if (*wakeup) { + error = -EBUSY; } syscore_resume(); } -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] suspend: Return error when pending wakeup source is found.
If a wakeup source is found to be pending in the last stage of suspend after syscore suspend then the device doesn't suspend but the error is not propogated which causes an error in the accounting for the number of suspend aborts and successful suspends. Signed-off-by: Ruchi Kandoi kandoiru...@google.com --- kernel/power/suspend.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8d7a1ef..343b4e4 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -366,6 +366,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) trace_suspend_resume(TPS(machine_suspend), state, false); events_check_enabled = false; + } else if (*wakeup) { + error = -EBUSY; } syscore_resume(); } -- 2.2.0.rc0.207.ga3a616c -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] power: add an API to log wakeup reasons
True, we could create new wakeup sources specifically to track this information, perhaps as needed once an IRQ is first observed to trigger a wakeup. We would want to know which wakeup sources were responsible for the most recent wakeup, since we keep a timeline of suspend/resume events with wakeup reasons and durations. It may require some extra work to keep track of this information. Apart from Google, other vendors like Qualcomm and Nvidia have already introduced similar kinds of logging in their respective interrupt controller drivers. We would really like to make this a standardized logging for debugging purposes. On Thu, Mar 13, 2014 at 6:06 PM, Rafael J. Wysocki wrote: > On Thursday, March 13, 2014 05:43:20 PM Ruchi Kandoi wrote: >> This should be true most of the times. >> >> But there might be cases otherwise too. >> >> For instance, there was a bug earlier with wi-fi which would cause the >> system to wake up but not get hold of a wakeup source because there >> wasn't any work for it to do. In that case, the wakeup sources would >> not log such an event. >> >> Additionally, there could be a situation where an IRQ caused the >> system to resume from suspend. And since the system was up, a driver >> could take a wakeup source. In this case we would assume that the >> driver would have woken the system, but in reality the driver held the >> wakeup source only because the system was up and did not cause the >> wake up to happen. > > But you can create special wakeup sources associated with interrupts (in > addition to the existing ones) and use the statistics for those. > > It is possible to define wakeup sources that don't correspond to any > devices. > > Rafael > > >> On Thu, Mar 13, 2014 at 3:18 PM, Rafael J. Wysocki >> wrote: >> > Hi, >> > >> > I saw the v4, but I don't have it handy, so replying here. >> > >> > On Wednesday, March 12, 2014 12:46:38 PM Ruchi Kandoi wrote: >> >> For power management diagnostic purposes, it is often useful to know >> >> what interrupts are frequently waking the system from low power >> >> suspend mode, especially on battery-powered consumer electronics >> >> devices that are expected to spend much of their time in low-power >> >> suspend while not in active use. For example, reduced battery life on >> >> a mobile phone may be caused in part by frequent wakeups by broadcast >> >> traffic on a busy wireless LAN even while the screen is off and the >> >> phone not in active use. >> >> >> >> Add API log_wakeup_reason() exposes it to userspace via the sysfs path >> >> /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called >> >> from the paltform specific, or from the driver for the interrupt >> >> controller, >> >> when the system resumes because of an IRQ. It logs the reasons which >> >> caused >> >> the system to wakeup from the low-power mode. >> > >> > So what exactly is wrong with using wakeup sources for this purpose? >> > >> > -- >> > I speak only for myself. >> > Rafael J. Wysocki, Intel Open Source Technology Center. >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> Please read the FAQ at http://www.tux.org/lkml/ > > -- > I speak only for myself. > Rafael J. Wysocki, Intel Open Source Technology Center. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] power: add an API to log wakeup reasons
This should be true most of the times. But there might be cases otherwise too. For instance, there was a bug earlier with wi-fi which would cause the system to wake up but not get hold of a wakeup source because there wasn't any work for it to do. In that case, the wakeup sources would not log such an event. Additionally, there could be a situation where an IRQ caused the system to resume from suspend. And since the system was up, a driver could take a wakeup source. In this case we would assume that the driver would have woken the system, but in reality the driver held the wakeup source only because the system was up and did not cause the wake up to happen. Regards, Ruchi Kandoi On Thu, Mar 13, 2014 at 3:18 PM, Rafael J. Wysocki wrote: > Hi, > > I saw the v4, but I don't have it handy, so replying here. > > On Wednesday, March 12, 2014 12:46:38 PM Ruchi Kandoi wrote: >> For power management diagnostic purposes, it is often useful to know >> what interrupts are frequently waking the system from low power >> suspend mode, especially on battery-powered consumer electronics >> devices that are expected to spend much of their time in low-power >> suspend while not in active use. For example, reduced battery life on >> a mobile phone may be caused in part by frequent wakeups by broadcast >> traffic on a busy wireless LAN even while the screen is off and the >> phone not in active use. >> >> Add API log_wakeup_reason() exposes it to userspace via the sysfs path >> /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called >> from the paltform specific, or from the driver for the interrupt controller, >> when the system resumes because of an IRQ. It logs the reasons which caused >> the system to wakeup from the low-power mode. > > So what exactly is wrong with using wakeup sources for this purpose? > > -- > I speak only for myself. > Rafael J. Wysocki, Intel Open Source Technology Center. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v4] power: add an API to log wakeup reasons
For power management diagnostic purposes, it is often useful to know what interrupts are frequently waking the system from low power suspend mode, especially on battery-powered consumer electronics devices that are expected to spend much of their time in low-power suspend while not in active use. For example, reduced battery life on a mobile phone may be caused in part by frequent wakeups by broadcast traffic on a busy wireless LAN even while the screen is off and the phone not in active use. Add API log_wakeup_reason() exposes it to userspace via the sysfs path /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called from the paltform specific, or from the driver for the interrupt controller, when the system resumes because of an IRQ. It logs the reasons which caused the system to wakeup from the low-power mode. This information can be collected by userspace as part of the accounting kept on power consumption. Signed-off-by: Ruchi Kandoi Signed-off-by: Greg Hackmann --- space added after prefix in the pr_fmt() --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 141 ++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_SUSPEND) += suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..c4ab205 --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,141 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) "wakeup_reason: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(_reason_lock); + for (irq_no = 0; irq_no < irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc && desc->action && desc->action->name) + buf_offset += sprintf(buf + buf_offset, "%d %s\n", + irq_list[irq_no], desc->action->name); + else + buf_offset += sprintf(buf + buf_offset, "%d\n", + irq_list[irq_no]); + } + spin_unlock(_reason_lock); + return buf_offset; +} + +static struct kobj_attribute resume_reason = __ATTR(last_re
[PATCH v4] power: add an API to log wakeup reasons
For power management diagnostic purposes, it is often useful to know what interrupts are frequently waking the system from low power suspend mode, especially on battery-powered consumer electronics devices that are expected to spend much of their time in low-power suspend while not in active use. For example, reduced battery life on a mobile phone may be caused in part by frequent wakeups by broadcast traffic on a busy wireless LAN even while the screen is off and the phone not in active use. Add API log_wakeup_reason() exposes it to userspace via the sysfs path /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called from the paltform specific, or from the driver for the interrupt controller, when the system resumes because of an IRQ. It logs the reasons which caused the system to wakeup from the low-power mode. This information can be collected by userspace as part of the accounting kept on power consumption. Signed-off-by: Ruchi Kandoi kandoiru...@google.com Signed-off-by: Greg Hackmann ghackm...@google.com --- space added after prefix in the pr_fmt() --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 141 ++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_SUSPEND) += suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..c4ab205 --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,141 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) wakeup_reason: fmt + +#include linux/wakeup_reason.h +#include linux/kernel.h +#include linux/irq.h +#include linux/interrupt.h +#include linux/io.h +#include linux/kobject.h +#include linux/sysfs.h +#include linux/init.h +#include linux/spinlock.h +#include linux/notifier.h +#include linux/suspend.h + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(resume_reason_lock); + for (irq_no = 0; irq_no irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc desc-action desc-action-name) + buf_offset += sprintf(buf + buf_offset, %d %s\n, + irq_list[irq_no], desc-action-name); + else + buf_offset += sprintf(buf + buf_offset, %d\n, + irq_list[irq_no
Re: [PATCH v3] power: add an API to log wakeup reasons
This should be true most of the times. But there might be cases otherwise too. For instance, there was a bug earlier with wi-fi which would cause the system to wake up but not get hold of a wakeup source because there wasn't any work for it to do. In that case, the wakeup sources would not log such an event. Additionally, there could be a situation where an IRQ caused the system to resume from suspend. And since the system was up, a driver could take a wakeup source. In this case we would assume that the driver would have woken the system, but in reality the driver held the wakeup source only because the system was up and did not cause the wake up to happen. Regards, Ruchi Kandoi On Thu, Mar 13, 2014 at 3:18 PM, Rafael J. Wysocki r...@rjwysocki.net wrote: Hi, I saw the v4, but I don't have it handy, so replying here. On Wednesday, March 12, 2014 12:46:38 PM Ruchi Kandoi wrote: For power management diagnostic purposes, it is often useful to know what interrupts are frequently waking the system from low power suspend mode, especially on battery-powered consumer electronics devices that are expected to spend much of their time in low-power suspend while not in active use. For example, reduced battery life on a mobile phone may be caused in part by frequent wakeups by broadcast traffic on a busy wireless LAN even while the screen is off and the phone not in active use. Add API log_wakeup_reason() exposes it to userspace via the sysfs path /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called from the paltform specific, or from the driver for the interrupt controller, when the system resumes because of an IRQ. It logs the reasons which caused the system to wakeup from the low-power mode. So what exactly is wrong with using wakeup sources for this purpose? -- I speak only for myself. Rafael J. Wysocki, Intel Open Source Technology Center. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v3] power: add an API to log wakeup reasons
True, we could create new wakeup sources specifically to track this information, perhaps as needed once an IRQ is first observed to trigger a wakeup. We would want to know which wakeup sources were responsible for the most recent wakeup, since we keep a timeline of suspend/resume events with wakeup reasons and durations. It may require some extra work to keep track of this information. Apart from Google, other vendors like Qualcomm and Nvidia have already introduced similar kinds of logging in their respective interrupt controller drivers. We would really like to make this a standardized logging for debugging purposes. On Thu, Mar 13, 2014 at 6:06 PM, Rafael J. Wysocki r...@rjwysocki.net wrote: On Thursday, March 13, 2014 05:43:20 PM Ruchi Kandoi wrote: This should be true most of the times. But there might be cases otherwise too. For instance, there was a bug earlier with wi-fi which would cause the system to wake up but not get hold of a wakeup source because there wasn't any work for it to do. In that case, the wakeup sources would not log such an event. Additionally, there could be a situation where an IRQ caused the system to resume from suspend. And since the system was up, a driver could take a wakeup source. In this case we would assume that the driver would have woken the system, but in reality the driver held the wakeup source only because the system was up and did not cause the wake up to happen. But you can create special wakeup sources associated with interrupts (in addition to the existing ones) and use the statistics for those. It is possible to define wakeup sources that don't correspond to any devices. Rafael On Thu, Mar 13, 2014 at 3:18 PM, Rafael J. Wysocki r...@rjwysocki.net wrote: Hi, I saw the v4, but I don't have it handy, so replying here. On Wednesday, March 12, 2014 12:46:38 PM Ruchi Kandoi wrote: For power management diagnostic purposes, it is often useful to know what interrupts are frequently waking the system from low power suspend mode, especially on battery-powered consumer electronics devices that are expected to spend much of their time in low-power suspend while not in active use. For example, reduced battery life on a mobile phone may be caused in part by frequent wakeups by broadcast traffic on a busy wireless LAN even while the screen is off and the phone not in active use. Add API log_wakeup_reason() exposes it to userspace via the sysfs path /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called from the paltform specific, or from the driver for the interrupt controller, when the system resumes because of an IRQ. It logs the reasons which caused the system to wakeup from the low-power mode. So what exactly is wrong with using wakeup sources for this purpose? -- I speak only for myself. Rafael J. Wysocki, Intel Open Source Technology Center. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- I speak only for myself. Rafael J. Wysocki, Intel Open Source Technology Center. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3] power: add an API to log wakeup reasons
For power management diagnostic purposes, it is often useful to know what interrupts are frequently waking the system from low power suspend mode, especially on battery-powered consumer electronics devices that are expected to spend much of their time in low-power suspend while not in active use. For example, reduced battery life on a mobile phone may be caused in part by frequent wakeups by broadcast traffic on a busy wireless LAN even while the screen is off and the phone not in active use. Add API log_wakeup_reason() exposes it to userspace via the sysfs path /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called from the paltform specific, or from the driver for the interrupt controller, when the system resumes because of an IRQ. It logs the reasons which caused the system to wakeup from the low-power mode. This information can be collected by userspace as part of the accounting kept on power consumption. Signed-off-by: Ruchi Kandoi Signed-off-by: Greg Hackmann --- commit message changed for clarity --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 141 ++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_SUSPEND) += suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..a21c592 --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,141 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) "wakeup_reason:" fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(_reason_lock); + for (irq_no = 0; irq_no < irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc && desc->action && desc->action->name) + buf_offset += sprintf(buf + buf_offset, "%d %s\n", + irq_list[irq_no], desc->action->name); + else + buf_offset += sprintf(buf + buf_offset, "%d\n", + irq_list[irq_no]); + } + spin_unlock(_reason_lock); + return buf_offset; +} + +static struct kobj_attribute resume_reason = __ATTR(last_resume_re
[PATCH v3] power: add an API to log wakeup reasons
For power management diagnostic purposes, it is often useful to know what interrupts are frequently waking the system from low power suspend mode, especially on battery-powered consumer electronics devices that are expected to spend much of their time in low-power suspend while not in active use. For example, reduced battery life on a mobile phone may be caused in part by frequent wakeups by broadcast traffic on a busy wireless LAN even while the screen is off and the phone not in active use. Add API log_wakeup_reason() exposes it to userspace via the sysfs path /sys/kernel/wakeup_reasons/last_resume_reason. This API would be called from the paltform specific, or from the driver for the interrupt controller, when the system resumes because of an IRQ. It logs the reasons which caused the system to wakeup from the low-power mode. This information can be collected by userspace as part of the accounting kept on power consumption. Signed-off-by: Ruchi Kandoi kandoiru...@google.com Signed-off-by: Greg Hackmann ghackm...@google.com --- commit message changed for clarity --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 141 ++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_SUSPEND) += suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..a21c592 --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,141 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) wakeup_reason: fmt + +#include linux/wakeup_reason.h +#include linux/kernel.h +#include linux/irq.h +#include linux/interrupt.h +#include linux/io.h +#include linux/kobject.h +#include linux/sysfs.h +#include linux/init.h +#include linux/spinlock.h +#include linux/notifier.h +#include linux/suspend.h + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(resume_reason_lock); + for (irq_no = 0; irq_no irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc desc-action desc-action-name) + buf_offset += sprintf(buf + buf_offset, %d %s\n, + irq_list[irq_no], desc-action-name); + else + buf_offset += sprintf(buf + buf_offset, %d\n, + irq_list[irq_no
Re: [PATCH] power: add an API to log wakeup reasons
This API would be called from the platform specific code, or the driver for the interrupt controller, when the system resumes from the suspend because of an IRQ. We track the reasons for which systems wake up from the low power suspend mode. This is especially important on battery-powered consumer electronic devices. Analyzing the data helps us figure what caused the maximum wake ups and if something can be done about the same to improve the battery life. For instances, if the wi-fi network traffic or the radio traffic causes the system to frequently wakeup from the low-power mode. This is already in use on some Android devices. We are trying to make this a generic API which could be called by other platforms as well, standardizing the format in which the info is logged in dmesg and the format of the info exported to userspace for collecting power management statistics. Thanking you, Ruchi Kandoi On Tue, Mar 11, 2014 at 12:32 PM, Rafael J. Wysocki wrote: > On Monday, March 10, 2014 07:02:02 PM Ruchi Kandoi wrote: >> Add API log_wakeup_reason() and expose it to userspace via sysfs path >> /sys/kernel/wakeup_reasons/last_resume_reason >> This is useful for power management diagnostic purposes. > > What's the use case and how is it supposed to work? > >> Signed-off-by: Ruchi Kandoi >> Signed-off-by: Greg Hackmann >> --- >> include/linux/wakeup_reason.h | 23 +++ >> kernel/power/Makefile | 2 +- >> kernel/power/wakeup_reason.c | 140 >> ++ >> 3 files changed, 164 insertions(+), 1 deletion(-) >> create mode 100644 include/linux/wakeup_reason.h >> create mode 100644 kernel/power/wakeup_reason.c >> >> diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h >> new file mode 100644 >> index 000..7ce50f0 >> --- /dev/null >> +++ b/include/linux/wakeup_reason.h >> @@ -0,0 +1,23 @@ >> +/* >> + * include/linux/wakeup_reason.h >> + * >> + * Logs the reason which caused the kernel to resume >> + * from the suspend mode. >> + * >> + * Copyright (C) 2014 Google, Inc. >> + * This software is licensed under the terms of the GNU General Public >> + * License version 2, as published by the Free Software Foundation, and >> + * may be copied, distributed, and modified under those terms. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + */ >> + >> +#ifndef _LINUX_WAKEUP_REASON_H >> +#define _LINUX_WAKEUP_REASON_H >> + >> +void log_wakeup_reason(int irq); >> + >> +#endif /* _LINUX_WAKEUP_REASON_H */ >> diff --git a/kernel/power/Makefile b/kernel/power/Makefile >> index 29472bf..f98f021 100644 >> --- a/kernel/power/Makefile >> +++ b/kernel/power/Makefile >> @@ -5,7 +5,7 @@ obj-y += qos.o >> obj-$(CONFIG_PM) += main.o >> obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o >> obj-$(CONFIG_FREEZER)+= process.o >> -obj-$(CONFIG_SUSPEND)+= suspend.o >> +obj-$(CONFIG_SUSPEND)+= suspend.o wakeup_reason.o >> obj-$(CONFIG_PM_TEST_SUSPEND)+= suspend_test.o >> obj-$(CONFIG_HIBERNATION)+= hibernate.o snapshot.o swap.o user.o \ >> block_io.o >> diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c >> new file mode 100644 >> index 000..188a6bf >> --- /dev/null >> +++ b/kernel/power/wakeup_reason.c >> @@ -0,0 +1,140 @@ >> +/* >> + * kernel/power/wakeup_reason.c >> + * >> + * Logs the reasons which caused the kernel to resume from >> + * the suspend mode. >> + * >> + * Copyright (C) 2014 Google, Inc. >> + * This software is licensed under the terms of the GNU General Public >> + * License version 2, as published by the Free Software Foundation, and >> + * may be copied, distributed, and modified under those terms. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + */ >> + >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +#include >> +
Re: [PATCH] power: add an API to log wakeup reasons
This API would be called from the platform specific code, or the driver for the interrupt controller, when the system resumes from the suspend because of an IRQ. We track the reasons for which systems wake up from the low power suspend mode. This is especially important on battery-powered consumer electronic devices. Analyzing the data helps us figure what caused the maximum wake ups and if something can be done about the same to improve the battery life. For instances, if the wi-fi network traffic or the radio traffic causes the system to frequently wakeup from the low-power mode. This is already in use on some Android devices. We are trying to make this a generic API which could be called by other platforms as well, standardizing the format in which the info is logged in dmesg and the format of the info exported to userspace for collecting power management statistics. Thanking you, Ruchi Kandoi On Tue, Mar 11, 2014 at 12:32 PM, Rafael J. Wysocki r...@rjwysocki.net wrote: On Monday, March 10, 2014 07:02:02 PM Ruchi Kandoi wrote: Add API log_wakeup_reason() and expose it to userspace via sysfs path /sys/kernel/wakeup_reasons/last_resume_reason This is useful for power management diagnostic purposes. What's the use case and how is it supposed to work? Signed-off-by: Ruchi Kandoi kandoiru...@google.com Signed-off-by: Greg Hackmann ghackm...@google.com --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 140 ++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER)+= process.o -obj-$(CONFIG_SUSPEND)+= suspend.o +obj-$(CONFIG_SUSPEND)+= suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND)+= suspend_test.o obj-$(CONFIG_HIBERNATION)+= hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..188a6bf --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,140 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include linux/wakeup_reason.h +#include linux/kernel.h +#include linux/irq.h +#include linux/interrupt.h +#include linux/io.h +#include linux/kobject.h +#include linux/sysfs.h +#include linux/init.h +#include linux/spinlock.h +#include linux/notifier.h +#include linux/suspend.h + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(resume_reason_lock); + for (irq_no = 0; irq_no irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no
[PATCH] power: add an API to log wakeup reasons
Add API log_wakeup_reason() and expose it to userspace via sysfs path /sys/kernel/wakeup_reasons/last_resume_reason This is useful for power management diagnostic purposes. Signed-off-by: Ruchi Kandoi Signed-off-by: Greg Hackmann --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 140 ++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_SUSPEND) += suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..188a6bf --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,140 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(_reason_lock); + for (irq_no = 0; irq_no < irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc && desc->action && desc->action->name) + buf_offset += sprintf(buf + buf_offset, "%d %s\n", + irq_list[irq_no], desc->action->name); + else + buf_offset += sprintf(buf + buf_offset, "%d\n", + irq_list[irq_no]); + } + spin_unlock(_reason_lock); + return buf_offset; +} + +static struct kobj_attribute resume_reason = __ATTR(last_resume_reason, 0666, + reason_show, NULL); + +static struct attribute *attrs[] = { + _reason.attr, + NULL, +}; +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +/* + * logs all the wake up reasons to the kernel + * stores the irqs to expose them to the userspace via sysfs + */ +void log_wakeup_reason(int irq) +{ + struct irq_desc *desc; + desc = irq_to_desc(irq); + if (desc && desc->action && desc->action->name) + printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq, + desc->action->name); + else + printk(KERN_INFO "Resume caused by IRQ %d\n", irq); + + spin_lock(_reason_lock); + if (irqcount == MAX_WAKEUP_REASON_IRQS) { +
[PATCH] power: add an API to log wakeup reasons
Add API log_wakeup_reason() and expose it to userspace via sysfs path /sys/kernel/wakeup_reasons/last_resume_reason This is useful for power management diagnostic purposes. Signed-off-by: Ruchi Kandoi kandoiru...@google.com Signed-off-by: Greg Hackmann ghackm...@google.com --- include/linux/wakeup_reason.h | 23 +++ kernel/power/Makefile | 2 +- kernel/power/wakeup_reason.c | 140 ++ 3 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 include/linux/wakeup_reason.h create mode 100644 kernel/power/wakeup_reason.c diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h new file mode 100644 index 000..7ce50f0 --- /dev/null +++ b/include/linux/wakeup_reason.h @@ -0,0 +1,23 @@ +/* + * include/linux/wakeup_reason.h + * + * Logs the reason which caused the kernel to resume + * from the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_WAKEUP_REASON_H +#define _LINUX_WAKEUP_REASON_H + +void log_wakeup_reason(int irq); + +#endif /* _LINUX_WAKEUP_REASON_H */ diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 29472bf..f98f021 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -5,7 +5,7 @@ obj-y += qos.o obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o -obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_SUSPEND) += suspend.o wakeup_reason.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000..188a6bf --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,140 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include linux/wakeup_reason.h +#include linux/kernel.h +#include linux/irq.h +#include linux/interrupt.h +#include linux/io.h +#include linux/kobject.h +#include linux/sysfs.h +#include linux/init.h +#include linux/spinlock.h +#include linux/notifier.h +#include linux/suspend.h + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static struct kobject *wakeup_reason; +static spinlock_t resume_reason_lock; + +static ssize_t reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(resume_reason_lock); + for (irq_no = 0; irq_no irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc desc-action desc-action-name) + buf_offset += sprintf(buf + buf_offset, %d %s\n, + irq_list[irq_no], desc-action-name); + else + buf_offset += sprintf(buf + buf_offset, %d\n, + irq_list[irq_no]); + } + spin_unlock(resume_reason_lock); + return buf_offset; +} + +static struct kobj_attribute resume_reason = __ATTR(last_resume_reason, 0666, + reason_show, NULL); + +static struct attribute *attrs[] = { + resume_reason.attr, + NULL, +}; +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +/* + * logs all the wake up reasons to the kernel + * stores the irqs to expose them to the userspace via sysfs + */ +void log_wakeup_reason(int irq) +{ + struct irq_desc *desc; + desc = irq_to_desc(irq); + if (desc desc-action desc-action-name) + printk(KERN_INFO Resume caused by IRQ %d, %s\n, irq, + desc-action-name); + else + printk(KERN_INFO Resume caused by IRQ %d\n, irq); + + spin_lock(resume_reason_lock