Re: [PATCH v5 02/20] perf: Add AUX area to ring buffer for raw data streams

2014-10-22 Thread Frederic Weisbecker
On Wed, Oct 22, 2014 at 02:35:47PM +0200, Peter Zijlstra wrote:
> On Mon, Oct 13, 2014 at 04:45:30PM +0300, Alexander Shishkin wrote:
> > +   struct kref aux_refcount;
> 
> I'm not a fan of kref, pointless obfuscation that.

It has a good potential for debugging though. Sure right now
the get/put simple APIs only performs counting sanity checks
but I've seen patches that extend it to object debugging.

Sounds quite valuable on complicated object lifecycles like
perf events.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 02/20] perf: Add AUX area to ring buffer for raw data streams

2014-10-22 Thread Peter Zijlstra
On Mon, Oct 13, 2014 at 04:45:30PM +0300, Alexander Shishkin wrote:
> + struct kref aux_refcount;

I'm not a fan of kref, pointless obfuscation that.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 02/20] perf: Add AUX area to ring buffer for raw data streams

2014-10-22 Thread Frederic Weisbecker
On Wed, Oct 22, 2014 at 02:35:47PM +0200, Peter Zijlstra wrote:
 On Mon, Oct 13, 2014 at 04:45:30PM +0300, Alexander Shishkin wrote:
  +   struct kref aux_refcount;
 
 I'm not a fan of kref, pointless obfuscation that.

It has a good potential for debugging though. Sure right now
the get/put simple APIs only performs counting sanity checks
but I've seen patches that extend it to object debugging.

Sounds quite valuable on complicated object lifecycles like
perf events.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 02/20] perf: Add AUX area to ring buffer for raw data streams

2014-10-22 Thread Peter Zijlstra
On Mon, Oct 13, 2014 at 04:45:30PM +0300, Alexander Shishkin wrote:
 + struct kref aux_refcount;

I'm not a fan of kref, pointless obfuscation that.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v5 02/20] perf: Add AUX area to ring buffer for raw data streams

2014-10-13 Thread Alexander Shishkin
From: Peter Zijlstra 

This patch introduces "AUX space" in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.

AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.

In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.

Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.

Signed-off-by: Alexander Shishkin 
---
 include/linux/perf_event.h  |  17 +
 include/uapi/linux/perf_event.h |  16 +
 kernel/events/core.c| 140 +---
 kernel/events/internal.h|  23 +++
 kernel/events/ring_buffer.c |  97 ++--
 5 files changed, 264 insertions(+), 29 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 893a0d0798..344058c71d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -263,6 +263,18 @@ struct pmu {
 * flush branch stack on context-switches (needed in cpu-wide mode)
 */
void (*flush_branch_stack)  (void);
+
+   /*
+* Set up pmu-private data structures for an AUX area
+*/
+   void *(*setup_aux)  (int cpu, void **pages,
+int nr_pages, bool overwrite);
+   /* optional */
+
+   /*
+* Free pmu-private AUX data structures
+*/
+   void (*free_aux)(void *aux); /* optional */
 };
 
 /**
@@ -782,6 +794,11 @@ static inline bool has_branch_stack(struct perf_event 
*event)
return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool has_aux(struct perf_event *event)
+{
+   return event->pmu->setup_aux;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index f7d18c2cb7..7e0967c0f5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -497,6 +497,22 @@ struct perf_event_mmap_page {
__u64   data_tail;  /* user-space written tail */
__u64   data_offset;/* where the buffer starts */
__u64   data_size;  /* data buffer size */
+
+   /*
+* AUX area is defined by aux_{offset,size} fields that should be set
+* by the userspace, so that
+*
+*   aux_offset >= data_offset + data_size
+*
+* prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+*
+* Ring buffer pointers aux_{head,tail} have the same semantics as
+* data_{head,tail} and same ordering rules apply.
+*/
+   __u64   aux_head;
+   __u64   aux_tail;
+   __u64   aux_offset;
+   __u64   aux_size;
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK  (7 << 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 23bacb8682..86b0577229 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4116,6 +4116,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 
atomic_inc(>mmap_count);
atomic_inc(>rb->mmap_count);
+   if (vma->vm_pgoff)
+   atomic_inc(>rb->aux_mmap_count);
 }
 
 /*
@@ -4135,6 +4137,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
 
+   /*
+* rb->aux_mmap_count will always drop before rb->mmap_count and
+* event->mmap_count, so it is ok to use event->mmap_mutex to
+* serialize with perf_mmap here.
+*/
+   if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
+   atomic_dec_and_mutex_lock(>aux_mmap_count, >mmap_mutex)) 
{
+   atomic_long_sub(rb->aux_nr_pages, _user->locked_vm);
+   vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+
+   rb_free_aux(rb);
+   mutex_unlock(>mmap_mutex);
+   }
+
atomic_dec(>mmap_count);
 
if (!atomic_dec_and_mutex_lock(>mmap_count, >mmap_mutex))
@@ -4208,7 +4224,7 @@ out_put:
 
 static const struct vm_operations_struct perf_mmap_vmops = {
.open   = perf_mmap_open,
-   .close  = perf_mmap_close,
+   .close  = perf_mmap_close, /* non mergable */
.fault 

[PATCH v5 02/20] perf: Add AUX area to ring buffer for raw data streams

2014-10-13 Thread Alexander Shishkin
From: Peter Zijlstra pet...@infradead.org

This patch introduces AUX space in the perf mmap buffer, intended for
exporting high bandwidth data streams to userspace, such as instruction
flow traces.

AUX space is a ring buffer, defined by aux_{offset,size} fields in the
user_page structure, and read/write pointers aux_{head,tail}, which abide
by the same rules as data_* counterparts of the main perf buffer.

In order to allocate/mmap AUX, userspace needs to set up aux_offset to
such an offset that will be greater than data_offset+data_size and
aux_size to be the desired buffer size. Both need to be page aligned.
Then, same aux_offset and aux_size should be passed to mmap() call and
if everything adds up, you should have an AUX buffer as a result.

Pages that are mapped into this buffer also come out of user's mlock
rlimit plus perf_event_mlock_kb allowance.

Signed-off-by: Alexander Shishkin alexander.shish...@linux.intel.com
---
 include/linux/perf_event.h  |  17 +
 include/uapi/linux/perf_event.h |  16 +
 kernel/events/core.c| 140 +---
 kernel/events/internal.h|  23 +++
 kernel/events/ring_buffer.c |  97 ++--
 5 files changed, 264 insertions(+), 29 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 893a0d0798..344058c71d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -263,6 +263,18 @@ struct pmu {
 * flush branch stack on context-switches (needed in cpu-wide mode)
 */
void (*flush_branch_stack)  (void);
+
+   /*
+* Set up pmu-private data structures for an AUX area
+*/
+   void *(*setup_aux)  (int cpu, void **pages,
+int nr_pages, bool overwrite);
+   /* optional */
+
+   /*
+* Free pmu-private AUX data structures
+*/
+   void (*free_aux)(void *aux); /* optional */
 };
 
 /**
@@ -782,6 +794,11 @@ static inline bool has_branch_stack(struct perf_event 
*event)
return event-attr.sample_type  PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool has_aux(struct perf_event *event)
+{
+   return event-pmu-setup_aux;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index f7d18c2cb7..7e0967c0f5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -497,6 +497,22 @@ struct perf_event_mmap_page {
__u64   data_tail;  /* user-space written tail */
__u64   data_offset;/* where the buffer starts */
__u64   data_size;  /* data buffer size */
+
+   /*
+* AUX area is defined by aux_{offset,size} fields that should be set
+* by the userspace, so that
+*
+*   aux_offset = data_offset + data_size
+*
+* prior to mmap()ing it. Size of the mmap()ed area should be aux_size.
+*
+* Ring buffer pointers aux_{head,tail} have the same semantics as
+* data_{head,tail} and same ordering rules apply.
+*/
+   __u64   aux_head;
+   __u64   aux_tail;
+   __u64   aux_offset;
+   __u64   aux_size;
 };
 
 #define PERF_RECORD_MISC_CPUMODE_MASK  (7  0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 23bacb8682..86b0577229 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4116,6 +4116,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 
atomic_inc(event-mmap_count);
atomic_inc(event-rb-mmap_count);
+   if (vma-vm_pgoff)
+   atomic_inc(event-rb-aux_mmap_count);
 }
 
 /*
@@ -4135,6 +4137,20 @@ static void perf_mmap_close(struct vm_area_struct *vma)
int mmap_locked = rb-mmap_locked;
unsigned long size = perf_data_size(rb);
 
+   /*
+* rb-aux_mmap_count will always drop before rb-mmap_count and
+* event-mmap_count, so it is ok to use event-mmap_mutex to
+* serialize with perf_mmap here.
+*/
+   if (rb_has_aux(rb)  vma-vm_pgoff == rb-aux_pgoff 
+   atomic_dec_and_mutex_lock(rb-aux_mmap_count, event-mmap_mutex)) 
{
+   atomic_long_sub(rb-aux_nr_pages, mmap_user-locked_vm);
+   vma-vm_mm-pinned_vm -= rb-aux_mmap_locked;
+
+   rb_free_aux(rb);
+   mutex_unlock(event-mmap_mutex);
+   }
+
atomic_dec(rb-mmap_count);
 
if (!atomic_dec_and_mutex_lock(event-mmap_count, event-mmap_mutex))
@@ -4208,7 +4224,7 @@ out_put:
 
 static const struct vm_operations_struct perf_mmap_vmops = {
.open   = perf_mmap_open,
-   .close  = perf_mmap_close,
+   .close