Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,
On 05.10.2018 14:50, Alexey Budankov wrote:
> 
> According to docs [1] ftruncate() does not advance file pos 
> which is essential here.
> 

Ideally, the both lseek() syscalls in every loop iteration can be
replaced by the only two syscalls just before and after the loop and
advancing *in-flight* off file pos value at perf_mmap__aio_push() 
after every successful record__aio_pushfn().

Thanks,
Alexey


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,
On 05.10.2018 14:50, Alexey Budankov wrote:
> 
> According to docs [1] ftruncate() does not advance file pos 
> which is essential here.
> 

Ideally, the both lseek() syscalls in every loop iteration can be
replaced by the only two syscalls just before and after the loop and
advancing *in-flight* off file pos value at perf_mmap__aio_push() 
after every successful record__aio_pushfn().

Thanks,
Alexey


[PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov


Trace file offset is calculated and updated linearly after
enqueuing aio write at record__aio_pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v10:
 - avoided lseek() setting file pos back in case of record__aio_write() failure
 - compacted code selecting between serial and AIO streaming
 - optimized call places of record__mmap_read_sync()
 Changes in v9:
 - enable AIO streaming only when --aio-cblocks option is specified explicitly
 Changes in v8:
 - split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 160 ++--
 tools/perf/perf.h   |   3 +
 tools/perf/util/mmap.c  |  73 
 tools/perf/util/mmap.h  |   4 ++
 4 files changed, 236 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0980dfe3396b..b2d38cb52650 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -124,6 +124,117 @@ static int record__write(struct record *rec, struct 
perf_mmap *map __maybe_unuse
return 0;
 }
 
+#ifdef HAVE_AIO_SUPPORT
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (aio_errno != EINTR)
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+ 

[PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov


Trace file offset is calculated and updated linearly after
enqueuing aio write at record__aio_pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v10:
 - avoided lseek() setting file pos back in case of record__aio_write() failure
 - compacted code selecting between serial and AIO streaming
 - optimized call places of record__mmap_read_sync()
 Changes in v9:
 - enable AIO streaming only when --aio-cblocks option is specified explicitly
 Changes in v8:
 - split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 160 ++--
 tools/perf/perf.h   |   3 +
 tools/perf/util/mmap.c  |  73 
 tools/perf/util/mmap.h  |   4 ++
 4 files changed, 236 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0980dfe3396b..b2d38cb52650 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -124,6 +124,117 @@ static int record__write(struct record *rec, struct 
perf_mmap *map __maybe_unuse
return 0;
 }
 
+#ifdef HAVE_AIO_SUPPORT
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (aio_errno != EINTR)
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+ 

Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,

On 05.10.2018 13:55, Namhyung Kim wrote:

> On Fri, Oct 05, 2018 at 12:39:10PM +0300, Alexey Budankov wrote:

>>
>> It still have to adjust the file pos thru lseek() prior leaving 
>> record__aio_pushfn() so space in trace file would be pre-allocated for 
>> enqueued record and file pos be moved beyond the record data, 
>> possibly for the next record.
> 
> For that purpose, isn't it better calling ftruncate() with a
> reasonable batch size to reduce number of syscalls?
> 

According to docs [1] ftruncate() does not advance file pos 
which is essential here.

> 

>> Well, if it has AIO symbols + opts.nr_cblocks exposed unconditionally of 
>> HAVE_AIO_SUPPORT, but keeps the symbols implementation under the define, then
>> as far aio-cblocks option is not exposed thru command line, we end up in
>> whole bunch of symbols referenced under the else branch that, after all, 
>> can cause Perf binary size increase, which is, probably, worth avoiding.
> 
> I think it's ok as long as they're empty.

Well, the both designs are possible and acceptable.
The only thing that matters here is contradictive requests from other reviewing 
folks. 
Let me share the version without dummy functions so 
we could jointly decide how to proceed from there. 
Ok?

Thanks,
Alexey

[1] http://man7.org/linux/man-pages/man2/ftruncate.2.html


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,

On 05.10.2018 13:55, Namhyung Kim wrote:

> On Fri, Oct 05, 2018 at 12:39:10PM +0300, Alexey Budankov wrote:

>>
>> It still have to adjust the file pos thru lseek() prior leaving 
>> record__aio_pushfn() so space in trace file would be pre-allocated for 
>> enqueued record and file pos be moved beyond the record data, 
>> possibly for the next record.
> 
> For that purpose, isn't it better calling ftruncate() with a
> reasonable batch size to reduce number of syscalls?
> 

According to docs [1] ftruncate() does not advance file pos 
which is essential here.

> 

>> Well, if it has AIO symbols + opts.nr_cblocks exposed unconditionally of 
>> HAVE_AIO_SUPPORT, but keeps the symbols implementation under the define, then
>> as far aio-cblocks option is not exposed thru command line, we end up in
>> whole bunch of symbols referenced under the else branch that, after all, 
>> can cause Perf binary size increase, which is, probably, worth avoiding.
> 
> I think it's ok as long as they're empty.

Well, the both designs are possible and acceptable.
The only thing that matters here is contradictive requests from other reviewing 
folks. 
Let me share the version without dummy functions so 
we could jointly decide how to proceed from there. 
Ok?

Thanks,
Alexey

[1] http://man7.org/linux/man-pages/man2/ftruncate.2.html


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Namhyung Kim
Hi,

On Fri, Oct 05, 2018 at 12:39:10PM +0300, Alexey Budankov wrote:
> Hi,
> 
> On 05.10.2018 11:48, Namhyung Kim wrote:
> > On Fri, Oct 05, 2018 at 11:31:11AM +0300, Alexey Budankov wrote:
> 
> >>
> >> Well, this could be implemented like this avoiding lseek() in else branch:
> >>
> >>off = lseek(trace_fd, 0, SEEK_CUR);
> >>ret = record__aio_write(cblock, trace_fd, bf, size, off);
> >>if (!ret) {
> >>lseek(trace_fd, off + size, SEEK_SET);
> >>rec->bytes_written += size;
> >>
> >>if (switch_output_size(rec))
> >>trigger_hit(_output_trigger);
> >>}
> > 
> > Oh I meant the both like:
> > 
> > off = rec->bytes_written;
> > ret = record__aio_write(cblock, trace_fd, bf, size, off);
> > if (!ret) {
> > rec->bytes_written += size;
> > 
> > ...
> > 
> 
> It still have to adjust the file pos thru lseek() prior leaving 
> record__aio_pushfn() so space in trace file would be pre-allocated for 
> enqueued record and file pos be moved beyond the record data, 
> possibly for the next record.

For that purpose, isn't it better calling ftruncate() with a
reasonable batch size to reduce number of syscalls?


> 
> > 
> 
> > 
> > Why not exposing opts.nr_cblocks regardless of the #ifdef?  It'll have
> > 0 when it's not compiled in.  Then it could be like below (assuming
> > you have all the dummy aio funcitons):
> > 
> > 
> >>
> >>if (map->base) {
> >>if (!rec->opts.nr_cblocks) {
> >>if (perf_mmap__push(map, rec, record__pushfn) 
> >> != 0) {
> >>rc = -1;
> >>goto out;
> >>}
> >>} else {
> >>int idx;
> >>/*
> >> * Call record__aio_sync() to wait till 
> >> map->data buffer
> >> * becomes available after previous aio write 
> >> request.
> >> */
> >>idx = record__aio_sync(map, false);
> >>if (perf_mmap__aio_push(map, rec, idx, 
> >> record__aio_pushfn) != 0) {
> >>rc = -1;
> >>goto out;
> >>}
> >>}
> >>}
> >>
> 
> Well, if it has AIO symbols + opts.nr_cblocks exposed unconditionally of 
> HAVE_AIO_SUPPORT, but keeps the symbols implementation under the define, then
> as far aio-cblocks option is not exposed thru command line, we end up in
> whole bunch of symbols referenced under the else branch that, after all, 
> can cause Perf binary size increase, which is, probably, worth avoiding.

I think it's ok as long as they're empty.

Thanks,
Namhyung


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Namhyung Kim
Hi,

On Fri, Oct 05, 2018 at 12:39:10PM +0300, Alexey Budankov wrote:
> Hi,
> 
> On 05.10.2018 11:48, Namhyung Kim wrote:
> > On Fri, Oct 05, 2018 at 11:31:11AM +0300, Alexey Budankov wrote:
> 
> >>
> >> Well, this could be implemented like this avoiding lseek() in else branch:
> >>
> >>off = lseek(trace_fd, 0, SEEK_CUR);
> >>ret = record__aio_write(cblock, trace_fd, bf, size, off);
> >>if (!ret) {
> >>lseek(trace_fd, off + size, SEEK_SET);
> >>rec->bytes_written += size;
> >>
> >>if (switch_output_size(rec))
> >>trigger_hit(_output_trigger);
> >>}
> > 
> > Oh I meant the both like:
> > 
> > off = rec->bytes_written;
> > ret = record__aio_write(cblock, trace_fd, bf, size, off);
> > if (!ret) {
> > rec->bytes_written += size;
> > 
> > ...
> > 
> 
> It still have to adjust the file pos thru lseek() prior leaving 
> record__aio_pushfn() so space in trace file would be pre-allocated for 
> enqueued record and file pos be moved beyond the record data, 
> possibly for the next record.

For that purpose, isn't it better calling ftruncate() with a
reasonable batch size to reduce number of syscalls?


> 
> > 
> 
> > 
> > Why not exposing opts.nr_cblocks regardless of the #ifdef?  It'll have
> > 0 when it's not compiled in.  Then it could be like below (assuming
> > you have all the dummy aio funcitons):
> > 
> > 
> >>
> >>if (map->base) {
> >>if (!rec->opts.nr_cblocks) {
> >>if (perf_mmap__push(map, rec, record__pushfn) 
> >> != 0) {
> >>rc = -1;
> >>goto out;
> >>}
> >>} else {
> >>int idx;
> >>/*
> >> * Call record__aio_sync() to wait till 
> >> map->data buffer
> >> * becomes available after previous aio write 
> >> request.
> >> */
> >>idx = record__aio_sync(map, false);
> >>if (perf_mmap__aio_push(map, rec, idx, 
> >> record__aio_pushfn) != 0) {
> >>rc = -1;
> >>goto out;
> >>}
> >>}
> >>}
> >>
> 
> Well, if it has AIO symbols + opts.nr_cblocks exposed unconditionally of 
> HAVE_AIO_SUPPORT, but keeps the symbols implementation under the define, then
> as far aio-cblocks option is not exposed thru command line, we end up in
> whole bunch of symbols referenced under the else branch that, after all, 
> can cause Perf binary size increase, which is, probably, worth avoiding.

I think it's ok as long as they're empty.

Thanks,
Namhyung


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,

On 05.10.2018 11:48, Namhyung Kim wrote:
> On Fri, Oct 05, 2018 at 11:31:11AM +0300, Alexey Budankov wrote:

>>
>> Well, this could be implemented like this avoiding lseek() in else branch:
>>
>>  off = lseek(trace_fd, 0, SEEK_CUR);
>>  ret = record__aio_write(cblock, trace_fd, bf, size, off);
>>  if (!ret) {
>>  lseek(trace_fd, off + size, SEEK_SET);
>>  rec->bytes_written += size;
>>
>>  if (switch_output_size(rec))
>>  trigger_hit(_output_trigger);
>>  }
> 
> Oh I meant the both like:
> 
>   off = rec->bytes_written;
>   ret = record__aio_write(cblock, trace_fd, bf, size, off);
>   if (!ret) {
>   rec->bytes_written += size;
> 
>   ...
> 

It still have to adjust the file pos thru lseek() prior leaving 
record__aio_pushfn() so space in trace file would be pre-allocated for 
enqueued record and file pos be moved beyond the record data, 
possibly for the next record.

> 

> 
> Why not exposing opts.nr_cblocks regardless of the #ifdef?  It'll have
> 0 when it's not compiled in.  Then it could be like below (assuming
> you have all the dummy aio funcitons):
> 
> 
>>
>>  if (map->base) {
>>  if (!rec->opts.nr_cblocks) {
>>  if (perf_mmap__push(map, rec, record__pushfn) 
>> != 0) {
>>  rc = -1;
>>  goto out;
>>  }
>>  } else {
>>  int idx;
>>  /*
>>   * Call record__aio_sync() to wait till 
>> map->data buffer
>>   * becomes available after previous aio write 
>> request.
>>   */
>>  idx = record__aio_sync(map, false);
>>  if (perf_mmap__aio_push(map, rec, idx, 
>> record__aio_pushfn) != 0) {
>>  rc = -1;
>>  goto out;
>>  }
>>  }
>>  }
>>

Well, if it has AIO symbols + opts.nr_cblocks exposed unconditionally of 
HAVE_AIO_SUPPORT, but keeps the symbols implementation under the define, then
as far aio-cblocks option is not exposed thru command line, we end up in
whole bunch of symbols referenced under the else branch that, after all, 
can cause Perf binary size increase, which is, probably, worth avoiding.

Thanks,
Alexey


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,

On 05.10.2018 11:48, Namhyung Kim wrote:
> On Fri, Oct 05, 2018 at 11:31:11AM +0300, Alexey Budankov wrote:

>>
>> Well, this could be implemented like this avoiding lseek() in else branch:
>>
>>  off = lseek(trace_fd, 0, SEEK_CUR);
>>  ret = record__aio_write(cblock, trace_fd, bf, size, off);
>>  if (!ret) {
>>  lseek(trace_fd, off + size, SEEK_SET);
>>  rec->bytes_written += size;
>>
>>  if (switch_output_size(rec))
>>  trigger_hit(_output_trigger);
>>  }
> 
> Oh I meant the both like:
> 
>   off = rec->bytes_written;
>   ret = record__aio_write(cblock, trace_fd, bf, size, off);
>   if (!ret) {
>   rec->bytes_written += size;
> 
>   ...
> 

It still have to adjust the file pos thru lseek() prior leaving 
record__aio_pushfn() so space in trace file would be pre-allocated for 
enqueued record and file pos be moved beyond the record data, 
possibly for the next record.

> 

> 
> Why not exposing opts.nr_cblocks regardless of the #ifdef?  It'll have
> 0 when it's not compiled in.  Then it could be like below (assuming
> you have all the dummy aio funcitons):
> 
> 
>>
>>  if (map->base) {
>>  if (!rec->opts.nr_cblocks) {
>>  if (perf_mmap__push(map, rec, record__pushfn) 
>> != 0) {
>>  rc = -1;
>>  goto out;
>>  }
>>  } else {
>>  int idx;
>>  /*
>>   * Call record__aio_sync() to wait till 
>> map->data buffer
>>   * becomes available after previous aio write 
>> request.
>>   */
>>  idx = record__aio_sync(map, false);
>>  if (perf_mmap__aio_push(map, rec, idx, 
>> record__aio_pushfn) != 0) {
>>  rc = -1;
>>  goto out;
>>  }
>>  }
>>  }
>>

Well, if it has AIO symbols + opts.nr_cblocks exposed unconditionally of 
HAVE_AIO_SUPPORT, but keeps the symbols implementation under the define, then
as far aio-cblocks option is not exposed thru command line, we end up in
whole bunch of symbols referenced under the else branch that, after all, 
can cause Perf binary size increase, which is, probably, worth avoiding.

Thanks,
Alexey


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Namhyung Kim
On Fri, Oct 05, 2018 at 11:31:11AM +0300, Alexey Budankov wrote:
> Hi,

Hi,

> 
> On 05.10.2018 10:16, Namhyung Kim wrote:
> > On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:
> 
> >> +static void record__aio_sync(struct perf_mmap *md)
> >> +{
> >> +  struct aiocb *cblock = >cblock;
> >> +  struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
> >> +
> >> +  do {
> >> +  if (cblock->aio_fildes == -1 || record__aio_complete(md, 
> >> cblock))
> >> +  return;
> >> +
> >> +  while (aio_suspend((const struct aiocb**), 1, )) 
> >> {
> >> +  if (!(errno == EAGAIN || errno == EINTR))
> >> +  pr_err("failed to sync perf data, error: %m\n");
> > 
> > Is there somthing we can do in this error case?  Any chance it gets
> > stuck in the loop?
> 
> Not really. Currently, in glibc, it can block on a mutex only.

OK, I was thinking of a situation where the aio_suspend() keeps
returning a same error code repeated..


> 
> > 
> > 
> >> +  }
> >> +  } while (1);
> >> +}
> >> +
> >> +static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, 
> >> size_t size)
> >> +{
> >> +  off_t off;
> >> +  struct record *rec = to;
> >> +  int ret, trace_fd = rec->session->data->file.fd;
> >> +
> >> +  rec->samples++;
> >> +
> >> +  off = lseek(trace_fd, 0, SEEK_CUR);
> >> +  lseek(trace_fd, off + size, SEEK_SET);
> > 
> > It'd be nice if these lseek() could be removed and use
> > rec->bytes_written instead.
> 
> Well, this could be implemented like this avoiding lseek() in else branch:
> 
>   off = lseek(trace_fd, 0, SEEK_CUR);
>   ret = record__aio_write(cblock, trace_fd, bf, size, off);
>   if (!ret) {
>   lseek(trace_fd, off + size, SEEK_SET);
>   rec->bytes_written += size;
> 
>   if (switch_output_size(rec))
>   trigger_hit(_output_trigger);
>   }

Oh I meant the both like:

off = rec->bytes_written;
ret = record__aio_write(cblock, trace_fd, bf, size, off);
if (!ret) {
rec->bytes_written += size;

...


> > 
> > 
> >> +  ret = record__aio_write(cblock, trace_fd, bf, size, off);
> >> +  if (!ret) {
> >> +  rec->bytes_written += size;
> >> +
> >> +  if (switch_output_size(rec))
> >> +  trigger_hit(_output_trigger);
> > 
> > Doesn't it need the _sync() before the trigger?  Maybe it should be
> > moved to record__mmap_read_evlist() or so..
> 
> Currently trigger just updates variable state.
> The state is then checked thru separate API at __cmd_record() where 
> record__mmap_read_sync() is called prior switching to a new trace file 
> or finishing collection.
> 
> >>  
> 
> >>if (map->base) {
> >> +#ifndef HAVE_AIO_SUPPORT
> >>if (perf_mmap__push(map, rec, record__pushfn) != 0) {
> >>rc = -1;
> >>goto out;
> >>}
> >> +#else
> >> +  if (!rec->opts.nr_cblocks) {
> >> +  if (perf_mmap__push(map, rec, record__pushfn) 
> >> != 0) {
> >> +  rc = -1;
> >> +  goto out;
> >> +  }
> >> +  } else {
> >> +  /*
> >> +   * Call record__aio_sync() to wait till 
> >> map->data buffer
> >> +   * becomes available after previous aio write 
> >> request.
> >> +   */
> >> +  record__aio_sync(map);
> >> +  if (perf_mmap__aio_push(map, rec, 
> >> record__aio_pushfn) != 0) {
> >> +  rc = -1;
> >> +  goto out;
> >> +  }
> >> +  }
> >> +#endif
> > 
> > If dummy aio functions are provided, the #ifdef can be removed and
> > just use the #else part assuming opts.nr_cblocks == 0.
> 
> Yes, it looks a little bit cumbersome. Would this be more compact?

Why not exposing opts.nr_cblocks regardless of the #ifdef?  It'll have
0 when it's not compiled in.  Then it could be like below (assuming
you have all the dummy aio funcitons):


> 
>   if (map->base) {
>   if (!rec->opts.nr_cblocks) {
>   if (perf_mmap__push(map, rec, record__pushfn) 
> != 0) {
>   rc = -1;
>   goto out;
>   }
>   } else {
>   int idx;
>   /*
>* Call record__aio_sync() to wait till 
> map->data buffer
>* becomes available after previous aio write 
> request.
>*/
>   idx = 

Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Namhyung Kim
On Fri, Oct 05, 2018 at 11:31:11AM +0300, Alexey Budankov wrote:
> Hi,

Hi,

> 
> On 05.10.2018 10:16, Namhyung Kim wrote:
> > On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:
> 
> >> +static void record__aio_sync(struct perf_mmap *md)
> >> +{
> >> +  struct aiocb *cblock = >cblock;
> >> +  struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
> >> +
> >> +  do {
> >> +  if (cblock->aio_fildes == -1 || record__aio_complete(md, 
> >> cblock))
> >> +  return;
> >> +
> >> +  while (aio_suspend((const struct aiocb**), 1, )) 
> >> {
> >> +  if (!(errno == EAGAIN || errno == EINTR))
> >> +  pr_err("failed to sync perf data, error: %m\n");
> > 
> > Is there somthing we can do in this error case?  Any chance it gets
> > stuck in the loop?
> 
> Not really. Currently, in glibc, it can block on a mutex only.

OK, I was thinking of a situation where the aio_suspend() keeps
returning a same error code repeated..


> 
> > 
> > 
> >> +  }
> >> +  } while (1);
> >> +}
> >> +
> >> +static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, 
> >> size_t size)
> >> +{
> >> +  off_t off;
> >> +  struct record *rec = to;
> >> +  int ret, trace_fd = rec->session->data->file.fd;
> >> +
> >> +  rec->samples++;
> >> +
> >> +  off = lseek(trace_fd, 0, SEEK_CUR);
> >> +  lseek(trace_fd, off + size, SEEK_SET);
> > 
> > It'd be nice if these lseek() could be removed and use
> > rec->bytes_written instead.
> 
> Well, this could be implemented like this avoiding lseek() in else branch:
> 
>   off = lseek(trace_fd, 0, SEEK_CUR);
>   ret = record__aio_write(cblock, trace_fd, bf, size, off);
>   if (!ret) {
>   lseek(trace_fd, off + size, SEEK_SET);
>   rec->bytes_written += size;
> 
>   if (switch_output_size(rec))
>   trigger_hit(_output_trigger);
>   }

Oh I meant the both like:

off = rec->bytes_written;
ret = record__aio_write(cblock, trace_fd, bf, size, off);
if (!ret) {
rec->bytes_written += size;

...


> > 
> > 
> >> +  ret = record__aio_write(cblock, trace_fd, bf, size, off);
> >> +  if (!ret) {
> >> +  rec->bytes_written += size;
> >> +
> >> +  if (switch_output_size(rec))
> >> +  trigger_hit(_output_trigger);
> > 
> > Doesn't it need the _sync() before the trigger?  Maybe it should be
> > moved to record__mmap_read_evlist() or so..
> 
> Currently trigger just updates variable state.
> The state is then checked thru separate API at __cmd_record() where 
> record__mmap_read_sync() is called prior switching to a new trace file 
> or finishing collection.
> 
> >>  
> 
> >>if (map->base) {
> >> +#ifndef HAVE_AIO_SUPPORT
> >>if (perf_mmap__push(map, rec, record__pushfn) != 0) {
> >>rc = -1;
> >>goto out;
> >>}
> >> +#else
> >> +  if (!rec->opts.nr_cblocks) {
> >> +  if (perf_mmap__push(map, rec, record__pushfn) 
> >> != 0) {
> >> +  rc = -1;
> >> +  goto out;
> >> +  }
> >> +  } else {
> >> +  /*
> >> +   * Call record__aio_sync() to wait till 
> >> map->data buffer
> >> +   * becomes available after previous aio write 
> >> request.
> >> +   */
> >> +  record__aio_sync(map);
> >> +  if (perf_mmap__aio_push(map, rec, 
> >> record__aio_pushfn) != 0) {
> >> +  rc = -1;
> >> +  goto out;
> >> +  }
> >> +  }
> >> +#endif
> > 
> > If dummy aio functions are provided, the #ifdef can be removed and
> > just use the #else part assuming opts.nr_cblocks == 0.
> 
> Yes, it looks a little bit cumbersome. Would this be more compact?

Why not exposing opts.nr_cblocks regardless of the #ifdef?  It'll have
0 when it's not compiled in.  Then it could be like below (assuming
you have all the dummy aio funcitons):


> 
>   if (map->base) {
>   if (!rec->opts.nr_cblocks) {
>   if (perf_mmap__push(map, rec, record__pushfn) 
> != 0) {
>   rc = -1;
>   goto out;
>   }
>   } else {
>   int idx;
>   /*
>* Call record__aio_sync() to wait till 
> map->data buffer
>* becomes available after previous aio write 
> request.
>*/
>   idx = 

Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,

On 05.10.2018 10:16, Namhyung Kim wrote:
> On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:

>> +static void record__aio_sync(struct perf_mmap *md)
>> +{
>> +struct aiocb *cblock = >cblock;
>> +struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
>> +
>> +do {
>> +if (cblock->aio_fildes == -1 || record__aio_complete(md, 
>> cblock))
>> +return;
>> +
>> +while (aio_suspend((const struct aiocb**), 1, )) 
>> {
>> +if (!(errno == EAGAIN || errno == EINTR))
>> +pr_err("failed to sync perf data, error: %m\n");
> 
> Is there somthing we can do in this error case?  Any chance it gets
> stuck in the loop?

Not really. Currently, in glibc, it can block on a mutex only.

> 
> 
>> +}
>> +} while (1);
>> +}
>> +
>> +static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, 
>> size_t size)
>> +{
>> +off_t off;
>> +struct record *rec = to;
>> +int ret, trace_fd = rec->session->data->file.fd;
>> +
>> +rec->samples++;
>> +
>> +off = lseek(trace_fd, 0, SEEK_CUR);
>> +lseek(trace_fd, off + size, SEEK_SET);
> 
> It'd be nice if these lseek() could be removed and use
> rec->bytes_written instead.

Well, this could be implemented like this avoiding lseek() in else branch:

off = lseek(trace_fd, 0, SEEK_CUR);
ret = record__aio_write(cblock, trace_fd, bf, size, off);
if (!ret) {
lseek(trace_fd, off + size, SEEK_SET);
rec->bytes_written += size;

if (switch_output_size(rec))
trigger_hit(_output_trigger);
}
> 
> 
>> +ret = record__aio_write(cblock, trace_fd, bf, size, off);
>> +if (!ret) {
>> +rec->bytes_written += size;
>> +
>> +if (switch_output_size(rec))
>> +trigger_hit(_output_trigger);
> 
> Doesn't it need the _sync() before the trigger?  Maybe it should be
> moved to record__mmap_read_evlist() or so..

Currently trigger just updates variable state.
The state is then checked thru separate API at __cmd_record() where 
record__mmap_read_sync() is called prior switching to a new trace file 
or finishing collection.

>>  

>>  if (map->base) {
>> +#ifndef HAVE_AIO_SUPPORT
>>  if (perf_mmap__push(map, rec, record__pushfn) != 0) {
>>  rc = -1;
>>  goto out;
>>  }
>> +#else
>> +if (!rec->opts.nr_cblocks) {
>> +if (perf_mmap__push(map, rec, record__pushfn) 
>> != 0) {
>> +rc = -1;
>> +goto out;
>> +}
>> +} else {
>> +/*
>> + * Call record__aio_sync() to wait till 
>> map->data buffer
>> + * becomes available after previous aio write 
>> request.
>> + */
>> +record__aio_sync(map);
>> +if (perf_mmap__aio_push(map, rec, 
>> record__aio_pushfn) != 0) {
>> +rc = -1;
>> +goto out;
>> +}
>> +}
>> +#endif
> 
> If dummy aio functions are provided, the #ifdef can be removed and
> just use the #else part assuming opts.nr_cblocks == 0.

Yes, it looks a little bit cumbersome. Would this be more compact?

if (map->base) {
#ifdef HAVE_AIO_SUPPORT
if (!rec->opts.nr_cblocks) {
#endif
if (perf_mmap__push(map, rec, record__pushfn) 
!= 0) {
rc = -1;
goto out;
}
#ifdef HAVE_AIO_SUPPORT
} else {
int idx;
/*
 * Call record__aio_sync() to wait till 
map->data buffer
 * becomes available after previous aio write 
request.
 */
idx = record__aio_sync(map, false);
if (perf_mmap__aio_push(map, rec, idx, 
record__aio_pushfn) != 0) {
rc = -1;
goto out;
}
}
#endif
}

Thanks,
Alexey


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Alexey Budankov
Hi,

On 05.10.2018 10:16, Namhyung Kim wrote:
> On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:

>> +static void record__aio_sync(struct perf_mmap *md)
>> +{
>> +struct aiocb *cblock = >cblock;
>> +struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
>> +
>> +do {
>> +if (cblock->aio_fildes == -1 || record__aio_complete(md, 
>> cblock))
>> +return;
>> +
>> +while (aio_suspend((const struct aiocb**), 1, )) 
>> {
>> +if (!(errno == EAGAIN || errno == EINTR))
>> +pr_err("failed to sync perf data, error: %m\n");
> 
> Is there somthing we can do in this error case?  Any chance it gets
> stuck in the loop?

Not really. Currently, in glibc, it can block on a mutex only.

> 
> 
>> +}
>> +} while (1);
>> +}
>> +
>> +static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, 
>> size_t size)
>> +{
>> +off_t off;
>> +struct record *rec = to;
>> +int ret, trace_fd = rec->session->data->file.fd;
>> +
>> +rec->samples++;
>> +
>> +off = lseek(trace_fd, 0, SEEK_CUR);
>> +lseek(trace_fd, off + size, SEEK_SET);
> 
> It'd be nice if these lseek() could be removed and use
> rec->bytes_written instead.

Well, this could be implemented like this avoiding lseek() in else branch:

off = lseek(trace_fd, 0, SEEK_CUR);
ret = record__aio_write(cblock, trace_fd, bf, size, off);
if (!ret) {
lseek(trace_fd, off + size, SEEK_SET);
rec->bytes_written += size;

if (switch_output_size(rec))
trigger_hit(_output_trigger);
}
> 
> 
>> +ret = record__aio_write(cblock, trace_fd, bf, size, off);
>> +if (!ret) {
>> +rec->bytes_written += size;
>> +
>> +if (switch_output_size(rec))
>> +trigger_hit(_output_trigger);
> 
> Doesn't it need the _sync() before the trigger?  Maybe it should be
> moved to record__mmap_read_evlist() or so..

Currently trigger just updates variable state.
The state is then checked thru separate API at __cmd_record() where 
record__mmap_read_sync() is called prior switching to a new trace file 
or finishing collection.

>>  

>>  if (map->base) {
>> +#ifndef HAVE_AIO_SUPPORT
>>  if (perf_mmap__push(map, rec, record__pushfn) != 0) {
>>  rc = -1;
>>  goto out;
>>  }
>> +#else
>> +if (!rec->opts.nr_cblocks) {
>> +if (perf_mmap__push(map, rec, record__pushfn) 
>> != 0) {
>> +rc = -1;
>> +goto out;
>> +}
>> +} else {
>> +/*
>> + * Call record__aio_sync() to wait till 
>> map->data buffer
>> + * becomes available after previous aio write 
>> request.
>> + */
>> +record__aio_sync(map);
>> +if (perf_mmap__aio_push(map, rec, 
>> record__aio_pushfn) != 0) {
>> +rc = -1;
>> +goto out;
>> +}
>> +}
>> +#endif
> 
> If dummy aio functions are provided, the #ifdef can be removed and
> just use the #else part assuming opts.nr_cblocks == 0.

Yes, it looks a little bit cumbersome. Would this be more compact?

if (map->base) {
#ifdef HAVE_AIO_SUPPORT
if (!rec->opts.nr_cblocks) {
#endif
if (perf_mmap__push(map, rec, record__pushfn) 
!= 0) {
rc = -1;
goto out;
}
#ifdef HAVE_AIO_SUPPORT
} else {
int idx;
/*
 * Call record__aio_sync() to wait till 
map->data buffer
 * becomes available after previous aio write 
request.
 */
idx = record__aio_sync(map, false);
if (perf_mmap__aio_push(map, rec, idx, 
record__aio_pushfn) != 0) {
rc = -1;
goto out;
}
}
#endif
}

Thanks,
Alexey


Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Namhyung Kim
On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:
> 
> Trace file offset is calculated and updated linearly prior
> enqueuing aio write at record__pushfn().
> 
> record__aio_sync() blocks till completion of started AIO operation 
> and then proceeds.
> 
> record__mmap_read_sync() implements a barrier for all incomplete
> aio write requests.
> 
> Signed-off-by: Alexey Budankov 
> ---
>  Changes in v9:
>  - enable AIO streaming only when --aio-cblocks option is specified explicitly
>  Changes in v8:
>  - split AIO completion check into separate record__aio_complete()
>  Changes in v6:
>  - handled errno == EAGAIN case from aio_write();
>  Changes in v5:
>  - data loss metrics decreased from 25% to 2x in trialed configuration;
>  - avoided nanosleep() prior calling aio_suspend();
>  - switched to per cpu multi record__aio_sync() aio
>  - record_mmap_read_sync() now does global barrier just before 
>switching trace file or collection stop;
>  - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
> of=/dev/null count=10
>  Changes in v4:
>  - converted void *bf to struct perf_mmap *md in signatures
>  - written comment in perf_mmap__push() just before perf_mmap__get();
>  - written comment in record__mmap_read_sync() on possible restarting 
>of aio_write() operation and releasing perf_mmap object after all;
>  - added perf_mmap__put() for the cases of failed aio_write();
>  Changes in v3:
>  - written comments about nanosleep(0.5ms) call prior aio_suspend()
>to cope with intrusiveness of its implementation in glibc;
>  - written comments about rationale behind coping profiling data 
>into mmap->data buffer;
> ---
>  tools/perf/builtin-record.c | 154 
> +++-
>  tools/perf/perf.h   |   3 +
>  tools/perf/util/mmap.c  |  73 +
>  tools/perf/util/mmap.h  |   4 ++
>  4 files changed, 233 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 0980dfe3396b..c63fe8c021a2 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -124,6 +124,118 @@ static int record__write(struct record *rec, struct 
> perf_mmap *map __maybe_unuse
>   return 0;
>  }
>  
> +#ifdef HAVE_AIO_SUPPORT
> +static int record__aio_write(struct aiocb *cblock, int trace_fd,
> + void *buf, size_t size, off_t off)
> +{
> + int rc;
> +
> + cblock->aio_fildes = trace_fd;
> + cblock->aio_buf= buf;
> + cblock->aio_nbytes = size;
> + cblock->aio_offset = off;
> + cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
> +
> + do {
> + rc = aio_write(cblock);
> + if (rc == 0) {
> + break;
> + } else if (errno != EAGAIN) {
> + cblock->aio_fildes = -1;
> + pr_err("failed to queue perf data, error: %m\n");
> + break;
> + }
> + } while (1);
> +
> + return rc;
> +}
> +
> +static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
> +{
> + void *rem_buf;
> + off_t rem_off;
> + size_t rem_size;
> + int rc, aio_errno;
> + ssize_t aio_ret, written;
> +
> + aio_errno = aio_error(cblock);
> + if (aio_errno == EINPROGRESS)
> + return 0;
> +
> + written = aio_ret = aio_return(cblock);
> + if (aio_ret < 0) {
> + if (aio_errno != EINTR)
> + pr_err("failed to write perf data, error: %m\n");
> + written = 0;
> + }
> +
> + rem_size = cblock->aio_nbytes - written;
> +
> + if (rem_size == 0) {
> + cblock->aio_fildes = -1;
> + /*
> +  * md->refcount is incremented in perf_mmap__push() for
> +  * every enqueued aio write request so decrement it because
> +  * the request is now complete.
> +  */
> + perf_mmap__put(md);
> + rc = 1;
> + } else {
> + /*
> +  * aio write request may require restart with the
> +  * reminder if the kernel didn't write whole
> +  * chunk at once.
> +  */
> + rem_off = cblock->aio_offset + written;
> + rem_buf = (void *)(cblock->aio_buf + written);
> + record__aio_write(cblock, cblock->aio_fildes,
> + rem_buf, rem_size, rem_off);
> + rc = 0;
> + }
> +
> + return rc;
> +}
> +
> +static void record__aio_sync(struct perf_mmap *md)
> +{
> + struct aiocb *cblock = >cblock;
> + struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
> +
> + do {
> + if (cblock->aio_fildes == -1 || record__aio_complete(md, 
> cblock))
> + return;
> +
> + while (aio_suspend((const struct aiocb**), 1, )) 
> {
> + if (!(errno == EAGAIN || errno == EINTR))
> + 

Re: [PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-05 Thread Namhyung Kim
On Wed, Oct 03, 2018 at 07:12:10PM +0300, Alexey Budankov wrote:
> 
> Trace file offset is calculated and updated linearly prior
> enqueuing aio write at record__pushfn().
> 
> record__aio_sync() blocks till completion of started AIO operation 
> and then proceeds.
> 
> record__mmap_read_sync() implements a barrier for all incomplete
> aio write requests.
> 
> Signed-off-by: Alexey Budankov 
> ---
>  Changes in v9:
>  - enable AIO streaming only when --aio-cblocks option is specified explicitly
>  Changes in v8:
>  - split AIO completion check into separate record__aio_complete()
>  Changes in v6:
>  - handled errno == EAGAIN case from aio_write();
>  Changes in v5:
>  - data loss metrics decreased from 25% to 2x in trialed configuration;
>  - avoided nanosleep() prior calling aio_suspend();
>  - switched to per cpu multi record__aio_sync() aio
>  - record_mmap_read_sync() now does global barrier just before 
>switching trace file or collection stop;
>  - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
> of=/dev/null count=10
>  Changes in v4:
>  - converted void *bf to struct perf_mmap *md in signatures
>  - written comment in perf_mmap__push() just before perf_mmap__get();
>  - written comment in record__mmap_read_sync() on possible restarting 
>of aio_write() operation and releasing perf_mmap object after all;
>  - added perf_mmap__put() for the cases of failed aio_write();
>  Changes in v3:
>  - written comments about nanosleep(0.5ms) call prior aio_suspend()
>to cope with intrusiveness of its implementation in glibc;
>  - written comments about rationale behind coping profiling data 
>into mmap->data buffer;
> ---
>  tools/perf/builtin-record.c | 154 
> +++-
>  tools/perf/perf.h   |   3 +
>  tools/perf/util/mmap.c  |  73 +
>  tools/perf/util/mmap.h  |   4 ++
>  4 files changed, 233 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> index 0980dfe3396b..c63fe8c021a2 100644
> --- a/tools/perf/builtin-record.c
> +++ b/tools/perf/builtin-record.c
> @@ -124,6 +124,118 @@ static int record__write(struct record *rec, struct 
> perf_mmap *map __maybe_unuse
>   return 0;
>  }
>  
> +#ifdef HAVE_AIO_SUPPORT
> +static int record__aio_write(struct aiocb *cblock, int trace_fd,
> + void *buf, size_t size, off_t off)
> +{
> + int rc;
> +
> + cblock->aio_fildes = trace_fd;
> + cblock->aio_buf= buf;
> + cblock->aio_nbytes = size;
> + cblock->aio_offset = off;
> + cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
> +
> + do {
> + rc = aio_write(cblock);
> + if (rc == 0) {
> + break;
> + } else if (errno != EAGAIN) {
> + cblock->aio_fildes = -1;
> + pr_err("failed to queue perf data, error: %m\n");
> + break;
> + }
> + } while (1);
> +
> + return rc;
> +}
> +
> +static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
> +{
> + void *rem_buf;
> + off_t rem_off;
> + size_t rem_size;
> + int rc, aio_errno;
> + ssize_t aio_ret, written;
> +
> + aio_errno = aio_error(cblock);
> + if (aio_errno == EINPROGRESS)
> + return 0;
> +
> + written = aio_ret = aio_return(cblock);
> + if (aio_ret < 0) {
> + if (aio_errno != EINTR)
> + pr_err("failed to write perf data, error: %m\n");
> + written = 0;
> + }
> +
> + rem_size = cblock->aio_nbytes - written;
> +
> + if (rem_size == 0) {
> + cblock->aio_fildes = -1;
> + /*
> +  * md->refcount is incremented in perf_mmap__push() for
> +  * every enqueued aio write request so decrement it because
> +  * the request is now complete.
> +  */
> + perf_mmap__put(md);
> + rc = 1;
> + } else {
> + /*
> +  * aio write request may require restart with the
> +  * reminder if the kernel didn't write whole
> +  * chunk at once.
> +  */
> + rem_off = cblock->aio_offset + written;
> + rem_buf = (void *)(cblock->aio_buf + written);
> + record__aio_write(cblock, cblock->aio_fildes,
> + rem_buf, rem_size, rem_off);
> + rc = 0;
> + }
> +
> + return rc;
> +}
> +
> +static void record__aio_sync(struct perf_mmap *md)
> +{
> + struct aiocb *cblock = >cblock;
> + struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
> +
> + do {
> + if (cblock->aio_fildes == -1 || record__aio_complete(md, 
> cblock))
> + return;
> +
> + while (aio_suspend((const struct aiocb**), 1, )) 
> {
> + if (!(errno == EAGAIN || errno == EINTR))
> + 

[PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-03 Thread Alexey Budankov


Trace file offset is calculated and updated linearly prior
enqueuing aio write at record__pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v9:
 - enable AIO streaming only when --aio-cblocks option is specified explicitly
 Changes in v8:
 - split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 154 +++-
 tools/perf/perf.h   |   3 +
 tools/perf/util/mmap.c  |  73 +
 tools/perf/util/mmap.h  |   4 ++
 4 files changed, 233 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0980dfe3396b..c63fe8c021a2 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -124,6 +124,118 @@ static int record__write(struct record *rec, struct 
perf_mmap *map __maybe_unuse
return 0;
 }
 
+#ifdef HAVE_AIO_SUPPORT
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (aio_errno != EINTR)
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+   pr_err("failed to sync perf data, error: %m\n");
+   }
+   } while (1);
+}
+
+static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, size_t 
size)
+{
+   off_t 

[PATCH v9 2/3]: perf record: enable asynchronous trace writing

2018-10-03 Thread Alexey Budankov


Trace file offset is calculated and updated linearly prior
enqueuing aio write at record__pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v9:
 - enable AIO streaming only when --aio-cblocks option is specified explicitly
 Changes in v8:
 - split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 154 +++-
 tools/perf/perf.h   |   3 +
 tools/perf/util/mmap.c  |  73 +
 tools/perf/util/mmap.h  |   4 ++
 4 files changed, 233 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0980dfe3396b..c63fe8c021a2 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -124,6 +124,118 @@ static int record__write(struct record *rec, struct 
perf_mmap *map __maybe_unuse
return 0;
 }
 
+#ifdef HAVE_AIO_SUPPORT
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (aio_errno != EINTR)
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+   pr_err("failed to sync perf data, error: %m\n");
+   }
+   } while (1);
+}
+
+static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, size_t 
size)
+{
+   off_t