[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Daniel Vetter
On Wed, May 04, 2016 at 02:24:14PM +0100, Robert Bragg wrote:
> On Wed, May 4, 2016 at 1:24 PM, Daniel Vetter  wrote:
> 
> > On Wed, May 04, 2016 at 10:49:53AM +0100, Robert Bragg wrote:
> > > On Wed, May 4, 2016 at 10:09 AM, Martin Peres <
> > martin.peres at linux.intel.com>
> > > wrote:
> > >
> > > > On 03/05/16 23:03, Robert Bragg wrote:
> > > >
> > > >>
> > > >>
> > > >> On Tue, May 3, 2016 at 8:34 PM, Robert Bragg  > > >> > wrote:
> > > >>
> > > >> Sorry for the delay replying to this, I missed it.
> > > >>
> > > >> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres <
> > martin.peres at free.fr
> > > >> > wrote:
> > > >>
> > > >> On 20/04/16 17:23, Robert Bragg wrote:
> > > >>
> > > >> Gen graphics hardware can be set up to periodically write
> > > >> snapshots of
> > > >> performance counters into a circular buffer via its
> > > >> Observation
> > > >> Architecture and this patch exposes that capability to
> > > >> userspace via the
> > > >> i915 perf interface.
> > > >>
> > > >> Cc: Chris Wilson  > > >> >
> > > >> Signed-off-by: Robert Bragg  > > >> >
> > > >> Signed-off-by: Zhenyu Wang  > > >> >
> > > >>
> > > >> ---
> > > >>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
> > > >>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
> > > >>   drivers/gpu/drm/i915/i915_perf.c| 940
> > > >> +++-
> > > >>   drivers/gpu/drm/i915/i915_reg.h | 338
> > 
> > > >>   include/uapi/drm/i915_drm.h |  70 ++-
> > > >>   5 files changed, 1408 insertions(+), 20 deletions(-)
> > > >>
> > > >> +
> > > >> +
> > > >> +   /* It takes a fairly long time for a new MUX
> > > >> configuration to
> > > >> +* be be applied after these register writes. This
> > > >> delay
> > > >> +* duration was derived empirically based on the
> > > >> render_basic
> > > >> +* config but hopefully it covers the maximum
> > > >> configuration
> > > >> +* latency...
> > > >> +*/
> > > >> +   mdelay(100);
> > > >>
> > > >>
> > > >> With such a HW and SW design, how can we ever expose hope to
> > get
> > > >> any
> > > >> kind of performance when we are trying to monitor different
> > > >> metrics on each
> > > >> draw call? This may be acceptable for system monitoring, but
> > it
> > > >> is problematic
> > > >> for the GL extensions :s
> > > >>
> > > >>
> > > >> Since it seems like we are going for a perf API, it means that
> > > >> for every change
> > > >> of metrics, we need to flush the commands, wait for the GPU to
> > > >> be done, then
> > > >> program the new set of metrics via an IOCTL, wait 100 ms, and
> > > >> then we may
> > > >> resume rendering ... until the next change. We are talking
> > about
> > > >> a latency of
> > > >> 6-7 frames at 60 Hz here... this is non-negligeable...
> > > >>
> > > >>
> > > >> I understand that we have a ton of counters and we may hide
> > > >> latency by not
> > > >> allowing using more than half of the counters for every draw
> > > >> call or frame, but
> > > >> even then, this 100ms delay is killing this approach
> > altogether.
> > > >>
> > > >>
> > > >>
> > > >>
> > > >> So revisiting this to double check how things fail with my latest
> > > >> driver/tests without the delay, I apparently can't reproduce test
> > > >> failures without the delay any more...
> > > >>
> > > >> I think the explanation is that since first adding the delay to the
> > > >> driver I also made the the driver a bit more careful to not forward
> > > >> spurious reports that look invalid due to a zeroed report id field,
> > and
> > > >> that mechanism keeps the unit tests happy, even though there are still
> > > >> some number of invalid reports generated if we don't wait.
> > > >>
> > > >> One problem with simply having no delay is that the driver prints an
> > > >> error if it sees an invalid reports so I get a lot of 'Skipping
> > > >> spurious, invalid OA report' dmesg spam. Also this was intended more
> > as
> > > >> a last resort mechanism, and I wouldn't feel too happy about squashing
> > > >> the error message and potentially sweeping other error cases under the
> > > >> carpet.
> > > >>
> > > >> Experimenting to see if the delay can at least be reduced, I brought
> > the
> > > >> 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Robert Bragg
On Wed, May 4, 2016 at 2:24 PM, Robert Bragg  wrote:

>
>
> On Wed, May 4, 2016 at 1:24 PM, Daniel Vetter  wrote:
>
>> On Wed, May 04, 2016 at 10:49:53AM +0100, Robert Bragg wrote:
>> > On Wed, May 4, 2016 at 10:09 AM, Martin Peres <
>> martin.peres at linux.intel.com>
>> > wrote:
>> >
>> > > On 03/05/16 23:03, Robert Bragg wrote:
>> > >
>> > >>
>> > >>
>> > >> On Tue, May 3, 2016 at 8:34 PM, Robert Bragg > > >> > wrote:
>> > >>
>> > >> Sorry for the delay replying to this, I missed it.
>> > >>
>> > >> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres <
>> martin.peres at free.fr
>> > >> > wrote:
>> > >>
>> > >> On 20/04/16 17:23, Robert Bragg wrote:
>> > >>
>> > >> Gen graphics hardware can be set up to periodically write
>> > >> snapshots of
>> > >> performance counters into a circular buffer via its
>> > >> Observation
>> > >> Architecture and this patch exposes that capability to
>> > >> userspace via the
>> > >> i915 perf interface.
>> > >>
>> > >> Cc: Chris Wilson > > >> >
>> > >> Signed-off-by: Robert Bragg > > >> >
>> > >> Signed-off-by: Zhenyu Wang > > >> >
>> > >>
>> > >> ---
>> > >>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>> > >>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>> > >>   drivers/gpu/drm/i915/i915_perf.c| 940
>> > >> +++-
>> > >>   drivers/gpu/drm/i915/i915_reg.h | 338
>> 
>> > >>   include/uapi/drm/i915_drm.h |  70 ++-
>> > >>   5 files changed, 1408 insertions(+), 20 deletions(-)
>> > >>
>> > >> +
>> > >> +
>> > >> +   /* It takes a fairly long time for a new MUX
>> > >> configuration to
>> > >> +* be be applied after these register writes.
>> This
>> > >> delay
>> > >> +* duration was derived empirically based on the
>> > >> render_basic
>> > >> +* config but hopefully it covers the maximum
>> > >> configuration
>> > >> +* latency...
>> > >> +*/
>> > >> +   mdelay(100);
>> > >>
>> > >>
>> > >> With such a HW and SW design, how can we ever expose hope to
>> get
>> > >> any
>> > >> kind of performance when we are trying to monitor different
>> > >> metrics on each
>> > >> draw call? This may be acceptable for system monitoring, but
>> it
>> > >> is problematic
>> > >> for the GL extensions :s
>> > >>
>> > >>
>> > >> Since it seems like we are going for a perf API, it means
>> that
>> > >> for every change
>> > >> of metrics, we need to flush the commands, wait for the GPU
>> to
>> > >> be done, then
>> > >> program the new set of metrics via an IOCTL, wait 100 ms, and
>> > >> then we may
>> > >> resume rendering ... until the next change. We are talking
>> about
>> > >> a latency of
>> > >> 6-7 frames at 60 Hz here... this is non-negligeable...
>> > >>
>> > >>
>> > >> I understand that we have a ton of counters and we may hide
>> > >> latency by not
>> > >> allowing using more than half of the counters for every draw
>> > >> call or frame, but
>> > >> even then, this 100ms delay is killing this approach
>> altogether.
>> > >>
>> > >>
>> > >>
>> > >>
>> > >> So revisiting this to double check how things fail with my latest
>> > >> driver/tests without the delay, I apparently can't reproduce test
>> > >> failures without the delay any more...
>> > >>
>> > >> I think the explanation is that since first adding the delay to the
>> > >> driver I also made the the driver a bit more careful to not forward
>> > >> spurious reports that look invalid due to a zeroed report id field,
>> and
>> > >> that mechanism keeps the unit tests happy, even though there are
>> still
>> > >> some number of invalid reports generated if we don't wait.
>> > >>
>> > >> One problem with simply having no delay is that the driver prints an
>> > >> error if it sees an invalid reports so I get a lot of 'Skipping
>> > >> spurious, invalid OA report' dmesg spam. Also this was intended more
>> as
>> > >> a last resort mechanism, and I wouldn't feel too happy about
>> squashing
>> > >> the error message and potentially sweeping other error cases under
>> the
>> > >> carpet.
>> > >>
>> > >> Experimenting to see if the delay can at least be reduced, I brought
>> the
>> > >> delay up in millisecond increments and found that although I still
>> see a
>> > >> lot of spurious 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Daniel Vetter
On Wed, May 04, 2016 at 10:49:53AM +0100, Robert Bragg wrote:
> On Wed, May 4, 2016 at 10:09 AM, Martin Peres  linux.intel.com>
> wrote:
> 
> > On 03/05/16 23:03, Robert Bragg wrote:
> >
> >>
> >>
> >> On Tue, May 3, 2016 at 8:34 PM, Robert Bragg  >> > wrote:
> >>
> >> Sorry for the delay replying to this, I missed it.
> >>
> >> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres  >> > wrote:
> >>
> >> On 20/04/16 17:23, Robert Bragg wrote:
> >>
> >> Gen graphics hardware can be set up to periodically write
> >> snapshots of
> >> performance counters into a circular buffer via its
> >> Observation
> >> Architecture and this patch exposes that capability to
> >> userspace via the
> >> i915 perf interface.
> >>
> >> Cc: Chris Wilson  >> >
> >> Signed-off-by: Robert Bragg  >> >
> >> Signed-off-by: Zhenyu Wang  >> >
> >>
> >> ---
> >>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
> >>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
> >>   drivers/gpu/drm/i915/i915_perf.c| 940
> >> +++-
> >>   drivers/gpu/drm/i915/i915_reg.h | 338 
> >>   include/uapi/drm/i915_drm.h |  70 ++-
> >>   5 files changed, 1408 insertions(+), 20 deletions(-)
> >>
> >> +
> >> +
> >> +   /* It takes a fairly long time for a new MUX
> >> configuration to
> >> +* be be applied after these register writes. This
> >> delay
> >> +* duration was derived empirically based on the
> >> render_basic
> >> +* config but hopefully it covers the maximum
> >> configuration
> >> +* latency...
> >> +*/
> >> +   mdelay(100);
> >>
> >>
> >> With such a HW and SW design, how can we ever expose hope to get
> >> any
> >> kind of performance when we are trying to monitor different
> >> metrics on each
> >> draw call? This may be acceptable for system monitoring, but it
> >> is problematic
> >> for the GL extensions :s
> >>
> >>
> >> Since it seems like we are going for a perf API, it means that
> >> for every change
> >> of metrics, we need to flush the commands, wait for the GPU to
> >> be done, then
> >> program the new set of metrics via an IOCTL, wait 100 ms, and
> >> then we may
> >> resume rendering ... until the next change. We are talking about
> >> a latency of
> >> 6-7 frames at 60 Hz here... this is non-negligeable...
> >>
> >>
> >> I understand that we have a ton of counters and we may hide
> >> latency by not
> >> allowing using more than half of the counters for every draw
> >> call or frame, but
> >> even then, this 100ms delay is killing this approach altogether.
> >>
> >>
> >>
> >>
> >> So revisiting this to double check how things fail with my latest
> >> driver/tests without the delay, I apparently can't reproduce test
> >> failures without the delay any more...
> >>
> >> I think the explanation is that since first adding the delay to the
> >> driver I also made the the driver a bit more careful to not forward
> >> spurious reports that look invalid due to a zeroed report id field, and
> >> that mechanism keeps the unit tests happy, even though there are still
> >> some number of invalid reports generated if we don't wait.
> >>
> >> One problem with simply having no delay is that the driver prints an
> >> error if it sees an invalid reports so I get a lot of 'Skipping
> >> spurious, invalid OA report' dmesg spam. Also this was intended more as
> >> a last resort mechanism, and I wouldn't feel too happy about squashing
> >> the error message and potentially sweeping other error cases under the
> >> carpet.
> >>
> >> Experimenting to see if the delay can at least be reduced, I brought the
> >> delay up in millisecond increments and found that although I still see a
> >> lot of spurious reports only waiting 1 or 5 milliseconds, at 10
> >> milliseconds its reduced quite a bit and at 15 milliseconds I don't seem
> >> to have any errors.
> >>
> >> 15 milliseconds is still a long time, but at least not as long as 100.
> >>
> >
> > OK, so the issue does not come from the HW after all, great!
> >
> 
> Erm, I'm not sure that's a conclusion we can make here...
> 
> The upshot here was really just reducing the delay from 100ms to 15ms.
> Previously I arrived at a workable delay by jumping the delay in orders of
> 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Robert Bragg
On Wed, May 4, 2016 at 1:24 PM, Daniel Vetter  wrote:

> On Wed, May 04, 2016 at 10:49:53AM +0100, Robert Bragg wrote:
> > On Wed, May 4, 2016 at 10:09 AM, Martin Peres <
> martin.peres at linux.intel.com>
> > wrote:
> >
> > > On 03/05/16 23:03, Robert Bragg wrote:
> > >
> > >>
> > >>
> > >> On Tue, May 3, 2016 at 8:34 PM, Robert Bragg  > >> > wrote:
> > >>
> > >> Sorry for the delay replying to this, I missed it.
> > >>
> > >> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres <
> martin.peres at free.fr
> > >> > wrote:
> > >>
> > >> On 20/04/16 17:23, Robert Bragg wrote:
> > >>
> > >> Gen graphics hardware can be set up to periodically write
> > >> snapshots of
> > >> performance counters into a circular buffer via its
> > >> Observation
> > >> Architecture and this patch exposes that capability to
> > >> userspace via the
> > >> i915 perf interface.
> > >>
> > >> Cc: Chris Wilson  > >> >
> > >> Signed-off-by: Robert Bragg  > >> >
> > >> Signed-off-by: Zhenyu Wang  > >> >
> > >>
> > >> ---
> > >>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
> > >>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
> > >>   drivers/gpu/drm/i915/i915_perf.c| 940
> > >> +++-
> > >>   drivers/gpu/drm/i915/i915_reg.h | 338
> 
> > >>   include/uapi/drm/i915_drm.h |  70 ++-
> > >>   5 files changed, 1408 insertions(+), 20 deletions(-)
> > >>
> > >> +
> > >> +
> > >> +   /* It takes a fairly long time for a new MUX
> > >> configuration to
> > >> +* be be applied after these register writes. This
> > >> delay
> > >> +* duration was derived empirically based on the
> > >> render_basic
> > >> +* config but hopefully it covers the maximum
> > >> configuration
> > >> +* latency...
> > >> +*/
> > >> +   mdelay(100);
> > >>
> > >>
> > >> With such a HW and SW design, how can we ever expose hope to
> get
> > >> any
> > >> kind of performance when we are trying to monitor different
> > >> metrics on each
> > >> draw call? This may be acceptable for system monitoring, but
> it
> > >> is problematic
> > >> for the GL extensions :s
> > >>
> > >>
> > >> Since it seems like we are going for a perf API, it means that
> > >> for every change
> > >> of metrics, we need to flush the commands, wait for the GPU to
> > >> be done, then
> > >> program the new set of metrics via an IOCTL, wait 100 ms, and
> > >> then we may
> > >> resume rendering ... until the next change. We are talking
> about
> > >> a latency of
> > >> 6-7 frames at 60 Hz here... this is non-negligeable...
> > >>
> > >>
> > >> I understand that we have a ton of counters and we may hide
> > >> latency by not
> > >> allowing using more than half of the counters for every draw
> > >> call or frame, but
> > >> even then, this 100ms delay is killing this approach
> altogether.
> > >>
> > >>
> > >>
> > >>
> > >> So revisiting this to double check how things fail with my latest
> > >> driver/tests without the delay, I apparently can't reproduce test
> > >> failures without the delay any more...
> > >>
> > >> I think the explanation is that since first adding the delay to the
> > >> driver I also made the the driver a bit more careful to not forward
> > >> spurious reports that look invalid due to a zeroed report id field,
> and
> > >> that mechanism keeps the unit tests happy, even though there are still
> > >> some number of invalid reports generated if we don't wait.
> > >>
> > >> One problem with simply having no delay is that the driver prints an
> > >> error if it sees an invalid reports so I get a lot of 'Skipping
> > >> spurious, invalid OA report' dmesg spam. Also this was intended more
> as
> > >> a last resort mechanism, and I wouldn't feel too happy about squashing
> > >> the error message and potentially sweeping other error cases under the
> > >> carpet.
> > >>
> > >> Experimenting to see if the delay can at least be reduced, I brought
> the
> > >> delay up in millisecond increments and found that although I still
> see a
> > >> lot of spurious reports only waiting 1 or 5 milliseconds, at 10
> > >> milliseconds its reduced quite a bit and at 15 milliseconds I don't
> seem
> > >> to have any errors.
> > >>
> > >> 15 milliseconds is still a 

[Intel-gfx] [PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Robert Bragg
On Wed, May 4, 2016 at 10:04 AM, Martin Peres 
wrote:

> On 03/05/16 22:34, Robert Bragg wrote:
>
>> Sorry for the delay replying to this, I missed it.
>>
>
> No worries!
>
>
>> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres > > wrote:
>>
>> On 20/04/16 17:23, Robert Bragg wrote:
>>
>> Gen graphics hardware can be set up to periodically write
>> snapshots of
>> performance counters into a circular buffer via its Observation
>> Architecture and this patch exposes that capability to userspace
>> via the
>> i915 perf interface.
>>
>> Cc: Chris Wilson > >
>> Signed-off-by: Robert Bragg > >
>> Signed-off-by: Zhenyu Wang > >
>>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>>   drivers/gpu/drm/i915/i915_perf.c| 940
>> +++-
>>   drivers/gpu/drm/i915/i915_reg.h | 338 
>>   include/uapi/drm/i915_drm.h |  70 ++-
>>   5 files changed, 1408 insertions(+), 20 deletions(-)
>>
>> +
>> +
>> +   /* It takes a fairly long time for a new MUX
>> configuration to
>> +* be be applied after these register writes. This delay
>> +* duration was derived empirically based on the
>> render_basic
>> +* config but hopefully it covers the maximum
>> configuration
>> +* latency...
>> +*/
>> +   mdelay(100);
>>
>>
>> With such a HW and SW design, how can we ever expose hope to get any
>> kind of performance when we are trying to monitor different metrics
>> on each
>> draw call? This may be acceptable for system monitoring, but it is
>> problematic
>> for the GL extensions :s
>>
>>
>> Since it seems like we are going for a perf API, it means that for
>> every change
>> of metrics, we need to flush the commands, wait for the GPU to be
>> done, then
>> program the new set of metrics via an IOCTL, wait 100 ms, and then
>> we may
>> resume rendering ... until the next change. We are talking about a
>> latency of
>> 6-7 frames at 60 Hz here... this is non-negligeable...
>>
>>
>> I understand that we have a ton of counters and we may hide latency
>> by not
>> allowing using more than half of the counters for every draw call or
>> frame, but
>> even then, this 100ms delay is killing this approach altogether.
>>
>>
>>
>> Although I'm also really unhappy about introducing this delay recently,
>> the impact of the delay is typically amortized somewhat by keeping a
>> configuration open as long as possible.
>>
>> Even without this explicit delay here the OA unit isn't suited to being
>> reconfigured on a per draw call basis, though it is able to support per
>> draw call queries with the same config.
>>
>> The above assessment assumes wanting to change config between draw calls
>> which is not something this driver aims to support - as the HW isn't
>> really designed for that model.
>>
>> E.g. in the case of INTEL_performance_query, the backend keeps the i915
>> perf stream open until all OA based query objects are deleted - so you
>> have to be pretty explicit if you want to change config.
>>
>
> OK, I get your point. However, I still want to state that applications
> changing the set would see a disastrous effect as a 100 ms is enough to
> downclock both the CPU and GPU and that would dramatically alter the
> metrics. Should we make it clear somewhere, either in the
> INTEL_performance_query or as a warning in mesa_performance if changing the
> set while running? I would think the latter would be preferable as it could
> also cover the case of the AMD extension which, IIRC, does not talk about
> the performance cost of changing the metrics. With this caveat made clear,
> it seems reasonable.
>

Yeah a KHR_debug performance warning sounds like a good idea.


>
>
>> In case you aren't familiar with how the GL_INTEL_performance_query side
>> of things works for OA counters; one thing to be aware of is that
>> there's a separate MI_REPORT_PERF_COUNT command that Mesa writes either
>> side of a query which writes all the counters for the current OA config
>> (as configured via this i915 perf interface) to a buffer. In addition to
>> collecting reports via MI_REPORT_PERF_COUNT Mesa also configures the
>> unit for periodic sampling to be able to account for potential counter
>> overflow.
>>
>
> Oh, the overflow case is mean. Doesn't the spec mandate the application to
> read at least every second? This is the case for the timestamp queries.
>

For a Haswell GT3 system with 40EUs @ 1GHz some aggregate EU counters 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Martin Peres
On 03/05/16 23:03, Robert Bragg wrote:
>
>
> On Tue, May 3, 2016 at 8:34 PM, Robert Bragg  > wrote:
>
> Sorry for the delay replying to this, I missed it.
>
> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres  > wrote:
>
> On 20/04/16 17:23, Robert Bragg wrote:
>
> Gen graphics hardware can be set up to periodically write
> snapshots of
> performance counters into a circular buffer via its Observation
> Architecture and this patch exposes that capability to
> userspace via the
> i915 perf interface.
>
> Cc: Chris Wilson  >
> Signed-off-by: Robert Bragg  >
> Signed-off-by: Zhenyu Wang  >
> ---
>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>   drivers/gpu/drm/i915/i915_perf.c| 940
> +++-
>   drivers/gpu/drm/i915/i915_reg.h | 338 
>   include/uapi/drm/i915_drm.h |  70 ++-
>   5 files changed, 1408 insertions(+), 20 deletions(-)
>
> +
> +
> +   /* It takes a fairly long time for a new MUX
> configuration to
> +* be be applied after these register writes. This delay
> +* duration was derived empirically based on the
> render_basic
> +* config but hopefully it covers the maximum
> configuration
> +* latency...
> +*/
> +   mdelay(100);
>
>
> With such a HW and SW design, how can we ever expose hope to get any
> kind of performance when we are trying to monitor different
> metrics on each
> draw call? This may be acceptable for system monitoring, but it
> is problematic
> for the GL extensions :s
>
>
> Since it seems like we are going for a perf API, it means that
> for every change
> of metrics, we need to flush the commands, wait for the GPU to
> be done, then
> program the new set of metrics via an IOCTL, wait 100 ms, and
> then we may
> resume rendering ... until the next change. We are talking about
> a latency of
> 6-7 frames at 60 Hz here... this is non-negligeable...
>
>
> I understand that we have a ton of counters and we may hide
> latency by not
> allowing using more than half of the counters for every draw
> call or frame, but
> even then, this 100ms delay is killing this approach altogether.
>
>
>
>
> So revisiting this to double check how things fail with my latest
> driver/tests without the delay, I apparently can't reproduce test
> failures without the delay any more...
>
> I think the explanation is that since first adding the delay to the
> driver I also made the the driver a bit more careful to not forward
> spurious reports that look invalid due to a zeroed report id field, and
> that mechanism keeps the unit tests happy, even though there are still
> some number of invalid reports generated if we don't wait.
>
> One problem with simply having no delay is that the driver prints an
> error if it sees an invalid reports so I get a lot of 'Skipping
> spurious, invalid OA report' dmesg spam. Also this was intended more as
> a last resort mechanism, and I wouldn't feel too happy about squashing
> the error message and potentially sweeping other error cases under the
> carpet.
>
> Experimenting to see if the delay can at least be reduced, I brought the
> delay up in millisecond increments and found that although I still see a
> lot of spurious reports only waiting 1 or 5 milliseconds, at 10
> milliseconds its reduced quite a bit and at 15 milliseconds I don't seem
> to have any errors.
>
> 15 milliseconds is still a long time, but at least not as long as 100.

OK, so the issue does not come from the HW after all, great!

Now, my main question is, why are spurious events generated when 
changing the MUX's value? I can understand that we would need to ignore 
the reading that came right after the change, but other than this,  I am 
a bit at a loss.

I am a bit swamped with other tasks right now, but I would love to spend 
more time reviewing your code as I really want to see this upstream!

Martin


[Intel-gfx] [PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Martin Peres
On 03/05/16 22:34, Robert Bragg wrote:
> Sorry for the delay replying to this, I missed it.

No worries!

>
> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres  > wrote:
>
> On 20/04/16 17:23, Robert Bragg wrote:
>
> Gen graphics hardware can be set up to periodically write
> snapshots of
> performance counters into a circular buffer via its Observation
> Architecture and this patch exposes that capability to userspace
> via the
> i915 perf interface.
>
> Cc: Chris Wilson  >
> Signed-off-by: Robert Bragg  >
> Signed-off-by: Zhenyu Wang  >
> ---
>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>   drivers/gpu/drm/i915/i915_perf.c| 940
> +++-
>   drivers/gpu/drm/i915/i915_reg.h | 338 
>   include/uapi/drm/i915_drm.h |  70 ++-
>   5 files changed, 1408 insertions(+), 20 deletions(-)
>
> +
> +
> +   /* It takes a fairly long time for a new MUX
> configuration to
> +* be be applied after these register writes. This delay
> +* duration was derived empirically based on the
> render_basic
> +* config but hopefully it covers the maximum configuration
> +* latency...
> +*/
> +   mdelay(100);
>
>
> With such a HW and SW design, how can we ever expose hope to get any
> kind of performance when we are trying to monitor different metrics
> on each
> draw call? This may be acceptable for system monitoring, but it is
> problematic
> for the GL extensions :s
>
>
> Since it seems like we are going for a perf API, it means that for
> every change
> of metrics, we need to flush the commands, wait for the GPU to be
> done, then
> program the new set of metrics via an IOCTL, wait 100 ms, and then
> we may
> resume rendering ... until the next change. We are talking about a
> latency of
> 6-7 frames at 60 Hz here... this is non-negligeable...
>
>
> I understand that we have a ton of counters and we may hide latency
> by not
> allowing using more than half of the counters for every draw call or
> frame, but
> even then, this 100ms delay is killing this approach altogether.
>
>
>
> Although I'm also really unhappy about introducing this delay recently,
> the impact of the delay is typically amortized somewhat by keeping a
> configuration open as long as possible.
>
> Even without this explicit delay here the OA unit isn't suited to being
> reconfigured on a per draw call basis, though it is able to support per
> draw call queries with the same config.
>
> The above assessment assumes wanting to change config between draw calls
> which is not something this driver aims to support - as the HW isn't
> really designed for that model.
>
> E.g. in the case of INTEL_performance_query, the backend keeps the i915
> perf stream open until all OA based query objects are deleted - so you
> have to be pretty explicit if you want to change config.

OK, I get your point. However, I still want to state that applications 
changing the set would see a disastrous effect as a 100 ms is enough to 
downclock both the CPU and GPU and that would dramatically alter the
metrics. Should we make it clear somewhere, either in the 
INTEL_performance_query or as a warning in mesa_performance if changing 
the set while running? I would think the latter would be preferable as 
it could also cover the case of the AMD extension which, IIRC, does not 
talk about the performance cost of changing the metrics. With this 
caveat made clear, it seems reasonable.

>
> Considering the sets available on Haswell:
> * Render Metrics Basic
> * Compute Metrics Basic
> * Compute Metrics Extended
> * Memory Reads Distribution
> * Memory Writes Distribution
> * Metric set SamplerBalance
>
> Each of these configs can expose around 50 counters as a set.
>
> A GL application is most likely just going to use the render basic set,
> and In the case of a tool like gputop/GPA then changing config would
> usually be driven by some user interaction to select a set of metrics,
> where even a 100ms delay will go unnoticed.

100 ms is becoming visible, but I agree, it would not be a show stopper 
for sure.

On the APITRACE side, this should not be an issue, because we do not 
change the set of metrics while running.

>
> In case you aren't familiar with how the GL_INTEL_performance_query side
> of things works for OA counters; one thing to be aware of is that
> there's a separate MI_REPORT_PERF_COUNT command that Mesa writes either
> side of a query which writes all 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-04 Thread Robert Bragg
On Wed, May 4, 2016 at 10:09 AM, Martin Peres 
wrote:

> On 03/05/16 23:03, Robert Bragg wrote:
>
>>
>>
>> On Tue, May 3, 2016 at 8:34 PM, Robert Bragg > > wrote:
>>
>> Sorry for the delay replying to this, I missed it.
>>
>> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres > > wrote:
>>
>> On 20/04/16 17:23, Robert Bragg wrote:
>>
>> Gen graphics hardware can be set up to periodically write
>> snapshots of
>> performance counters into a circular buffer via its
>> Observation
>> Architecture and this patch exposes that capability to
>> userspace via the
>> i915 perf interface.
>>
>> Cc: Chris Wilson > >
>> Signed-off-by: Robert Bragg > >
>> Signed-off-by: Zhenyu Wang > >
>>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>>   drivers/gpu/drm/i915/i915_perf.c| 940
>> +++-
>>   drivers/gpu/drm/i915/i915_reg.h | 338 
>>   include/uapi/drm/i915_drm.h |  70 ++-
>>   5 files changed, 1408 insertions(+), 20 deletions(-)
>>
>> +
>> +
>> +   /* It takes a fairly long time for a new MUX
>> configuration to
>> +* be be applied after these register writes. This
>> delay
>> +* duration was derived empirically based on the
>> render_basic
>> +* config but hopefully it covers the maximum
>> configuration
>> +* latency...
>> +*/
>> +   mdelay(100);
>>
>>
>> With such a HW and SW design, how can we ever expose hope to get
>> any
>> kind of performance when we are trying to monitor different
>> metrics on each
>> draw call? This may be acceptable for system monitoring, but it
>> is problematic
>> for the GL extensions :s
>>
>>
>> Since it seems like we are going for a perf API, it means that
>> for every change
>> of metrics, we need to flush the commands, wait for the GPU to
>> be done, then
>> program the new set of metrics via an IOCTL, wait 100 ms, and
>> then we may
>> resume rendering ... until the next change. We are talking about
>> a latency of
>> 6-7 frames at 60 Hz here... this is non-negligeable...
>>
>>
>> I understand that we have a ton of counters and we may hide
>> latency by not
>> allowing using more than half of the counters for every draw
>> call or frame, but
>> even then, this 100ms delay is killing this approach altogether.
>>
>>
>>
>>
>> So revisiting this to double check how things fail with my latest
>> driver/tests without the delay, I apparently can't reproduce test
>> failures without the delay any more...
>>
>> I think the explanation is that since first adding the delay to the
>> driver I also made the the driver a bit more careful to not forward
>> spurious reports that look invalid due to a zeroed report id field, and
>> that mechanism keeps the unit tests happy, even though there are still
>> some number of invalid reports generated if we don't wait.
>>
>> One problem with simply having no delay is that the driver prints an
>> error if it sees an invalid reports so I get a lot of 'Skipping
>> spurious, invalid OA report' dmesg spam. Also this was intended more as
>> a last resort mechanism, and I wouldn't feel too happy about squashing
>> the error message and potentially sweeping other error cases under the
>> carpet.
>>
>> Experimenting to see if the delay can at least be reduced, I brought the
>> delay up in millisecond increments and found that although I still see a
>> lot of spurious reports only waiting 1 or 5 milliseconds, at 10
>> milliseconds its reduced quite a bit and at 15 milliseconds I don't seem
>> to have any errors.
>>
>> 15 milliseconds is still a long time, but at least not as long as 100.
>>
>
> OK, so the issue does not come from the HW after all, great!
>

Erm, I'm not sure that's a conclusion we can make here...

The upshot here was really just reducing the delay from 100ms to 15ms.
Previously I arrived at a workable delay by jumping the delay in orders of
magnitude with 10ms failing, 100ms passing and I didn't try and refine it
further. Here I've looked at delays between 10 and 100ms.

The other thing is observing that because the kernel is checking for
invalid reports (generated by the hardware) before forwarding to userspace
the lack of a delay no longer triggers i-g-t 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-03 Thread Robert Bragg
On Tue, May 3, 2016 at 8:34 PM, Robert Bragg  wrote:

> Sorry for the delay replying to this, I missed it.
>
> On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres 
> wrote:
>
>> On 20/04/16 17:23, Robert Bragg wrote:
>>
>>> Gen graphics hardware can be set up to periodically write snapshots of
>>> performance counters into a circular buffer via its Observation
>>> Architecture and this patch exposes that capability to userspace via the
>>> i915 perf interface.
>>>
>>> Cc: Chris Wilson 
>>> Signed-off-by: Robert Bragg 
>>> Signed-off-by: Zhenyu Wang 
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>>>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>>>   drivers/gpu/drm/i915/i915_perf.c| 940
>>> +++-
>>>   drivers/gpu/drm/i915/i915_reg.h | 338 
>>>   include/uapi/drm/i915_drm.h |  70 ++-
>>>   5 files changed, 1408 insertions(+), 20 deletions(-)
>>>
>>> +
>>> +
>>> +   /* It takes a fairly long time for a new MUX configuration to
>>> +* be be applied after these register writes. This delay
>>> +* duration was derived empirically based on the render_basic
>>> +* config but hopefully it covers the maximum configuration
>>> +* latency...
>>> +*/
>>> +   mdelay(100);
>>>
>>
>> With such a HW and SW design, how can we ever expose hope to get any
>> kind of performance when we are trying to monitor different metrics on
>> each
>> draw call? This may be acceptable for system monitoring, but it is
>> problematic
>> for the GL extensions :s
>>
>
>> Since it seems like we are going for a perf API, it means that for every
>> change
>> of metrics, we need to flush the commands, wait for the GPU to be done,
>> then
>> program the new set of metrics via an IOCTL, wait 100 ms, and then we may
>> resume rendering ... until the next change. We are talking about a
>> latency of
>> 6-7 frames at 60 Hz here... this is non-negligeable...
>>
>
>> I understand that we have a ton of counters and we may hide latency by not
>> allowing using more than half of the counters for every draw call or
>> frame, but
>> even then, this 100ms delay is killing this approach altogether.
>>
>
>
>
So revisiting this to double check how things fail with my latest
driver/tests without the delay, I apparently can't reproduce test failures
without the delay any more...

I think the explanation is that since first adding the delay to the driver
I also made the the driver a bit more careful to not forward spurious
reports that look invalid due to a zeroed report id field, and that
mechanism keeps the unit tests happy, even though there are still some
number of invalid reports generated if we don't wait.

One problem with simply having no delay is that the driver prints an error
if it sees an invalid reports so I get a lot of 'Skipping spurious, invalid
OA report' dmesg spam. Also this was intended more as a last resort
mechanism, and I wouldn't feel too happy about squashing the error message
and potentially sweeping other error cases under the carpet.

Experimenting to see if the delay can at least be reduced, I brought the
delay up in millisecond increments and found that although I still see a
lot of spurious reports only waiting 1 or 5 milliseconds, at 10
milliseconds its reduced quite a bit and at 15 milliseconds I don't seem to
have any errors.

15 milliseconds is still a long time, but at least not as long as 100.

Regards,
- Robert
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-05-03 Thread Robert Bragg
Sorry for the delay replying to this, I missed it.

On Sat, Apr 23, 2016 at 11:34 AM, Martin Peres  wrote:

> On 20/04/16 17:23, Robert Bragg wrote:
>
>> Gen graphics hardware can be set up to periodically write snapshots of
>> performance counters into a circular buffer via its Observation
>> Architecture and this patch exposes that capability to userspace via the
>> i915 perf interface.
>>
>> Cc: Chris Wilson 
>> Signed-off-by: Robert Bragg 
>> Signed-off-by: Zhenyu Wang 
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>>   drivers/gpu/drm/i915/i915_perf.c| 940
>> +++-
>>   drivers/gpu/drm/i915/i915_reg.h | 338 
>>   include/uapi/drm/i915_drm.h |  70 ++-
>>   5 files changed, 1408 insertions(+), 20 deletions(-)
>>
>> +
>> +
>> +   /* It takes a fairly long time for a new MUX configuration to
>> +* be be applied after these register writes. This delay
>> +* duration was derived empirically based on the render_basic
>> +* config but hopefully it covers the maximum configuration
>> +* latency...
>> +*/
>> +   mdelay(100);
>>
>
> With such a HW and SW design, how can we ever expose hope to get any
> kind of performance when we are trying to monitor different metrics on each
> draw call? This may be acceptable for system monitoring, but it is
> problematic
> for the GL extensions :s
>

> Since it seems like we are going for a perf API, it means that for every
> change
> of metrics, we need to flush the commands, wait for the GPU to be done,
> then
> program the new set of metrics via an IOCTL, wait 100 ms, and then we may
> resume rendering ... until the next change. We are talking about a latency
> of
> 6-7 frames at 60 Hz here... this is non-negligeable...
>

> I understand that we have a ton of counters and we may hide latency by not
> allowing using more than half of the counters for every draw call or
> frame, but
> even then, this 100ms delay is killing this approach altogether.
>


Although I'm also really unhappy about introducing this delay recently, the
impact of the delay is typically amortized somewhat by keeping a
configuration open as long as possible.

Even without this explicit delay here the OA unit isn't suited to being
reconfigured on a per draw call basis, though it is able to support per
draw call queries with the same config.

The above assessment assumes wanting to change config between draw calls
which is not something this driver aims to support - as the HW isn't really
designed for that model.

E.g. in the case of INTEL_performance_query, the backend keeps the i915
perf stream open until all OA based query objects are deleted - so you have
to be pretty explicit if you want to change config.

Considering the sets available on Haswell:
* Render Metrics Basic
* Compute Metrics Basic
* Compute Metrics Extended
* Memory Reads Distribution
* Memory Writes Distribution
* Metric set SamplerBalance

Each of these configs can expose around 50 counters as a set.

A GL application is most likely just going to use the render basic set, and
In the case of a tool like gputop/GPA then changing config would usually be
driven by some user interaction to select a set of metrics, where even a
100ms delay will go unnoticed.

In case you aren't familiar with how the GL_INTEL_performance_query side of
things works for OA counters; one thing to be aware of is that there's a
separate MI_REPORT_PERF_COUNT command that Mesa writes either side of a
query which writes all the counters for the current OA config (as
configured via this i915 perf interface) to a buffer. In addition to
collecting reports via MI_REPORT_PERF_COUNT Mesa also configures the unit
for periodic sampling to be able to account for potential counter overflow.


It also might be worth keeping in mind that per draw queries will anyway
trash the pipelining of work, since it's necessary to put stalls between
the draw calls to avoid conflated metrics (not to do with the details of
this driver) so use cases will probably be limited to those that just want
the draw call numbers but don't mind ruining overall frame/application
performance. Periodic sampling or swap-to-swap queries would be better
suited to cases that should minimize their impact.


>
> To be honest, if it indeed is an HW bug, then the approach that Samuel
> Pitoiset
> and I used for Nouveau involving pushing an handle representing a
> pre-computed configuration to the command buffer so as a software method
> can be ask the kernel to reprogram the counters with as little idle time as
> possible, would be useless as waiting for the GPU to be idle would usually
> not
> take more than a few ms... which is nothing compared to waiting 100ms.
>

Yeah, I think this is a really quite different programming model to what
the OA unit is geared for, even if we can somehow knock out this 100ms MUX
config delay.


>
> 

[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-23 Thread Martin Peres
On 20/04/16 17:23, Robert Bragg wrote:
> Gen graphics hardware can be set up to periodically write snapshots of
> performance counters into a circular buffer via its Observation
> Architecture and this patch exposes that capability to userspace via the
> i915 perf interface.
>
> Cc: Chris Wilson 
> Signed-off-by: Robert Bragg 
> Signed-off-by: Zhenyu Wang 
> ---
>   drivers/gpu/drm/i915/i915_drv.h |  56 +-
>   drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
>   drivers/gpu/drm/i915/i915_perf.c| 940 
> +++-
>   drivers/gpu/drm/i915/i915_reg.h | 338 
>   include/uapi/drm/i915_drm.h |  70 ++-
>   5 files changed, 1408 insertions(+), 20 deletions(-)
>
> +
> +
> + /* It takes a fairly long time for a new MUX configuration to
> +  * be be applied after these register writes. This delay
> +  * duration was derived empirically based on the render_basic
> +  * config but hopefully it covers the maximum configuration
> +  * latency...
> +  */
> + mdelay(100);

With such a HW and SW design, how can we ever expose hope to get any
kind of performance when we are trying to monitor different metrics on each
draw call? This may be acceptable for system monitoring, but it is 
problematic
for the GL extensions :s

Since it seems like we are going for a perf API, it means that for every 
change
of metrics, we need to flush the commands, wait for the GPU to be done, then
program the new set of metrics via an IOCTL, wait 100 ms, and then we may
resume rendering ... until the next change. We are talking about a 
latency of
6-7 frames at 60 Hz here... this is non-negligeable...

I understand that we have a ton of counters and we may hide latency by not
allowing using more than half of the counters for every draw call or 
frame, but
even then, this 100ms delay is killing this approach altogether.

To be honest, if it indeed is an HW bug, then the approach that Samuel 
Pitoiset
and I used for Nouveau involving pushing an handle representing a
pre-computed configuration to the command buffer so as a software method
can be ask the kernel to reprogram the counters with as little idle time as
possible, would be useless as waiting for the GPU to be idle would 
usually not
take more than a few ms... which is nothing compared to waiting 100ms.

So, now, the elephant in the room, how can it take that long to apply the
change? Are the OA registers double buffered (NVIDIA's are, so as we can
reconfigure and start monitoring multiple counters at the same time)?

Maybe this 100ms is the polling period and the HW does not allow changing
the configuration in the middle of a polling session. In this case, this 
delay
should be dependent on the polling frequency. But even then, I would really
hope that the HW would allow us to tear down everything, reconfigure and
start polling again without waiting for the next tick. If not possible, 
maybe we
can change the frequency for the polling clock to make the polling event 
happen
sooner.

HW delays are usually a few microseconds, not milliseconds, that really 
suggests
that something funny is happening and the HW design is not understood 
properly.
If the documentation has nothing on this and the HW teams cannot help, 
then I
suggest a little REing session. I really want to see this work land, but 
the way I see
it right now is that we cannot rely on it because of this bug. Maybe 
fixing this bug
would require changing the architecture, so better address it before 
landing the
patches.

Worst case scenario, do not hesitate to contact me if non of the proposed
explanation pans out, I will take the time to read through the OA 
material and try my
REing skills on it. As I said, I really want to see this upstream!

Sorry...

Martin


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-23 Thread Chris Wilson
On Thu, Apr 21, 2016 at 05:15:10PM +0100, Robert Bragg wrote:
>On Wed, Apr 20, 2016 at 10:11 PM, Chris Wilson
><[1]chris at chris-wilson.co.uk> wrote:
> 
>  On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
>  > +static void gen7_update_oacontrol_locked(struct drm_i915_private
>  *dev_priv)
>  > +{
>  > +     assert_spin_locked(_priv->perf.hook_lock);
>  > +
>  > +     if (dev_priv->perf.oa.exclusive_stream->enabled) {
>  > +             unsigned long ctx_id = 0;
>  > +
>  > +             if (dev_priv->perf.oa.exclusive_stream->ctx)
>  > +                     ctx_id = 
> dev_priv->perf.oa.specific_ctx_id;
>  > +
>  > +             if (dev_priv->perf.oa.exclusive_stream->ctx == 
> NULL ||
>  ctx_id) {
>  > +                     bool periodic = 
> dev_priv->perf.oa.periodic;
>  > +                     u32 period_exponent =
>  dev_priv->perf.oa.period_exponent;
>  > +                     u32 report_format =
>  dev_priv->perf.oa.oa_buffer.format;
>  > +
>  > +                     I915_WRITE(GEN7_OACONTROL,
>  > +                                (ctx_id & 
> GEN7_OACONTROL_CTX_MASK) |
>  > +                                (period_exponent <<
>  > +                                 
> GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
>  > +                                (periodic ?
>  > +                                 
> GEN7_OACONTROL_TIMER_ENABLE : 0) |
>  > +                                (report_format <<
>  > +                                 
> GEN7_OACONTROL_FORMAT_SHIFT) |
>  > +                                (ctx_id ?
>  > +                                 
> GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
>  > +                                
> GEN7_OACONTROL_ENABLE);
> 
>  So this works by only recording when the OACONTROL context address
>  matches the CCID.
> 
>  Rather than hooking into switch context and checking every batch whether
>  you have the exclusive context in case it changed address, you could
>  just pin the exclusive context when told by the user to bind perf to
>  that context. Then it will also have the same address until oa is
>  finished (and releases it vma pin).
> 
>Yeah, this was the approach I first went with when the driver was perf
>based, though we ended up deciding to got with hooking into pinning and
>updating the OA state in the end.
> 
>E.g. for reference:
>
> [2]https://lists.freedesktop.org/archives/intel-gfx/2014-November/055385.html
>(wow, sad face after seeing how long I've been kicking this stuff)
> 
>I'd prefer to stick with this approach now, unless you see a big problem
>with it.

Given no reason to have the hook, I don't see why we should. Pinning the
context in the GGTT and causing that bit of extra fragmenetation isn't
the worst evil here. and is better practice overall to treat the OA
register as holding the pin on the object is it referencing, along with
lifetime tracking of that register (i.e. unpinning only when we know it
has completed its writes). Given that, the pin_notify is inadequate.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-22 Thread Chris Wilson
On Fri, Apr 22, 2016 at 12:04:26PM +0100, Robert Bragg wrote:
>On Wed, Apr 20, 2016 at 11:46 PM, Chris Wilson
><[1]chris at chris-wilson.co.uk> wrote:
> 
>  On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
>  > +static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
>  > +{
>  > +     /* Pre-DevBDW: OABUFFER must be set with counters off,
>  > +      * before OASTATUS1, but after OASTATUS2
>  > +      */
>  > +     I915_WRITE(GEN7_OASTATUS2,
>  dev_priv->perf.oa.oa_buffer.gtt_offset |
>  > +                OA_MEM_SELECT_GGTT); /* head */
>  > +     I915_WRITE(GEN7_OABUFFER,
>  dev_priv->perf.oa.oa_buffer.gtt_offset);
>  > +     I915_WRITE(GEN7_OASTATUS1,
>  dev_priv->perf.oa.oa_buffer.gtt_offset |
>  > +                OABUFFER_SIZE_16M); /* tail */
>  > +
>  > +     /* On Haswell we have to track which OASTATUS1 flags we've
>  > +      * already seen since they can't be cleared while periodic
>  > +      * sampling is enabled.
>  > +      */
>  > +     dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
>  > +
>  > +     /* We have a sanity check in gen7_append_oa_reports() that
>  > +      * looks at the report-id field to make sure it's non-zero
>  > +      * which relies on the assumption that new reports are
>  > +      * being written to zeroed memory...
>  > +      */
>  > +     memset(dev_priv->perf.oa.oa_buffer.addr, 0, SZ_16M);
> 
>  You allocated zeroed memory.
> 
>yup. currently I have this memset here because we may re-init the buffer
>if the stream is disabled then re-enabled (via I915_PERF_IOCTL_ENABLE) or
>if we have to reset the unit on error. In these cases there may be some
>number of reports in the buffer with non-zero report-id fields while we
>still want to be sure new reports are being written to zereod memory so
>that the sanity check that report-id != 0 will continue to be valid.
> 
>I've had it in mind to consider optimizing this at some point to minimize
>how much of the buffer is cleared, maybe just for the _DISABLE/_ENABLE
>case where I'd expect the buffer will mostly be empty before disabling the
>stream.

Or just make it clear that you are considering buffer reuse. Having the
memset here allows us to use non-shmemfs allocation, it wasn't that I
objected I just didn't understand the comment in the context of
allocation path.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-22 Thread Robert Bragg
On Wed, Apr 20, 2016 at 11:46 PM, Chris Wilson 
wrote:

> On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> > +static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
> > +{
> > + /* Pre-DevBDW: OABUFFER must be set with counters off,
> > +  * before OASTATUS1, but after OASTATUS2
> > +  */
> > + I915_WRITE(GEN7_OASTATUS2, dev_priv->perf.oa.oa_buffer.gtt_offset |
> > +OA_MEM_SELECT_GGTT); /* head */
> > + I915_WRITE(GEN7_OABUFFER, dev_priv->perf.oa.oa_buffer.gtt_offset);
> > + I915_WRITE(GEN7_OASTATUS1, dev_priv->perf.oa.oa_buffer.gtt_offset |
> > +OABUFFER_SIZE_16M); /* tail */
> > +
> > + /* On Haswell we have to track which OASTATUS1 flags we've
> > +  * already seen since they can't be cleared while periodic
> > +  * sampling is enabled.
> > +  */
> > + dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
> > +
> > + /* We have a sanity check in gen7_append_oa_reports() that
> > +  * looks at the report-id field to make sure it's non-zero
> > +  * which relies on the assumption that new reports are
> > +  * being written to zeroed memory...
> > +  */
> > + memset(dev_priv->perf.oa.oa_buffer.addr, 0, SZ_16M);
>
> You allocated zeroed memory.
>

yup. currently I have this memset here because we may re-init the buffer if
the stream is disabled then re-enabled (via I915_PERF_IOCTL_ENABLE) or if
we have to reset the unit on error. In these cases there may be some number
of reports in the buffer with non-zero report-id fields while we still want
to be sure new reports are being written to zereod memory so that the
sanity check that report-id != 0 will continue to be valid.

I've had it in mind to consider optimizing this at some point to minimize
how much of the buffer is cleared, maybe just for the _DISABLE/_ENABLE case
where I'd expect the buffer will mostly be empty before disabling the
stream.

- Robert


-Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
>
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-22 Thread Robert Bragg
On Thu, Apr 21, 2016 at 4:18 PM, Robert Bragg  wrote:

>
>
> On Thu, Apr 21, 2016 at 12:09 AM, Chris Wilson 
> wrote:
>
>> On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
>> > +static void i915_oa_stream_enable(struct i915_perf_stream *stream)
>> > +{
>> > + struct drm_i915_private *dev_priv = stream->dev_priv;
>> > +
>> > + dev_priv->perf.oa.ops.oa_enable(dev_priv);
>> > +
>> > + if (dev_priv->perf.oa.periodic)
>> > + hrtimer_start(_priv->perf.oa.poll_check_timer,
>> > +   ns_to_ktime(POLL_PERIOD),
>> > +   HRTIMER_MODE_REL_PINNED);
>> > +}
>>
>> > +static void i915_oa_stream_disable(struct i915_perf_stream *stream)
>> > +{
>> > + struct drm_i915_private *dev_priv = stream->dev_priv;
>> > +
>> > + dev_priv->perf.oa.ops.oa_disable(dev_priv);
>> > +
>> > + if (dev_priv->perf.oa.periodic)
>> > + hrtimer_cancel(_priv->perf.oa.poll_check_timer);
>> > +}
>>
>> > +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer
>> *hrtimer)
>> > +{
>> > + struct drm_i915_private *dev_priv =
>> > + container_of(hrtimer, typeof(*dev_priv),
>> > +  perf.oa.poll_check_timer);
>> > +
>> > + if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv))
>> > + wake_up(_priv->perf.oa.poll_wq);
>> > +
>> > + hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
>> > +
>> > + return HRTIMER_RESTART;
>> > +}
>>
>> > @@ -424,8 +1313,37 @@ void i915_perf_init(struct drm_device *dev)
>> >  {
>> >   struct drm_i915_private *dev_priv = to_i915(dev);
>> >
>> > + if (!IS_HASWELL(dev))
>> > + return;
>> > +
>> > + hrtimer_init(_priv->perf.oa.poll_check_timer,
>> > +  CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> > + dev_priv->perf.oa.poll_check_timer.function =
>> oa_poll_check_timer_cb;
>> > + init_waitqueue_head(_priv->perf.oa.poll_wq);
>>
>> This timer only serves to wake up pollers / wait_unlocked, right? So why
>> is it always running (when the stream is enabled)?
>>
>>
> Right, it's unecessary. I'll look at limitting it to just while polling or
> for blocking reads.
>

Actually, looking at this, I couldn't see a clean way to synchronized with
do_sys_poll() returning to be able to cancel the hrtimer. The .poll fop is
only responsible for registering a wait queue via poll_wait() and checking
if there are already events pending before any wait.

The current hrtimer frequency was picked as a reasonable default to ensure
we pick up samples before an overflow with high frequency OA sampling but
also avoids latency in picking up samples written at a lower frequency.
Something I've considered a few times before is that we might want to add a
property that can influence the latency userspace is happy with, which
might also alieviate some of this concern that the hrtimer runs all the
time the stream is enabled when it could often be fine to run at a much
lower frequency than the current 200Hz.

For now, maybe it's ok to stick with the fixed frequency, and I'll plan to
experiment with a property for influencing the maximum latency?

Regards,
- Robert
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Chris Wilson
On Thu, Apr 21, 2016 at 04:43:19PM +0100, Robert Bragg wrote:
>On Wed, Apr 20, 2016 at 11:52 PM, Chris Wilson
><[1]chris at chris-wilson.co.uk> wrote:
> 
>  On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
>  > +static int i915_oa_read(struct i915_perf_stream *stream,
>  > +                     struct i915_perf_read_state 
> *read_state)
>  > +{
>  > +     struct drm_i915_private *dev_priv = stream->dev_priv;
>  > +
>  > +     return dev_priv->perf.oa.ops.read(stream, read_state);
>  > +}
> 
>  > +     stream->destroy = i915_oa_stream_destroy;
>  > +     stream->enable = i915_oa_stream_enable;
>  > +     stream->disable = i915_oa_stream_disable;
>  > +     stream->can_read = i915_oa_can_read;
>  > +     stream->wait_unlocked = i915_oa_wait_unlocked;
>  > +     stream->poll_wait = i915_oa_poll_wait;
>  > +     stream->read = i915_oa_read;
> 
>  Why aren't these a const ops table?
> 
>No particular reason; I guess it just seemed straightforward enough at the
>time. I suppose it avoids some redundant pointer indirection and could
>suit defining streams in the future that might find it awkward to have
>static ops (don't have anything like that in mind though) but it's at the
>expense of a slightly larger stream struct (though also don't see that as
>a concern currently).
> 
>Can change if you like.

I think it is safe to say it is considered best practice to have vfunc
tables in read-only memory. Certainly raises an eyebrow when they look
like they could be modified on the fly.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Robert Bragg
On Wed, Apr 20, 2016 at 10:11 PM, Chris Wilson 
wrote:

> On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> > +static void gen7_update_oacontrol_locked(struct drm_i915_private
> *dev_priv)
> > +{
> > + assert_spin_locked(_priv->perf.hook_lock);
> > +
> > + if (dev_priv->perf.oa.exclusive_stream->enabled) {
> > + unsigned long ctx_id = 0;
> > +
> > + if (dev_priv->perf.oa.exclusive_stream->ctx)
> > + ctx_id = dev_priv->perf.oa.specific_ctx_id;
> > +
> > + if (dev_priv->perf.oa.exclusive_stream->ctx == NULL ||
> ctx_id) {
> > + bool periodic = dev_priv->perf.oa.periodic;
> > + u32 period_exponent =
> dev_priv->perf.oa.period_exponent;
> > + u32 report_format =
> dev_priv->perf.oa.oa_buffer.format;
> > +
> > + I915_WRITE(GEN7_OACONTROL,
> > +(ctx_id & GEN7_OACONTROL_CTX_MASK) |
> > +(period_exponent <<
> > + GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
> > +(periodic ?
> > + GEN7_OACONTROL_TIMER_ENABLE : 0) |
> > +(report_format <<
> > + GEN7_OACONTROL_FORMAT_SHIFT) |
> > +(ctx_id ?
> > + GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
> > +GEN7_OACONTROL_ENABLE);
>
> So this works by only recording when the OACONTROL context address
> matches the CCID.
>

> Rather than hooking into switch context and checking every batch whether
> you have the exclusive context in case it changed address, you could
> just pin the exclusive context when told by the user to bind perf to
> that context. Then it will also have the same address until oa is
> finished (and releases it vma pin).
>

Yeah, this was the approach I first went with when the driver was perf
based, though we ended up deciding to got with hooking into pinning and
updating the OA state in the end.

E.g. for reference:
https://lists.freedesktop.org/archives/intel-gfx/2014-November/055385.html
(wow, sad face after seeing how long I've been kicking this stuff)

I'd prefer to stick with this approach now, unless you see a big problem
with it.

Regards,
- Robert



> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
>
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Robert Bragg
On Wed, Apr 20, 2016 at 11:52 PM, Chris Wilson 
wrote:

> On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> > +static int i915_oa_read(struct i915_perf_stream *stream,
> > + struct i915_perf_read_state *read_state)
> > +{
> > + struct drm_i915_private *dev_priv = stream->dev_priv;
> > +
> > + return dev_priv->perf.oa.ops.read(stream, read_state);
> > +}
>
> > + stream->destroy = i915_oa_stream_destroy;
> > + stream->enable = i915_oa_stream_enable;
> > + stream->disable = i915_oa_stream_disable;
> > + stream->can_read = i915_oa_can_read;
> > + stream->wait_unlocked = i915_oa_wait_unlocked;
> > + stream->poll_wait = i915_oa_poll_wait;
> > + stream->read = i915_oa_read;
>
> Why aren't these a const ops table?
>

No particular reason; I guess it just seemed straightforward enough at the
time. I suppose it avoids some redundant pointer indirection and could suit
defining streams in the future that might find it awkward to have static
ops (don't have anything like that in mind though) but it's at the expense
of a slightly larger stream struct (though also don't see that as a concern
currently).

Can change if you like.

Regards,
- Robert


> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
>
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Robert Bragg
On Thu, Apr 21, 2016 at 12:09 AM, Chris Wilson 
wrote:

> On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> > +static void i915_oa_stream_enable(struct i915_perf_stream *stream)
> > +{
> > + struct drm_i915_private *dev_priv = stream->dev_priv;
> > +
> > + dev_priv->perf.oa.ops.oa_enable(dev_priv);
> > +
> > + if (dev_priv->perf.oa.periodic)
> > + hrtimer_start(_priv->perf.oa.poll_check_timer,
> > +   ns_to_ktime(POLL_PERIOD),
> > +   HRTIMER_MODE_REL_PINNED);
> > +}
>
> > +static void i915_oa_stream_disable(struct i915_perf_stream *stream)
> > +{
> > + struct drm_i915_private *dev_priv = stream->dev_priv;
> > +
> > + dev_priv->perf.oa.ops.oa_disable(dev_priv);
> > +
> > + if (dev_priv->perf.oa.periodic)
> > + hrtimer_cancel(_priv->perf.oa.poll_check_timer);
> > +}
>
> > +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer
> *hrtimer)
> > +{
> > + struct drm_i915_private *dev_priv =
> > + container_of(hrtimer, typeof(*dev_priv),
> > +  perf.oa.poll_check_timer);
> > +
> > + if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv))
> > + wake_up(_priv->perf.oa.poll_wq);
> > +
> > + hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
> > +
> > + return HRTIMER_RESTART;
> > +}
>
> > @@ -424,8 +1313,37 @@ void i915_perf_init(struct drm_device *dev)
> >  {
> >   struct drm_i915_private *dev_priv = to_i915(dev);
> >
> > + if (!IS_HASWELL(dev))
> > + return;
> > +
> > + hrtimer_init(_priv->perf.oa.poll_check_timer,
> > +  CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> > + dev_priv->perf.oa.poll_check_timer.function =
> oa_poll_check_timer_cb;
> > + init_waitqueue_head(_priv->perf.oa.poll_wq);
>
> This timer only serves to wake up pollers / wait_unlocked, right? So why
> is it always running (when the stream is enabled)?
>
> What happens to poll / wait_unlocked if oa.periodic is not set? It seems
> like those functions would block indefinitely.
>

Right, it's unecessary. I'll look at limitting it to just while polling or
for blocking reads.

Good point about the blocking case too.

I just started testing that scenario yesterday, writting an MI_RPC unit
test which opens a stream without requesting periodic sampling, but didn't
poll or read in that case so far so didn't hit this yet.

At least for the read() this is partially considered by returning -EIO if
attempting a blocking read while the stream is disabled, but it doesn't
consider the case that the stream is enabled but periodic sampling isn't
enabled.

Regards,
- Robert


> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
>
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Robert Bragg
On Thu, Apr 21, 2016 at 12:16 AM, Chris Wilson 
wrote:

> On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> > +static int hsw_enable_metric_set(struct drm_i915_private *dev_priv)
> > +{
> > + int ret = i915_oa_select_metric_set_hsw(dev_priv);
> > +
> > + if (ret)
> > + return ret;
> > +
> > + I915_WRITE(GDT_CHICKEN_BITS, GT_NOA_ENABLE);
> > +
> > + /* PRM:
> > +  *
> > +  * OA unit is using “crclk” for its functionality. When trunk
> > +  * level clock gating takes place, OA clock would be gated,
> > +  * unable to count the events from non-render clock domain.
> > +  * Render clock gating must be disabled when OA is enabled to
> > +  * count the events from non-render domain. Unit level clock
> > +  * gating for RCS should also be disabled.
> > +  */
> > + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
> > + ~GEN7_DOP_CLOCK_GATE_ENABLE));
> > + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
> > +   GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> > +
> > + config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
> > +dev_priv->perf.oa.mux_regs_len);
> > +
> > + /* It takes a fairly long time for a new MUX configuration to
> > +  * be be applied after these register writes. This delay
> > +  * duration was derived empirically based on the render_basic
> > +  * config but hopefully it covers the maximum configuration
> > +  * latency...
> > +  */
> > + mdelay(100);
>
> You really want to busy spin for 100ms? msleep() perhaps!
>

Ah, oops, I forgot to change this, thanks!


>
> Did you look for some register you can observe the change in when the
> mux is reconfigured? Is even reading one of the OA registers enough?
>

Although I can't really comprehend why the delay apparently needs to be
quite so long, based on my limited understanding of some of the NOA
michroarchitecture involved here it makes some sense to me there would be a
delay that's also somewhat variable depending on the particular MUX config
and I don't know of a trick for getting explicit feedback of completion
unfortunately.

I did bring this up briefly, recently in discussion with others more
familiar with the HW side of things, but haven't had much feedback on this
so far. afaik other OS drivers aren't currently accounting for a need to
have a delay here.

For reference, 100ms was picked as I was experimenting with stepping up the
delay by orders of magnitude and found 10ms wasn't enough. Potentially I
could experiment further with delays between 10 and 100ms, but I suppose it
won't make a big difference.



>
> > + config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
> > +dev_priv->perf.oa.b_counter_regs_len);
> > +
> > + return 0;
> > +}
> > +
> > +static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
> > +{
> > + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
> > +   ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> > + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
> > + GEN7_DOP_CLOCK_GATE_ENABLE));
> > +
> > + I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
> > +   ~GT_NOA_ENABLE));
>
> You didn't preserve any other chicken bits during enable_metric_set.
>

Hmm, good point. I think I'll aim to preserve other bits when setting if
that works, just in case something else needs to fiddle with the same
register later.


> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
>
-- next part --
An HTML attachment was scrubbed...
URL: 



[Intel-gfx] [PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread kbuild test robot
Hi,

[auto build test ERROR on drm-intel/for-linux-next]
[also build test ERROR on next-20160420]
[cannot apply to v4.6-rc4]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improving the system]

url:
https://github.com/0day-ci/linux/commits/Robert-Bragg/Enable-Gen-7-Observation-Architecture/20160420-222746
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: i386-allmodconfig (attached as .config)
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

>> ERROR: "__udivdi3" [drivers/gpu/drm/i915/i915.ko] undefined!

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation
-- next part --
A non-text attachment was scrubbed...
Name: .config.gz
Type: application/octet-stream
Size: 54467 bytes
Desc: not available
URL: 



[Intel-gfx] [PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread kbuild test robot
Hi,

[auto build test ERROR on drm-intel/for-linux-next]
[also build test ERROR on next-20160420]
[cannot apply to v4.6-rc4]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improving the system]

url:
https://github.com/0day-ci/linux/commits/Robert-Bragg/Enable-Gen-7-Observation-Architecture/20160420-222746
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: i386-randconfig-s1-201616 (attached as .config)
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   drivers/built-in.o: In function `i915_perf_open_ioctl_locked':
>> (.text+0x2cadf4): undefined reference to `__udivdi3'
   drivers/built-in.o: In function `i915_perf_open_ioctl_locked':
   (.text+0x2cae0d): undefined reference to `__udivdi3'

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation
-- next part --
A non-text attachment was scrubbed...
Name: .config.gz
Type: application/octet-stream
Size: 25411 bytes
Desc: not available
URL: 



[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Chris Wilson
On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> +static int hsw_enable_metric_set(struct drm_i915_private *dev_priv)
> +{
> + int ret = i915_oa_select_metric_set_hsw(dev_priv);
> +
> + if (ret)
> + return ret;
> +
> + I915_WRITE(GDT_CHICKEN_BITS, GT_NOA_ENABLE);
> +
> + /* PRM:
> +  *
> +  * OA unit is using “crclk” for its functionality. When trunk
> +  * level clock gating takes place, OA clock would be gated,
> +  * unable to count the events from non-render clock domain.
> +  * Render clock gating must be disabled when OA is enabled to
> +  * count the events from non-render domain. Unit level clock
> +  * gating for RCS should also be disabled.
> +  */
> + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
> + ~GEN7_DOP_CLOCK_GATE_ENABLE));
> + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
> +   GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> +
> + config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
> +dev_priv->perf.oa.mux_regs_len);
> +
> + /* It takes a fairly long time for a new MUX configuration to
> +  * be be applied after these register writes. This delay
> +  * duration was derived empirically based on the render_basic
> +  * config but hopefully it covers the maximum configuration
> +  * latency...
> +  */
> + mdelay(100);

You really want to busy spin for 100ms? msleep() perhaps!

Did you look for some register you can observe the change in when the
mux is reconfigured? Is even reading one of the OA registers enough?

> + config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
> +dev_priv->perf.oa.b_counter_regs_len);
> +
> + return 0;
> +}
> +
> +static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
> +{
> + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
> +   ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
> + GEN7_DOP_CLOCK_GATE_ENABLE));
> +
> + I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
> +   ~GT_NOA_ENABLE));

You didn't preserve any other chicken bits during enable_metric_set.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Chris Wilson
On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> +static void i915_oa_stream_enable(struct i915_perf_stream *stream)
> +{
> + struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> + dev_priv->perf.oa.ops.oa_enable(dev_priv);
> +
> + if (dev_priv->perf.oa.periodic)
> + hrtimer_start(_priv->perf.oa.poll_check_timer,
> +   ns_to_ktime(POLL_PERIOD),
> +   HRTIMER_MODE_REL_PINNED);
> +}

> +static void i915_oa_stream_disable(struct i915_perf_stream *stream)
> +{
> + struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> + dev_priv->perf.oa.ops.oa_disable(dev_priv);
> +
> + if (dev_priv->perf.oa.periodic)
> + hrtimer_cancel(_priv->perf.oa.poll_check_timer);
> +}

> +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
> +{
> + struct drm_i915_private *dev_priv =
> + container_of(hrtimer, typeof(*dev_priv),
> +  perf.oa.poll_check_timer);
> +
> + if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv))
> + wake_up(_priv->perf.oa.poll_wq);
> +
> + hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
> +
> + return HRTIMER_RESTART;
> +}

> @@ -424,8 +1313,37 @@ void i915_perf_init(struct drm_device *dev)
>  {
>   struct drm_i915_private *dev_priv = to_i915(dev);
>  
> + if (!IS_HASWELL(dev))
> + return;
> +
> + hrtimer_init(_priv->perf.oa.poll_check_timer,
> +  CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
> + init_waitqueue_head(_priv->perf.oa.poll_wq);

This timer only serves to wake up pollers / wait_unlocked, right? So why
is it always running (when the stream is enabled)?

What happens to poll / wait_unlocked if oa.periodic is not set? It seems
like those functions would block indefinitely.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Chris Wilson
On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> +static int i915_oa_read(struct i915_perf_stream *stream,
> + struct i915_perf_read_state *read_state)
> +{
> + struct drm_i915_private *dev_priv = stream->dev_priv;
> +
> + return dev_priv->perf.oa.ops.read(stream, read_state);
> +}

> + stream->destroy = i915_oa_stream_destroy;
> + stream->enable = i915_oa_stream_enable;
> + stream->disable = i915_oa_stream_disable;
> + stream->can_read = i915_oa_can_read;
> + stream->wait_unlocked = i915_oa_wait_unlocked;
> + stream->poll_wait = i915_oa_poll_wait;
> + stream->read = i915_oa_read;

Why aren't these a const ops table?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Chris Wilson
On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> +static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> + /* Pre-DevBDW: OABUFFER must be set with counters off,
> +  * before OASTATUS1, but after OASTATUS2
> +  */
> + I915_WRITE(GEN7_OASTATUS2, dev_priv->perf.oa.oa_buffer.gtt_offset |
> +OA_MEM_SELECT_GGTT); /* head */
> + I915_WRITE(GEN7_OABUFFER, dev_priv->perf.oa.oa_buffer.gtt_offset);
> + I915_WRITE(GEN7_OASTATUS1, dev_priv->perf.oa.oa_buffer.gtt_offset |
> +OABUFFER_SIZE_16M); /* tail */
> +
> + /* On Haswell we have to track which OASTATUS1 flags we've
> +  * already seen since they can't be cleared while periodic
> +  * sampling is enabled.
> +  */
> + dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
> +
> + /* We have a sanity check in gen7_append_oa_reports() that
> +  * looks at the report-id field to make sure it's non-zero
> +  * which relies on the assumption that new reports are
> +  * being written to zeroed memory...
> +  */
> + memset(dev_priv->perf.oa.oa_buffer.addr, 0, SZ_16M);

You allocated zeroed memory.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-21 Thread Chris Wilson
On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> + struct drm_i915_gem_object *bo;
> + int ret;
> +
> + BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
> +
> + ret = i915_mutex_lock_interruptible(dev_priv->dev);
> + if (ret)
> + return ret;
> +
> + bo = i915_gem_alloc_object(dev_priv->dev, OA_BUFFER_SIZE);
> + if (bo == NULL) {
> + DRM_ERROR("Failed to allocate OA buffer\n");
> + ret = -ENOMEM;
> + goto unlock;
> + }
> + dev_priv->perf.oa.oa_buffer.obj = bo;
> +
> + ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> + if (ret)
> + goto err_unref;
> +
> + /* PreHSW required 512K alignment, HSW requires 16M */
> + ret = i915_gem_obj_ggtt_pin(bo, SZ_16M, 0);
> + if (ret)
> + goto err_unref;
> +
> + dev_priv->perf.oa.oa_buffer.gtt_offset = i915_gem_obj_ggtt_offset(bo);
> + dev_priv->perf.oa.oa_buffer.addr = vmap_oa_buffer(bo);

Now i915_gem_object_pin_map(bo) instead of manually vmapping it, and
i915_gem_object_unpin_map() to release.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-20 Thread Chris Wilson
On Wed, Apr 20, 2016 at 03:23:10PM +0100, Robert Bragg wrote:
> +static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
> +{
> + assert_spin_locked(_priv->perf.hook_lock);
> +
> + if (dev_priv->perf.oa.exclusive_stream->enabled) {
> + unsigned long ctx_id = 0;
> +
> + if (dev_priv->perf.oa.exclusive_stream->ctx)
> + ctx_id = dev_priv->perf.oa.specific_ctx_id;
> +
> + if (dev_priv->perf.oa.exclusive_stream->ctx == NULL || ctx_id) {
> + bool periodic = dev_priv->perf.oa.periodic;
> + u32 period_exponent = dev_priv->perf.oa.period_exponent;
> + u32 report_format = dev_priv->perf.oa.oa_buffer.format;
> +
> + I915_WRITE(GEN7_OACONTROL,
> +(ctx_id & GEN7_OACONTROL_CTX_MASK) |
> +(period_exponent <<
> + GEN7_OACONTROL_TIMER_PERIOD_SHIFT) |
> +(periodic ?
> + GEN7_OACONTROL_TIMER_ENABLE : 0) |
> +(report_format <<
> + GEN7_OACONTROL_FORMAT_SHIFT) |
> +(ctx_id ?
> + GEN7_OACONTROL_PER_CTX_ENABLE : 0) |
> +GEN7_OACONTROL_ENABLE);

So this works by only recording when the OACONTROL context address
matches the CCID.

Rather than hooking into switch context and checking every batch whether
you have the exclusive context in case it changed address, you could
just pin the exclusive context when told by the user to bind perf to
that context. Then it will also have the same address until oa is
finished (and releases it vma pin).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH 5/9] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-04-20 Thread Robert Bragg
Gen graphics hardware can be set up to periodically write snapshots of
performance counters into a circular buffer via its Observation
Architecture and this patch exposes that capability to userspace via the
i915 perf interface.

Cc: Chris Wilson 
Signed-off-by: Robert Bragg 
Signed-off-by: Zhenyu Wang 
---
 drivers/gpu/drm/i915/i915_drv.h |  56 +-
 drivers/gpu/drm/i915/i915_gem_context.c |  24 +-
 drivers/gpu/drm/i915/i915_perf.c| 940 +++-
 drivers/gpu/drm/i915/i915_reg.h | 338 
 include/uapi/drm/i915_drm.h |  70 ++-
 5 files changed, 1408 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5e959f3..972ae6c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1708,8 +1708,13 @@ struct intel_wm_config {
bool sprites_scaled;
 };

+struct i915_oa_format {
+   u32 format;
+   int size;
+};
+
 struct i915_oa_reg {
-   u32 addr;
+   i915_reg_t addr;
u32 value;
 };

@@ -1725,6 +1730,7 @@ struct i915_perf_stream {
struct list_head link;

u32 sample_flags;
+   int sample_size;

struct intel_context *ctx;
bool enabled;
@@ -1786,6 +1792,20 @@ struct i915_perf_stream {
void (*destroy)(struct i915_perf_stream *stream);
 };

+struct i915_oa_ops {
+   void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
+   int (*enable_metric_set)(struct drm_i915_private *dev_priv);
+   void (*disable_metric_set)(struct drm_i915_private *dev_priv);
+   void (*oa_enable)(struct drm_i915_private *dev_priv);
+   void (*oa_disable)(struct drm_i915_private *dev_priv);
+   void (*update_oacontrol)(struct drm_i915_private *dev_priv);
+   void (*update_hw_ctx_id_locked)(struct drm_i915_private *dev_priv,
+   u32 ctx_id);
+   int (*read)(struct i915_perf_stream *stream,
+   struct i915_perf_read_state *read_state);
+   bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
+};
+
 struct drm_i915_private {
struct drm_device *dev;
struct kmem_cache *objects;
@@ -2059,16 +2079,46 @@ struct drm_i915_private {

struct {
bool initialized;
+
struct mutex lock;
struct list_head streams;

+   spinlock_t hook_lock;
+
struct {
-   u32 metrics_set;
+   struct i915_perf_stream *exclusive_stream;
+
+   u32 specific_ctx_id;
+
+   struct hrtimer poll_check_timer;
+   wait_queue_head_t poll_wq;
+
+   bool periodic;
+   int period_exponent;
+   int timestamp_frequency;
+
+   int tail_margin;
+
+   int metrics_set;

const struct i915_oa_reg *mux_regs;
int mux_regs_len;
const struct i915_oa_reg *b_counter_regs;
int b_counter_regs_len;
+
+   struct {
+   struct drm_i915_gem_object *obj;
+   u32 gtt_offset;
+   u8 *addr;
+   int format;
+   int format_size;
+   } oa_buffer;
+
+   u32 gen7_latched_oastatus1;
+
+   struct i915_oa_ops ops;
+   const struct i915_oa_format *oa_formats;
+   int n_builtin_sets;
} oa;
} perf;

@@ -3429,6 +3479,8 @@ int i915_gem_context_setparam_ioctl(struct drm_device 
*dev, void *data,

 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);
+void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
+   struct intel_context *context);

 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index e5acc39..ed5665f 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -133,6 +133,23 @@ static int get_context_size(struct drm_device *dev)
return ret;
 }

+static int i915_gem_context_pin_state(struct drm_device *dev,
+ struct intel_context *ctx)
+{
+   int ret;
+
+   BUG_ON(!mutex_is_locked(>struct_mutex));
+
+   ret = i915_gem_obj_ggtt_pin(ctx->legacy_hw_ctx.rcs_state,
+   get_context_alignment(dev), 0);
+   if (ret)
+   return ret;
+
+   i915_oa_context_pin_notify(dev->dev_private, ctx);
+
+   return 0;
+}
+
 static void i915_gem_context_clean(struct