[PATCH v6 06/11] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-10-21 Thread Robert Bragg
On Thu, Oct 20, 2016 at 11:10 PM, Chris Wilson 
wrote:

> On Thu, Oct 20, 2016 at 10:19:05PM +0100, Robert Bragg wrote:
> > +int i915_gem_context_pin_legacy_rcs_state(struct drm_i915_private
> *dev_priv,
> > +   struct i915_gem_context *ctx,
> > +   u64 flags)
>
> This is still no.
>
> > +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> > +{
> > + struct drm_i915_gem_object *bo;
> > + enum i915_map_type map;
> > + struct i915_vma *vma;
> > + int ret;
> > +
> > + BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
> > +
> > + ret = i915_mutex_lock_interruptible(_priv->drm);
> > + if (ret)
> > + return ret;
> > +
> > + BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
> > + BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
> > +
> > + bo = i915_gem_object_create(_priv->drm, OA_BUFFER_SIZE);
> > + if (IS_ERR(bo)) {
> > + DRM_ERROR("Failed to allocate OA buffer\n");
> > + ret = PTR_ERR(bo);
> > + goto unlock;
> > + }
> > + dev_priv->perf.oa.oa_buffer.obj = bo;
> > +
> > + ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> > + if (ret)
> > + goto err_unref;
> > +
> > + /* PreHSW required 512K alignment, HSW requires 16M */
> > + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);
> > + if (IS_ERR(vma)) {
> > + ret = PTR_ERR(vma);
> > + goto err_unref;
> > + }
> > + dev_priv->perf.oa.oa_buffer.vma = vma;
> > +
> > + map = HAS_LLC(dev_priv) ? I915_MAP_WB : I915_MAP_WC;
>
> You set the hw up to do coherent writes into the CPU cache, and then you
> request WC access to the pages? With set_cache_level(LLC) you can use
> MAP_WB on both llc and snoop based architectures. Fortunately this is
> only HSW!
>

hmm, yeah it looks like I unwittingly added this recently as part of a
rebase, I think from lazily copying some similar code from
intel_ringbuffer.c when I hit a conflict, without thinking more carefully,
sorry.


>
> > + dev_priv->perf.oa.oa_buffer.gtt_offset = i915_ggtt_offset(vma);
>
> I haven't spotted the advantage of storing both the ggtt_offset in
> addition to the vma (or the bo as well as the vma).
>

right, it looks like this can be cleaned up.


>
> > + dev_priv->perf.oa.oa_buffer.addr = i915_gem_object_pin_map(bo,
> map);
> > + if (IS_ERR(dev_priv->perf.oa.oa_buffer.addr)) {
> > + ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.addr);
> > + goto err_unpin;
> > + }
>
> --
> Chris Wilson, Intel Open Source Technology Centre
>

Thanks,
- Robert
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH v6 06/11] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-10-21 Thread Robert Bragg
On Thu, Oct 20, 2016 at 11:10 PM, Chris Wilson 
wrote:

> On Thu, Oct 20, 2016 at 10:19:05PM +0100, Robert Bragg wrote:
> > +int i915_gem_context_pin_legacy_rcs_state(struct drm_i915_private
> *dev_priv,
> > +   struct i915_gem_context *ctx,
> > +   u64 flags)
>
> This is still no.
>

Okay, but it's a little frustrating for me to go in circles here :-/

I didn't originally do it this way; I originally looked at pinning the
context when opening the stream so I didn't have to consider it being
relocated. The feedback from Daniel Vetter was to look at doing it this way
I think because of some concern to do with some shrinker corner cases.

... just dug up the archive:
https://lists.freedesktop.org/archives/intel-gfx/2014-November/055385.html

Can you maybe please explain what's wrong with the current approach and
provide some justification for a different approach with some reassurance
that Daniel's original concern with the shrinker unpinning contexts isn't
actually a problem? I don't currently understand the concern with this, and
this approach seems to have been working well for quite a long time now.

- Robert
-- next part --
An HTML attachment was scrubbed...
URL: 



[PATCH v6 06/11] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-10-21 Thread Chris Wilson
On Thu, Oct 20, 2016 at 10:19:05PM +0100, Robert Bragg wrote:
> +int i915_gem_context_pin_legacy_rcs_state(struct drm_i915_private *dev_priv,
> +   struct i915_gem_context *ctx,
> +   u64 flags)

This is still no.

> +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> + struct drm_i915_gem_object *bo;
> + enum i915_map_type map;
> + struct i915_vma *vma;
> + int ret;
> +
> + BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
> +
> + ret = i915_mutex_lock_interruptible(_priv->drm);
> + if (ret)
> + return ret;
> +
> + BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
> + BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
> +
> + bo = i915_gem_object_create(_priv->drm, OA_BUFFER_SIZE);
> + if (IS_ERR(bo)) {
> + DRM_ERROR("Failed to allocate OA buffer\n");
> + ret = PTR_ERR(bo);
> + goto unlock;
> + }
> + dev_priv->perf.oa.oa_buffer.obj = bo;
> +
> + ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> + if (ret)
> + goto err_unref;
> +
> + /* PreHSW required 512K alignment, HSW requires 16M */
> + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);
> + if (IS_ERR(vma)) {
> + ret = PTR_ERR(vma);
> + goto err_unref;
> + }
> + dev_priv->perf.oa.oa_buffer.vma = vma;
> +
> + map = HAS_LLC(dev_priv) ? I915_MAP_WB : I915_MAP_WC;

You set the hw up to do coherent writes into the CPU cache, and then you
request WC access to the pages? With set_cache_level(LLC) you can use
MAP_WB on both llc and snoop based architectures. Fortunately this is
only HSW!

> + dev_priv->perf.oa.oa_buffer.gtt_offset = i915_ggtt_offset(vma);

I haven't spotted the advantage of storing both the ggtt_offset in
addition to the vma (or the bo as well as the vma).

> + dev_priv->perf.oa.oa_buffer.addr = i915_gem_object_pin_map(bo, map);
> + if (IS_ERR(dev_priv->perf.oa.oa_buffer.addr)) {
> + ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.addr);
> + goto err_unpin;
> + }

-- 
Chris Wilson, Intel Open Source Technology Centre


[PATCH v6 06/11] drm/i915: Enable i915 perf stream for Haswell OA unit

2016-10-20 Thread Robert Bragg
Gen graphics hardware can be set up to periodically write snapshots of
performance counters into a circular buffer via its Observation
Architecture and this patch exposes that capability to userspace via the
i915 perf interface.

v2:
   Make sure to initialize ->specific_ctx_id when opening, without
   relying on _pin_notify hook, in case ctx already pinned.

Cc: Chris Wilson 
Signed-off-by: Robert Bragg 
Signed-off-by: Zhenyu Wang 

factor out init_specific_ctx_id func
---
 drivers/gpu/drm/i915/i915_drv.h |   72 ++-
 drivers/gpu/drm/i915/i915_gem_context.c |   22 +-
 drivers/gpu/drm/i915/i915_perf.c| 1034 ++-
 drivers/gpu/drm/i915/i915_reg.h |  338 ++
 drivers/gpu/drm/i915/intel_ringbuffer.c |   11 +-
 include/uapi/drm/i915_drm.h |   70 ++-
 6 files changed, 1515 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 28f3f77..b234412 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1760,6 +1760,11 @@ struct intel_wm_config {
bool sprites_scaled;
 };

+struct i915_oa_format {
+   u32 format;
+   int size;
+};
+
 struct i915_oa_reg {
i915_reg_t addr;
u32 value;
@@ -1780,11 +1785,6 @@ struct i915_perf_stream_ops {
 */
void (*disable)(struct i915_perf_stream *stream);

-   /* Return: true if any i915 perf records are ready to read()
-* for this stream.
-*/
-   bool (*can_read)(struct i915_perf_stream *stream);
-
/* Call poll_wait, passing a wait queue that will be woken
 * once there is something ready to read() for the stream
 */
@@ -1794,9 +1794,7 @@ struct i915_perf_stream_ops {

/* For handling a blocking read, wait until there is something
 * to ready to read() for the stream. E.g. wait on the same
-* wait queue that would be passed to poll_wait() until
-* ->can_read() returns true (if its safe to call ->can_read()
-* without the i915 perf lock held).
+* wait queue that would be passed to poll_wait().
 */
int (*wait_unlocked)(struct i915_perf_stream *stream);

@@ -1836,11 +1834,28 @@ struct i915_perf_stream {
struct list_head link;

u32 sample_flags;
+   int sample_size;

struct i915_gem_context *ctx;
bool enabled;

-   struct i915_perf_stream_ops *ops;
+   const struct i915_perf_stream_ops *ops;
+};
+
+struct i915_oa_ops {
+   void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
+   int (*enable_metric_set)(struct drm_i915_private *dev_priv);
+   void (*disable_metric_set)(struct drm_i915_private *dev_priv);
+   void (*oa_enable)(struct drm_i915_private *dev_priv);
+   void (*oa_disable)(struct drm_i915_private *dev_priv);
+   void (*update_oacontrol)(struct drm_i915_private *dev_priv);
+   void (*update_hw_ctx_id_locked)(struct drm_i915_private *dev_priv,
+   u32 ctx_id);
+   int (*read)(struct i915_perf_stream *stream,
+   char __user *buf,
+   size_t count,
+   size_t *offset);
+   bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv);
 };

 struct drm_i915_private {
@@ -2145,16 +2160,48 @@ struct drm_i915_private {

struct {
bool initialized;
+
struct mutex lock;
struct list_head streams;

+   spinlock_t hook_lock;
+
struct {
-   u32 metrics_set;
+   struct i915_perf_stream *exclusive_stream;
+
+   u32 specific_ctx_id;
+
+   struct hrtimer poll_check_timer;
+   wait_queue_head_t poll_wq;
+   atomic_t pollin;
+
+   bool periodic;
+   int period_exponent;
+   int timestamp_frequency;
+
+   int tail_margin;
+
+   int metrics_set;

const struct i915_oa_reg *mux_regs;
int mux_regs_len;
const struct i915_oa_reg *b_counter_regs;
int b_counter_regs_len;
+
+   struct {
+   struct drm_i915_gem_object *obj;
+   struct i915_vma *vma;
+   u32 gtt_offset;
+   u8 *addr;
+   int format;
+   int format_size;
+   } oa_buffer;
+
+   u32 gen7_latched_oastatus1;
+
+   struct i915_oa_ops ops;
+   const struct i915_oa_format *oa_formats;
+   int n_builtin_sets;
} oa;
} perf;

@@ -3525,6 +3572,9 @@ struct