Re: [Intel-gfx] [PATCH 2/2] drm/i915/perf: fix ctx_id read with GuC & ICL

2018-05-31 Thread Lionel Landwerlin

On 31/05/18 21:46, Michel Thierry wrote:

On 5/31/2018 12:56 PM, Lionel Landwerlin wrote:

One thing we didn't really understand about the OA report is that the
ContextID field (dword 2) is copy of the context descriptor (dword 1).

On Gen8->10 and without using GuC we didn't notice the issue because
we only checked the 21bits of the ContextID field in the OA reports
which matches exactly the hw_id stored into the context descriptor.

When using GuC submission we have an issue of a non matching hw_id
because GuC uses bit 20 of the hw_id to signal proxy submission. This
change makes introduces a mask to compare only the relevant bits.

Choose one: makes or introduces ;)



On ICL the context descriptor format has changed and we failed to
address this. On top of using a mask we also need to shift the bits
properly.



Someone is going to complain this should be two patches (one to 
address the GuC-ness and one for Gen11), but not me.


Reviewed-by: Michel Thierry 



Kind of agree, but that's a pain.
We can put it as a fix of the first commit that enabled gen8 and I'll do 
the backport in stable versions.


Cheers,

-
Lionel




Signed-off-by: Lionel Landwerlin 
Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
BSpec: 1237
Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
---
  drivers/gpu/drm/i915/i915_drv.h  |   1 +
  drivers/gpu/drm/i915/i915_perf.c | 123 ---
  drivers/gpu/drm/i915/intel_lrc.c |   5 ++
  3 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h 
b/drivers/gpu/drm/i915/i915_drv.h

index 58ab9259fb73..0ccda488a8db 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1953,6 +1953,7 @@ struct drm_i915_private {
    struct intel_context *pinned_ctx;
  u32 specific_ctx_id;
+    u32 specific_ctx_id_mask;
    struct hrtimer poll_check_timer;
  wait_queue_head_t poll_wq;
diff --git a/drivers/gpu/drm/i915/i915_perf.c 
b/drivers/gpu/drm/i915/i915_perf.c

index 4a62024cbf85..d5e5d4635f1f 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct 
i915_perf_stream *stream,

  continue;
  }
  -    /*
- * XXX: Just keep the lower 21 bits for now since I'm not
- * entirely sure if the HW touches any of the higher bits in
- * this field
- */
-    ctx_id = report32[2] & 0x1f;
+    ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
    /*
   * Squash whatever is in the CTX_ID field if it's marked as
@@ -1204,6 +1199,36 @@ static int i915_oa_read(struct 
i915_perf_stream *stream,

  return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
  }
  +static int oa_get_render_lrca(struct drm_i915_private *i915,
+  struct i915_gem_context *ctx,
+  u32 *lrca)
+{
+    struct intel_engine_cs *engine = i915->engine[RCS];
+    struct intel_context *ce;
+    int ret;
+
+    ret = i915_mutex_lock_interruptible(>drm);
+    if (ret)
+    return ret;
+
+    /*
+ * As the ID is the gtt offset of the context's vma we
+ * pin the vma to ensure the ID remains fixed.
+ *
+ * NB: implied RCS engine...
+ */
+    ce = intel_context_pin(ctx, engine);
+    mutex_unlock(>drm.struct_mutex);
+    if (IS_ERR(ce))
+    return PTR_ERR(ce);
+
+    i915->perf.oa.pinned_ctx = ce;
+
+    *lrca = i915_ggtt_offset(ce->state);
+
+    return 0;
+}
+
  /**
   * oa_get_render_ctx_id - determine and hold ctx hw id
   * @stream: An i915-perf stream opened for OA metrics
@@ -1216,40 +1241,81 @@ static int i915_oa_read(struct 
i915_perf_stream *stream,

   */
  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
  {
-    struct drm_i915_private *dev_priv = stream->dev_priv;
+    struct drm_i915_private *i915 = stream->dev_priv;
  -    if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
-    dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
-    } else {
-    struct intel_engine_cs *engine = dev_priv->engine[RCS];
-    struct intel_context *ce;
+    switch (INTEL_GEN(i915)) {
+    case 7: {
  int ret;
  -    ret = i915_mutex_lock_interruptible(_priv->drm);
+    ret = oa_get_render_lrca(i915, stream->ctx,
+ >perf.oa.specific_ctx_id);
  if (ret)
  return ret;
    /*
- * As the ID is the gtt offset of the context's vma we
- * pin the vma to ensure the ID remains fixed.
- *
- * NB: implied RCS engine...
+ * On Haswell we don't do any post processing of the reports
+ * and don't need to use the mask.
   */
-    ce = intel_context_pin(stream->ctx, engine);
-    mutex_unlock(_priv->drm.struct_mutex);
-    if 

Re: [Intel-gfx] [PATCH 2/2] drm/i915/perf: fix ctx_id read with GuC & ICL

2018-05-31 Thread Michel Thierry

On 5/31/2018 12:56 PM, Lionel Landwerlin wrote:

One thing we didn't really understand about the OA report is that the
ContextID field (dword 2) is copy of the context descriptor (dword 1).

On Gen8->10 and without using GuC we didn't notice the issue because
we only checked the 21bits of the ContextID field in the OA reports
which matches exactly the hw_id stored into the context descriptor.

When using GuC submission we have an issue of a non matching hw_id
because GuC uses bit 20 of the hw_id to signal proxy submission. This
change makes introduces a mask to compare only the relevant bits.

Choose one: makes or introduces ;)



On ICL the context descriptor format has changed and we failed to
address this. On top of using a mask we also need to shift the bits
properly.



Someone is going to complain this should be two patches (one to address 
the GuC-ness and one for Gen11), but not me.


Reviewed-by: Michel Thierry 



Signed-off-by: Lionel Landwerlin 
Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
BSpec: 1237
Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
---
  drivers/gpu/drm/i915/i915_drv.h  |   1 +
  drivers/gpu/drm/i915/i915_perf.c | 123 ---
  drivers/gpu/drm/i915/intel_lrc.c |   5 ++
  3 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 58ab9259fb73..0ccda488a8db 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1953,6 +1953,7 @@ struct drm_i915_private {
  
  			struct intel_context *pinned_ctx;

u32 specific_ctx_id;
+   u32 specific_ctx_id_mask;
  
  			struct hrtimer poll_check_timer;

wait_queue_head_t poll_wq;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4a62024cbf85..d5e5d4635f1f 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
continue;
}
  
-		/*

-* XXX: Just keep the lower 21 bits for now since I'm not
-* entirely sure if the HW touches any of the higher bits in
-* this field
-*/
-   ctx_id = report32[2] & 0x1f;
+   ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
  
  		/*

 * Squash whatever is in the CTX_ID field if it's marked as
@@ -1204,6 +1199,36 @@ static int i915_oa_read(struct i915_perf_stream *stream,
return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
  }
  
+static int oa_get_render_lrca(struct drm_i915_private *i915,

+ struct i915_gem_context *ctx,
+ u32 *lrca)
+{
+   struct intel_engine_cs *engine = i915->engine[RCS];
+   struct intel_context *ce;
+   int ret;
+
+   ret = i915_mutex_lock_interruptible(>drm);
+   if (ret)
+   return ret;
+
+   /*
+* As the ID is the gtt offset of the context's vma we
+* pin the vma to ensure the ID remains fixed.
+*
+* NB: implied RCS engine...
+*/
+   ce = intel_context_pin(ctx, engine);
+   mutex_unlock(>drm.struct_mutex);
+   if (IS_ERR(ce))
+   return PTR_ERR(ce);
+
+   i915->perf.oa.pinned_ctx = ce;
+
+   *lrca = i915_ggtt_offset(ce->state);
+
+   return 0;
+}
+
  /**
   * oa_get_render_ctx_id - determine and hold ctx hw id
   * @stream: An i915-perf stream opened for OA metrics
@@ -1216,40 +1241,81 @@ static int i915_oa_read(struct i915_perf_stream *stream,
   */
  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
  {
-   struct drm_i915_private *dev_priv = stream->dev_priv;
+   struct drm_i915_private *i915 = stream->dev_priv;
  
-	if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {

-   dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
-   } else {
-   struct intel_engine_cs *engine = dev_priv->engine[RCS];
-   struct intel_context *ce;
+   switch (INTEL_GEN(i915)) {
+   case 7: {
int ret;
  
-		ret = i915_mutex_lock_interruptible(_priv->drm);

+   ret = oa_get_render_lrca(i915, stream->ctx,
+>perf.oa.specific_ctx_id);
if (ret)
return ret;
  
  		/*

-* As the ID is the gtt offset of the context's vma we
-* pin the vma to ensure the ID remains fixed.
-*
-* NB: implied RCS engine...
+* On Haswell we don't do any post processing of the reports
+* and don't need to use the mask.
 */
-   ce = intel_context_pin(stream->ctx, 

[Intel-gfx] [PATCH 2/2] drm/i915/perf: fix ctx_id read with GuC & ICL

2018-05-31 Thread Lionel Landwerlin
One thing we didn't really understand about the OA report is that the
ContextID field (dword 2) is copy of the context descriptor (dword 1).

On Gen8->10 and without using GuC we didn't notice the issue because
we only checked the 21bits of the ContextID field in the OA reports
which matches exactly the hw_id stored into the context descriptor.

When using GuC submission we have an issue of a non matching hw_id
because GuC uses bit 20 of the hw_id to signal proxy submission. This
change makes introduces a mask to compare only the relevant bits.

On ICL the context descriptor format has changed and we failed to
address this. On top of using a mask we also need to shift the bits
properly.

Signed-off-by: Lionel Landwerlin 
Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
BSpec: 1237
Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
---
 drivers/gpu/drm/i915/i915_drv.h  |   1 +
 drivers/gpu/drm/i915/i915_perf.c | 123 ---
 drivers/gpu/drm/i915/intel_lrc.c |   5 ++
 3 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 58ab9259fb73..0ccda488a8db 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1953,6 +1953,7 @@ struct drm_i915_private {
 
struct intel_context *pinned_ctx;
u32 specific_ctx_id;
+   u32 specific_ctx_id_mask;
 
struct hrtimer poll_check_timer;
wait_queue_head_t poll_wq;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4a62024cbf85..d5e5d4635f1f 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream 
*stream,
continue;
}
 
-   /*
-* XXX: Just keep the lower 21 bits for now since I'm not
-* entirely sure if the HW touches any of the higher bits in
-* this field
-*/
-   ctx_id = report32[2] & 0x1f;
+   ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
 
/*
 * Squash whatever is in the CTX_ID field if it's marked as
@@ -1204,6 +1199,36 @@ static int i915_oa_read(struct i915_perf_stream *stream,
return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
 }
 
+static int oa_get_render_lrca(struct drm_i915_private *i915,
+ struct i915_gem_context *ctx,
+ u32 *lrca)
+{
+   struct intel_engine_cs *engine = i915->engine[RCS];
+   struct intel_context *ce;
+   int ret;
+
+   ret = i915_mutex_lock_interruptible(>drm);
+   if (ret)
+   return ret;
+
+   /*
+* As the ID is the gtt offset of the context's vma we
+* pin the vma to ensure the ID remains fixed.
+*
+* NB: implied RCS engine...
+*/
+   ce = intel_context_pin(ctx, engine);
+   mutex_unlock(>drm.struct_mutex);
+   if (IS_ERR(ce))
+   return PTR_ERR(ce);
+
+   i915->perf.oa.pinned_ctx = ce;
+
+   *lrca = i915_ggtt_offset(ce->state);
+
+   return 0;
+}
+
 /**
  * oa_get_render_ctx_id - determine and hold ctx hw id
  * @stream: An i915-perf stream opened for OA metrics
@@ -1216,40 +1241,81 @@ static int i915_oa_read(struct i915_perf_stream *stream,
  */
 static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
 {
-   struct drm_i915_private *dev_priv = stream->dev_priv;
+   struct drm_i915_private *i915 = stream->dev_priv;
 
-   if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
-   dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
-   } else {
-   struct intel_engine_cs *engine = dev_priv->engine[RCS];
-   struct intel_context *ce;
+   switch (INTEL_GEN(i915)) {
+   case 7: {
int ret;
 
-   ret = i915_mutex_lock_interruptible(_priv->drm);
+   ret = oa_get_render_lrca(i915, stream->ctx,
+>perf.oa.specific_ctx_id);
if (ret)
return ret;
 
/*
-* As the ID is the gtt offset of the context's vma we
-* pin the vma to ensure the ID remains fixed.
-*
-* NB: implied RCS engine...
+* On Haswell we don't do any post processing of the reports
+* and don't need to use the mask.
 */
-   ce = intel_context_pin(stream->ctx, engine);
-   mutex_unlock(_priv->drm.struct_mutex);
-   if (IS_ERR(ce))
-   return PTR_ERR(ce);
+