Re: [Intel-gfx] [PATCH 21/43] drm/i915/bdw: Emission of requests with logical rings

2014-08-13 Thread Daniel, Thomas


 -Original Message-
 From: Daniel Vetter [mailto:daniel.vet...@ffwll.ch] On Behalf Of Daniel
 Vetter
 Sent: Monday, August 11, 2014 9:57 PM
 To: Daniel, Thomas
 Cc: intel-gfx@lists.freedesktop.org
 Subject: Re: [Intel-gfx] [PATCH 21/43] drm/i915/bdw: Emission of requests
 with logical rings
 
 On Thu, Jul 24, 2014 at 05:04:29PM +0100, Thomas Daniel wrote:
  From: Oscar Mateo oscar.ma...@intel.com
 
  On a previous iteration of this patch, I created an Execlists version
  of __i915_add_request and asbtracted it away as a vfunc. Daniel Vetter
  wondered then why that was needed:
 
  with the clean split in command submission I expect every function to
  know wether it'll submit to an lrc (everything in
  intel_lrc.c) or wether it'll submit to a legacy ring (existing code),
  so I don't see a need for an add_request vfunc.
 
  The honest, hairy truth is that this patch is the glue keeping the
  whole logical ring puzzle together:
 
 Oops, I didn't spot this and it's indeed not terribly pretty.
Are you saying you want to go back to a vfunc for add_request?

 
  - i915_add_request is used by intel_ring_idle, which in turn is
used by i915_gpu_idle, which in turn is used in several places
inside the eviction and gtt codes.
 
 This should probably be folded in with the lrc specific version of stop_rings
 and so should work out.
 
  - Also, it is used by i915_gem_check_olr, which is littered all
over i915_gem.c
 
 We now always preallocate the request struct, so olr is officially dead.
 Well almost, except for non-execbuf stuff that we emit through the rings.
 Which is nothing for lrc/execlist mode.
 
 Also there's the icky-bitty problem with ringbuf-ctx which makes this patch
 not apply any more. I think we need to revise or at least discuss a bit.
 
Context is required when doing an advance_and_submit so that we know
which context to queue in the execlist queue.

  - ...
 
  If I were to duplicate all the code that directly or indirectly uses
  __i915_add_request, I'll end up creating a separate driver.
 
  To show the differences between the existing legacy version and the
  new Execlists one, this time I have special-cased __i915_add_request
  instead of adding an add_request vfunc. I hope this helps to untangle
  this Gordian knot.
 
  Signed-off-by: Oscar Mateo oscar.ma...@intel.com
  ---
   drivers/gpu/drm/i915/i915_gem.c  |   72
 --
   drivers/gpu/drm/i915/intel_lrc.c |   30 +---
   drivers/gpu/drm/i915/intel_lrc.h |1 +
   3 files changed, 80 insertions(+), 23 deletions(-)
 
  diff --git a/drivers/gpu/drm/i915/i915_gem.c
  b/drivers/gpu/drm/i915/i915_gem.c index 9560b40..1c83b9c 100644
  --- a/drivers/gpu/drm/i915/i915_gem.c
  +++ b/drivers/gpu/drm/i915/i915_gem.c
  @@ -2327,10 +2327,21 @@ int __i915_add_request(struct intel_engine_cs
  *ring,  {
  struct drm_i915_private *dev_priv = ring-dev-dev_private;
  struct drm_i915_gem_request *request;
  +   struct intel_ringbuffer *ringbuf;
  u32 request_ring_position, request_start;
  int ret;
 
  -   request_start = intel_ring_get_tail(ring-buffer);
  +   request = ring-preallocated_lazy_request;
  +   if (WARN_ON(request == NULL))
  +   return -ENOMEM;
  +
  +   if (i915.enable_execlists) {
  +   struct intel_context *ctx = request-ctx;
  +   ringbuf = ctx-engine[ring-id].ringbuf;
  +   } else
  +   ringbuf = ring-buffer;
  +
  +   request_start = intel_ring_get_tail(ringbuf);
  /*
   * Emit any outstanding flushes - execbuf can fail to emit the flush
   * after having emitted the batchbuffer command. Hence we need to
  fix @@ -2338,24 +2349,32 @@ int __i915_add_request(struct
 intel_engine_cs *ring,
   * is that the flush _must_ happen before the next request, no
 matter
   * what.
   */
  -   ret = intel_ring_flush_all_caches(ring);
  -   if (ret)
  -   return ret;
  -
  -   request = ring-preallocated_lazy_request;
  -   if (WARN_ON(request == NULL))
  -   return -ENOMEM;
  +   if (i915.enable_execlists) {
  +   ret = logical_ring_flush_all_caches(ringbuf);
  +   if (ret)
  +   return ret;
  +   } else {
  +   ret = intel_ring_flush_all_caches(ring);
  +   if (ret)
  +   return ret;
  +   }
 
  /* Record the position of the start of the request so that
   * should we detect the updated seqno part-way through the
   * GPU processing the request, we never over-estimate the
   * position of the head.
   */
  -   request_ring_position = intel_ring_get_tail(ring-buffer);
  +   request_ring_position = intel_ring_get_tail(ringbuf);
 
  -   ret = ring-add_request(ring);
  -   if (ret)
  -   return ret;
  +   if (i915.enable_execlists) {
  +   ret = ring-emit_request(ringbuf);
  +   if (ret)
  +   return ret;
  +   } else {
  +   ret = ring-add_request(ring

Re: [Intel-gfx] [PATCH 21/43] drm/i915/bdw: Emission of requests with logical rings

2014-08-13 Thread Daniel Vetter
On Wed, Aug 13, 2014 at 01:34:28PM +, Daniel, Thomas wrote:
 
 
  -Original Message-
  From: Daniel Vetter [mailto:daniel.vet...@ffwll.ch] On Behalf Of Daniel
  Vetter
  Sent: Monday, August 11, 2014 9:57 PM
  To: Daniel, Thomas
  Cc: intel-gfx@lists.freedesktop.org
  Subject: Re: [Intel-gfx] [PATCH 21/43] drm/i915/bdw: Emission of requests
  with logical rings
  
  On Thu, Jul 24, 2014 at 05:04:29PM +0100, Thomas Daniel wrote:
   From: Oscar Mateo oscar.ma...@intel.com
  
   On a previous iteration of this patch, I created an Execlists version
   of __i915_add_request and asbtracted it away as a vfunc. Daniel Vetter
   wondered then why that was needed:
  
   with the clean split in command submission I expect every function to
   know wether it'll submit to an lrc (everything in
   intel_lrc.c) or wether it'll submit to a legacy ring (existing code),
   so I don't see a need for an add_request vfunc.
  
   The honest, hairy truth is that this patch is the glue keeping the
   whole logical ring puzzle together:
  
  Oops, I didn't spot this and it's indeed not terribly pretty.
 Are you saying you want to go back to a vfunc for add_request?

Nope, I'd have expected that there's no need for such a switch at all, and
that all these differences disappear behind the execbuf cmd submission
abstraction.

   - i915_add_request is used by intel_ring_idle, which in turn is
 used by i915_gpu_idle, which in turn is used in several places
 inside the eviction and gtt codes.
  
  This should probably be folded in with the lrc specific version of 
  stop_rings
  and so should work out.
  
   - Also, it is used by i915_gem_check_olr, which is littered all
 over i915_gem.c
  
  We now always preallocate the request struct, so olr is officially dead.
  Well almost, except for non-execbuf stuff that we emit through the rings.
  Which is nothing for lrc/execlist mode.
  
  Also there's the icky-bitty problem with ringbuf-ctx which makes this patch
  not apply any more. I think we need to revise or at least discuss a bit.
  
 Context is required when doing an advance_and_submit so that we know
 which context to queue in the execlist queue.

Might need to re-read, but imo it's not a good idea to do a submit in
there. If I understand this correctly there should only be a need for an
ELSP write (or queueing it up) at the end of the execlist-specific cmd
submission implementation. The ringbuffer writes we do before that for the
different pieces (flushes, seqno write, bb_start, whatever) should just
update the tail pointer in the ring.

Or do I miss something really big here?

 
   - ...
  
   If I were to duplicate all the code that directly or indirectly uses
   __i915_add_request, I'll end up creating a separate driver.
  
   To show the differences between the existing legacy version and the
   new Execlists one, this time I have special-cased __i915_add_request
   instead of adding an add_request vfunc. I hope this helps to untangle
   this Gordian knot.
  
   Signed-off-by: Oscar Mateo oscar.ma...@intel.com
   ---
drivers/gpu/drm/i915/i915_gem.c  |   72
  --
drivers/gpu/drm/i915/intel_lrc.c |   30 +---
drivers/gpu/drm/i915/intel_lrc.h |1 +
3 files changed, 80 insertions(+), 23 deletions(-)
  
   diff --git a/drivers/gpu/drm/i915/i915_gem.c
   b/drivers/gpu/drm/i915/i915_gem.c index 9560b40..1c83b9c 100644
   --- a/drivers/gpu/drm/i915/i915_gem.c
   +++ b/drivers/gpu/drm/i915/i915_gem.c
   @@ -2327,10 +2327,21 @@ int __i915_add_request(struct intel_engine_cs
   *ring,  {
 struct drm_i915_private *dev_priv = ring-dev-dev_private;
 struct drm_i915_gem_request *request;
   + struct intel_ringbuffer *ringbuf;
 u32 request_ring_position, request_start;
 int ret;
  
   - request_start = intel_ring_get_tail(ring-buffer);
   + request = ring-preallocated_lazy_request;
   + if (WARN_ON(request == NULL))
   + return -ENOMEM;
   +
   + if (i915.enable_execlists) {
   + struct intel_context *ctx = request-ctx;
   + ringbuf = ctx-engine[ring-id].ringbuf;
   + } else
   + ringbuf = ring-buffer;
   +
   + request_start = intel_ring_get_tail(ringbuf);
 /*
  * Emit any outstanding flushes - execbuf can fail to emit the flush
  * after having emitted the batchbuffer command. Hence we need to
   fix @@ -2338,24 +2349,32 @@ int __i915_add_request(struct
  intel_engine_cs *ring,
  * is that the flush _must_ happen before the next request, no
  matter
  * what.
  */
   - ret = intel_ring_flush_all_caches(ring);
   - if (ret)
   - return ret;
   -
   - request = ring-preallocated_lazy_request;
   - if (WARN_ON(request == NULL))
   - return -ENOMEM;
   + if (i915.enable_execlists) {
   + ret = logical_ring_flush_all_caches(ringbuf);
   + if (ret)
   + return ret;
   + } else {
   + ret = intel_ring_flush_all_caches(ring

Re: [Intel-gfx] [PATCH 21/43] drm/i915/bdw: Emission of requests with logical rings

2014-08-11 Thread Daniel Vetter
On Thu, Jul 24, 2014 at 05:04:29PM +0100, Thomas Daniel wrote:
 From: Oscar Mateo oscar.ma...@intel.com
 
 On a previous iteration of this patch, I created an Execlists
 version of __i915_add_request and asbtracted it away as a
 vfunc. Daniel Vetter wondered then why that was needed:
 
 with the clean split in command submission I expect every
 function to know wether it'll submit to an lrc (everything in
 intel_lrc.c) or wether it'll submit to a legacy ring (existing
 code), so I don't see a need for an add_request vfunc.
 
 The honest, hairy truth is that this patch is the glue keeping
 the whole logical ring puzzle together:

Oops, I didn't spot this and it's indeed not terribly pretty.
 
 - i915_add_request is used by intel_ring_idle, which in turn is
   used by i915_gpu_idle, which in turn is used in several places
   inside the eviction and gtt codes.

This should probably be folded in with the lrc specific version of
stop_rings and so should work out.

 - Also, it is used by i915_gem_check_olr, which is littered all
   over i915_gem.c

We now always preallocate the request struct, so olr is officially dead.
Well almost, except for non-execbuf stuff that we emit through the rings.
Which is nothing for lrc/execlist mode.

Also there's the icky-bitty problem with ringbuf-ctx which makes this
patch not apply any more. I think we need to revise or at least discuss a
bit.

 - ...
 
 If I were to duplicate all the code that directly or indirectly
 uses __i915_add_request, I'll end up creating a separate driver.
 
 To show the differences between the existing legacy version and
 the new Execlists one, this time I have special-cased
 __i915_add_request instead of adding an add_request vfunc. I
 hope this helps to untangle this Gordian knot.
 
 Signed-off-by: Oscar Mateo oscar.ma...@intel.com
 ---
  drivers/gpu/drm/i915/i915_gem.c  |   72 
 --
  drivers/gpu/drm/i915/intel_lrc.c |   30 +---
  drivers/gpu/drm/i915/intel_lrc.h |1 +
  3 files changed, 80 insertions(+), 23 deletions(-)
 
 diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
 index 9560b40..1c83b9c 100644
 --- a/drivers/gpu/drm/i915/i915_gem.c
 +++ b/drivers/gpu/drm/i915/i915_gem.c
 @@ -2327,10 +2327,21 @@ int __i915_add_request(struct intel_engine_cs *ring,
  {
   struct drm_i915_private *dev_priv = ring-dev-dev_private;
   struct drm_i915_gem_request *request;
 + struct intel_ringbuffer *ringbuf;
   u32 request_ring_position, request_start;
   int ret;
  
 - request_start = intel_ring_get_tail(ring-buffer);
 + request = ring-preallocated_lazy_request;
 + if (WARN_ON(request == NULL))
 + return -ENOMEM;
 +
 + if (i915.enable_execlists) {
 + struct intel_context *ctx = request-ctx;
 + ringbuf = ctx-engine[ring-id].ringbuf;
 + } else
 + ringbuf = ring-buffer;
 +
 + request_start = intel_ring_get_tail(ringbuf);
   /*
* Emit any outstanding flushes - execbuf can fail to emit the flush
* after having emitted the batchbuffer command. Hence we need to fix
 @@ -2338,24 +2349,32 @@ int __i915_add_request(struct intel_engine_cs *ring,
* is that the flush _must_ happen before the next request, no matter
* what.
*/
 - ret = intel_ring_flush_all_caches(ring);
 - if (ret)
 - return ret;
 -
 - request = ring-preallocated_lazy_request;
 - if (WARN_ON(request == NULL))
 - return -ENOMEM;
 + if (i915.enable_execlists) {
 + ret = logical_ring_flush_all_caches(ringbuf);
 + if (ret)
 + return ret;
 + } else {
 + ret = intel_ring_flush_all_caches(ring);
 + if (ret)
 + return ret;
 + }
  
   /* Record the position of the start of the request so that
* should we detect the updated seqno part-way through the
* GPU processing the request, we never over-estimate the
* position of the head.
*/
 - request_ring_position = intel_ring_get_tail(ring-buffer);
 + request_ring_position = intel_ring_get_tail(ringbuf);
  
 - ret = ring-add_request(ring);
 - if (ret)
 - return ret;
 + if (i915.enable_execlists) {
 + ret = ring-emit_request(ringbuf);
 + if (ret)
 + return ret;
 + } else {
 + ret = ring-add_request(ring);
 + if (ret)
 + return ret;
 + }
  
   request-seqno = intel_ring_get_seqno(ring);
   request-ring = ring;
 @@ -2370,12 +2389,14 @@ int __i915_add_request(struct intel_engine_cs *ring,
*/
   request-batch_obj = obj;
  
 - /* Hold a reference to the current context so that we can inspect
 -  * it later in case a hangcheck error event fires.
 -  */
 - request-ctx = ring-last_context;
 - if (request-ctx)
 - 

[Intel-gfx] [PATCH 21/43] drm/i915/bdw: Emission of requests with logical rings

2014-07-24 Thread Thomas Daniel
From: Oscar Mateo oscar.ma...@intel.com

On a previous iteration of this patch, I created an Execlists
version of __i915_add_request and asbtracted it away as a
vfunc. Daniel Vetter wondered then why that was needed:

with the clean split in command submission I expect every
function to know wether it'll submit to an lrc (everything in
intel_lrc.c) or wether it'll submit to a legacy ring (existing
code), so I don't see a need for an add_request vfunc.

The honest, hairy truth is that this patch is the glue keeping
the whole logical ring puzzle together:

- i915_add_request is used by intel_ring_idle, which in turn is
  used by i915_gpu_idle, which in turn is used in several places
  inside the eviction and gtt codes.
- Also, it is used by i915_gem_check_olr, which is littered all
  over i915_gem.c
- ...

If I were to duplicate all the code that directly or indirectly
uses __i915_add_request, I'll end up creating a separate driver.

To show the differences between the existing legacy version and
the new Execlists one, this time I have special-cased
__i915_add_request instead of adding an add_request vfunc. I
hope this helps to untangle this Gordian knot.

Signed-off-by: Oscar Mateo oscar.ma...@intel.com
---
 drivers/gpu/drm/i915/i915_gem.c  |   72 --
 drivers/gpu/drm/i915/intel_lrc.c |   30 +---
 drivers/gpu/drm/i915/intel_lrc.h |1 +
 3 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9560b40..1c83b9c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2327,10 +2327,21 @@ int __i915_add_request(struct intel_engine_cs *ring,
 {
struct drm_i915_private *dev_priv = ring-dev-dev_private;
struct drm_i915_gem_request *request;
+   struct intel_ringbuffer *ringbuf;
u32 request_ring_position, request_start;
int ret;
 
-   request_start = intel_ring_get_tail(ring-buffer);
+   request = ring-preallocated_lazy_request;
+   if (WARN_ON(request == NULL))
+   return -ENOMEM;
+
+   if (i915.enable_execlists) {
+   struct intel_context *ctx = request-ctx;
+   ringbuf = ctx-engine[ring-id].ringbuf;
+   } else
+   ringbuf = ring-buffer;
+
+   request_start = intel_ring_get_tail(ringbuf);
/*
 * Emit any outstanding flushes - execbuf can fail to emit the flush
 * after having emitted the batchbuffer command. Hence we need to fix
@@ -2338,24 +2349,32 @@ int __i915_add_request(struct intel_engine_cs *ring,
 * is that the flush _must_ happen before the next request, no matter
 * what.
 */
-   ret = intel_ring_flush_all_caches(ring);
-   if (ret)
-   return ret;
-
-   request = ring-preallocated_lazy_request;
-   if (WARN_ON(request == NULL))
-   return -ENOMEM;
+   if (i915.enable_execlists) {
+   ret = logical_ring_flush_all_caches(ringbuf);
+   if (ret)
+   return ret;
+   } else {
+   ret = intel_ring_flush_all_caches(ring);
+   if (ret)
+   return ret;
+   }
 
/* Record the position of the start of the request so that
 * should we detect the updated seqno part-way through the
 * GPU processing the request, we never over-estimate the
 * position of the head.
 */
-   request_ring_position = intel_ring_get_tail(ring-buffer);
+   request_ring_position = intel_ring_get_tail(ringbuf);
 
-   ret = ring-add_request(ring);
-   if (ret)
-   return ret;
+   if (i915.enable_execlists) {
+   ret = ring-emit_request(ringbuf);
+   if (ret)
+   return ret;
+   } else {
+   ret = ring-add_request(ring);
+   if (ret)
+   return ret;
+   }
 
request-seqno = intel_ring_get_seqno(ring);
request-ring = ring;
@@ -2370,12 +2389,14 @@ int __i915_add_request(struct intel_engine_cs *ring,
 */
request-batch_obj = obj;
 
-   /* Hold a reference to the current context so that we can inspect
-* it later in case a hangcheck error event fires.
-*/
-   request-ctx = ring-last_context;
-   if (request-ctx)
-   i915_gem_context_reference(request-ctx);
+   if (!i915.enable_execlists) {
+   /* Hold a reference to the current context so that we can 
inspect
+* it later in case a hangcheck error event fires.
+*/
+   request-ctx = ring-last_context;
+   if (request-ctx)
+   i915_gem_context_reference(request-ctx);
+   }
 
request-emitted_jiffies = jiffies;
list_add_tail(request-list, ring-request_list);
@@ -2630,6 +2651,7 @@