Re: [Intel-gfx] [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission

2016-11-03 Thread Chris Wilson
On Thu, Nov 03, 2016 at 10:51:14AM +, Chris Wilson wrote:
> Yes. Worse is that the 2 comes from having different paths to this point
> with their own nesting pattern.

Not any more, that was a leftover from one version that managed to nest
signaling/execution.

The issue is 

__i915_gem_request_submit -> fence_signal(rq->execute)

Then if we hook the submit fence onto an earlier execute fence, we hit
submit_notify() and try to acquire the engine->timeline again.

Hindsight says we cannot control the recursion there, so don't hook up
the fences like that (submit is not allowed to listen to execute).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission

2016-11-03 Thread Chris Wilson
On Thu, Nov 03, 2016 at 10:34:26AM +, Tvrtko Ursulin wrote:
> 
> On 02/11/2016 17:50, Chris Wilson wrote:
> >Defer the transfer from the client's timeline onto the execution
> >timeline from the point of readiness to the point of actual submission.
> >For example, in execlists, a request is finally submitted to hardware
> >when the hardware is ready, and only put onto the hardware queue when
> >the request is ready. By deferring the transfer, we ensure that the
> >timeline is maintained in retirement order if we decide to queue the
> >requests onto the hardware in a different order than fifo.
> >
> >Signed-off-by: Chris Wilson 
> >---
> > drivers/gpu/drm/i915/i915_gem_request.c| 36 
> > ++
> > drivers/gpu/drm/i915/i915_gem_request.h|  2 ++
> > drivers/gpu/drm/i915/i915_guc_submission.c |  2 ++
> > drivers/gpu/drm/i915/intel_lrc.c   |  5 +
> > drivers/gpu/drm/i915/intel_ringbuffer.c|  2 ++
> > 5 files changed, 33 insertions(+), 14 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
> >b/drivers/gpu/drm/i915/i915_gem_request.c
> >index 1ae5a2f8953f..05544dec5de9 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline 
> >*tl)
> > return atomic_inc_return(>next_seqno);
> > }
> >
> >-static int __i915_sw_fence_call
> >-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
> > {
> >-struct drm_i915_gem_request *request =
> >-container_of(fence, typeof(*request), submit);
> > struct intel_engine_cs *engine = request->engine;
> > struct intel_timeline *timeline;
> >-unsigned long flags;
> > u32 seqno;
> >
> >-if (state != FENCE_COMPLETE)
> >-return NOTIFY_DONE;
> >-
> > /* Transfer from per-context onto the global per-engine timeline */
> > timeline = engine->timeline;
> > GEM_BUG_ON(timeline == request->timeline);
> >-
> >-/* Will be called from irq-context when using foreign DMA fences */
> >-spin_lock_irqsave(>lock, flags);
> >+assert_spin_locked(>lock);
> >
> > seqno = timeline_get_seqno(timeline->common);
> > GEM_BUG_ON(!seqno);
> >@@ -345,15 +336,32 @@ submit_notify(struct i915_sw_fence *fence, enum 
> >i915_sw_fence_notify state)
> > GEM_BUG_ON(!request->global_seqno);
> > engine->emit_breadcrumb(request,
> > request->ring->vaddr + request->postfix);
> >-engine->submit_request(request);
> >
> >-spin_lock_nested(>timeline->lock, SINGLE_DEPTH_NESTING);
> >+/* nested from submit_notify */
> >+spin_lock_nested(>timeline->lock, 2);
> 
> Is this because lockdep does not differentiate between
> request->timeline->lock and engine->timeline->lock?

Yes. Worse is that the 2 comes from having different paths to this point
with their own nesting pattern.

struct timeline {
struct lock_class_key lockdep;
struct mutex mutex;
}

__mutex_init(>mutex, "timeline", >lockdep);

? (Hopefully that works w/o lockdep enabled)

Better then would be

i915_gem_timeline_create()
i915_gem_timeline_create_global()

and just use one lock class for all user timelines, and a lock class per
engine on the global timeline.

Will try.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission

2016-11-03 Thread Tvrtko Ursulin


On 02/11/2016 17:50, Chris Wilson wrote:

Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_gem_request.c| 36 ++
 drivers/gpu/drm/i915/i915_gem_request.h|  2 ++
 drivers/gpu/drm/i915/i915_guc_submission.c |  2 ++
 drivers/gpu/drm/i915/intel_lrc.c   |  5 +
 drivers/gpu/drm/i915/intel_ringbuffer.c|  2 ++
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
b/drivers/gpu/drm/i915/i915_gem_request.c
index 1ae5a2f8953f..05544dec5de9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline 
*tl)
return atomic_inc_return(>next_seqno);
 }

-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-   struct drm_i915_gem_request *request =
-   container_of(fence, typeof(*request), submit);
struct intel_engine_cs *engine = request->engine;
struct intel_timeline *timeline;
-   unsigned long flags;
u32 seqno;

-   if (state != FENCE_COMPLETE)
-   return NOTIFY_DONE;
-
/* Transfer from per-context onto the global per-engine timeline */
timeline = engine->timeline;
GEM_BUG_ON(timeline == request->timeline);
-
-   /* Will be called from irq-context when using foreign DMA fences */
-   spin_lock_irqsave(>lock, flags);
+   assert_spin_locked(>lock);

seqno = timeline_get_seqno(timeline->common);
GEM_BUG_ON(!seqno);
@@ -345,15 +336,32 @@ submit_notify(struct i915_sw_fence *fence, enum 
i915_sw_fence_notify state)
GEM_BUG_ON(!request->global_seqno);
engine->emit_breadcrumb(request,
request->ring->vaddr + request->postfix);
-   engine->submit_request(request);

-   spin_lock_nested(>timeline->lock, SINGLE_DEPTH_NESTING);
+   /* nested from submit_notify */
+   spin_lock_nested(>timeline->lock, 2);


Is this because lockdep does not differentiate between 
request->timeline->lock and engine->timeline->lock?


Could we pass different lock classes to i915_gem_timeline_init and 
remove this deep nested annotation. Would be better I think.



list_move_tail(>link, >requests);
spin_unlock(>timeline->lock);

i915_sw_fence_commit(>execute);
+}

-   spin_unlock_irqrestore(>lock, flags);
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+   if (state == FENCE_COMPLETE) {
+   struct drm_i915_gem_request *request =
+   container_of(fence, typeof(*request), submit);
+   struct intel_engine_cs *engine = request->engine;
+   unsigned long flags;
+
+   /* Will be called from irq-context when using foreign fences,
+* and may be called from the execution callback of another
+* engine.
+*/
+   spin_lock_irqsave_nested(>timeline->lock, flags, 1);
+   engine->submit_request(request);
+   spin_unlock_irqrestore(>timeline->lock, flags);
+   }

return NOTIFY_DONE;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h 
b/drivers/gpu/drm/i915/i915_gem_request.h
index ed13f37fea0f..725d04128eaf 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -228,6 +228,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, 
bool flush_caches);
 #define i915_add_request_no_flush(req) \
__i915_add_request(req, false)

+void __i915_gem_request_submit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 857ef914cae7..ddb574d5ceda 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -640,6 +640,8 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
struct i915_guc_client *client = guc->execbuf_client;
int b_ret;

+   __i915_gem_request_submit(rq);
+
spin_lock(>wq_lock);
guc_wq_item_append(client, rq);

diff --git 

[Intel-gfx] [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission

2016-11-02 Thread Chris Wilson
Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

Signed-off-by: Chris Wilson 
---
 drivers/gpu/drm/i915/i915_gem_request.c| 36 ++
 drivers/gpu/drm/i915/i915_gem_request.h|  2 ++
 drivers/gpu/drm/i915/i915_guc_submission.c |  2 ++
 drivers/gpu/drm/i915/intel_lrc.c   |  5 +
 drivers/gpu/drm/i915/intel_ringbuffer.c|  2 ++
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c 
b/drivers/gpu/drm/i915/i915_gem_request.c
index 1ae5a2f8953f..05544dec5de9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline 
*tl)
return atomic_inc_return(>next_seqno);
 }
 
-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-   struct drm_i915_gem_request *request =
-   container_of(fence, typeof(*request), submit);
struct intel_engine_cs *engine = request->engine;
struct intel_timeline *timeline;
-   unsigned long flags;
u32 seqno;
 
-   if (state != FENCE_COMPLETE)
-   return NOTIFY_DONE;
-
/* Transfer from per-context onto the global per-engine timeline */
timeline = engine->timeline;
GEM_BUG_ON(timeline == request->timeline);
-
-   /* Will be called from irq-context when using foreign DMA fences */
-   spin_lock_irqsave(>lock, flags);
+   assert_spin_locked(>lock);
 
seqno = timeline_get_seqno(timeline->common);
GEM_BUG_ON(!seqno);
@@ -345,15 +336,32 @@ submit_notify(struct i915_sw_fence *fence, enum 
i915_sw_fence_notify state)
GEM_BUG_ON(!request->global_seqno);
engine->emit_breadcrumb(request,
request->ring->vaddr + request->postfix);
-   engine->submit_request(request);
 
-   spin_lock_nested(>timeline->lock, SINGLE_DEPTH_NESTING);
+   /* nested from submit_notify */
+   spin_lock_nested(>timeline->lock, 2);
list_move_tail(>link, >requests);
spin_unlock(>timeline->lock);
 
i915_sw_fence_commit(>execute);
+}
 
-   spin_unlock_irqrestore(>lock, flags);
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+   if (state == FENCE_COMPLETE) {
+   struct drm_i915_gem_request *request =
+   container_of(fence, typeof(*request), submit);
+   struct intel_engine_cs *engine = request->engine;
+   unsigned long flags;
+
+   /* Will be called from irq-context when using foreign fences,
+* and may be called from the execution callback of another
+* engine.
+*/
+   spin_lock_irqsave_nested(>timeline->lock, flags, 1);
+   engine->submit_request(request);
+   spin_unlock_irqrestore(>timeline->lock, flags);
+   }
 
return NOTIFY_DONE;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h 
b/drivers/gpu/drm/i915/i915_gem_request.h
index ed13f37fea0f..725d04128eaf 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -228,6 +228,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, 
bool flush_caches);
 #define i915_add_request_no_flush(req) \
__i915_add_request(req, false)
 
+void __i915_gem_request_submit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 857ef914cae7..ddb574d5ceda 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -640,6 +640,8 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
struct i915_guc_client *client = guc->execbuf_client;
int b_ret;
 
+   __i915_gem_request_submit(rq);
+
spin_lock(>wq_lock);
guc_wq_item_append(client, rq);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fa3012c342cc..b9aba16851ac 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
 {
struct