Some error conditions just stop a channel and fences get stuck, so they
either need to be kicked ready in overwriting hw seq numbers (as nvgpu
does) or faked with a sw flag like this. This is just a hack as an
example of what would be needed.

Here, a channel id whose fences should be forced updated is passed
upwards with the uevent response. Normally, this is -1 to match no
channel id, but some error paths fake an update event with an explicit
channel id.

Note: if userspace has some meaningful timeouts on the fences, then they
do finish but without any notification that the channel is broken now
(how do you distinguish a too long gpu job from a stuck one?). In many
cases, a channel needs to be shut down completely when it breaks (e.g.,
mmu fault).

Signed-off-by: Konsta Hölttä <khol...@nvidia.com>
---
 drm/nouveau/include/nvif/event.h       |  1 +
 drm/nouveau/include/nvkm/engine/fifo.h |  2 +-
 drm/nouveau/nouveau_fence.c            | 13 ++++++++-----
 drm/nouveau/nvkm/engine/fifo/base.c    |  3 ++-
 drm/nouveau/nvkm/engine/fifo/gf100.c   |  2 +-
 drm/nouveau/nvkm/engine/fifo/gk104.c   |  7 ++++++-
 drm/nouveau/nvkm/engine/fifo/nv04.c    |  2 +-
 7 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/drm/nouveau/include/nvif/event.h b/drm/nouveau/include/nvif/event.h
index d148b85..a9ff4ee 100644
--- a/drm/nouveau/include/nvif/event.h
+++ b/drm/nouveau/include/nvif/event.h
@@ -52,16 +52,17 @@ struct nvif_notify_conn_rep_v0 {
 };
 
 struct nvif_notify_uevent_req {
        /* nvif_notify_req ... */
 };
 
 struct nvif_notify_uevent_rep {
        /* nvif_notify_rep ... */
+       __u32 force_chid;
 };
 
 struct nvif_notify_eevent_req {
        /* nvif_notify_req ... */
        u32 chid;
 };
 
 struct nvif_notify_eevent_rep {
diff --git a/drm/nouveau/include/nvkm/engine/fifo.h 
b/drm/nouveau/include/nvkm/engine/fifo.h
index cbca477..946eb68 100644
--- a/drm/nouveau/include/nvkm/engine/fifo.h
+++ b/drm/nouveau/include/nvkm/engine/fifo.h
@@ -117,15 +117,15 @@ extern struct nvkm_oclass *gf100_fifo_oclass;
 extern struct nvkm_oclass *gk104_fifo_oclass;
 extern struct nvkm_oclass *gk20a_fifo_oclass;
 extern struct nvkm_oclass *gk208_fifo_oclass;
 extern struct nvkm_oclass *gm204_fifo_oclass;
 extern struct nvkm_oclass *gm20b_fifo_oclass;
 
 int  nvkm_fifo_uevent_ctor(struct nvkm_object *, void *, u32,
                           struct nvkm_notify *);
-void nvkm_fifo_uevent(struct nvkm_fifo *);
+void nvkm_fifo_uevent(struct nvkm_fifo *, u32 force_chid);
 
 void nvkm_fifo_eevent(struct nvkm_fifo *, u32 chid, u32 error);
 
 void nv04_fifo_intr(struct nvkm_subdev *);
 int  nv04_fifo_context_attach(struct nvkm_object *, struct nvkm_object *);
 #endif
diff --git a/drm/nouveau/nouveau_fence.c b/drm/nouveau/nouveau_fence.c
index 38bccb0..b7d9987 100644
--- a/drm/nouveau/nouveau_fence.c
+++ b/drm/nouveau/nouveau_fence.c
@@ -123,50 +123,53 @@ nouveau_fence_context_put(struct kref *fence_ref)
 
 void
 nouveau_fence_context_free(struct nouveau_fence_chan *fctx)
 {
        kref_put(&fctx->fence_ref, nouveau_fence_context_put);
 }
 
 static int
-nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan 
*fctx)
+nouveau_fence_update(struct nouveau_channel *chan,
+               struct nouveau_fence_chan *fctx, u32 force_chid)
 {
        struct nouveau_fence *fence;
        int drop = 0;
        u32 seq = fctx->read(chan);
+       bool force = force_chid == chan->chid;
 
        while (!list_empty(&fctx->pending)) {
                fence = list_entry(fctx->pending.next, typeof(*fence), head);
 
-               if ((int)(seq - fence->base.seqno) < 0)
+               if ((int)(seq - fence->base.seqno) < 0 && !force)
                        break;
 
                drop |= nouveau_fence_signal(fence);
        }
 
        return drop;
 }
 
 static int
 nouveau_fence_wait_uevent_handler(struct nvif_notify *notify)
 {
        struct nouveau_fence_chan *fctx =
                container_of(notify, typeof(*fctx), notify);
+       const struct nvif_notify_uevent_rep *rep = notify->data;
        unsigned long flags;
        int ret = NVIF_NOTIFY_KEEP;
 
        spin_lock_irqsave(&fctx->lock, flags);
        if (!list_empty(&fctx->pending)) {
                struct nouveau_fence *fence;
                struct nouveau_channel *chan;
 
                fence = list_entry(fctx->pending.next, typeof(*fence), head);
                chan = rcu_dereference_protected(fence->channel, 
lockdep_is_held(&fctx->lock));
-               if (nouveau_fence_update(fence->channel, fctx))
+               if (nouveau_fence_update(fence->channel, fctx, rep->force_chid))
                        ret = NVIF_NOTIFY_DROP;
        }
        spin_unlock_irqrestore(&fctx->lock, flags);
 
        return ret;
 }
 
 void
@@ -278,17 +281,17 @@ nouveau_fence_emit(struct nouveau_fence *fence, struct 
nouveau_channel *chan)
        kref_get(&fctx->fence_ref);
 
        trace_fence_emit(&fence->base);
        ret = fctx->emit(fence);
        if (!ret) {
                fence_get(&fence->base);
                spin_lock_irq(&fctx->lock);
 
-               if (nouveau_fence_update(chan, fctx))
+               if (nouveau_fence_update(chan, fctx, -1))
                        nvif_notify_put(&fctx->notify);
 
                list_add_tail(&fence->head, &fctx->pending);
                spin_unlock_irq(&fctx->lock);
        }
 
        return ret;
 }
@@ -302,17 +305,17 @@ nouveau_fence_done(struct nouveau_fence *fence)
                struct nouveau_channel *chan;
                unsigned long flags;
 
                if (test_bit(FENCE_FLAG_SIGNALED_BIT, &fence->base.flags))
                        return true;
 
                spin_lock_irqsave(&fctx->lock, flags);
                chan = rcu_dereference_protected(fence->channel, 
lockdep_is_held(&fctx->lock));
-               if (chan && nouveau_fence_update(chan, fctx))
+               if (chan && nouveau_fence_update(chan, fctx, -1))
                        nvif_notify_put(&fctx->notify);
                spin_unlock_irqrestore(&fctx->lock, flags);
        }
        return fence_is_signaled(&fence->base);
 }
 
 static long
 nouveau_fence_wait_legacy(struct fence *f, bool intr, long wait)
diff --git a/drm/nouveau/nvkm/engine/fifo/base.c 
b/drm/nouveau/nvkm/engine/fifo/base.c
index a5dc6c9..e35d711 100644
--- a/drm/nouveau/nvkm/engine/fifo/base.c
+++ b/drm/nouveau/nvkm/engine/fifo/base.c
@@ -184,19 +184,20 @@ nvkm_fifo_uevent_ctor(struct nvkm_object *object, void 
*data, u32 size,
                notify->types = 1;
                notify->index = 0;
        }
 
        return ret;
 }
 
 void
-nvkm_fifo_uevent(struct nvkm_fifo *fifo)
+nvkm_fifo_uevent(struct nvkm_fifo *fifo, u32 force_chid)
 {
        struct nvif_notify_uevent_rep rep = {
+               .force_chid = force_chid
        };
        nvkm_event_send(&fifo->uevent, 1, 0, &rep, sizeof(rep));
 }
 
 static int
 nvkm_fifo_eevent_ctor(struct nvkm_object *object, void *data, u32 size,
                      struct nvkm_notify *notify)
 {
diff --git a/drm/nouveau/nvkm/engine/fifo/gf100.c 
b/drm/nouveau/nvkm/engine/fifo/gf100.c
index b745252..ca86dfe 100644
--- a/drm/nouveau/nvkm/engine/fifo/gf100.c
+++ b/drm/nouveau/nvkm/engine/fifo/gf100.c
@@ -732,17 +732,17 @@ gf100_fifo_intr_engine_unit(struct gf100_fifo_priv *priv, 
int engn)
        u32 inte = nv_rd32(priv, 0x002628);
        u32 unkn;
 
        nv_wr32(priv, 0x0025a8 + (engn * 0x04), intr);
 
        for (unkn = 0; unkn < 8; unkn++) {
                u32 ints = (intr >> (unkn * 0x04)) & inte;
                if (ints & 0x1) {
-                       nvkm_fifo_uevent(&priv->base);
+                       nvkm_fifo_uevent(&priv->base, -1);
                        ints &= ~1;
                }
                if (ints) {
                        nv_error(priv, "ENGINE %d %d %01x", engn, unkn, ints);
                        nv_mask(priv, 0x002628, ints, 0);
                }
        }
 }
diff --git a/drm/nouveau/nvkm/engine/fifo/gk104.c 
b/drm/nouveau/nvkm/engine/fifo/gk104.c
index 53a464d..caecef1 100644
--- a/drm/nouveau/nvkm/engine/fifo/gk104.c
+++ b/drm/nouveau/nvkm/engine/fifo/gk104.c
@@ -908,16 +908,18 @@ gk104_fifo_intr_fault(struct gk104_fifo_priv *priv, int 
unit)
        object = engctx;
        while (object) {
                switch (nv_mclass(object)) {
                case KEPLER_CHANNEL_GPFIFO_A:
                case MAXWELL_CHANNEL_GPFIFO_A:
                        nvkm_fifo_eevent(&priv->base,
                                        ((struct nvkm_fifo_chan*)object)->chid,
                                        
NOUVEAU_GEM_CHANNEL_FIFO_ERROR_MMU_ERR_FLT);
+                       nvkm_fifo_uevent(&priv->base,
+                                       ((struct nvkm_fifo_chan*)object)->chid);
                        gk104_fifo_recover(priv, engine, (void *)object);
                        break;
                }
                object = object->parent;
        }
 
        nvkm_engctx_put(engctx);
 }
@@ -978,18 +980,21 @@ gk104_fifo_intr_pbdma_0(struct gk104_fifo_priv *priv, int 
unit)
                nv_error(priv, "PBDMA%d:", unit);
                nvkm_bitfield_print(gk104_fifo_pbdma_intr_0, show);
                pr_cont("\n");
                nv_error(priv,
                         "PBDMA%d: ch %d [%s] subc %d mthd 0x%04x data 
0x%08x\n",
                         unit, chid,
                         nvkm_client_name_for_fifo_chid(&priv->base, chid),
                         subc, mthd, data);
+
                nvkm_fifo_eevent(&priv->base, chid,
                                NOUVEAU_GEM_CHANNEL_PBDMA_ERROR);
+
+               nvkm_fifo_uevent(&priv->base, chid);
        }
 
        nv_wr32(priv, 0x040108 + (unit * 0x2000), stat);
 }
 
 static const struct nvkm_bitfield gk104_fifo_pbdma_intr_1[] = {
        { 0x00000001, "HCE_RE_ILLEGAL_OP" },
        { 0x00000002, "HCE_RE_ALIGNB" },
@@ -1030,17 +1035,17 @@ gk104_fifo_intr_runlist(struct gk104_fifo_priv *priv)
                nv_wr32(priv, 0x002a00, 1 << engn);
                mask &= ~(1 << engn);
        }
 }
 
 static void
 gk104_fifo_intr_engine(struct gk104_fifo_priv *priv)
 {
-       nvkm_fifo_uevent(&priv->base);
+       nvkm_fifo_uevent(&priv->base, -1);
 }
 
 static void
 gk104_fifo_intr(struct nvkm_subdev *subdev)
 {
        struct gk104_fifo_priv *priv = (void *)subdev;
        u32 mask = nv_rd32(priv, 0x002140);
        u32 stat = nv_rd32(priv, 0x002100) & mask;
diff --git a/drm/nouveau/nvkm/engine/fifo/nv04.c 
b/drm/nouveau/nvkm/engine/fifo/nv04.c
index 043e429..1749614 100644
--- a/drm/nouveau/nvkm/engine/fifo/nv04.c
+++ b/drm/nouveau/nvkm/engine/fifo/nv04.c
@@ -536,17 +536,17 @@ nv04_fifo_intr(struct nvkm_subdev *subdev)
        if (device->card_type == NV_50) {
                if (stat & 0x00000010) {
                        stat &= ~0x00000010;
                        nv_wr32(priv, 0x002100, 0x00000010);
                }
 
                if (stat & 0x40000000) {
                        nv_wr32(priv, 0x002100, 0x40000000);
-                       nvkm_fifo_uevent(&priv->base);
+                       nvkm_fifo_uevent(&priv->base, -1);
                        stat &= ~0x40000000;
                }
        }
 
        if (stat) {
                nv_warn(priv, "unknown intr 0x%08x\n", stat);
                nv_mask(priv, NV03_PFIFO_INTR_EN_0, stat, 0x00000000);
                nv_wr32(priv, NV03_PFIFO_INTR_0, stat);
-- 
2.1.4

_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

Reply via email to