I think that the IB tests already set the ready flag to false when something goes wrong, but for the ring tests your probably want to double check and maybe generalize that.

Christian.

Am 24.02.20 um 19:15 schrieb Das, Nirmoy:

[AMD Official Use Only - Internal Distribution Only]


Thanks Christian. I will try to send a updated patch soon.

Get Outlook for Android <https://aka.ms/ghei36>

------------------------------------------------------------------------
*From:* Koenig, Christian <christian.koe...@amd.com>
*Sent:* Monday, February 24, 2020, 18:06
*To:* Nirmoy Das
*Cc:* amd-gfx@lists.freedesktop.org; Deucher, Alexander; Liu, Monk; Li, Dennis; Das, Nirmoy *Subject:* Re: [RFC PATCH 1/1] drm/amdgpu: wait for sched to become ready on job submit

Hi Nirmoy,

Am 24.02.2020 17:48 schrieb Nirmoy Das <nirmoy.ai...@gmail.com>:

    On reset, amdgpu can set a drm sched's ready status to false
    temporarily. drm job
    init will fail if all of the drm scheds are not ready for a HW IP.
    This patch tries to make
    kernel's internal drm job submit handle, amdgpu_job_submit() a bit
    more fault tolerant.


I don't think that this approach makes sense. Since it is a front end property we should rather stop setting the scheduler ready status to false during reset.

Instead we should only set it to false when the ring/IB test fails and we can't bring the ring back to life again.

Christian.


    Signed-off-by: Nirmoy Das <nirmoy....@amd.com>
    ---
     drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 +++++++++++++++++++--
     drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  5 +--
     drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  6 ++--
     drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c |  2 +-
     drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c |  2 +-
     drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-
     drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  2 +-
     7 files changed, 43 insertions(+), 11 deletions(-)

    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
    index d42be880a236..0745df80112f 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
    @@ -139,7 +139,38 @@ void amdgpu_job_free(struct amdgpu_job *job)
             kfree(job);
     }

    -int amdgpu_job_submit(struct amdgpu_job *job, struct
    drm_sched_entity *entity,
    +static int amdgpu_job_try_init(struct amdgpu_device *adev,
    +                              struct drm_sched_job *base,
    +                              struct drm_sched_entity *entity,
    +                              void *owner)
    +{
    +       int r, i;
    +
    +       r = drm_sched_job_init(base, entity, owner);
    +       if (r == -ENOENT) {
    +               /* retry till we come out of reset phase */
    +               while (!mutex_trylock(&adev->lock_reset))
    +                       msleep(10);
    +               /* retry for a second for the sched to get ready*/
    +               for (i = 0; i < 100; i++) {
    +                       msleep(10);
    +                       r = drm_sched_job_init(base, entity, owner);
    +                       if (r == -ENOENT)
    +                               continue;
    +               }
    +
    + mutex_unlock(&adev->lock_reset);
    +               /* If after all these we failed to initialize a job
    +                * it means the IP is unrecoverable */
    +               if (r == -ENOENT)
    +                       return -ENODEV;
    +       }
    +
    +       return r;
    +}
    +
    +int amdgpu_job_submit(struct amdgpu_device *adev,struct
    amdgpu_job *job,
    +                     struct drm_sched_entity *entity,
                           void *owner, struct dma_fence **f)
     {
             enum drm_sched_priority priority;
    @@ -149,7 +180,7 @@ int amdgpu_job_submit(struct amdgpu_job *job,
    struct drm_sched_entity *entity,
             if (!f)
                     return -EINVAL;

    -       r = drm_sched_job_init(&job->base, entity, owner);
    +       r = amdgpu_job_try_init(adev, &job->base, entity, owner);
             if (r)
                     return r;

    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
    index 2e2110dddb76..fed87e96cacc 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
    @@ -70,8 +70,9 @@ int amdgpu_job_alloc_with_ib(struct
    amdgpu_device *adev, unsigned size,

     void amdgpu_job_free_resources(struct amdgpu_job *job);
     void amdgpu_job_free(struct amdgpu_job *job);
    -int amdgpu_job_submit(struct amdgpu_job *job, struct
    drm_sched_entity *entity,
    -                     void *owner, struct dma_fence **f);
    +int amdgpu_job_submit(struct amdgpu_device *adev, struct
    amdgpu_job *job,
    +                     struct drm_sched_entity *entity, void *owner,
    +                     struct dma_fence **f);
     int amdgpu_job_submit_direct(struct amdgpu_job *job, struct
    amdgpu_ring *ring,
                                  struct dma_fence **fence);

    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
    index 660867cf2597..adfde07eb75f 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
    @@ -2066,7 +2066,7 @@ static int amdgpu_map_buffer(struct
    ttm_buffer_object *bo,
             if (r)
                     goto error_free;

    -       r = amdgpu_job_submit(job, &adev->mman.entity,
    +       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
    AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
             if (r)
                     goto error_free;
    @@ -2137,7 +2137,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring
    *ring, uint64_t src_offset,
             if (direct_submit)
                     r = amdgpu_job_submit_direct(job, ring, fence);
             else
    -               r = amdgpu_job_submit(job, &adev->mman.entity,
    +               r = amdgpu_job_submit(adev, job, &adev->mman.entity,
    AMDGPU_FENCE_OWNER_UNDEFINED, fence);
             if (r)
                     goto error_free;
    @@ -2231,7 +2231,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,

             amdgpu_ring_pad_ib(ring, &job->ibs[0]);
             WARN_ON(job->ibs[0].length_dw > num_dw);
    -       r = amdgpu_job_submit(job, &adev->mman.entity,
    +       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
    AMDGPU_FENCE_OWNER_UNDEFINED, fence);
             if (r)
                     goto error_free;
    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
    index 5fd32ad1c575..8ff97b24914e 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
    @@ -1104,7 +1104,7 @@ static int amdgpu_uvd_send_msg(struct
    amdgpu_ring *ring, struct amdgpu_bo *bo,
                     if (r)
                             goto err_free;

    -               r = amdgpu_job_submit(job, &adev->uvd.entity,
    +               r = amdgpu_job_submit(adev, job, &adev->uvd.entity,
    AMDGPU_FENCE_OWNER_UNDEFINED, &f);
                     if (r)
                             goto err_free;
    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
    index 59ddba137946..e721d3367783 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
    @@ -554,7 +554,7 @@ static int amdgpu_vce_get_destroy_msg(struct
    amdgpu_ring *ring, uint32_t handle,
             if (direct)
                     r = amdgpu_job_submit_direct(job, ring, &f);
             else
    -               r = amdgpu_job_submit(job, &ring->adev->vce.entity,
    +               r = amdgpu_job_submit(ring->adev, job,
    &ring->adev->vce.entity,
    AMDGPU_FENCE_OWNER_UNDEFINED, &f);
             if (r)
                     goto err;
    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
    index 4cc7881f438c..b536962c22d9 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
    @@ -100,7 +100,7 @@ static int amdgpu_vm_sdma_commit(struct
    amdgpu_vm_update_params *p,
             WARN_ON(ib->length_dw == 0);
             amdgpu_ring_pad_ib(ring, ib);
             WARN_ON(ib->length_dw > p->num_dw_left);
    -       r = amdgpu_job_submit(p->job, entity,
    AMDGPU_FENCE_OWNER_VM, &f);
    +       r = amdgpu_job_submit(p->adev, p->job, entity,
    AMDGPU_FENCE_OWNER_VM, &f);
             if (r)
                     goto error;

    diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
    b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
    index 9775eca6fe43..a4aaa2a1f878 100644
    --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
    +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
    @@ -377,7 +377,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct
    amdgpu_device *adev, uint32_t vmid,
             job->vm_needs_flush = true;
    job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;
             amdgpu_ring_pad_ib(ring, &job->ibs[0]);
    -       r = amdgpu_job_submit(job, &adev->mman.entity,
    +       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
    AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
             if (r)
                     goto error_submit;
-- 2.25.0




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to