Hi all,

Sorry, this was actually a conflict between the amdgpu tree and the drm
tree.

On Tue, 1 Jul 2025 13:07:02 +1000 Stephen Rothwell <s...@canb.auug.org.au> 
wrote:
>
> Today's linux-next merge of the drm-misc tree got a conflict in:
> 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> 
> between commits:
> 
>   183bccafa176 ("drm: Create a task info option for wedge events")
>   a72002cb181f ("drm/amdgpu: Make use of drm_wedge_task_info")
> 
> from the drm tree and commits:
> 
>   821aacb2dcf0 ("drm/amdgpu: rework queue reset scheduler interaction")
>   43ca5eb94b38 ("drm/amdgpu: move guilty handling into ring resets")
>   38b20968f3d8 ("drm/amdgpu: move scheduler wqueue handling into callbacks")
> 
> from the drm-misc tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> diff --cc drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 6b4ffa9ceb7a,f0b7080dccb8..000000000000
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@@ -89,11 -89,9 +89,10 @@@ static enum drm_gpu_sched_stat amdgpu_j
>   {
>       struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>       struct amdgpu_job *job = to_amdgpu_job(s_job);
>  +    struct drm_wedge_task_info *info = NULL;
>       struct amdgpu_task_info *ti;
>       struct amdgpu_device *adev = ring->adev;
> -     int idx;
> -     int r;
> +     int idx, r;
>   
>       if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>               dev_info(adev->dev, "%s - device unplugged skipping recovery on 
> scheduler:%s",
> @@@ -133,47 -133,22 +132,24 @@@
>       if (unlikely(adev->debug_disable_gpu_ring_reset)) {
>               dev_err(adev->dev, "Ring reset disabled by debug mask\n");
>       } else if (amdgpu_gpu_recovery && ring->funcs->reset) {
> -             bool is_guilty;
> - 
> -             dev_err(adev->dev, "Starting %s ring reset\n", 
> s_job->sched->name);
> -             /* stop the scheduler, but don't mess with the
> -              * bad job yet because if ring reset fails
> -              * we'll fall back to full GPU reset.
> -              */
> -             drm_sched_wqueue_stop(&ring->sched);
> - 
> -             /* for engine resets, we need to reset the engine,
> -              * but individual queues may be unaffected.
> -              * check here to make sure the accounting is correct.
> -              */
> -             if (ring->funcs->is_guilty)
> -                     is_guilty = ring->funcs->is_guilty(ring);
> -             else
> -                     is_guilty = true;
> - 
> -             if (is_guilty)
> -                     dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
> - 
> -             r = amdgpu_ring_reset(ring, job->vmid);
> +             dev_err(adev->dev, "Starting %s ring reset\n",
> +                     s_job->sched->name);
> +             r = amdgpu_ring_reset(ring, job->vmid, NULL);
>               if (!r) {
> -                     if (amdgpu_ring_sched_ready(ring))
> -                             drm_sched_stop(&ring->sched, s_job);
> -                     if (is_guilty) {
> -                             atomic_inc(&ring->adev->gpu_reset_counter);
> -                             amdgpu_fence_driver_force_completion(ring);
> -                     }
> -                     if (amdgpu_ring_sched_ready(ring))
> -                             drm_sched_start(&ring->sched, 0);
> -                     dev_err(adev->dev, "Ring %s reset succeeded\n", 
> ring->sched.name);
> -                     drm_dev_wedged_event(adev_to_drm(adev), 
> DRM_WEDGE_RECOVERY_NONE, info);
> +                     atomic_inc(&ring->adev->gpu_reset_counter);
> +                     dev_err(adev->dev, "Ring %s reset succeeded\n",
> +                             ring->sched.name);
> +                     drm_dev_wedged_event(adev_to_drm(adev),
>  -                                         DRM_WEDGE_RECOVERY_NONE);
> ++                                         DRM_WEDGE_RECOVERY_NONE, info);
>                       goto exit;
>               }
> -             dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
> +             dev_err(adev->dev, "Ring %s reset failed\n", ring->sched.name);
>       }
> + 
>       dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>   
>  +    amdgpu_vm_put_task_info(ti);
>  +
>       if (amdgpu_device_should_recover_gpu(ring->adev)) {
>               struct amdgpu_reset_context reset_context;
>               memset(&reset_context, 0, sizeof(reset_context));

-- 
Cheers,
Stephen Rothwell

Attachment: pgpam5T9c4Knb.pgp
Description: OpenPGP digital signature

Reply via email to