[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
     Thanks for your review. I will update it according to your suggestion. 

Best Regards
Dennis Li
-----Original Message-----
From: Christian König <ckoenig.leichtzumer...@gmail.com> 
Sent: Thursday, August 20, 2020 5:11 PM
To: Li, Dennis <dennis...@amd.com>; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander <alexander.deuc...@amd.com>; Kuehling, Felix 
<felix.kuehl...@amd.com>; Zhang, Hawking <hawking.zh...@amd.com>; Koenig, 
Christian <christian.koe...@amd.com>
Subject: Re: [PATCH] drm/amdgpu: change reset lock from mutex to rw_semaphore

Am 20.08.20 um 04:09 schrieb Dennis Li:
> clients don't need reset-lock for synchronization when no GPU 
> recovery.
>
> Signed-off-by: Dennis Li <dennis...@amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index c8aec832b244..ec11ed2a9ca4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -954,7 +954,7 @@ struct amdgpu_device {
>   
>       atomic_t                        in_gpu_reset;
>       enum pp_mp1_state               mp1_state;
> -     struct mutex  lock_reset;
> +     struct rw_semaphore reset_sem;
>       struct amdgpu_doorbell_index doorbell_index;
>   
>       struct mutex                    notifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 79b397800cbc..0090e850eab9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -101,14 +101,17 @@ static int amdgpu_debugfs_autodump_open(struct 
> inode *inode, struct file *file)
>   
>       file->private_data = adev;
>   
> -     mutex_lock(&adev->lock_reset);
> +     if (down_read_killable(&adev->reset_sem))
> +             return -EINTR;

Better use ret = down_read_killable(); if (ret) return ret; here. Same for all 
other places of course.

> +
>       if (adev->autodump.dumping.done) {
>               reinit_completion(&adev->autodump.dumping);
>               ret = 0;
>       } else {
>               ret = -EBUSY;
>       }
> -     mutex_unlock(&adev->lock_reset);
> +
> +     up_read(&adev->reset_sem);
>   
>       return ret;
>   }
> @@ -1242,7 +1245,8 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, 
> void *data)
>       }
>   
>       /* Avoid accidently unparking the sched thread during GPU reset */
> -     mutex_lock(&adev->lock_reset);
> +     if (down_read_killable(&adev->reset_sem))
> +             return -EINTR;
>   
>       /* hold on the scheduler */
>       for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1269,7 +1273,7 @@ 
> static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>               kthread_unpark(ring->sched.thread);
>       }
>   
> -     mutex_unlock(&adev->lock_reset);
> +     up_read(&adev->reset_sem);
>   
>       pm_runtime_mark_last_busy(dev->dev);
>       pm_runtime_put_autosuspend(dev->dev);
> @@ -1459,7 +1463,10 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>               return -ENOMEM;
>   
>       /* Avoid accidently unparking the sched thread during GPU reset */
> -     mutex_lock(&adev->lock_reset);
> +     if (down_read_killable(&adev->reset_sem)) {
> +             kfree(fences);
> +             return -EINTR;

Maybe better use a "goto err;" style error handling here.

> +     }
>   
>       /* stop the scheduler */
>       kthread_park(ring->sched.thread);
> @@ -1500,7 +1507,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 
> val)
>       /* restart the scheduler */
>       kthread_unpark(ring->sched.thread);
>   
> -     mutex_unlock(&adev->lock_reset);
> +     up_read(&adev->reset_sem);
>   
>       ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 78fd2c9a7b7d..82242e2f5658 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>       mutex_init(&adev->virt.vf_errors.lock);
>       hash_init(adev->mn_hash);
>       atomic_set(&adev->in_gpu_reset, 0);
> -     mutex_init(&adev->lock_reset);
> +     init_rwsem(&adev->reset_sem);
>       mutex_init(&adev->psp.mutex);
>       mutex_init(&adev->notifier_lock);
>   
> @@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct 
> amdgpu_device *adev)
>       if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
>               return false;
>   
> -     mutex_lock(&adev->lock_reset);
> +     down_write(&adev->reset_sem);
>   
>       atomic_inc(&adev->gpu_reset_counter);
>       switch (amdgpu_asic_reset_method(adev)) { @@ -4229,7 +4229,7 @@ 
> static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>       amdgpu_vf_error_trans_all(adev);
>       adev->mp1_state = PP_MP1_STATE_NONE;
>       atomic_set(&adev->in_gpu_reset, 0);
> -     mutex_unlock(&adev->lock_reset);
> +     up_write(&adev->reset_sem);
>   }
>   
>   static void amdgpu_device_resume_display_audio(struct amdgpu_device 
> *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index f27d83f2de78..8ac63f13fc6f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -238,19 +238,12 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
> *work)
>       struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
> flr_work);
>       struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
> virt);
>       int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
> -     int locked;
>   
>       /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>        * otherwise the mailbox msg will be ruined/reseted by
>        * the VF FLR.
> -      *
> -      * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> -      * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
> -      * which means host side had finished this VF's FLR.
>        */
> -     locked = mutex_trylock(&adev->lock_reset);
> -     if (locked)
> -             atomic_set(&adev->in_gpu_reset, 1);
> +     down_read(&adev->reset_sem);

Somebody from the virtualization team should take a look at this as well.

Christian.

>   
>       do {
>               if (xgpu_ai_mailbox_peek_msg(adev) == 
> IDH_FLR_NOTIFICATION_CMPL) 
> @@ -261,10 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct 
> *work)
>       } while (timeout > 1);
>   
>   flr_done:
> -     if (locked) {
> -             atomic_set(&adev->in_gpu_reset, 0);
> -             mutex_unlock(&adev->lock_reset);
> -     }
> +     up_read(&adev->reset_sem);
>   
>       /* Trigger recovery for world switch failure if no TDR */
>       if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 3cb10ab943a6..bcc583f087e7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -259,19 +259,12 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
> *work)
>       struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, 
> flr_work);
>       struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, 
> virt);
>       int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
> -     int locked;
>   
>       /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
>        * otherwise the mailbox msg will be ruined/reseted by
>        * the VF FLR.
> -      *
> -      * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> -      * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
> -      * which means host side had finished this VF's FLR.
>        */
> -     locked = mutex_trylock(&adev->lock_reset);
> -     if (locked)
> -             atomic_set(&adev->in_gpu_reset, 1);
> +     down_read(&adev->reset_sem);
>   
>       do {
>               if (xgpu_nv_mailbox_peek_msg(adev) == 
> IDH_FLR_NOTIFICATION_CMPL) 
> @@ -282,10 +275,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct 
> *work)
>       } while (timeout > 1);
>   
>   flr_done:
> -     if (locked) {
> -             atomic_set(&adev->in_gpu_reset, 0);
> -             mutex_unlock(&adev->lock_reset);
> -     }
> +     up_read(&adev->reset_sem);
>   
>       /* Trigger recovery for world switch failure if no TDR */
>       if (amdgpu_device_should_recover_gpu(adev)
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to