[+Amber, moving amd-gfx to BCC]

Amber worked on a related problem on an NPI branch recently in the
nocpsch version of this code. We should port that fix to
amd-staging-drm-next. Then lets come up with a common solution for the 
cpsch code path as well.

See one comment inline.


Am 2021-06-15 um 4:02 a.m. schrieb xinhui pan:
> We call free_mqd without dqm lock hold, that causes double free of
> mqd_mem_obj. Fix it by using a tmp pointer.
> We need walk through the queues_list with dqm lock hold. Otherwise hit
> list corruption.
>
> Signed-off-by: xinhui pan <[email protected]>
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c   | 17 +++++++++++++----
>  1 file changed, 13 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index e6366b408420..575c895fc241 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1484,6 +1484,7 @@ static int destroy_queue_cpsch(struct 
> device_queue_manager *dqm,
>       struct mqd_manager *mqd_mgr;
>       uint64_t sdma_val = 0;
>       struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> +     void *ptr = NULL;
>  
>       /* Get the SDMA queue stats */
>       if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
> @@ -1543,10 +1544,12 @@ static int destroy_queue_cpsch(struct 
> device_queue_manager *dqm,
>       pr_debug("Total of %d queues are accountable so far\n",
>                       dqm->total_queue_count);
>  
> +     swap(ptr, q->mqd_mem_obj);
>       dqm_unlock(dqm);
>  
>       /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
> -     mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +     if (ptr)
> +             mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr);
>  
>       return retval;
>  
> @@ -1709,6 +1712,7 @@ static int process_termination_cpsch(struct 
> device_queue_manager *dqm,
>       enum kfd_unmap_queues_filter filter =
>               KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
>       bool found = false;
> +     void *ptr = NULL;
>  
>       retval = 0;
>  
> @@ -1737,8 +1741,6 @@ static int process_termination_cpsch(struct 
> device_queue_manager *dqm,
>                               qpd->mapped_gws_queue = false;
>                       }
>               }
> -
> -             dqm->total_queue_count--;
>       }
>  
>       /* Unregister process */
> @@ -1770,13 +1772,20 @@ static int process_termination_cpsch(struct 
> device_queue_manager *dqm,
>       /* Lastly, free mqd resources.
>        * Do free_mqd() after dqm_unlock to avoid circular locking.
>        */
> +     dqm_lock(dqm);
>       list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
>               mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>                               q->properties.type)];
>               list_del(&q->list);
>               qpd->queue_count--;
> -             mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +             dqm->total_queue_count--;
> +             swap(ptr, q->mqd_mem_obj);
> +             dqm_unlock(dqm);

This still risks list corruption because the list can be modified while
the lock is dropped. However you can make it safe by changing the loop
to something like

        dqm_lock(dqm);
        while (!list_empty(...)) {
                q = list_first_entry(...);
                dqm_unlock(dqm);
                ...
                mqd_mgr->free_mqd(...);
                ...
                dqm_lock(dqm);
        }
        dqm_unlock();

Regards,
  Felix


> +             if (ptr)
> +                     mqd_mgr->free_mqd(mqd_mgr, q->mqd, ptr);
> +             dqm_lock(dqm);
>       }
> +     dqm_unlock(dqm);
>  
>       return retval;
>  }
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to