Re: [PATCH 6/6] drm/amdkfd: Fix sdma queue allocate race condition

2019-06-06 Thread Kuehling, Felix
Patches 5 and 6 are Reviewed-by: Felix Kuehling 

On 2019-06-06 2:25 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock at it modify
> the global dqm members. Move up the dqm_lock so sdma
> queue allocation is enclosed in the critical section. Move
> mqd allocation out of critical section to avoid circular
> lock dependency.
>
> Change-Id: I96abd42eae6e77c82a5ba1b8e600af3efe8d791d
> Signed-off-by: Oak Zeng 
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 24 
> +++---
>   1 file changed, 12 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 166636c..cd259b8 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1133,23 +1133,27 @@ static int create_queue_cpsch(struct 
> device_queue_manager *dqm, struct queue *q,
>   if (dqm->total_queue_count >= max_num_of_queues_per_device) {
>   pr_warn("Can't create new usermode queue because %d queues were 
> already created\n",
>   dqm->total_queue_count);
> - retval = -EPERM;
> - goto out;
> + return -EPERM;
>   }
>   
> + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> + q->properties.type)];
> + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, >properties);
> + if (!q->mqd_mem_obj)
> + return -ENOMEM;
> +
> + dqm_lock(dqm);
>   if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   retval = allocate_sdma_queue(dqm, q);
>   if (retval)
> - goto out;
> + goto out_unlock;
>   }
>   
>   retval = allocate_doorbell(qpd, q);
>   if (retval)
>   goto out_deallocate_sdma_queue;
>   
> - mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> - q->properties.type)];
>   /*
>* Eviction state logic: mark all queues as evicted, even ones
>* not currently active. Restoring inactive queues later only
> @@ -1161,12 +1165,8 @@ static int create_queue_cpsch(struct 
> device_queue_manager *dqm, struct queue *q,
>   dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
>   q->properties.tba_addr = qpd->tba_addr;
>   q->properties.tma_addr = qpd->tma_addr;
> - q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, >properties);
> - if (!q->mqd_mem_obj)
> - goto out_deallocate_doorbell;
>   mqd_mgr->init_mqd(mqd_mgr, >mqd, q->mqd_mem_obj,
>   >gart_mqd_addr, >properties);
> - dqm_lock(dqm);
>   
>   list_add(>list, >queues_list);
>   qpd->queue_count++;
> @@ -1192,13 +1192,13 @@ static int create_queue_cpsch(struct 
> device_queue_manager *dqm, struct queue *q,
>   dqm_unlock(dqm);
>   return retval;
>   
> -out_deallocate_doorbell:
> - deallocate_doorbell(qpd, q);
>   out_deallocate_sdma_queue:
>   if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
>   q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
>   deallocate_sdma_queue(dqm, q);
> -out:
> +out_unlock:
> + dqm_unlock(dqm);
> + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>   return retval;
>   }
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 6/6] drm/amdkfd: Fix sdma queue allocate race condition

2019-06-05 Thread Kuehling, Felix
I think the simpler way to fix this, is to restructure 
create_queue_cpsch similar to the nocpsch version where we allocate the 
MQD early and take the DQM lock right after that. That way you don't 
need locked and unlocked variants of allocate_sdma_queue and 
deallocate_sdma_queue.

Regards,
   Felix


On 2019-06-05 12:06 p.m., Zeng, Oak wrote:
> SDMA queue allocation requires the dqm lock as it modify
> the global dqm members. Introduce functions to allocate/deallocate
> in locked/unlocked circumstance.
>
> Change-Id: Id3084524c5f65d9629b12cf6b4862a7516945cb1
> Signed-off-by: Oak Zeng 
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 34 
> --
>   1 file changed, 25 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 6b1a2ee..52e4ede 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -53,6 +53,8 @@ static int map_queues_cpsch(struct device_queue_manager 
> *dqm);
>   
>   static void deallocate_sdma_queue(struct device_queue_manager *dqm,
>   struct queue *q);
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
> + struct queue *q);
>   
>   static inline void deallocate_hqd(struct device_queue_manager *dqm,
>   struct queue *q);
> @@ -434,10 +436,10 @@ static int destroy_queue_nocpsch_locked(struct 
> device_queue_manager *dqm,
>   deallocate_hqd(dqm, q);
>   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
>   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
>   } else {
>   pr_debug("q->properties.type %d is invalid\n",
>   q->properties.type);
> @@ -914,9 +916,12 @@ static int allocate_sdma_queue(struct 
> device_queue_manager *dqm,
>   {
>   int bit;
>   
> + dqm_lock(dqm);
>   if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> - if (dqm->sdma_bitmap == 0)
> + if (dqm->sdma_bitmap == 0) {
> + dqm_unlock(dqm);
>   return -ENOMEM;
> + }
>   bit = __ffs64(dqm->sdma_bitmap);
>   dqm->sdma_bitmap &= ~(1ULL << bit);
>   q->sdma_id = bit;
> @@ -925,8 +930,10 @@ static int allocate_sdma_queue(struct 
> device_queue_manager *dqm,
>   q->properties.sdma_queue_id = q->sdma_id /
>   get_num_sdma_engines(dqm);
>   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> - if (dqm->xgmi_sdma_bitmap == 0)
> + if (dqm->xgmi_sdma_bitmap == 0) {
> + dqm_unlock(dqm);
>   return -ENOMEM;
> + }
>   bit = __ffs64(dqm->xgmi_sdma_bitmap);
>   dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
>   q->sdma_id = bit;
> @@ -942,13 +949,14 @@ static int allocate_sdma_queue(struct 
> device_queue_manager *dqm,
>   get_num_xgmi_sdma_engines(dqm);
>   }
>   
> + dqm_unlock(dqm);
>   pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
>   pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
>   
>   return 0;
>   }
>   
> -static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> +static void deallocate_sdma_queue_locked(struct device_queue_manager *dqm,
>   struct queue *q)
>   {
>   if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
> @@ -962,6 +970,14 @@ static void deallocate_sdma_queue(struct 
> device_queue_manager *dqm,
>   }
>   }
>   
> +static void deallocate_sdma_queue(struct device_queue_manager *dqm,
> + struct queue *q)
> +{
> + dqm_lock(dqm);
> + deallocate_sdma_queue_locked(dqm, q);
> + dqm_unlock(dqm);
> +}
> +
>   /*
>* Device Queue Manager implementation for cp scheduler
>*/
> @@ -1356,10 +1372,10 @@ static int destroy_queue_cpsch(struct 
> device_queue_manager *dqm,
>   
>   if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
>   dqm->sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
>   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
>   dqm->xgmi_sdma_queue_count--;
> - deallocate_sdma_queue(dqm, q);
> + deallocate_sdma_queue_locked(dqm, q);
>   }
>   
>   list_del(>list);
> @@ -1585,10 +1601,10 @@ static int