Am 2022-03-17 um 15:37 schrieb Tushar Patel:
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 21 ++++++++++++---------
  2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4c20c23d6ba0..bda1b5132ee8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -680,7 +680,7 @@ MODULE_PARM_DESC(sched_policy,
   * Maximum number of processes that HWS can schedule concurrently. The 
maximum is the
   * number of VMIDs assigned to the HWS, which is also the default.
   */
-int hws_max_conc_proc = 8;
+int hws_max_conc_proc = -1;
  module_param(hws_max_conc_proc, int, 0444);
  MODULE_PARM_DESC(hws_max_conc_proc,
        "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no 
concurrency, #VMIDs for KFD = Maximum(default))");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 339e12c94cff..39073f72fe5f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -483,15 +483,18 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
        }
/* Verify module parameters regarding mapped process number*/
-       if ((hws_max_conc_proc < 0)
-                       || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
-               dev_err(kfd_device,
-                       "hws_max_conc_proc %d must be between 0 and %d, use %d 
instead\n",
-                       hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
-                       kfd->vm_info.vmid_num_kfd);
-               kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
-       } else
-               kfd->max_proc_per_quantum = hws_max_conc_proc;
+       kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
+       if (hws_max_conc_proc != -1) {
+               if ((hws_max_conc_proc > 0)
+                               && (hws_max_conc_proc < 
kfd->vm_info.vmid_num_kfd)) {

I think this should be <= kfd->vm_info.vmid_num_kfd.


+                       kfd->max_proc_per_quantum = hws_max_conc_proc;
+               } else {
+                                       dev_err(kfd_device,
+                               "hws_max_conc_proc %d must be between 0 and %d, use 
%d instead\n",
+                               hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
+                               kfd->vm_info.vmid_num_kfd);

I think this error message is the wrong approach. hws_max_conc_proc is a global setting that affects all GPUs. Different GPUs may have different numbers of VMIDs. So we can't treat (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd) as an error. It may be an error on one GPU but perfectly fine on another.

I think you can simplify this if-else like this and get rid of the dev_err:

    kfd->max_proc_per_quantum = min(hws_max_conc_proc, kfd->vm_info.vmid_num_kfd);

Regards,
  Felix


+               }
+       }
/* calculate max size of mqds needed for queues */
        size = max_num_of_queues_per_device *

Reply via email to