from:"Felix Kuehling"

Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset

2024-08-30 Thread Felix Kuehling



On 2024-08-29 18:16, Philip Yang wrote:
> 
> On 2024-08-29 17:15, Felix Kuehling wrote:
>> On 2024-08-23 15:49, Philip Yang wrote:
>>> If GPU reset kick in while KFD restore_process_worker running, this may
>>> causes different issues, for example below rcu stall warning, because
>>> restore work may move BOs and evict queues under VRAM pressure.
>>>
>>> Fix this race by taking adev reset_domain read semaphore to prevent GPU
>>> reset in restore_process_worker, the reset read semaphore can be taken
>>> recursively if adev have multiple partitions.
>>>
>>> Then there is live locking issue if CP hangs while
>>> restore_process_worker runs, then GPU reset wait for semaphore to start
>>> and restore_process_worker cannot finish to release semaphore. We need
>>> signal eviction fence to solve the live locking if evict queue return
>>> -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs,
>>>
>>>   amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded!
>>>   rcu: INFO: rcu_sched self-detected stall on CPU
>>>
>>>   Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
>>>   Call Trace:
>>>    update_process_times+0x94/0xd0
>>>   RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
>>>    amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
>>>    restore_process_helper+0x27/0x80 [amdgpu]
>>>
>>> Signed-off-by: Philip Yang 
>>
>> See comments inline. I'd also like Christian to take a look at this patch 
>> since he's the expert on the reset locking stuff.
>>
>>
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++-
>>>   1 file changed, 55 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> index a902950cc060..53a814347522 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>>> @@ -35,6 +35,7 @@
>>>   #include 
>>>   #include "amdgpu_amdkfd.h"
>>>   #include "amdgpu.h"
>>> +#include "amdgpu_reset.h"
>>>     struct mm_struct;
>>>   @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct 
>>> work_struct *work)
>>>   kfd_process_restore_queues(p);
>>>     pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
>>> -    } else
>>> +    } else if (ret == -ETIMEDOUT || ret == -ETIME) {
>>> +    /* If CP hangs, signal the eviction fence, then restore_bo_worker
>>> + * can finish to up_read GPU reset semaphore to start GPU reset.
>>> + */
>>> +    signal_eviction_fence(p);
>>> +    } else {
>>>   pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
>>> +    }
>>>   }
>>>     static int restore_process_helper(struct kfd_process *p)
>>> @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process 
>>> *p)
>>>   return ret;
>>>   }
>>>   +/*
>>> + * kfd_hold_devices_reset_semaphore
>>> + *
>>> + * return:
>>> + *   true : hold reset domain semaphore to prevent device reset
>>> + *   false: one of the device is resetting or already reset
>>> + *
>>> + */
>>> +static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p)
>>
>> I find the function naming of these functions (hold/unhold) a bit weird. I'd 
>> suggest kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems.
> ok
>>
>>
>>> +{
>>> +    struct amdgpu_device *adev;
>>> +    int i;
>>> +
>>> +    for (i = 0; i < p->n_pdds; i++) {
>>> +    adev = p->pdds[i]->dev->adev;
>>> +    if (!down_read_trylock(&adev->reset_domain->sem))
>>> +    goto out_upread;
>>> +    }
>>> +    return true;
>>> +
>>> +out_upread:
>>> +    while (i--) {
>>> +    adev = p->pdds[i]->dev->adev;
>>> +    up_read(&adev->reset_domain->sem);
>>> +    }
>>> +    return false;
>>> +}
>>> +
>>> +static void kfd_unhold_devices_reset_semaphore(struct kfd_process *p)
>>> +{
>>> +    struct amdgpu_device *adev;
>>> +    int i;
>>> +
>>> +    for (i = 0; i < p->n_pdds; i++) {

Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset

2024-08-29 Thread Felix Kuehling


On 2024-08-23 15:49, Philip Yang wrote:

If GPU reset kick in while KFD restore_process_worker running, this may
causes different issues, for example below rcu stall warning, because
restore work may move BOs and evict queues under VRAM pressure.

Fix this race by taking adev reset_domain read semaphore to prevent GPU
reset in restore_process_worker, the reset read semaphore can be taken
recursively if adev have multiple partitions.

Then there is live locking issue if CP hangs while
restore_process_worker runs, then GPU reset wait for semaphore to start
and restore_process_worker cannot finish to release semaphore. We need
signal eviction fence to solve the live locking if evict queue return
-ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs,

  amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded!
  rcu: INFO: rcu_sched self-detected stall on CPU

  Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
  Call Trace:
   update_process_times+0x94/0xd0
  RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
   amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
   restore_process_helper+0x27/0x80 [amdgpu]

Signed-off-by: Philip Yang 


See comments inline. I'd also like Christian to take a look at this 
patch since he's the expert on the reset locking stuff.




---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++-
  1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902950cc060..53a814347522 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -35,6 +35,7 @@
  #include 
  #include "amdgpu_amdkfd.h"
  #include "amdgpu.h"
+#include "amdgpu_reset.h"
  
  struct mm_struct;
  
@@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct *work)

kfd_process_restore_queues(p);
  
  		pr_debug("Finished evicting pasid 0x%x\n", p->pasid);

-   } else
+   } else if (ret == -ETIMEDOUT || ret == -ETIME) {
+   /* If CP hangs, signal the eviction fence, then 
restore_bo_worker
+* can finish to up_read GPU reset semaphore to start GPU reset.
+*/
+   signal_eviction_fence(p);
+   } else {
pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
+   }
  }
  
  static int restore_process_helper(struct kfd_process *p)

@@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p)
return ret;
  }
  
+/*

+ * kfd_hold_devices_reset_semaphore
+ *
+ * return:
+ *   true : hold reset domain semaphore to prevent device reset
+ *   false: one of the device is resetting or already reset
+ *
+ */
+static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p)


I find the function naming of these functions (hold/unhold) a bit weird. 
I'd suggest kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems.




+{
+   struct amdgpu_device *adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   adev = p->pdds[i]->dev->adev;
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   goto out_upread;
+   }
+   return true;
+
+out_upread:
+   while (i--) {
+   adev = p->pdds[i]->dev->adev;
+   up_read(&adev->reset_domain->sem);
+   }
+   return false;
+}
+
+static void kfd_unhold_devices_reset_semaphore(struct kfd_process *p)
+{
+   struct amdgpu_device *adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   adev = p->pdds[i]->dev->adev;
+   up_read(&adev->reset_domain->sem);
+   }
+}
+
  static void restore_process_worker(struct work_struct *work)
  {
struct delayed_work *dwork;
@@ -2009,6 +2055,12 @@ static void restore_process_worker(struct work_struct 
*work)
 * lifetime of this thread, kfd_process p will be valid
 */
p = container_of(dwork, struct kfd_process, restore_work);
+
+   if (!kfd_hold_devices_reset_semaphore(p)) {
+   pr_debug("GPU resetting, restore bo and queue skipped\n");


Should we reschedule the restore worker to make sure it runs again after 
the reset is done?


Thanks,
  Felix



+   return;
+   }
+
pr_debug("Started restoring pasid 0x%x\n", p->pasid);
  
  	/* Setting last_restore_timestamp before successful restoration.

@@ -2031,6 +2083,8 @@ static void restore_process_worker(struct work_struct 
*work)
 msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
kfd_process_restore_queues(p);
}
+
+   kfd_unhold_devices_reset_semaphore(p);
  }
  
  void kfd_suspend_all_processes(void)

Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter

2024-08-28 Thread Felix Kuehling




On 2024-08-28 17:38, Chen, Xiaogang wrote:



On 8/28/2024 4:05 PM, Felix Kuehling wrote:


On 2024-08-28 16:34, Chen, Xiaogang wrote:



On 8/28/2024 3:26 PM, Errabolu, Ramesh wrote:


Responses inline

Regards,

Ramesh

*From:*Chen, Xiaogang 
*Sent:* Wednesday, August 28, 2024 3:01 PM
*To:* Errabolu, Ramesh ; 
amd-gfx@lists.freedesktop.org
*Subject:* Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW 
module parameter


On 8/28/2024 2:52 PM, Errabolu, Ramesh wrote:

    Response inline

    Regards,

    Ramesh


    -Original Message-

    From: Chen, Xiaogang 
<mailto:xiaogang.c...@amd.com>

    Sent: Wednesday, August 28, 2024 2:43 PM

    To: Errabolu, Ramesh 
<mailto:ramesh.errab...@amd.com>;amd-gfx@lists.freedesktop.org


    Subject: Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW 
module parameter


    Why need this driver parameter? kfd has 
KFD_IOCTL_SVM_ATTR_GRANULARITY api that allows user space to set 
migration granularity per prange. If both got set which will take 
precedence?


    Ramesh: Use of Kfd Ioctl is available to users of registered 
memory. It allows users to control GOBM per buffer level, including 
overwriting default value. For ranges that do not specify GOBM, the 
default value will be found.


If user space use KFD_IOCTL_SVM_ATTR_GRANULARITY it will overwrite 
this parameter value for a prange, then how to know which 
granularity take effect? That is decided when user set this 
parameter and when the api got used.


Ramesh: The value bound by Kfd ioctl will take effect. In the life 
cycle of a prange it can go from the default value to one that is 
set by user via set_attr() call. However, it is generally 
understood that that users of set_attr() will not call it directly 
i.e. the rely on higher level apis from ROCr or HIP.


driver parameter can be set at run time, not only at boot time. It 
is not predictable  when user set this driver parameter and when the 
api got called.


I don't think this is a problem. The module parameter determines the 
granularity if the application doesn't set the virtual address range 
attribute. The default is captured in the per-process svms structure. 
So all mappings of the same process will use the same default, even 
if the module parameter is changed after the process is started. The 
get_attr ioctl will always return the actual granularity, no matter 
whether it comes from the default or was overridden by user mode for 
the virtual address range.


My concern is there are two ways to set pragne's granularity, both can 
be used at run time. It can make confusion to know which one take 
effect as user can use driver parameter and api to change granularity 
with any timing.


But it's not "any timing". For the module parameter to take effect, it 
has to be set _before_ the process starts. Any changes to the module 
parameter after the process starts will not take effect.


Regards,
  Felix



Regards

Xiaogang



Regards,
  Felix



Regards

Xiaogang


Regards

Xiaogang

    Regards

    Xiaogang

    On 8/26/2024 2:34 PM, Ramesh Errabolu wrote:

    Caution: This message originated from an External Source. 
Use proper caution when opening attachments, clicking links, or 
responding.


    Enables users to update the default size of buffer used in 
migration


    either from Sysmem to VRAM or vice versa.

    The param GOBM refers to granularity of buffer migration, 
and is


    specified in terms of log(numPages(buffer)). It facilitates 
users of


    unregistered memory to control GOBM, albeit at a coarse level

    Signed-off-by: Ramesh Errabolu 
<mailto:ramesh.errab...@amd.com>


    ---

   drivers/gpu/drm/amd/amdgpu/amdgpu.h |  4 

   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 
+


   drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 

   drivers/gpu/drm/amd/amdkfd/kfd_svm.c    | 26 
-


   4 files changed, 51 insertions(+), 9 deletions(-)

    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

    b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

    index e8c284aea1f2..73dd816b01f2 100644

    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

    @@ -237,6 +237,7 @@ extern int sched_policy;

   extern bool debug_evictions;

   extern bool no_system_mem_limit;

   extern int halt_if_hws_hang;

    +extern uint amdgpu_svm_attr_gobm;

   #else

   static const int __maybe_unused sched_policy = 
KFD_SCHED_POLICY_HWS;


   static const bool __maybe_unused debug_evictions; /* = 
false */ @@


    -313,6 +314,9 @@ extern int amdgpu_wbrf;

   /* Extra time delay(in ms) to eliminate the influence of 
temperature momentary fluctuation */


   #define AMDGPU_SWCTF_EXTRA_DELAY   50

    +/* D

Re: [PATCH 2/3] drm/amdgpu: sync to KFD fences before clearing PTEs

2024-08-28 Thread Felix Kuehling




On 2024-08-22 05:07, Christian König wrote:

Am 21.08.24 um 22:01 schrieb Felix Kuehling:

On 2024-08-21 08:03, Christian König wrote:

This patch tries to solve the basic problem we also need to sync to
the KFD fences of the BO because otherwise it can be that we clear
PTEs while the KFD queues are still running.


This is going to trigger a lot of phantom KFD evictions and will tank 
performance. It's probably not what you intended.


I tried to avoid that by only waiting for the KFD fence only in the 
particular situation that we can't lock the cleared BO because of 
contention.


OK. It's hard to make out where you're adding that call with the small 
context in the patch. As far as I can tell it's in the "if (clear || 
!bo)" branch. The "clear" case is as you mention, only used when the BO 
cannot be locked. The !bo case is PRT?


Contention would happen, if this runs concurrently with a 
restore-from-eviction, in which case we're already on a slow path and 
another eviction doesn't matter (as long as we're not getting into a 
live-lock situation). Or if a KFD BO is in the middle of being mapped or 
unmapped by another thread, which should be unlikely. So maybe this 
won't have a huge impact in practice. It's worth a try.


The patch is

Acked-by: Felix Kuehling 




The only short term alternative I can see is to lock all BOs during CS 
and that is a) a really large rework and b) will most likely hurt 
performance.


Then there is the alternative to lock the VM during BO eviction, but 
that means we need to wait on using the drm_exec object inside TTM as 
well. So that won't get this fixed in the next halve year or so.


Regards,
Christian.



Regards,
  Felix




Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 30 


  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   |  6 +
  3 files changed, 37 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c

index bdf1ef825d89..c586ab4c911b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -260,6 +260,36 @@ int amdgpu_sync_resv(struct amdgpu_device 
*adev, struct amdgpu_sync *sync,

  return 0;
  }
  +/**
+ * amdgpu_sync_kfd - sync to KFD fences
+ *
+ * @sync: sync object to add KFD fences to
+ * @resv: reservation object with KFD fences
+ *
+ * Extract all KFD fences and add them to the sync object.
+ */
+int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv)
+{
+    struct dma_resv_iter cursor;
+    struct dma_fence *f;
+    int r = 0;
+
+    dma_resv_iter_begin(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP);
+    dma_resv_for_each_fence_unlocked(&cursor, f) {
+    void *fence_owner = amdgpu_sync_get_owner(f);
+
+    if (fence_owner != AMDGPU_FENCE_OWNER_KFD)
+    continue;
+
+    r = amdgpu_sync_fence(sync, f);
+    if (r)
+    break;
+    }
+    dma_resv_iter_end(&cursor);
+
+    return r;
+}
+
  /* Free the entry back to the slab */
  static void amdgpu_sync_entry_free(struct amdgpu_sync_entry *e)
  {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h

index cf1e9e858efd..e3272dce798d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
@@ -51,6 +51,7 @@ int amdgpu_sync_fence(struct amdgpu_sync *sync, 
struct dma_fence *f);
  int amdgpu_sync_resv(struct amdgpu_device *adev, struct 
amdgpu_sync *sync,

   struct dma_resv *resv, enum amdgpu_sync_mode mode,
   void *owner);
+int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv);
  struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
   struct amdgpu_ring *ring);
  struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index ba99d428610a..13d429b91327 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1168,6 +1168,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
*adev, struct amdgpu_bo_va *bo_va,

   AMDGPU_SYNC_EQ_OWNER, vm);
  if (r)
  goto error_free;
+    if (bo) {
+    r = amdgpu_sync_kfd(&sync, bo->tbo.base.resv);
+    if (r)
+    goto error_free;
+    }
+
  } else {
  struct drm_gem_object *obj = &bo->tbo.base;

Re: [PATCH 1/3] drm/amdgpu: re-work VM syncing

2024-08-28 Thread Felix Kuehling




On 2024-08-22 03:28, Friedrich Vock wrote:

On 21.08.24 22:46, Felix Kuehling wrote:


On 2024-08-21 08:03, Christian König wrote:

Rework how VM operations synchronize to submissions. Provide an
amdgpu_sync container to the backends instead of an reservation
object and fill in the amdgpu_sync object in the higher layers
of the code.

No intended functional change, just prepares for upcomming changes.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 84 
+

  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 11 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  |  7 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 16 +---


There are two calls to amdgpu_vm_update_range in amdkfd/kfd_svm.c 
that would need to be updated as well.


I don't think any change should be needed there? Both calls pass NULL 
for the resv.


Right, sorry, the change to the function signature looked bigger than it 
was due to formatting changes. The patch is


Acked-by: Felix Kuehling 


All this patch changes is that we're now passing NULL for the 
amdgpu_sync - but the behavior with a NULL amdgpu_sync with this patch 
is the same as with a NULL dma_resv without this patch, so nothing 
needs to change.


Regards,
Friedrich



Regards,
   Felix



  5 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index bcb729094521..ba99d428610a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -838,7 +838,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device 
*adev,

  params.vm = vm;
  params.immediate = immediate;
-    r = vm->update_funcs->prepare(¶ms, NULL, 
AMDGPU_SYNC_EXPLICIT);

+    r = vm->update_funcs->prepare(¶ms, NULL);
  if (r)
  goto error;
@@ -933,7 +933,7 @@ amdgpu_vm_tlb_flush(struct 
amdgpu_vm_update_params *params,

   * @unlocked: unlocked invalidation during MM callback
   * @flush_tlb: trigger tlb invalidation after update completed
   * @allow_override: change MTYPE for local NUMA nodes
- * @resv: fences we need to sync to
+ * @sync: fences we need to sync to
   * @start: start of mapped range
   * @last: last mapped entry
   * @flags: flags for the entries
@@ -949,16 +949,16 @@ amdgpu_vm_tlb_flush(struct 
amdgpu_vm_update_params *params,

   * 0 for success, negative erro code for failure.
   */
  int amdgpu_vm_update_range(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
-   bool immediate, bool unlocked, bool flush_tlb, bool 
allow_override,

-   struct dma_resv *resv, uint64_t start, uint64_t last,
-   uint64_t flags, uint64_t offset, uint64_t vram_base,
+   bool immediate, bool unlocked, bool flush_tlb,
+   bool allow_override, struct amdgpu_sync *sync,
+   uint64_t start, uint64_t last, uint64_t flags,
+   uint64_t offset, uint64_t vram_base,
 struct ttm_resource *res, dma_addr_t *pages_addr,
 struct dma_fence **fence)
  {
  struct amdgpu_vm_tlb_seq_struct *tlb_cb;
  struct amdgpu_vm_update_params params;
  struct amdgpu_res_cursor cursor;
-    enum amdgpu_sync_mode sync_mode;
  int r, idx;
  if (!drm_dev_enter(adev_to_drm(adev), &idx))
@@ -991,14 +991,6 @@ int amdgpu_vm_update_range(struct amdgpu_device 
*adev, struct amdgpu_vm *vm,

  params.allow_override = allow_override;
  INIT_LIST_HEAD(¶ms.tlb_flush_waitlist);
-    /* Implicitly sync to command submissions in the same VM before
- * unmapping. Sync to moving fences before mapping.
- */
-    if (!(flags & AMDGPU_PTE_VALID))
-    sync_mode = AMDGPU_SYNC_EQ_OWNER;
-    else
-    sync_mode = AMDGPU_SYNC_EXPLICIT;
-
  amdgpu_vm_eviction_lock(vm);
  if (vm->evicting) {
  r = -EBUSY;
@@ -1013,7 +1005,7 @@ int amdgpu_vm_update_range(struct 
amdgpu_device *adev, struct amdgpu_vm *vm,

  dma_fence_put(tmp);
  }
-    r = vm->update_funcs->prepare(¶ms, resv, sync_mode);
+    r = vm->update_funcs->prepare(¶ms, sync);
  if (r)
  goto error_free;
@@ -1155,23 +1147,30 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
*adev, struct amdgpu_bo_va *bo_va,

  struct amdgpu_bo *bo = bo_va->base.bo;
  struct amdgpu_vm *vm = bo_va->base.vm;
  struct amdgpu_bo_va_mapping *mapping;
+    struct dma_fence **last_update;
  dma_addr_t *pages_addr = NULL;
  struct ttm_resource *mem;
-    struct dma_fence **last_update;
+    struct amdgpu_sync sync;
  bool flush_tlb = clear;
-    bool uncached;
-    struct dma_resv *resv;
  uint64_t vram_base;
  uint64_t flags;
+    bool uncached;
  int r;
+    amdgpu_sync_create(&sync);
  if (clear || !bo) {
  mem = NULL;
-    resv = vm->root.bo->tbo.base.resv;
+
+    /*

Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset

2024-08-28 Thread Felix Kuehling




On 2024-08-23 15:49, Philip Yang wrote:

If GPU reset kick in while KFD restore_process_worker running, this may
causes different issues, for example below rcu stall warning, because
restore work may move BOs and evict queues under VRAM pressure.

Fix this race by taking adev reset_domain read semaphore to prevent GPU
reset in restore_process_worker, the reset read semaphore can be taken
recursively if adev have multiple partitions.


Are you sure that an rw_sem can be read-locked recursively in the same 
thread. I can't find any evidence that this is true.


Regards,
  Felix




Then there is live locking issue if CP hangs while
restore_process_worker runs, then GPU reset wait for semaphore to start
and restore_process_worker cannot finish to release semaphore. We need
signal eviction fence to solve the live locking if evict queue return
-ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP hangs,

  amdgpu :af:00.0: amdgpu: GPU reset(21) succeeded!
  rcu: INFO: rcu_sched self-detected stall on CPU

  Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
  Call Trace:
   update_process_times+0x94/0xd0
  RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
   amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
   restore_process_helper+0x27/0x80 [amdgpu]

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56 +++-
  1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902950cc060..53a814347522 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -35,6 +35,7 @@
  #include 
  #include "amdgpu_amdkfd.h"
  #include "amdgpu.h"
+#include "amdgpu_reset.h"
  
  struct mm_struct;
  
@@ -1972,8 +1973,14 @@ static void evict_process_worker(struct work_struct *work)

kfd_process_restore_queues(p);
  
  		pr_debug("Finished evicting pasid 0x%x\n", p->pasid);

-   } else
+   } else if (ret == -ETIMEDOUT || ret == -ETIME) {
+   /* If CP hangs, signal the eviction fence, then 
restore_bo_worker
+* can finish to up_read GPU reset semaphore to start GPU reset.
+*/
+   signal_eviction_fence(p);
+   } else {
pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
+   }
  }
  
  static int restore_process_helper(struct kfd_process *p)

@@ -1997,6 +2004,45 @@ static int restore_process_helper(struct kfd_process *p)
return ret;
  }
  
+/*

+ * kfd_hold_devices_reset_semaphore
+ *
+ * return:
+ *   true : hold reset domain semaphore to prevent device reset
+ *   false: one of the device is resetting or already reset
+ *
+ */
+static bool kfd_hold_devices_reset_semaphore(struct kfd_process *p)
+{
+   struct amdgpu_device *adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   adev = p->pdds[i]->dev->adev;
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   goto out_upread;
+   }
+   return true;
+
+out_upread:
+   while (i--) {
+   adev = p->pdds[i]->dev->adev;
+   up_read(&adev->reset_domain->sem);
+   }
+   return false;
+}
+
+static void kfd_unhold_devices_reset_semaphore(struct kfd_process *p)
+{
+   struct amdgpu_device *adev;
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   adev = p->pdds[i]->dev->adev;
+   up_read(&adev->reset_domain->sem);
+   }
+}
+
  static void restore_process_worker(struct work_struct *work)
  {
struct delayed_work *dwork;
@@ -2009,6 +2055,12 @@ static void restore_process_worker(struct work_struct 
*work)
 * lifetime of this thread, kfd_process p will be valid
 */
p = container_of(dwork, struct kfd_process, restore_work);
+
+   if (!kfd_hold_devices_reset_semaphore(p)) {
+   pr_debug("GPU resetting, restore bo and queue skipped\n");
+   return;
+   }
+
pr_debug("Started restoring pasid 0x%x\n", p->pasid);
  
  	/* Setting last_restore_timestamp before successful restoration.

@@ -2031,6 +2083,8 @@ static void restore_process_worker(struct work_struct 
*work)
 msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
kfd_process_restore_queues(p);
}
+
+   kfd_unhold_devices_reset_semaphore(p);
  }
  
  void kfd_suspend_all_processes(void)

Re: [PATCH] drm/amdgpu: revert "use CPU for page table update if SDMA is unavailable"

2024-08-28 Thread Felix Kuehling




On 2024-08-27 10:16, Christian König wrote:

That is clearly not something we should do upstream. The SDMA is
mandatory for the driver to work correctly.

We could do this for emulation and bringup, but in those cases the
engineer should probably enabled CPU based updates manually.

This reverts commit 23335f9577e0b509c20ad8d65d9fdedd14545b55.

Signed-off-by: Christian König 


Acked-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 --
  1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3464a7a880f0..f0ccc560fd5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2392,7 +2392,6 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
   int32_t xcp_id)
  {
-   struct amdgpu_ip_block *ip_block;
struct amdgpu_bo *root_bo;
struct amdgpu_bo_vm *root;
int r, i;
@@ -2422,11 +2421,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
AMDGPU_VM_USE_CPU_FOR_GFX);
  
-	/* use CPU for page table update if SDMA is unavailable */

-   ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_SDMA);
-   if (!ip_block || ip_block->status.valid == false)
-   vm->use_cpu_for_update = true;
-
DRM_DEBUG_DRIVER("VM update mode is %s\n",
 vm->use_cpu_for_update ? "CPU" : "SDMA");
WARN_ONCE((vm->use_cpu_for_update &&

Re: [PATCH] drm/amdkfd: fix missed queue reset on queue destroy

2024-08-28 Thread Felix Kuehling




On 2024-08-22 11:17, Jonathan Kim wrote:

If a queue is being destroyed but causes a HWS hang on removal, the KFD
may issue an unnecessary gpu reset if the destroyed queue can be fixed
by a queue reset.

This is because the queue has been removed from the KFD's queue list
prior to the preemption action on destroy so the reset call will fail to
match the HQD PQ reset information against the KFD's queue record to do
the actual reset.

To fix this, deactivate the queue prior to preemption since it's being
destroyed anyways and remove the queue from the KFD's queue list after
preemption.

v2: early deactivate queue and delete queue from list later as-per
description instead of destroy queue referencing hack.

Signed-off-by: Jonathan Kim 
---
  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 577d121cc6d1..6d5a632b95eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2407,10 +2407,10 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
pdd->sdma_past_activity_counter += sdma_val;
}
  
-	list_del(&q->list);

qpd->queue_count--;


You may need to move the queue_count update as well to keep things 
consistent. Please make sure this passes KFD queue tests on GPUs with 
HWS and MES.


Other than that, this patch is

Reviewed-by: Felix Kuehling 



if (q->properties.is_active) {
decrement_queue_count(dqm, qpd, q);
+   q->properties.is_active = false;
if (!dqm->dev->kfd->shared_resources.enable_mes) {
retval = execute_queues_cpsch(dqm,
  
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
@@ -2421,6 +2421,7 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
retval = remove_queue_mes(dqm, q, qpd);
}
}
+   list_del(&q->list);
  
  	/*

 * Unconditionally decrement this counter, regardless of the queue's

Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter

2024-08-28 Thread Felix Kuehling




On 2024-08-28 16:34, Chen, Xiaogang wrote:



On 8/28/2024 3:26 PM, Errabolu, Ramesh wrote:


Responses inline

Regards,

Ramesh

*From:*Chen, Xiaogang 
*Sent:* Wednesday, August 28, 2024 3:01 PM
*To:* Errabolu, Ramesh ; 
amd-gfx@lists.freedesktop.org
*Subject:* Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW 
module parameter


On 8/28/2024 2:52 PM, Errabolu, Ramesh wrote:

Response inline

Regards,

Ramesh

  


-Original Message-

From: Chen, Xiaogang    


Sent: Wednesday, August 28, 2024 2:43 PM

To: Errabolu, Ramesh  
;amd-gfx@lists.freedesktop.org

Subject: Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW module 
parameter

Why need this driver parameter? kfd has KFD_IOCTL_SVM_ATTR_GRANULARITY api 
that allows user space to set migration granularity per prange. If both got set 
which will take precedence?

Ramesh: Use of Kfd Ioctl is available to users of registered memory. It 
allows users to control GOBM per buffer level, including overwriting default 
value. For ranges that do not specify GOBM, the default value will be found.

If user space use KFD_IOCTL_SVM_ATTR_GRANULARITY it will overwrite 
this parameter value for a prange, then how to know which granularity 
take effect? That is decided when user set this parameter and when 
the api got used.


Ramesh: The value bound by Kfd ioctl will take effect. In the life 
cycle of a prange it can go from the default value to one that is set 
by user via set_attr() call. However, it is generally understood that 
that users of set_attr() will not call it directly i.e. the rely on 
higher level apis from ROCr or HIP.


driver parameter can be set at run time, not only at boot time. It is 
not predictable  when user set this driver parameter and when the api 
got called.


I don't think this is a problem. The module parameter determines the 
granularity if the application doesn't set the virtual address range 
attribute. The default is captured in the per-process svms structure. So 
all mappings of the same process will use the same default, even if the 
module parameter is changed after the process is started. The get_attr 
ioctl will always return the actual granularity, no matter whether it 
comes from the default or was overridden by user mode for the virtual 
address range.


Regards,
  Felix



Regards

Xiaogang


Regards

Xiaogang

Regards

Xiaogang

On 8/26/2024 2:34 PM, Ramesh Errabolu wrote:

Caution: This message originated from an External Source. Use proper 
caution when opening attachments, clicking links, or responding.

Enables users to update the default size of buffer used in migration

either from Sysmem to VRAM or vice versa.

The param GOBM refers to granularity of buffer migration, and is

specified in terms of log(numPages(buffer)). It facilitates users of

unregistered memory to control GOBM, albeit at a coarse level

Signed-off-by: Ramesh Errabolu  


---

   drivers/gpu/drm/amd/amdgpu/amdgpu.h |  4 

   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 +

   drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 

   drivers/gpu/drm/amd/amdkfd/kfd_svm.c    | 26 
-

   4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index e8c284aea1f2..73dd816b01f2 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

@@ -237,6 +237,7 @@ extern int sched_policy;

   extern bool debug_evictions;

   extern bool no_system_mem_limit;

   extern int halt_if_hws_hang;

+extern uint amdgpu_svm_attr_gobm;

   #else

   static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;

   static const bool __maybe_unused debug_evictions; /* = false */ @@

-313,6 +314,9 @@ extern int amdgpu_wbrf;

   /* Extra time delay(in ms) to eliminate the influence of temperature 
momentary fluctuation */

   #define AMDGPU_SWCTF_EXTRA_DELAY   50

+/* Default size of buffer to use in migrating buffer */

+#define AMDGPU_SVM_ATTR_GOBM   9

+

   struct amdgpu_xcp_mgr;

   struct amdgpu_device;

   struct amdgpu_irq_src;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index b9529948f2b2..09c501753a3b 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

@@ -169,6 +169,17 @@ uint amdgpu_sdma_phase_quantum = 32;

   char *amdgpu_disable_cu;

   char *amdgpu_virtual_display;

Re: [PATCH v2] drm/amdgpu: Surface svm_attr_gobm, a RW module parameter

2024-08-28 Thread Felix Kuehling




On 2024-08-26 15:34, Ramesh Errabolu wrote:

Enables users to update the default size of buffer used
in migration either from Sysmem to VRAM or vice versa.
The param GOBM refers to granularity of buffer migration,
and is specified in terms of log(numPages(buffer)). It
facilitates users of unregistered memory to control GOBM,
albeit at a coarse level


Can we change the name of this to something more human-readable? I 
suggest something like svm_default_granularity.





Signed-off-by: Ramesh Errabolu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  4 
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 12 
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c| 26 -
  4 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e8c284aea1f2..73dd816b01f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -237,6 +237,7 @@ extern int sched_policy;
  extern bool debug_evictions;
  extern bool no_system_mem_limit;
  extern int halt_if_hws_hang;
+extern uint amdgpu_svm_attr_gobm;
  #else
  static const int __maybe_unused sched_policy = KFD_SCHED_POLICY_HWS;
  static const bool __maybe_unused debug_evictions; /* = false */
@@ -313,6 +314,9 @@ extern int amdgpu_wbrf;
  /* Extra time delay(in ms) to eliminate the influence of temperature 
momentary fluctuation */
  #define AMDGPU_SWCTF_EXTRA_DELAY  50
  
+/* Default size of buffer to use in migrating buffer */

+#define AMDGPU_SVM_ATTR_GOBM   9


I change this name, too. AMDGPU_SVM_DEFAULT_GRANULARITY.



+
  struct amdgpu_xcp_mgr;
  struct amdgpu_device;
  struct amdgpu_irq_src;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b9529948f2b2..09c501753a3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -169,6 +169,17 @@ uint amdgpu_sdma_phase_quantum = 32;
  char *amdgpu_disable_cu;
  char *amdgpu_virtual_display;
  bool enforce_isolation;
+
+/* Specifies the default size of buffer to use in
+ * migrating buffer from Sysmem to VRAM and vice
+ * versa
+ *
+ * GOBM - Granularity of Buffer Migration


This is really the granularity for page faults as well as migration. 
Hence the suggested name change. Also, GOBM is a new acronym. If you 
only define that in this comment, users won't know what it means. I'd 
remove this comment and instead include enough information in the 
PARM_DESC to let users know what this means. This information is visible 
to them in the output of the "modinfo" command.




+ *
+ * Defined as log2(sizeof(buffer)/PAGE_SIZE)
+ */
+uint amdgpu_svm_attr_gobm = AMDGPU_SVM_ATTR_GOBM;
+
  /*
   * OverDrive(bit 14) disabled by default
   * GFX DCS(bit 19) disabled by default
@@ -320,6 +331,13 @@ module_param_named(pcie_gen2, amdgpu_pcie_gen2, int, 0444);
  MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)");
  module_param_named(msi, amdgpu_msi, int, 0444);
  
+/**

+ * DOC: svm_attr_gobm (uint)
+ * Size of buffer to use in migrating buffer from Sysmem to VRAM and vice versa
+ */
+MODULE_PARM_DESC(svm_attr_gobm, "Defined as log2(sizeof(buffer)/PAGE_SIZE), e.g. 9 
for 2 MiB");


Suggested description: "Default SVM page fault granularity in 2^x pages, 
default 9 = 2MiB".




+module_param_named(svm_attr_gobm, amdgpu_svm_attr_gobm, uint, 0644);


All the other writable options use permissions 0600. I'd stay consistent 
with that.




+
  /**
   * DOC: lockup_timeout (string)
   * Set GPU scheduler timeout value in ms.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9ae9abc6eb43..c2e54b18c167 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -868,6 +868,18 @@ struct svm_range_list {
struct task_struct  *faulting_task;
/* check point ts decides if page fault recovery need be dropped */
uint64_tcheckpoint_ts[MAX_GPU_INSTANCE];
+
+   /* Indicates the default size to use in migrating
+* buffers of a process from Sysmem to VRAM and vice
+* versa. The max legal value cannot be greater than
+* 0x3F
+*
+* @note: A side effect of this symbol being part of
+* struct svm_range_list is that it forces all buffers
+* of the process of unregistered kind to use the same
+* size in buffer migration
+*/
+   uint8_t attr_gobm;


Attr is not a good name for this. Attr are the per-virtual address range 
attributes. This is the default setting for just one of these 
attributes. Instead of a long comment, just use a more descriptive name:


uint8_t default_granularity;



  };
  
  /* Process data */

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd

Re: [PATCH 2/2] drm/amdgpu/gfx9: put queue resets behind a debug option

2024-08-21 Thread Felix Kuehling




On 2024-08-20 16:25, Alex Deucher wrote:

Pending extended validation.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 4 
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 
  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c   | 6 ++
  3 files changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index c63528a4e8941..1254a43ec96b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -1151,6 +1151,10 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device 
*adev,
uint32_t low, high;
uint64_t queue_addr = 0;
  
+	if (!adev->debug_exp_resets &&

+   !adev->gfx.num_gfx_rings)
+   return 0;
+


Did you put this in the HW-specific code path intentionally? If you want 
this check to apply to all ASICs, you should put it into 
detect_queue_hang in kfd_device_queue_manager.c. But maybe the extended 
validation is HW-specific.


Either way, the patch is

Acked-by: Felix Kuehling 



kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 21089aadbb7b4..8cf5d7925b51c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -7233,6 +7233,10 @@ static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
unsigned long flags;
int i, r;
  
+	if (!adev->debug_exp_resets &&

+   !adev->gfx.num_gfx_rings)
+   return -EINVAL;
+
if (amdgpu_sriov_vf(adev))
return -EINVAL;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c

index 2067f26d3a9d8..f8649546b9c4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3052,6 +3052,9 @@ static void gfx_v9_4_3_ring_soft_recovery(struct 
amdgpu_ring *ring,
struct amdgpu_device *adev = ring->adev;
uint32_t value = 0;
  
+	if (!adev->debug_exp_resets)

+   return;
+
value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
@@ -3475,6 +3478,9 @@ static int gfx_v9_4_3_reset_kcq(struct amdgpu_ring *ring,
unsigned long flags;
int r, i;
  
+	if (!adev->debug_exp_resets)

+   return -EINVAL;
+
if (amdgpu_sriov_vf(adev))
return -EINVAL;

Re: [PATCH] drm/amdkfd: fix missed queue reset on queue destroy

2024-08-21 Thread Felix Kuehling




On 2024-08-21 17:17, Jonathan Kim wrote:

If a queue is being destroyed but causes a HWS hang on removal, the KFD
may issue an unnecessary gpu reset if the destroyed queue can be fixed
by a queue reset.

This is because the queue has been removed from the KFD's queue list
prior to the preemption action on destroy so the reset call will fail to
match the HQD PQ reset information against the KFD's queue record to do
the actual reset.

Since a queue destroy request is under the same device lock as any other
preemption request (which subsumes queue reset calls), transiently
store the destroyed queue's reference so that a potential subsequent queue
reset call can check against this queue as well.


Maybe this could be simplified by disabling the queues before destroying 
it. That way the queue would still exist when it's being unmapped and 
you don't need to hack the special case "cur_destroyed_queue" into the 
queue reset code.


Regards,
  Felix




Signed-off-by: Jonathan Kim 
---
  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 10 +-
  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
  2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 577d121cc6d1..09e39a72ca31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1842,6 +1842,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
goto fail_detect_hang_buffer;
}
  
+	dqm->cur_destroyed_queue = NULL;

+
dqm_unlock(dqm);
  
  	return 0;

@@ -2105,7 +2107,7 @@ static void set_queue_as_reset(struct 
device_queue_manager *dqm, struct queue *q
q->properties.queue_id, q->process->pasid);
  
  	pdd->has_reset_queue = true;

-   if (q->properties.is_active) {
+   if (q->properties.is_active && dqm->cur_destroyed_queue != q) {
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
}
@@ -2160,6 +2162,10 @@ static struct queue *find_queue_by_address(struct 
device_queue_manager *dqm, uin
struct qcm_process_device *qpd;
struct queue *q;
  
+	if (dqm->cur_destroyed_queue &&

+   dqm->cur_destroyed_queue->properties.queue_address == queue_address)
+   return dqm->cur_destroyed_queue;
+
list_for_each_entry(cur, &dqm->queues, list) {
qpd = cur->qpd;
list_for_each_entry(q, &qpd->queues_list, list) {
@@ -2409,6 +2415,7 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
  
  	list_del(&q->list);

qpd->queue_count--;
+   dqm->cur_destroyed_queue = q;
if (q->properties.is_active) {
decrement_queue_count(dqm, qpd, q);
if (!dqm->dev->kfd->shared_resources.enable_mes) {
@@ -2421,6 +2428,7 @@ static int destroy_queue_cpsch(struct 
device_queue_manager *dqm,
retval = remove_queue_mes(dqm, q, qpd);
}
}
+   dqm->cur_destroyed_queue = NULL;
  
  	/*

 * Unconditionally decrement this counter, regardless of the queue's
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 08b40826ad1e..5425c1dd7924 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -285,6 +285,7 @@ struct device_queue_manager {
struct dqm_detect_hang_info *detect_hang_info;
size_t detect_hang_info_size;
int detect_hang_count;
+   struct queue *cur_destroyed_queue;
  };
  
  void device_queue_manager_init_cik(

Re: [PATCH 1/3] drm/amdgpu: re-work VM syncing

2024-08-21 Thread Felix Kuehling




On 2024-08-21 08:03, Christian König wrote:

Rework how VM operations synchronize to submissions. Provide an
amdgpu_sync container to the backends instead of an reservation
object and fill in the amdgpu_sync object in the higher layers
of the code.

No intended functional change, just prepares for upcomming changes.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 84 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 11 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c  |  7 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 16 +---


There are two calls to amdgpu_vm_update_range in amdkfd/kfd_svm.c that 
would need to be updated as well.


Regards,
  Felix



  5 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index bcb729094521..ba99d428610a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -838,7 +838,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
params.vm = vm;
params.immediate = immediate;
  
-	r = vm->update_funcs->prepare(¶ms, NULL, AMDGPU_SYNC_EXPLICIT);

+   r = vm->update_funcs->prepare(¶ms, NULL);
if (r)
goto error;
  
@@ -933,7 +933,7 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,

   * @unlocked: unlocked invalidation during MM callback
   * @flush_tlb: trigger tlb invalidation after update completed
   * @allow_override: change MTYPE for local NUMA nodes
- * @resv: fences we need to sync to
+ * @sync: fences we need to sync to
   * @start: start of mapped range
   * @last: last mapped entry
   * @flags: flags for the entries
@@ -949,16 +949,16 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params 
*params,
   * 0 for success, negative erro code for failure.
   */
  int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
-  bool immediate, bool unlocked, bool flush_tlb, bool 
allow_override,
-  struct dma_resv *resv, uint64_t start, uint64_t last,
-  uint64_t flags, uint64_t offset, uint64_t vram_base,
+  bool immediate, bool unlocked, bool flush_tlb,
+  bool allow_override, struct amdgpu_sync *sync,
+  uint64_t start, uint64_t last, uint64_t flags,
+  uint64_t offset, uint64_t vram_base,
   struct ttm_resource *res, dma_addr_t *pages_addr,
   struct dma_fence **fence)
  {
struct amdgpu_vm_tlb_seq_struct *tlb_cb;
struct amdgpu_vm_update_params params;
struct amdgpu_res_cursor cursor;
-   enum amdgpu_sync_mode sync_mode;
int r, idx;
  
  	if (!drm_dev_enter(adev_to_drm(adev), &idx))

@@ -991,14 +991,6 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
params.allow_override = allow_override;
INIT_LIST_HEAD(¶ms.tlb_flush_waitlist);
  
-	/* Implicitly sync to command submissions in the same VM before

-* unmapping. Sync to moving fences before mapping.
-*/
-   if (!(flags & AMDGPU_PTE_VALID))
-   sync_mode = AMDGPU_SYNC_EQ_OWNER;
-   else
-   sync_mode = AMDGPU_SYNC_EXPLICIT;
-
amdgpu_vm_eviction_lock(vm);
if (vm->evicting) {
r = -EBUSY;
@@ -1013,7 +1005,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
dma_fence_put(tmp);
}
  
-	r = vm->update_funcs->prepare(¶ms, resv, sync_mode);

+   r = vm->update_funcs->prepare(¶ms, sync);
if (r)
goto error_free;
  
@@ -1155,23 +1147,30 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,

struct amdgpu_bo *bo = bo_va->base.bo;
struct amdgpu_vm *vm = bo_va->base.vm;
struct amdgpu_bo_va_mapping *mapping;
+   struct dma_fence **last_update;
dma_addr_t *pages_addr = NULL;
struct ttm_resource *mem;
-   struct dma_fence **last_update;
+   struct amdgpu_sync sync;
bool flush_tlb = clear;
-   bool uncached;
-   struct dma_resv *resv;
uint64_t vram_base;
uint64_t flags;
+   bool uncached;
int r;
  
+	amdgpu_sync_create(&sync);

if (clear || !bo) {
mem = NULL;
-   resv = vm->root.bo->tbo.base.resv;
+
+   /* Implicitly sync to command submissions in the same VM before
+* unmapping.
+*/
+   r = amdgpu_sync_resv(adev, &sync, vm->root.bo->tbo.base.resv,
+AMDGPU_SYNC_EQ_OWNER, vm);
+   if (r)
+   goto error_free;
} else {
struct drm_gem_object *obj = &bo->tbo.base;
  
-

Re: [PATCH 2/3] drm/amdgpu: sync to KFD fences before clearing PTEs

2024-08-21 Thread Felix Kuehling


On 2024-08-21 08:03, Christian König wrote:

This patch tries to solve the basic problem we also need to sync to
the KFD fences of the BO because otherwise it can be that we clear
PTEs while the KFD queues are still running.


This is going to trigger a lot of phantom KFD evictions and will tank 
performance. It's probably not what you intended.


Regards,
  Felix




Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 30 
  drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   |  6 +
  3 files changed, 37 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index bdf1ef825d89..c586ab4c911b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -260,6 +260,36 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct 
amdgpu_sync *sync,
return 0;
  }
  
+/**

+ * amdgpu_sync_kfd - sync to KFD fences
+ *
+ * @sync: sync object to add KFD fences to
+ * @resv: reservation object with KFD fences
+ *
+ * Extract all KFD fences and add them to the sync object.
+ */
+int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv)
+{
+   struct dma_resv_iter cursor;
+   struct dma_fence *f;
+   int r = 0;
+
+   dma_resv_iter_begin(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP);
+   dma_resv_for_each_fence_unlocked(&cursor, f) {
+   void *fence_owner = amdgpu_sync_get_owner(f);
+
+   if (fence_owner != AMDGPU_FENCE_OWNER_KFD)
+   continue;
+
+   r = amdgpu_sync_fence(sync, f);
+   if (r)
+   break;
+   }
+   dma_resv_iter_end(&cursor);
+
+   return r;
+}
+
  /* Free the entry back to the slab */
  static void amdgpu_sync_entry_free(struct amdgpu_sync_entry *e)
  {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
index cf1e9e858efd..e3272dce798d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
@@ -51,6 +51,7 @@ int amdgpu_sync_fence(struct amdgpu_sync *sync, struct 
dma_fence *f);
  int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
 struct dma_resv *resv, enum amdgpu_sync_mode mode,
 void *owner);
+int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv);
  struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
 struct amdgpu_ring *ring);
  struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ba99d428610a..13d429b91327 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1168,6 +1168,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, 
struct amdgpu_bo_va *bo_va,
 AMDGPU_SYNC_EQ_OWNER, vm);
if (r)
goto error_free;
+   if (bo) {
+   r = amdgpu_sync_kfd(&sync, bo->tbo.base.resv);
+   if (r)
+   goto error_free;
+   }
+
} else {
struct drm_gem_object *obj = &bo->tbo.base;

Re: [PATCHv3 3/3] drm/amdkfd: Update BadOpcode Interrupt handling with MES

2024-08-16 Thread Felix Kuehling




On 2024-08-16 14:01, Mukul Joshi wrote:

Based on the recommendation of MEC FW, update BadOpcode interrupt
handling by unmapping all queues, removing the queue that got the
interrupt from scheduling and remapping rest of the queues back when
using MES scheduler. This is done to prevent the case where unmapping
of the bad queue can fail thereby causing a GPU reset.

Signed-off-by: Mukul Joshi 
Acked-by: Harish Kasiviswanathan 
Acked-by: Alex Deucher 


Reviewed-by: Felix Kuehling 



---
v1->v2:
- No change.

v2->v3:
- No change.

  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 51 +++
  .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |  9 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
  3 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 0ca933d2099c..d7db33f378e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2875,6 +2875,57 @@ void device_queue_manager_uninit(struct 
device_queue_manager *dqm)
kfree(dqm);
  }
  
+int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id)

+{
+   struct kfd_process_device *pdd;
+   struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+   struct device_queue_manager *dqm = knode->dqm;
+   struct device *dev = dqm->dev->adev->dev;
+   struct qcm_process_device *qpd;
+   struct queue *q = NULL;
+   int ret = 0;
+
+   if (!p)
+   return -EINVAL;
+
+   dqm_lock(dqm);
+
+   pdd = kfd_get_process_device_data(dqm->dev, p);
+   if (pdd) {
+   qpd = &pdd->qpd;
+
+   list_for_each_entry(q, &qpd->queues_list, list) {
+   if (q->doorbell_id == doorbell_id && 
q->properties.is_active) {
+   ret = suspend_all_queues_mes(dqm);
+   if (ret) {
+   dev_err(dev, "Suspending all queues 
failed");
+   goto out;
+   }
+
+   q->properties.is_evicted = true;
+   q->properties.is_active = false;
+   decrement_queue_count(dqm, qpd, q);
+
+   ret = remove_queue_mes(dqm, q, qpd);
+   if (ret) {
+   dev_err(dev, "Removing bad queue 
failed");
+   goto out;
+   }
+
+   ret = resume_all_queues_mes(dqm);
+   if (ret)
+   dev_err(dev, "Resuming all queues 
failed");
+
+   break;
+   }
+   }
+   }
+
+out:
+   dqm_unlock(dqm);
+   return ret;
+}
+
  static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
   struct qcm_process_device *qpd)
  {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index f524a55eee11..b3f988b275a8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -330,11 +330,14 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
-
KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)))
-   kfd_set_dbg_ev_from_interrupt(dev, pasid,
-   KFD_CTXID0_DOORBELL_ID(context_id0),
+
KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
+   u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
+
+   kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id,

KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
NULL, 0);
+   kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id);
+   }
  
  		/* SDMA */

else if (source_id == SOC21_INTSRC_SDMA_TRAP)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f7c12d4f0abb..7bba6bed2f48 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1324,6 +1324,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node 
*dev,
enum kfd_queue_type type);
  void kernel_queu

Re: [PATCHv3 2/3] drm/amdkfd: Update queue unmap after VM fault with MES

2024-08-16 Thread Felix Kuehling




On 2024-08-16 14:01, Mukul Joshi wrote:

MEC FW expects MES to unmap all queues when a VM fault is observed
on a queue and then resumed once the affected process is terminated.
Use the MES Suspend and Resume APIs to achieve this.

Signed-off-by: Mukul Joshi 
Acked-by: Alex Deucher 


Reviewed-by: Felix Kuehling 



---
v1->v2:
- Add MES FW version check.
- Separate out the kfd_dqm_evict_pasid into another function.
- Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume queues.

v2->v3:
- Use down_read_trylock/up_read instead of dqm->is_hws_hang.
- Increase eviction count if the process is already evicted in
   kfd_dqm_evict_pasid_mes to make sure the process stays evicted.

  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 87 ++-
  1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f6e211070299..0ca933d2099c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -319,6 +319,46 @@ static int remove_all_queues_mes(struct 
device_queue_manager *dqm)
return retval;
  }
  
+static int suspend_all_queues_mes(struct device_queue_manager *dqm)

+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+   int r = 0;
+
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EIO;
+
+   r = amdgpu_mes_suspend(adev);
+   up_read(&adev->reset_domain->sem);
+
+   if (r) {
+   dev_err(adev->dev, "failed to suspend gangs from MES\n");
+   dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU 
reset\n");
+   kfd_hws_hang(dqm);
+   }
+
+   return r;
+}
+
+static int resume_all_queues_mes(struct device_queue_manager *dqm)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+   int r = 0;
+
+   if (!down_read_trylock(&adev->reset_domain->sem))
+   return -EIO;
+
+   r = amdgpu_mes_resume(adev);
+   up_read(&adev->reset_domain->sem);
+
+   if (r) {
+   dev_err(adev->dev, "failed to resume gangs from MES\n");
+   dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU 
reset\n");
+   kfd_hws_hang(dqm);
+   }
+
+   return r;
+}
+
  static void increment_queue_count(struct device_queue_manager *dqm,
  struct qcm_process_device *qpd,
  struct queue *q)
@@ -2835,6 +2875,44 @@ void device_queue_manager_uninit(struct 
device_queue_manager *dqm)
kfree(dqm);
  }
  
+static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,

+  struct qcm_process_device *qpd)
+{
+   struct device *dev = dqm->dev->adev->dev;
+   int ret = 0;
+
+   /* Check if process is already evicted */
+   dqm_lock(dqm);
+   if (qpd->evicted) {
+   /* Increment the evicted count to make sure the
+* process stays evicted before its terminated.
+*/
+   qpd->evicted++;
+   dqm_unlock(dqm);
+   goto out;
+   }
+   dqm_unlock(dqm);
+
+   ret = suspend_all_queues_mes(dqm);
+   if (ret) {
+   dev_err(dev, "Suspending all queues failed");
+   goto out;
+   }
+
+   ret = dqm->ops.evict_process_queues(dqm, qpd);
+   if (ret) {
+   dev_err(dev, "Evicting process queues failed");
+   goto out;
+   }
+
+   ret = resume_all_queues_mes(dqm);
+   if (ret)
+   dev_err(dev, "Resuming all queues failed");
+
+out:
+   return ret;
+}
+
  int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
  {
struct kfd_process_device *pdd;
@@ -2845,8 +2923,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager 
*dqm, u32 pasid)
return -EINVAL;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
pdd = kfd_get_process_device_data(dqm->dev, p);
-   if (pdd)
-   ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+   if (pdd) {
+   if (dqm->dev->kfd->shared_resources.enable_mes)
+   ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
+   else
+   ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+   }
+
kfd_unref_process(p);
  
  	return ret;

Re: [PATCH v5] drm/amdgpu: Take IOMMU remapping into account for p2p checks

2024-08-16 Thread Felix Kuehling



On 2024-08-16 3:29, Rahul Jain wrote:
> when trying to enable p2p the amdgpu_device_is_peer_accessible()
> checks the condition where address_mask overlaps the aper_base
> and hence returns 0, due to which the p2p disables for this platform
> 
> IOMMU should remap the BAR addresses so the device can access
> them. Hence check if peer_adev is remapping DMA
> 
> v5: (Felix, Alex)
> - fixing comment as per Alex feedback
> - refactor code as per Felix
> 
> v4: (Alex)
> - fix the comment and description
> 
> v3:
> - remove iommu_remap variable
> 
> v2: (Alex)
> - Fix as per review comments
> - add new function amdgpu_device_check_iommu_remap to check if iommu
>   remap
> 
> Signed-off-by: Rahul Jain 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +-
>  1 file changed, 34 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a6b8d0ba4758..e03b3357ae09 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3952,6 +3952,25 @@ static void 
> amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
>   adev->ram_is_direct_mapped = true;
>  }
>  
> +/**
> + * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
> + *
> + * @adev: amdgpu_device pointer
> + *
> + * return if IOMMU remapping bar address
> + */
> +static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
> +{
> + struct iommu_domain *domain;
> +
> + domain = iommu_get_domain_for_dev(adev->dev);
> + if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
> + domain->type == IOMMU_DOMAIN_DMA_FQ))
> + return true;
> +
> + return false;
> +}
> +
>  static const struct attribute *amdgpu_dev_attributes[] = {
>   &dev_attr_pcie_replay_count.attr,
>   NULL
> @@ -6127,21 +6146,26 @@ bool amdgpu_device_is_peer_accessible(struct 
> amdgpu_device *adev,
> struct amdgpu_device *peer_adev)
>  {
>  #ifdef CONFIG_HSA_AMD_P2P
> - uint64_t address_mask = peer_adev->dev->dma_mask ?
> - ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
> - resource_size_t aper_limit =
> - adev->gmc.aper_base + adev->gmc.aper_size - 1;
>   bool p2p_access =
>   !adev->gmc.xgmi.connected_to_cpu &&
>   !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
>  
> - return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
> - adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
> - !(adev->gmc.aper_base & address_mask ||
> -   aper_limit & address_mask));
> -#else
> - return false;
> + bool is_large_bar = adev->gmc.visible_vram_size &&
> + adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
> + bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
> +
> + if (!p2p_addressable) {
> + uint64_t address_mask = peer_adev->dev->dma_mask ?
> + ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
> + resource_size_t aper_limit =
> + adev->gmc.aper_base + adev->gmc.aper_size - 1;
> +
> + p2p_addressable = !(adev->gmc.aper_base & address_mask ||
> +  aper_limit & address_mask);
> + }
> + return is_large_bar && p2p_access && p2p_addressable;
>  #endif
> + return false;

You changed the #else into a #endif. Logically that's OK, but it may cause a 
compiler warning about unreachable code because the program can never reach the 
"return false;" statement when CONFIG_HSA_AMD_P2P is enabled. It's probably 
safer to leave the #else to make sure this compiles without warnings on current 
and future compilers.

With that fixed, this patch is
Reviewed-by: Felix Kuehling 

>  }
>  
>  int amdgpu_device_baco_enter(struct drm_device *dev)

Re: [PATCHv2 2/3] drm/amdkfd: Update queue unmap after VM fault with MES

2024-08-15 Thread Felix Kuehling




On 2024-08-15 17:08, Joshi, Mukul wrote:

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Felix,


-Original Message-
From: Kuehling, Felix 
Sent: Thursday, August 15, 2024 2:25 PM
To: Joshi, Mukul ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: Re: [PATCHv2 2/3] drm/amdkfd: Update queue unmap after VM fault
with MES

On 2024-08-14 19:27, Mukul Joshi wrote:

MEC FW expects MES to unmap all queues when a VM fault is observed on
a queue and then resumed once the affected process is terminated.
Use the MES Suspend and Resume APIs to achieve this.

Signed-off-by: Mukul Joshi 
---
v1->v2:
- Add MES FW version check.
- Separate out the kfd_dqm_evict_pasid into another function.
- Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume

queues.

   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 79

++-

   1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f6e211070299..cb5b866eee3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -319,6 +319,42 @@ static int remove_all_queues_mes(struct

device_queue_manager *dqm)

 return retval;
   }

+static int suspend_all_queues_mes(struct device_queue_manager *dqm) {
+   struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev-
adev;
+   int r = 0;
+
+   if (dqm->is_hws_hang)
+   return -EIO;
+
+   r = amdgpu_mes_suspend(adev);
+   if (r) {
+   dev_err(adev->dev, "failed to suspend gangs from MES\n");
+   dev_err(adev->dev, "MES might be in unrecoverable state,

issue a GPU reset\n");

+   kfd_hws_hang(dqm);
+   }
+
+   return r;
+}
+
+static int resume_all_queues_mes(struct device_queue_manager *dqm) {
+   struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev-
adev;
+   int r = 0;
+
+   if (dqm->is_hws_hang)
+   return -EIO;
+
+   r = amdgpu_mes_resume(adev);
+   if (r) {
+   dev_err(adev->dev, "failed to resume gangs from MES\n");
+   dev_err(adev->dev, "MES might be in unrecoverable state,

issue a GPU reset\n");

+   kfd_hws_hang(dqm);
+   }
+
+   return r;
+}
+
   static void increment_queue_count(struct device_queue_manager *dqm,
   struct qcm_process_device *qpd,
   struct queue *q)
@@ -2835,6 +2871,40 @@ void device_queue_manager_uninit(struct

device_queue_manager *dqm)

 kfree(dqm);
   }

+static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,
+  struct qcm_process_device *qpd) {
+   struct device *dev = dqm->dev->adev->dev;
+   int ret = 0;
+
+   /* Check if process is already evicted */
+   dqm_lock(dqm);
+   if (qpd->evicted) {
+   dqm_unlock(dqm);
+   goto out;

qpd->evicted is a reference count. Without this shortcut,
dqm->ops.evict_process_queues will increment the ref count. You probably
need to increment it here before dropping the lock. Otherwise two things can
go wrong:

  1. The corresponding dqm->ops.restore_process_queues will underflow the
 reference count
  2. A race condition where the queues get restored too early


The intent here is to check if the process queues are already evicted or not. 
If its not, then we want
to suspend all queues, evict all queues (which also updates the evicted 
refcount) of the affected process,
and resume all queues.
If I increment the refcount here, then dqm->ops.evict_process_queues will not 
evict the queues unless we
change that function.
And this function would be called only for the VM fault case, so the process is 
going to be terminated. Is it
possible to have dqm->ops.restore_process_queues called on it? Even if it 
called, I don't think we can have
underflow of the refcount with the current code.

Can you please explain the case where the dqm->ops.restore_process_queues can 
cause an underflow
with the current code? And the scenario for the race?


On GPUs with MES, you pair kfd_dqm_evict_pasid_mes with 
dqm->ops.restore_process_queues. For every call of 
kfd_dqm_evict_pasid_mes there will be a corresponding call of 
dqm->ops.restore_process_queues. If kfd_dqm_evict_pasid_mes doesn't 
increment the qpd->evicted refcount for some cases, the refcount will 
underflow in dqm->ops.restore_process_queues.


Regards,
  Felix





Regards,
Mukul


Regards,
Felix



+   }
+   dqm_unlock(dqm);
+
+   ret = suspend_all_queues_mes(dqm);
+   if (ret) {
+   dev_err(dev, "Suspending all queues failed");
+   goto out;
+   }
+
+   ret = dqm->ops.evict_process_queues(dqm, qpd);
+   if (ret) {
+   dev_err(dev, "Evicting process queues failed");
+   goto out;
+   }
+
+   ret = resume_all_queues_mes(dqm);
+   if (ret)
+   dev_err(dev, "Resuming all queues failed");
+
+out:
+   return ret;
+}
+
   int kfd_dqm_evict_pasid(s

Re: [PATCHv2 2/3] drm/amdkfd: Update queue unmap after VM fault with MES

2024-08-15 Thread Felix Kuehling


On 2024-08-14 19:27, Mukul Joshi wrote:

MEC FW expects MES to unmap all queues when a VM fault is observed
on a queue and then resumed once the affected process is terminated.
Use the MES Suspend and Resume APIs to achieve this.

Signed-off-by: Mukul Joshi 
---
v1->v2:
- Add MES FW version check.
- Separate out the kfd_dqm_evict_pasid into another function.
- Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume queues.

  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 79 ++-
  1 file changed, 77 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index f6e211070299..cb5b866eee3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -319,6 +319,42 @@ static int remove_all_queues_mes(struct 
device_queue_manager *dqm)
return retval;
  }
  
+static int suspend_all_queues_mes(struct device_queue_manager *dqm)

+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+   int r = 0;
+
+   if (dqm->is_hws_hang)
+   return -EIO;
+
+   r = amdgpu_mes_suspend(adev);
+   if (r) {
+   dev_err(adev->dev, "failed to suspend gangs from MES\n");
+   dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU 
reset\n");
+   kfd_hws_hang(dqm);
+   }
+
+   return r;
+}
+
+static int resume_all_queues_mes(struct device_queue_manager *dqm)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
+   int r = 0;
+
+   if (dqm->is_hws_hang)
+   return -EIO;
+
+   r = amdgpu_mes_resume(adev);
+   if (r) {
+   dev_err(adev->dev, "failed to resume gangs from MES\n");
+   dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU 
reset\n");
+   kfd_hws_hang(dqm);
+   }
+
+   return r;
+}
+
  static void increment_queue_count(struct device_queue_manager *dqm,
  struct qcm_process_device *qpd,
  struct queue *q)
@@ -2835,6 +2871,40 @@ void device_queue_manager_uninit(struct 
device_queue_manager *dqm)
kfree(dqm);
  }
  
+static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm,

+  struct qcm_process_device *qpd)
+{
+   struct device *dev = dqm->dev->adev->dev;
+   int ret = 0;
+
+   /* Check if process is already evicted */
+   dqm_lock(dqm);
+   if (qpd->evicted) {
+   dqm_unlock(dqm);
+   goto out;


qpd->evicted is a reference count. Without this shortcut, 
dqm->ops.evict_process_queues will increment the ref count. You probably 
need to increment it here before dropping the lock. Otherwise two things 
can go wrong:


1. The corresponding dqm->ops.restore_process_queues will underflow the
   reference count
2. A race condition where the queues get restored too early

Regards,
  Felix



+   }
+   dqm_unlock(dqm);
+
+   ret = suspend_all_queues_mes(dqm);
+   if (ret) {
+   dev_err(dev, "Suspending all queues failed");
+   goto out;
+   }
+
+   ret = dqm->ops.evict_process_queues(dqm, qpd);
+   if (ret) {
+   dev_err(dev, "Evicting process queues failed");
+   goto out;
+   }
+
+   ret = resume_all_queues_mes(dqm);
+   if (ret)
+   dev_err(dev, "Resuming all queues failed");
+
+out:
+   return ret;
+}
+
  int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
  {
struct kfd_process_device *pdd;
@@ -2845,8 +2915,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager 
*dqm, u32 pasid)
return -EINVAL;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
pdd = kfd_get_process_device_data(dqm->dev, p);
-   if (pdd)
-   ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+   if (pdd) {
+   if (dqm->dev->kfd->shared_resources.enable_mes)
+   ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
+   else
+   ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+   }
+
kfd_unref_process(p);
  
  	return ret;

Re: [PATCH 2/4] amdgpu: fix a race in kfd_mem_export_dmabuf()

2024-08-14 Thread Felix Kuehling




On 2024-08-12 02:59, Al Viro wrote:

Using drm_gem_prime_handle_to_fd() to set dmabuf up and insert it into
descriptor table, only to have it looked up by file descriptor and
remove it from descriptor table is not just too convoluted - it's
racy; another thread might have modified the descriptor table while
we'd been going through that song and dance.

Switch kfd_mem_export_dmabuf() to using drm_gem_prime_handle_to_dmabuf()
and leave the descriptor table alone...

Signed-off-by: Al Viro 


This patch is

Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +++-
  1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 11672bfe4fad..bc5401de2948 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -25,7 +25,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  
  #include 

@@ -818,18 +817,13 @@ static int kfd_mem_export_dmabuf(struct kgd_mem *mem)
if (!mem->dmabuf) {
struct amdgpu_device *bo_adev;
struct dma_buf *dmabuf;
-   int r, fd;
  
  		bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);

-   r = drm_gem_prime_handle_to_fd(&bo_adev->ddev, 
bo_adev->kfd.client.file,
+   dmabuf = drm_gem_prime_handle_to_dmabuf(&bo_adev->ddev, 
bo_adev->kfd.client.file,
   mem->gem_handle,
mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
-  DRM_RDWR : 0, &fd);
-   if (r)
-   return r;
-   dmabuf = dma_buf_get(fd);
-   close_fd(fd);
-   if (WARN_ON_ONCE(IS_ERR(dmabuf)))
+  DRM_RDWR : 0);
+   if (IS_ERR(dmabuf))
return PTR_ERR(dmabuf);
mem->dmabuf = dmabuf;
}

Re: [PATCH 2/2] drm/amdgpu: fix incomplete access issue in amdgpu_ttm_access_memory_sdma()

2024-08-14 Thread Felix Kuehling




On 2024-08-12 02:59, Samuel Zhang wrote:

The requested access range may be across 2 adjacent buddy blocks of a
BO. In this case, it needs to issue 2 sdma copy commands to fully access
the data range. But current implementation only issue 1 sdma copy
command and result in incomplete access.

The fix is to loop the res cursor when emitting copy commands so that
multiple(2) copy commands got issued when necessary.

Signed-off-by: Samuel Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 26 ++---
  1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index a6e90eada367..c423574acd5c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1484,7 +1484,7 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
struct dma_fence *fence;
uint64_t src_addr, dst_addr;
unsigned int num_dw;
-   int r, idx;
+   int r, idx, count = 0;
  
  	if (len > PAGE_SIZE)

return -EINVAL;
@@ -1498,7 +1498,7 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
if (write)
memcpy(adev->mman.sdma_access_ptr, buf, len);
  
-	num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8);

+   num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw * 2, 8);
r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr,
 AMDGPU_FENCE_OWNER_UNDEFINED,
 num_dw * 4, AMDGPU_IB_POOL_DELAYED,
@@ -1507,15 +1507,19 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
goto out;
  
  	amdgpu_res_first(abo->tbo.resource, offset, len, &src_mm);

-   src_addr = amdgpu_ttm_domain_start(adev, bo->resource->mem_type) +
-   src_mm.start;
-   dst_addr = amdgpu_bo_gpu_offset(adev->mman.sdma_access_bo);
-   if (write)
-   swap(src_addr, dst_addr);
-
-   amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
-   len, 0);
-
+   while (src_mm.remaining) {
+   src_addr = amdgpu_ttm_domain_start(adev, 
bo->resource->mem_type) +
+   src_mm.start;
+   dst_addr = amdgpu_bo_gpu_offset(adev->mman.sdma_access_bo) + 
count;
+   if (write)
+   swap(src_addr, dst_addr);
+
+   amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,
+   src_mm.size, 0);
+
+   count += src_mm.size;


You could just increment dst_addr instead. And move the initialization 
of dst_addr outside the loop. Other than that, this patch is


Reviewed-by: Felix Kuehling 



+   amdgpu_res_next(&src_mm, src_mm.size);
+}
amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);
WARN_ON(job->ibs[0].length_dw > num_dw);

Re: [PATCH 1/2] drm/amdgpu: fix KFDMemoryTest.PtraceAccessInvisibleVram fail on SRIOV

2024-08-14 Thread Felix Kuehling




On 2024-08-12 02:59, Samuel Zhang wrote:

Ptrace access VRAM bo will first try sdma access in
amdgpu_ttm_access_memory_sdma(), if fails, it will fallback to mmio
access.

Since ptrace only access 8 bytes at a time and
amdgpu_ttm_access_memory_sdma() only allow PAGE_SIZE bytes access,
it returns fail.
On SRIOV, mmio access will also fail as MM_INDEX/MM_DATA register write
is blocked for security reasons.

The fix is just change len check in amdgpu_ttm_access_memory_sdma() so
that len in (0, PAGE_SIZE] are allowed. This will not only fix the ptrace
test case on SRIOV, but also improve the access performance when the
access length is < PAGE_SIZE.
len > PAGE_SIZE case support is not needed as larger size will be break
into chunks of PAGE_SIZE len max in mem_rw().

Signed-off-by: Samuel Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 5daa05e23ddf..a6e90eada367 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1486,7 +1486,7 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
unsigned int num_dw;
int r, idx;
  
-	if (len != PAGE_SIZE)

+   if (len > PAGE_SIZE)
OK, I'll spell it out explicitly. This needs an SRIOV VF-specific 
condition if you want to allow smaller accesses with SDMA on SRIOV. On 
bare metal we want to be able to fall back to the FB BAR for smaller 
accesses. On a VF it will use SDMA for everything.


if (!amdgpu_sriov_vf(adev) && len != PAGE_SIZE)
return -EINVAL;


Regards,
  Felix



return -EINVAL;
  
  	if (!adev->mman.sdma_access_ptr)

@@ -1514,7 +1514,7 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
swap(src_addr, dst_addr);
  
  	amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,

-   PAGE_SIZE, 0);
+   len, 0);
  
  	amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);

WARN_ON(job->ibs[0].length_dw > num_dw);

Re: [PATCH v3] drm/amdgpu: Take IOMMU remapping into account for p2p checks

2024-08-14 Thread Felix Kuehling




On 2024-08-14 11:17, Alex Deucher wrote:

On Wed, Aug 14, 2024 at 5:15 AM Rahul Jain  wrote:

when trying to enable p2p the amdgpu_device_is_peer_accessible()
checks the condition where address_mask overlaps the aper_base
and hence returns 0, due to which the p2p disables for this platform

IOMMU should remap the BAR addresses so the device can access
them. Hence check if peer_adev is remapping DMA

v3:
- remove iommu_remap variable

v2: (Alex)
- Fix as per review comments
- add new function amdgpu_device_check_iommu_remap to check if iommu
   remap

Signed-off-by: Rahul Jain 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++---
  1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a6b8d0ba4758..040c75c491cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3952,6 +3952,25 @@ static void amdgpu_device_check_iommu_direct_map(struct 
amdgpu_device *adev)
 adev->ram_is_direct_mapped = true;
  }

+/**
+ * amdgpu_device_check_iommu_remap - check if iommu remaped BAR

change this to:
Check if DMA remapping is enabled.

since it's not just the BAR, all system address space accesses will be remapped.


+ *
+ * @adev: amdgpu_device pointer
+ *
+ * return if IOMMU remapping bar address
+ */
+static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
+{
+   struct iommu_domain *domain;
+
+   domain = iommu_get_domain_for_dev(adev->dev);
+   if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
+   domain->type == IOMMU_DOMAIN_DMA_FQ))
+   return true;
+
+   return false;
+}
+
  static const struct attribute *amdgpu_dev_attributes[] = {
 &dev_attr_pcie_replay_count.attr,
 NULL
@@ -6127,6 +6146,8 @@ bool amdgpu_device_is_peer_accessible(struct 
amdgpu_device *adev,
   struct amdgpu_device *peer_adev)
  {
  #ifdef CONFIG_HSA_AMD_P2P
+   bool peer_remap = amdgpu_device_check_iommu_remap(peer_adev);
+
 uint64_t address_mask = peer_adev->dev->dma_mask ?
 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
 resource_size_t aper_limit =
@@ -6135,13 +6156,26 @@ bool amdgpu_device_is_peer_accessible(struct 
amdgpu_device *adev,
 !adev->gmc.xgmi.connected_to_cpu &&
 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);

-   return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
-   adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
-   !(adev->gmc.aper_base & address_mask ||
- aper_limit & address_mask));
-#else
-   return false;
+   if (peer_remap)
+   /**

These don't need to be kerneldoc comments.  Replace /** with /*


+* IOMMU is remapping DMA for peer_adev so all accesses
+* should be within peer_adev's DMA mask
+*/
+   return pcie_p2p && p2p_access &&
+   (adev->gmc.visible_vram_size &&
+adev->gmc.real_vram_size == 
adev->gmc.visible_vram_size);
+   else
+   /**

Same here.

With those fixed, it looks good to me, but it would be good if Ramesh
took a look as well as he wrote this code originally.


Looks reasonable to me. But it could be refactored to avoid duplicating 
a bunch of the condition. Maybe something like:


bool is_large_bar = adev->gmc.visible_vram_size &&
adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
bool p2p_access =
!adev->gmc.xgmi.connected_to_cpu &&
!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);

if (!p2p_addressable) {
uint64_t address_mask = peer_adev->dev->dma_mask ?
~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
resource_size_t aper_limit =
adev->gmc.aper_base + adev->gmc.aper_size - 1;

peer_addressable = !(adev->gmc.aper_base & address_mask ||
 aper_limit & address_mask);
}
return is_large_bar && p2p_access && p2p_addressable;

Regards,
  Felix




Alex



+* No IOMMU remapping so make sure the adev's aperture
+* fits into peer_adev's dma mask
+*/
+   return pcie_p2p && p2p_access &&
+   (adev->gmc.visible_vram_size &&
+   adev->gmc.real_vram_size == adev->gmc.visible_vram_size 
&&
+   !(adev->gmc.aper_base & address_mask ||
+   aper_limit & address_mask));
  #endif
+   return false;
  }

  int amdgpu_device_baco_enter(struct drm_device *dev)
--
2.34.1

Re: AMD drm patch workflow is broken for stable trees

2024-08-14 Thread Felix Kuehling


On 2024-08-12 11:00, Greg KH wrote:

Hi all,

As some of you have noticed, there's a TON of failure messages being
sent out for AMD gpu driver commits that are tagged for stable
backports.  In short, you all are doing something really wrong with how
you are tagging these.

Hi Greg,

I got notifications about one KFD patch failing to apply on six branches 
(6.10, 6.6, 6.1, 5.15, 5.10 and 5.4). The funny thing is, that you 
already applied this patch on two branches back in May. The emails had a 
suspicious looking date in the header (Sep 17, 2001). I wonder if there 
was some date glitch that caused a whole bunch of patches to be re-sent 
to stable somehow:


   -- original commit in Linus's tree
   -- From 24e82654e98e96cece5d8b919c522054456eeec6 Mon
   Sep 17 00:00:00 2001 From: Alex Deucher
   Date: Sun, 14 Apr 2024 13:06:39 -0400
   Subject: [PATCH] drm/amdkfd: don't allow mapping the MMIO HDP page
   with large pages ...

On 6.1 and 6.6, the patch was already applied by you in May:

   $ git log --pretty=fuller stable/linux-6.6.y --grep "drm/amdkfd: don't allow 
mapping the MMIO HDP page with large pages"
   commit 4b4cff994a27ebf7bd3fb9a798a1cdfa8d01b724
   Author: Alex Deucher 
   AuthorDate: Sun Apr 14 13:06:39 2024 -0400
   Commit: Greg Kroah-Hartman 
   CommitDate: Fri May 17 12:02:34 2024 +0200

drm/amdkfd: don't allow mapping the MMIO HDP page with large pages
   ...

On 6.10 it was already upstream.

On 5.4-5.15 it doesn't apply because of conflicts. I can resolve those 
and send the fixed patches out for you.


Regards,
  Felix




Please fix it up to NOT have duplicates in multiple branches that end up
in Linus's tree at different times.  Or if you MUST do that, then give
us a chance to figure out that it IS a duplicate.  As-is, it's not
working at all, and I think I need to just drop all patches for this
driver that are tagged for stable going forward and rely on you all to
provide a proper set of backported fixes when you say they are needed.

Again, what you are doing today is NOT ok and is broken.  Please fix.

greg k-h

Re: [PATCH] drm/amdkfd: keep create queue success if cwsr save area doesn't match

2024-08-14 Thread Felix Kuehling




On 2024-08-14 2:35, Zhang, Yifan wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> AFAIK, for low level libraries, e.g. LLVM, ROCr, Hip/OpenCL runtimes, all 
> GPUs are supported. But for the mathlibs and frameworks, only limited GPUs 
> are supported. E.g. :
> 
> https://github.com/ROCm/rocBLAS/blob/28877e5e134a157b7ea56b88a1a12ba551d53cbf/CMakeLists.txt#L111
> 
> set( TARGET_LIST_ROCM_6.3 
> "gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
> 
> On the unsupported GPUs, HSA_OVERRIDE_GFX_VERSION currently works as a 
> workaround.

Then HSA_OVERRIDE_GFX_VERSION probably does more than we need it to for working 
around support in the math libs. What math libs care about is mainly the ISA 
target version. There should be no need to allocate different sizes for CWSR 
areas.

Regards,
  Felix


> 
> 
> Best Regards,
> Yifan
> 
> -Original Message-
> From: Christopher Snowhill 
> Sent: Wednesday, August 14, 2024 10:01 AM
> To: Kuehling, Felix 
> Cc: Zhang, Yifan ; Kasiviswanathan, Harish 
> ; amd-gfx@lists.freedesktop.org; Yang, Philip 
> 
> Subject: Re: [PATCH] drm/amdkfd: keep create queue success if cwsr save area 
> doesn't match
> 
> 
> 
>> On Aug 13, 2024, at 6:52 PM, Felix Kuehling  wrote:
>>
>> Hang on a second. If there are production GPUs that only work with 
>> HSA_OVERRIDE_GFX_VERSION right now, then we should make those GPUs properly 
>> supported. I thought this was only used internally for bring-up or maybe 
>> externally as a short-term solution before we upstream proper support for 
>> new GPUs.
> 
> For instance, for a bunch of compute things, I have to override 10.3.0 for my 
> 10.3.1 GPU, a 6700 XT, because nobody builds or packages the kernels for 
> 10.3.1.
> 
>>
>> Regards,
>>  Felix
>>
>>
>>> On 2024-08-11 22:10, Zhang, Yifan wrote:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>> I agree that adding exp_hw_support is a safer approach. My concern is that 
>>> HSA_OVERRIDE_GFX_VERSION has been used for a while and has become a status 
>>> quo for running ROCm on unsupported APUs. I'm not sure if this approach 
>>> will be a burden for APU end users. Adding driver load parameters is more 
>>> complicated than simply adding an environment variable on consumer PCs.
>>>
>>> Best Regards,
>>> Yifan
>>>
>>> -Original Message-
>>> From: Kuehling, Felix 
>>> Sent: Saturday, August 10, 2024 7:37 AM
>>> To: Zhang, Yifan ; Kasiviswanathan, Harish
>>> ; amd-gfx@lists.freedesktop.org
>>> Cc: Yang, Philip 
>>> Subject: Re: [PATCH] drm/amdkfd: keep create queue success if cwsr
>>> save area doesn't match
>>>
>>> Maybe we can turn this check into a warnings if, and only if the 
>>> exp_hw_support module param is set. That way we don't water down the checks 
>>> on the production code path but allow experimental setups to run without a 
>>> seat belt.
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>> On 2024-08-09 01:39, Zhang, Yifan wrote:
>>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>>
>>>> Yes, I think we need that change for a normal code path, but this case is 
>>>> introduced only with the HSA_OVERRIDE_GFX_VERSION environment setting, 
>>>> which implies that "the override ASIC is compatible with the real ASIC." 
>>>> It is intended for experimental purposes. When a user is using 
>>>> HSA_OVERRIDE_GFX_VERSION, they should be aware of the potential risks it 
>>>> may bring. Usually, HSA_OVERRIDE_GFX_VERSION is used to force an 
>>>> unsupported APU to be recognized as a ROCm-supported high-end dGPU, which 
>>>> has a large cwsr save area, making the operation safe. This check was 
>>>> added to KFD two weeks ago, the HSA_OVERRIDE_GFX_VERSION environment had 
>>>> been working fine before that.
>>>>
>>>> Best Regards,
>>>> Yifan
>>>>
>>>> -Original Message-
>>>> From: Kasiviswanathan, Harish 
>>>> Sent: Thursday, August 8, 2024 10:46 PM
>>>> To: Zhang, Yifan ;
>>>> amd-gfx@lists.freedesktop.org
>>>> Cc: Kuehling, Felix ; Yang, Philip
>>>> ; Zhang, Yifan 
>>>> Subject: RE: [PATCH] drm/amdkfd: keep create queue success if cwsr
>>>> save area doesn't matc

Re: [PATCH] drm/amdkfd: keep create queue success if cwsr save area doesn't match

2024-08-13 Thread Felix Kuehling

Hang on a second. If there are production GPUs that only work with 
HSA_OVERRIDE_GFX_VERSION right now, then we should make those GPUs properly 
supported. I thought this was only used internally for bring-up or maybe 
externally as a short-term solution before we upstream proper support for new 
GPUs.

Regards,
  Felix


On 2024-08-11 22:10, Zhang, Yifan wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
> 
> I agree that adding exp_hw_support is a safer approach. My concern is that 
> HSA_OVERRIDE_GFX_VERSION has been used for a while and has become a status 
> quo for running ROCm on unsupported APUs. I'm not sure if this approach will 
> be a burden for APU end users. Adding driver load parameters is more 
> complicated than simply adding an environment variable on consumer PCs.
> 
> Best Regards,
> Yifan
> 
> -Original Message-
> From: Kuehling, Felix 
> Sent: Saturday, August 10, 2024 7:37 AM
> To: Zhang, Yifan ; Kasiviswanathan, Harish 
> ; amd-gfx@lists.freedesktop.org
> Cc: Yang, Philip 
> Subject: Re: [PATCH] drm/amdkfd: keep create queue success if cwsr save area 
> doesn't match
> 
> Maybe we can turn this check into a warnings if, and only if the 
> exp_hw_support module param is set. That way we don't water down the checks 
> on the production code path but allow experimental setups to run without a 
> seat belt.
> 
> Regards,
>Felix
> 
> 
> On 2024-08-09 01:39, Zhang, Yifan wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> Yes, I think we need that change for a normal code path, but this case is 
>> introduced only with the HSA_OVERRIDE_GFX_VERSION environment setting, which 
>> implies that "the override ASIC is compatible with the real ASIC." It is 
>> intended for experimental purposes. When a user is using 
>> HSA_OVERRIDE_GFX_VERSION, they should be aware of the potential risks it may 
>> bring. Usually, HSA_OVERRIDE_GFX_VERSION is used to force an unsupported APU 
>> to be recognized as a ROCm-supported high-end dGPU, which has a large cwsr 
>> save area, making the operation safe. This check was added to KFD two weeks 
>> ago, the HSA_OVERRIDE_GFX_VERSION environment had been working fine before 
>> that.
>>
>> Best Regards,
>> Yifan
>>
>> -Original Message-
>> From: Kasiviswanathan, Harish 
>> Sent: Thursday, August 8, 2024 10:46 PM
>> To: Zhang, Yifan ; amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix ; Yang, Philip
>> ; Zhang, Yifan 
>> Subject: RE: [PATCH] drm/amdkfd: keep create queue success if cwsr
>> save area doesn't match
>>
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> In this case, shouldn't larger of two sizes be used. Also, we should have an 
>> upper bound check.
>>
>> -Original Message-
>> From: amd-gfx  On Behalf Of
>> Yifan Zhang
>> Sent: Thursday, August 8, 2024 4:44 AM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Kuehling, Felix ; Yang, Philip
>> ; Zhang, Yifan 
>> Subject: [PATCH] drm/amdkfd: keep create queue success if cwsr save
>> area doesn't match
>>
>> If HSA_OVERRIDE_GFX_VERSION is used in ROCm workload, user space and kernel 
>> use different spec to calculate cwsr save area, current check may fail 
>> create queue ioctl. Change error to warn to make create queue succeed in 
>> that case.
>>
>> Signed-off-by: Yifan Zhang 
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 4 +---
>>   1 file changed, 1 insertion(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
>> index e0a073ae4a49..9f283aff057a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
>> @@ -295,11 +295,9 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
>> *pdd, struct queue_prope
>>  }
>>
>>  if (properties->ctx_save_restore_area_size != 
>> topo_dev->node_props.cwsr_size) {
>> -   pr_debug("queue cwsr size 0x%x not equal to node cwsr size 
>> 0x%x\n",
>> +   pr_warn("queue cwsr size 0x%x not equal to node cwsr
>> + size 0x%x\n",
>>  properties->ctx_save_restore_area_size,
>>  topo_dev->node_props.cwsr_size);
>> -   err = -EINVAL;
>> -   goto out_err_unreserve;
>>  }
>>
>>  total_cwsr_size = (topo_dev->node_props.cwsr_size +
>> topo_dev->node_props.debug_memory_size)
>> --
>> 2.37.3
>>
>>

Re: [PATCH] drm/amdgpu: fix KFDMemoryTest.PtraceAccessInvisibleVram fail on SRIOV

2024-08-09 Thread Felix Kuehling




On 2024-08-07 04:36, Samuel Zhang wrote:

Ptrace access VRAM bo will first try sdma access in
amdgpu_ttm_access_memory_sdma(), if fails, it will fallback to mmio
access.

Since ptrace only access 8 bytes at a time and
amdgpu_ttm_access_memory_sdma() only allow PAGE_SIZE bytes access,
it returns fail.
On SRIOV, mmio access will also fail as MM_INDEX/MM_DATA register write
is blocked for security reasons.

The fix is just change len check in amdgpu_ttm_access_memory_sdma() so
that len in (0, PAGE_SIZE] are allowed. This will not fix the ptrace
test case on SRIOV, but also improve the access performance when the
access length is < PAGE_SIZE.
len > PAGE_SIZE case support is not needed as larger size will be break
into chunks of PAGE_SIZE len max in mem_rw().


I'm not convinced that using SDMA for small accesses is the best 
solution for all cases. For example, on large-BAR GPUs we should fall 
back to access through the FB BAR before we use indirect register 
access. That may still perform better than SDMA especially for very 
small accesses like 4-bytes typical for ptrace accesses. Maybe this 
needs an SRIOV-VF-specific condition if MMIO register access is not an 
option there.


@Jonathan Kim, can you chime in as well?

Thanks,
  Felix




Signed-off-by: Samuel Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 5daa05e23ddf..a6e90eada367 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1486,7 +1486,7 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
unsigned int num_dw;
int r, idx;
  
-	if (len != PAGE_SIZE)

+   if (len > PAGE_SIZE)
return -EINVAL;
  
  	if (!adev->mman.sdma_access_ptr)

@@ -1514,7 +1514,7 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
swap(src_addr, dst_addr);
  
  	amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr,

-   PAGE_SIZE, 0);
+   len, 0);
  
  	amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);

WARN_ON(job->ibs[0].length_dw > num_dw);

Re: [PATCH] drm/amdkfd: keep create queue success if cwsr save area doesn't match

2024-08-09 Thread Felix Kuehling

Maybe we can turn this check into a warnings if, and only if the 
exp_hw_support module param is set. That way we don't water down the 
checks on the production code path but allow experimental setups to run 
without a seat belt.


Regards,
  Felix


On 2024-08-09 01:39, Zhang, Yifan wrote:

[AMD Official Use Only - AMD Internal Distribution Only]

Yes, I think we need that change for a normal code path, but this case is introduced only 
with the HSA_OVERRIDE_GFX_VERSION environment setting, which implies that "the 
override ASIC is compatible with the real ASIC." It is intended for experimental 
purposes. When a user is using HSA_OVERRIDE_GFX_VERSION, they should be aware of the 
potential risks it may bring. Usually, HSA_OVERRIDE_GFX_VERSION is used to force an 
unsupported APU to be recognized as a ROCm-supported high-end dGPU, which has a large 
cwsr save area, making the operation safe. This check was added to KFD two weeks ago, the 
HSA_OVERRIDE_GFX_VERSION environment had been working fine before that.

Best Regards,
Yifan

-Original Message-
From: Kasiviswanathan, Harish 
Sent: Thursday, August 8, 2024 10:46 PM
To: Zhang, Yifan ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Yang, Philip ; 
Zhang, Yifan 
Subject: RE: [PATCH] drm/amdkfd: keep create queue success if cwsr save area 
doesn't match

[AMD Official Use Only - AMD Internal Distribution Only]

In this case, shouldn't larger of two sizes be used. Also, we should have an 
upper bound check.

-Original Message-
From: amd-gfx  On Behalf Of Yifan Zhang
Sent: Thursday, August 8, 2024 4:44 AM
To: amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Yang, Philip ; 
Zhang, Yifan 
Subject: [PATCH] drm/amdkfd: keep create queue success if cwsr save area 
doesn't match

If HSA_OVERRIDE_GFX_VERSION is used in ROCm workload, user space and kernel use 
different spec to calculate cwsr save area, current check may fail create queue 
ioctl. Change error to warn to make create queue succeed in that case.

Signed-off-by: Yifan Zhang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 4 +---
  1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index e0a073ae4a49..9f283aff057a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -295,11 +295,9 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
 }

 if (properties->ctx_save_restore_area_size != 
topo_dev->node_props.cwsr_size) {
-   pr_debug("queue cwsr size 0x%x not equal to node cwsr size 
0x%x\n",
+   pr_warn("queue cwsr size 0x%x not equal to node cwsr
+ size 0x%x\n",
 properties->ctx_save_restore_area_size,
 topo_dev->node_props.cwsr_size);
-   err = -EINVAL;
-   goto out_err_unreserve;
 }

 total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
--
2.37.3

Re: [PATCH] drm/amdkfd: Handle queue destroy buffer access race

2024-08-02 Thread Felix Kuehling




On 2024-08-02 11:28, Philip Yang wrote:

Add helper function kfd_queue_unreference_buffers to reduce queue buffer
refcount, separate it from release queue buffers.

Because it is circular locking to hold dqm_lock to take vm lock,
kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers
first, but not release queue buffers, to handle error in case failed to
hold vm lock. Then hold dqm_lock to remove queue from queue list and
then release queue buffers.

Restore process worker restore queue hold dqm_lock, will always find
the queue with valid queue buffers.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +-
  .../amd/amdkfd/kfd_process_queue_manager.c|  8 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 62 ---
  4 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 0622ebd7e8ef..10d6e29b23cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
return 0;
  
  err_create_queue:

+   kfd_queue_unreference_buffers(pdd, &q_properties);
kfd_queue_release_buffers(pdd, &q_properties);


The naming of these functions is a bit unfortunate because 
kfd_queue_release_buffers actually unreferences the buffers, and 
kfd_queue_unreference_buffers affects the virtual address mappings 
(technically amdgpu_bo_vas), not the buffers themselves. I would suggest 
the following rename:


kfd_queue_unreference_buffers -> kfd_queue_unref_bo_vas



  err_acquire_queue_buf:
  err_sdma_engine_id:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 057d20446c31..e38484b40467 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q);
  void print_queue(struct queue *q);
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size);
-void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
+void kfd_queue_buffer_put(struct amdgpu_bo **bo);
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
+void kfd_queue_unreference_buffer(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
+int kfd_queue_unreference_buffers(struct kfd_process_device *pdd,
+ struct queue_properties *properties);
  void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev);
  
  struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index f732ee35b531..ef76a9cbc7e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm)
list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
if (pqn->q) {
pdd = kfd_get_process_device_data(pqn->q->device, 
pqm->process);
+   kfd_queue_unreference_buffers(pdd, &pqn->q->properties);
kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
}
@@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
unsigned int qid)
}
  
  	if (pqn->q) {

-   retval = kfd_queue_release_buffers(pdd, &pqn->q->properties);
+   retval = kfd_queue_unreference_buffers(pdd, 
&pqn->q->properties);
if (retval)
goto err_destroy_queue;
  
@@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)

if (retval != -ETIME)
goto err_destroy_queue;
}
-
+   kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
uninit_queue(pqn->q);
}
@@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct 
process_queue_manager *pqm,
return -EFAULT;
}
  
-		kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo);

+   kfd_queue_unreference_buffer(vm, &pqn->q->properties.ring_bo);
+   kfd_queue_buffer_put(&pqn->q->properties.ring_bo);
amdgpu_bo_unreserve(vm->root.bo);
  
  		pqn->q->properties.ring_bo = p->ring_bo;

diff --git a/drivers/gpu/drm/amd/am

Re: [PATCH] drm/amdkfd: support per-queue reset on gfx9

2024-07-31 Thread Felix Kuehling




On 2024-07-31 09:37, Jonathan Kim wrote:

Support per-queue reset for GFX9.  The recommendation is for the driver
to target reset the HW queue via a SPI MMIO register write.

Since this requires pipe and HW queue info and MEC FW is limited to
doorbell reports of hung queues after an unmap failure, scan the HW
queue slots defined by SET_RESOURCES first to identify the user queue
candidates to reset.

Only signal reset events to processes that have had a queue reset.

If queue reset fails, fall back to GPU reset.

v3: address nitpicks
- handle hang detect buffer ENOMEM
- warn on multiple detect hang misuse
- reset hang detect buffer to NULL on free
- update DRM_ERR on reset to drm_err app warning message


I meant dev_err here to make sure we print the device identifier. That's 
what we mostly use in KFD. If drm_err does the same, that's fine, too. 
Looking at the definitions in drm_print.h, the only thing that drm_err 
adds is a "[drm]" tag in the message.


See one more comment inline.




v2: move reset queue flag for house keeping to process device.
split detect and reset into separate functions.
make reset call safe during power saving modes.
clean up some other nitpicks.

Signed-off-by: Jonathan Kim 


[snip]

@@ -1929,6 +1966,135 @@ static int map_queues_cpsch(struct 
device_queue_manager *dqm)

return retval;
  }
  
+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q,

+  struct qcm_process_device *qpd)
+{
+   struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+
+   pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
+  q->properties.queue_id, q->process->pasid);


This could also be a dev_err(dqm->dev->adev->dev, ...) or 
drm_err(dqm->dev->adev->ddev, ...). With that fixed, the patch is


Reviewed-by: Felix Kuehling 



+
+   pdd->has_reset_queue = true;
+   if (q->properties.is_active) {
+   q->properties.is_active = false;
+   decrement_queue_count(dqm, qpd, q);
+   }
+}
+
+static int detect_queue_hang(struct device_queue_manager *dqm)
+{
+   int i;
+
+   /* detect should be used only in dqm locked queue reset */
+   if (WARN_ON(dqm->detect_hang_count > 0))
+   return 0;
+
+   memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);
+
+   for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
+   uint32_t mec, pipe, queue;
+   int xcc_id;
+
+   mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
+   / dqm->dev->kfd->shared_resources.num_pipe_per_mec;
+
+   if (mec || !test_bit(i, 
dqm->dev->kfd->shared_resources.cp_queue_bitmap))
+   continue;
+
+   amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, 
&queue);
+
+   for_each_inst(xcc_id, dqm->dev->xcc_mask) {
+   uint64_t queue_addr = 
dqm->dev->kfd2kgd->hqd_get_pq_addr(
+   dqm->dev->adev, pipe, queue, 
xcc_id);
+   struct dqm_detect_hang_info hang_info;
+
+   if (!queue_addr)
+   continue;
+
+   hang_info.pipe_id = pipe;
+   hang_info.queue_id = queue;
+   hang_info.xcc_id = xcc_id;
+   hang_info.queue_address = queue_addr;
+
+   dqm->detect_hang_info[dqm->detect_hang_count] = 
hang_info;
+   dqm->detect_hang_count++;
+   }
+   }
+
+   return dqm->detect_hang_count;
+}
+
+static struct queue *find_queue_by_address(struct device_queue_manager *dqm, 
uint64_t queue_address)
+{
+   struct device_process_node *cur;
+   struct qcm_process_device *qpd;
+   struct queue *q;
+
+   list_for_each_entry(cur, &dqm->queues, list) {
+   qpd = cur->qpd;
+   list_for_each_entry(q, &qpd->queues_list, list) {
+   if (queue_address == q->properties.queue_address)
+   return q;
+   }
+   }
+
+   return NULL;
+}
+
+/* only for compute queue */
+static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
+{
+   int r = 0, reset_count = 0, i;
+
+   if (!dqm->detect_hang_info || dqm->is_hws_hang)
+   return -EIO;
+
+   /* assume dqm locked. */
+   if (!detect_queue_hang(dqm))
+   return -ENOTRECOVERABLE;
+
+   for (i = 0; i < dqm->detect_hang_count; i++) {
+   struct dqm_detect_hang_info hang_info = 
dqm->detect_hang_info[i];
+   struct queue *q = find_queue_by_address(dqm, 
hang_info.queue_address);
+   struct kfd_process_device *pdd;
+

Re: [PATCH 2/2] drm/amdkfd: support the debugger during per-queue reset

2024-07-30 Thread Felix Kuehling




On 2024-07-26 11:30, Jonathan Kim wrote:
> In order to allow ROCm GDB to handle reset queues, raise an
> EC_QUEUE_RESET exception so that the debugger can subscribe and
> query this exception.
> 
> Reset queues should still be considered suspendable with a status
> flag of KFD_DBG_QUEUE_RESET_MASK.
> However they should not be resumable since user space will no longer
> be able to access reset queues.
> 
> v2: move per-queue reset flag to this patch
> rebase based on patch 1 changes
> 
> Signed-off-by: Jonathan Kim 
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 31 ---
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
>  include/uapi/linux/kfd_ioctl.h|  4 +++
>  3 files changed, 31 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index e335703eff84..cb7b5bbf5c40 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -164,6 +164,10 @@ static void kfd_hws_hang(struct device_queue_manager 
> *dqm)
>   struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>  
>   pdd->has_reset_queue = true;
> + q->properties.is_reset = true;
> + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET),
> +  q->process, q->device, q->doorbell_id,
> +  false, NULL, 0);
>   }
>   }
>  
> @@ -986,7 +990,7 @@ static int suspend_single_queue(struct 
> device_queue_manager *dqm,
>  {
>   bool is_new;
>  
> - if (q->properties.is_suspended)
> + if (q->properties.is_suspended || q->properties.is_reset)
>   return 0;
>  
>   pr_debug("Suspending PASID %u queue [%i]\n",
> @@ -1007,6 +1011,9 @@ static int suspend_single_queue(struct 
> device_queue_manager *dqm,
>   if (dqm->dev->kfd->shared_resources.enable_mes) {
>   int r = remove_queue_mes(dqm, q, &pdd->qpd);
>  
> + if (q->properties.is_reset)
> + return 0;
> +
>   if (r)
>   return r;
>   }
> @@ -1967,10 +1974,14 @@ static void set_queue_as_reset(struct 
> device_queue_manager *dqm, struct queue *q
>  q->properties.queue_id, q->process->pasid);
>  
>   pdd->has_reset_queue = true;
> + q->properties.is_reset = true;
>   if (q->properties.is_active) {
>   q->properties.is_active = false;
>   decrement_queue_count(dqm, qpd, q);
>   }
> +
> + kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_RESET), q->process, q->device,
> +  q->doorbell_id, false, NULL, 0);
>  }
>  
>  static int detect_queue_hang(struct device_queue_manager *dqm)
> @@ -3037,7 +3048,8 @@ int resume_queues(struct kfd_process *p,
>   queue_ids[q_idx] &=
>   
> ~KFD_DBG_QUEUE_INVALID_MASK;
>   } else {
> - queue_ids[q_idx] |=
> + queue_ids[q_idx] |= 
> q->properties.is_reset ?
> + 
> KFD_DBG_QUEUE_RESET_MASK :
>   
> KFD_DBG_QUEUE_ERROR_MASK;
>   break;
>   }
> @@ -3072,7 +3084,7 @@ int resume_queues(struct kfd_process *p,
>   queue_ids);
>  
>   /* mask queue as error on resume fail */
> - if (q_idx != QUEUE_NOT_FOUND)
> + if (q_idx != QUEUE_NOT_FOUND && 
> !q->properties.is_reset)
>   queue_ids[q_idx] |=
>   
> KFD_DBG_QUEUE_ERROR_MASK;
>   }
> @@ -3119,6 +3131,7 @@ int suspend_queues(struct kfd_process *p,
>   struct qcm_process_device *qpd = &pdd->qpd;
>   struct queue *q;
>   int r, per_device_suspended = 0;
> + bool has_queue_reset_fail = false;
>  
>   mutex_lock(&p->event_mutex);
>   dqm_lock(dqm);
> @@ -3135,6 +3148,9 @@ int suspend_queues(struct kfd_process *p,
>  
>   if (!err) {
>   queue_ids[q_idx] &= 
> ~KFD_DBG_QUEUE_INVALID_MASK;
> + if (q->properties.is_reset)
> + queue_ids[q_idx] |= 
> KFD_DBG_QUEUE_RESET_MASK;
> +
>   if (exception_clear_mask && is_mes)
>

Re: [PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9

2024-07-30 Thread Felix Kuehling



On 2024-07-26 11:30, Jonathan Kim wrote:
> Support per-queue reset for GFX9.  The recommendation is for the driver
> to target reset the HW queue via a SPI MMIO register write.
> 
> Since this requires pipe and HW queue info and MEC FW is limited to
> doorbell reports of hung queues after an unmap failure, scan the HW
> queue slots defined by SET_RESOURCES first to identify the user queue
> candidates to reset.
> 
> Only signal reset events to processes that have had a queue reset.
> 
> If queue reset fails, fall back to GPU reset.
> 
> v2: move reset queue flag for house keeping to process device.
> split detect and reset into separate functions.
> make reset call safe during power saving modes.
> clean up some other nitpicks.

Some more nit-picks inline.

> 
> Signed-off-by: Jonathan Kim 
> ---
>  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   2 +
>  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |   4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  16 ++
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|   9 +
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   4 +-
>  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  18 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  85 +
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   9 +
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 172 +-
>  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  12 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  21 +++
>  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |   6 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +
>  .../gpu/drm/amd/include/kgd_kfd_interface.h   |   6 +
>  16 files changed, 360 insertions(+), 13 deletions(-)
> 
[snip]
> @@ -1680,6 +1700,14 @@ static int start_cpsch(struct device_queue_manager 
> *dqm)
>   &dqm->wait_times);
>   }
>  
> + /* setup per-queue reset detection buffer  */
> + num_hw_queue_slots =  
> dqm->dev->kfd->shared_resources.num_queue_per_pipe *
> +   dqm->dev->kfd->shared_resources.num_pipe_per_mec *
> +   NUM_XCC(dqm->dev->xcc_mask);
> +
> + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct 
> dqm_detect_hang_info);
> + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL);

You need to check the return value and handle allocation failures.

> +
>   dqm_unlock(dqm);
>  
>   return 0;
> @@ -1713,6 +1741,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
>   kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
>   if (!dqm->dev->kfd->shared_resources.enable_mes)
>   pm_uninit(&dqm->packet_mgr);
> + kfree(dqm->detect_hang_info);

Reset dqm->detect_hang_info to NULL to avoid a dangling pointer.

>   dqm_unlock(dqm);
>  
>   return 0;
> @@ -1929,6 +1958,131 @@ static int map_queues_cpsch(struct 
> device_queue_manager *dqm)
>   return retval;
>  }
>  
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct 
> queue *q,
> +struct qcm_process_device *qpd)
> +{
> + struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> +
> + pr_err("queue id 0x%0x at pasid 0x%0x is reset\n",
> +q->properties.queue_id, q->process->pasid);
> +
> + pdd->has_reset_queue = true;
> + if (q->properties.is_active) {
> + q->properties.is_active = false;
> + decrement_queue_count(dqm, qpd, q);
> + }
> +}
> +
> +static int detect_queue_hang(struct device_queue_manager *dqm)
> +{
> + int i;
> +
> + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size);

Set dqm->detect_hang_count to 0 to avoid overflows in case multiple hand 
detections get kicked off. Or if that's not possible, just print a WARN_ON if 
detect_hang_count is not 0 and return.

> +
> + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
> + uint32_t mec, pipe, queue;
[snip]
> @@ -1244,12 +1245,32 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>   idx = srcu_read_lock(&kfd_processes_srcu);
>   hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>   int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
> + struct kfd_process_device *pdd = 
> kfd_get_process_device_data(dev, p);
>  
>   if (unlikely(user_gpu_id == -EINVAL)) {
>   WARN_ONCE(1, "Could not get user_gpu_id from 
> dev->id:%x\n", dev->id);
>   continue;
>   }
>  
> + if (unlikely(!pdd)) {
> + WARN_ONCE(1, "Could not get device data from 
> pasid:0x%x\n", p->pasid);
> + continue;
> + }
> +
> + if (dev->dqm->detect_hang_count && !pdd->has_reset_queue)
> + continue;
> +
> + if (dev->dqm->detect_hang_count) {
>

Re: [PATCH] drm/amdkfd: Fix compile error if HMM support not enabled

2024-07-26 Thread Felix Kuehling




On 2024-07-25 19:25, Philip Yang wrote:

Fixes the below if kernel config not enable HMM support


drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:107:26: error:

implicit declaration of function 'svm_range_from_addr'


drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:107:24: error:

assignment to 'struct svm_range *' from 'int' makes pointer from integer
without a cast [-Wint-conversion]


drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_queue.c:111:28: error:

invalid use of undefined type 'struct svm_range'

Fixes: de165b53c93f ("drm/amdkfd: Validate user queue svm memory residency")
Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202407252127.zvnxakra-...@intel.com/
Signed-off-by: Philip Yang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 9807e8adf77d..64c292f0ba1b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -85,6 +85,8 @@ void uninit_queue(struct queue *q)
kfree(q);
  }
  
+#if IS_ENABLED(CONFIG_HSA_AMD_SVM)

+
  static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, 
u64 size)
  {
struct kfd_process *p = pdd->process;
@@ -178,6 +180,18 @@ static void kfd_queue_buffer_svm_put(struct 
kfd_process_device *pdd, u64 addr, u
  
  	mutex_unlock(&p->svms.lock);

  }
+#else
+
+static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   return -EINVAL;
+}
+
+static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+}
+
+#endif
  
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo,

 u64 expected_size)

Re: [PATCH 1/2] drm/amdkfd: support per-queue reset on gfx9

2024-07-24 Thread Felix Kuehling




On 2024-07-18 13:56, Jonathan Kim wrote:

Support per-queue reset for GFX9.  The recommendation is for the driver
to target reset the HW queue via a SPI MMIO register write.

Since this requires pipe and HW queue info and MEC FW is limited to
doorbell reports of hung queues after an unmap failure, scan the HW
queue slots defined by SET_RESOURCES first to identify the user queue
candidates to reset.

Only signal reset events to processes that have had a queue reset.

If queue reset fails, fall back to GPU reset.

Signed-off-by: Jonathan Kim 
---
  .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c  |   1 +
  .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |   3 +-
  .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c   |   3 +-
  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|   9 +
  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h|   6 +
  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c  |   3 +-
  .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c|  11 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  56 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |   6 +
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 195 --
  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  15 ++
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |   6 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   4 +
  drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +
  .../gpu/drm/amd/include/kgd_kfd_interface.h   |   4 +
  16 files changed, 310 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
index aff08321e976..1dc601e4518a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
@@ -191,4 +191,5 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
+   .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
index 3a3f3ce09f00..534975c722df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
@@ -418,5 +418,6 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v9_build_grace_period_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
-   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings
+   .program_trap_handler_settings = 
kgd_gfx_v9_program_trap_handler_settings,
+   .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
index a5c7259cf2a3..b53c1cfa34de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
@@ -541,5 +541,6 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
kgd_gfx_v9_4_3_set_wave_launch_trap_override,
.set_wave_launch_mode = kgd_aldebaran_set_wave_launch_mode,
.set_address_watch = kgd_gfx_v9_4_3_set_address_watch,
-   .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch
+   .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch,
+   .hqd_detect_and_reset = kgd_gfx_v9_hqd_detect_and_reset
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 3ab6c3aa0ad1..dd449a0caba8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -1070,6 +1070,14 @@ static void program_trap_handler_settings(struct 
amdgpu_device *adev,
unlock_srbm(adev);
  }
  
+uint64_t kgd_gfx_v10_hqd_detect_and_reset(struct amdgpu_device *adev,

+ uint32_t pipe_id, uint32_t queue_id,
+ uint32_t inst, unsigned int utimeout,
+ bool detect_only)
+{
+   return 0;
+}
+
  const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.program_sh_mem_settings = kgd_program_sh_mem_settings,
.set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping,
@@ -1097,4 +1105,5 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
.build_grace_period_packet_info = 
kgd_gfx_v10_build_grace_period_packet_info,
.program_trap_handler_settings = program_trap_handler_settings,
+   .hqd_detect_and_reset = kgd_gfx_v10_hqd_detect_and_reset
  };
diff --git a/drivers/gpu/drm/amd/amdgpu

Re: [PATCH v2] drm/amdkfd: Change kfd/svm page fault drain handling

2024-07-24 Thread Felix Kuehling




On 2024-07-19 18:17, Xiaogang.Chen wrote:

From: Xiaogang Chen 

When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page page to decide to drop or handle
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happens on unmapped ranges after unmap events is app bug that
accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

Signed-off-by: Xiaogang.Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |   4 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |   2 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |   3 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |   4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h  |   5 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 111 +
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   2 +-
  7 files changed, 88 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 3abfa66d72a2..d90b7ea3f020 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2763,7 +2763,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
   * shouldn't be reported any more.
   */
  bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-   u32 vmid, u32 node_id, uint64_t addr,
+   u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
bool write_fault)
  {
bool is_compute_context = false;
@@ -2789,7 +2789,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
addr /= AMDGPU_GPU_PAGE_SIZE;
  
  	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,

-   node_id, addr, write_fault)) {
+   node_id, addr, ts, write_fault)) {
amdgpu_bo_unref(&root);
return true;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 312a408b80d3..1d6a1381ede9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -548,7 +548,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
  void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
  
  bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,

-   u32 vmid, u32 node_id, uint64_t addr,
+   u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
bool write_fault);
  
  void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index d933e19e0cf5..5cceaba6e5c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device 
*adev,
/* Try to handle the recoverable page faults by filling page
 * tables
 */
-   if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, 
write_fault))
+   if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+  
entry->timestamp, write_fault))
return 1;
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 350f6b6676f1..ac08d9424feb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
cam_index = entry->src_data[2] & 0x3ff;
  
  			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,

-addr, write_fault);
+addr, entry->timestamp, 
write_fault);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, 
cam_index);
if (ret)
return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
 * tables
 */
if (amdgpu_vm_handle_fault(adev, entry->pasid, 
entry->vmid, node_id,
-  addr, write_fault))
+

Re: [PATCH] drm/amdkfd: allow users to target recommended SDMA engines

2024-07-24 Thread Felix Kuehling




On 2024-07-24 13:56, Jonathan Kim wrote:

Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.

v2: remove unnecessary crat updates and refactor sdma resource
bit setting logic.

Signed-off-by: Jonathan Kim 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 16 ++
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 38 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +-
  .../amd/amdkfd/kfd_process_queue_manager.c|  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 52 +++
  drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  1 +
  include/uapi/linux/kfd_ioctl.h|  6 ++-
  7 files changed, 116 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 32e5db509560..9610cb90a47e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
args->ctx_save_restore_address;
q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
q_properties->ctl_stack_size = args->ctl_stack_size;
+   q_properties->sdma_engine_id = args->sdma_engine_id;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->type = KFD_QUEUE_TYPE_SDMA;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
+   else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
+   q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
else
return -ENOTSUPP;
  
@@ -334,6 +337,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,

goto err_bind_process;
}
  
+	if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {

+   int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
+ kfd_get_num_xgmi_sdma_engines(dev) - 1;
+
+   if (q_properties.sdma_engine_id > max_sdma_eng_id) {
+   err = -EINVAL;
+   pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
+  q_properties.sdma_engine_id, max_sdma_eng_id);
+   goto err_sdma_engine_id;
+   }
+   }
+
if (!pdd->qpd.proc_doorbells) {
err = kfd_alloc_process_doorbells(dev->kfd, pdd);
if (err) {
@@ -425,6 +440,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
if (wptr_bo)
amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
  err_wptr_map_gart:
+err_sdma_engine_id:
  err_bind_process:
  err_pdd:
mutex_unlock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4f48507418d2..69315885519d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1534,6 +1534,41 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_xgmi_sdma_engines(dqm->dev);
+   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
+   int i, num_queues, num_engines, eng_offset = 0, start_engine;
+   bool free_bit_found = false, is_xgmi = false;
+
+   if (q->properties.sdma_engine_id < 
kfd_get_num_sdma_engines(dqm->dev)) {
+   num_queues = get_num_sdma_queues(dqm);
+   num_engines = kfd_get_num_sdma_engines(dqm->dev);
+   q->properties.type = KFD_QUEUE_TYPE_SDMA;
+   } else {
+   num_queues = get_num_xgmi_sdma_queues(dqm);
+   num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev);
+   eng_offset = kfd_get_num_sdma_engines(dqm->dev);
+   q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
+   is_xgmi = true;
+   }
+
+   /* Scan available bit based on target engine ID. */
+   start_engine

Re: [PATCH] drm/amdkfd: allow users to target recommended SDMA engines

2024-07-19 Thread Felix Kuehling


On 2024-07-18 19:05, Jonathan Kim wrote:

Certain GPUs have better copy performance over xGMI on specific
SDMA engines depending on the source and destination GPU.
Allow users to create SDMA queues on these recommended engines.
Close to 2x overall performance has been observed with this
optimization.

Signed-off-by: Jonathan Kim 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 16 ++
  drivers/gpu/drm/amd/amdkfd/kfd_crat.h |  3 +-
  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 39 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  5 +-
  .../amd/amdkfd/kfd_process_queue_manager.c|  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 52 +++
  drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  1 +
  include/uapi/linux/kfd_ioctl.h|  6 ++-
  8 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 32e5db509560..9610cb90a47e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
args->ctx_save_restore_address;
q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
q_properties->ctl_stack_size = args->ctl_stack_size;
+   q_properties->sdma_engine_id = args->sdma_engine_id;
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->type = KFD_QUEUE_TYPE_SDMA;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
+   else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
+   q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
else
return -ENOTSUPP;
  
@@ -334,6 +337,18 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,

goto err_bind_process;
}
  
+	if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {

+   int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
+ kfd_get_num_xgmi_sdma_engines(dev) - 1;
+
+   if (q_properties.sdma_engine_id > max_sdma_eng_id) {
+   err = -EINVAL;
+   pr_err("sdma_engine_id %i exceeds maximum id of %i\n",
+  q_properties.sdma_engine_id, max_sdma_eng_id);
+   goto err_sdma_engine_id;
+   }
+   }
+
if (!pdd->qpd.proc_doorbells) {
err = kfd_alloc_process_doorbells(dev->kfd, pdd);
if (err) {
@@ -425,6 +440,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
if (wptr_bo)
amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
  err_wptr_map_gart:
+err_sdma_engine_id:
  err_bind_process:
  err_pdd:
mutex_unlock(&p->mutex);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index a8ca7ecb6d27..e880a71837bc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -259,7 +259,7 @@ struct crat_subtype_ccompute {
  #define CRAT_IOLINK_TYPE_OTHER16
  #define CRAT_IOLINK_TYPE_MAX  255
  
-#define CRAT_IOLINK_RESERVED_LENGTH	24

+#define CRAT_IOLINK_RESERVED_LENGTH20
  
  struct crat_subtype_iolink {

uint8_t type;
@@ -276,6 +276,7 @@ struct crat_subtype_iolink {
uint32_tminimum_bandwidth_mbs;
uint32_tmaximum_bandwidth_mbs;
uint32_trecommended_transfer_size;
+   uint32_trecommended_sdma_eng_id_mask;


This seems completely unnecessary because your code in kfd_topology 
doesn't depend on this info being in the CRAT table.




uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1];
uint8_t weight_xgmi;
  };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4f48507418d2..58d7710ebb30 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1534,6 +1534,42 @@ static int allocate_sdma_queue(struct 
device_queue_manager *dqm,
q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_xgmi_sdma_engines(dqm->dev);
+   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
+   int i, num_queues, num_engines, eng_offset = 0;
+   bool

Re: [PATCH v2 0/9] KFD user queue validation

2024-07-18 Thread Felix Kuehling



On 2024-07-18 17:05, Philip Yang wrote:
> This patch series do additional queue buffers validation in the queue
> creation IOCTLS, fail the queue creation if buffers not mapped on the GPU
> with the expected size.
> 
> Ensure queue buffers residency by tracking the GPUVM virtual addresses
> for queue buffers to return error if the user tries to free and unmap them
> when the qeueu is active, or evict the queue if SVM memory is unmapped and
> freed from CPU.
> 
> Patch 1-2 is prepration work and general fix.
> 
> v2:
>  - patch 3/9, keep wptr_bo_gart in struct queue

The series is
Reviewed-by: Felix Kuehling 

> 
> Philip Yang (9):
>   drm/amdkfd: kfd_bo_mapped_dev support partition
>   drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer
>   drm/amdkfd: Refactor queue wptr_bo GART mapping
>   drm/amdkfd: Validate user queue buffers
>   drm/amdkfd: Ensure user queue buffers residency
>   drm/amdkfd: Validate user queue svm memory residency
>   drm/amdkfd: Validate user queue update
>   drm/amdkfd: Store queue cwsr area size to node properties
>   drm/amdkfd: Validate queue cwsr area and eop buffer size
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  14 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   6 +-
>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  24 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|   6 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  61 +---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c   |   4 +-
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c |   8 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  |   2 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  19 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c  |   2 +-
>  .../amd/amdkfd/kfd_process_queue_manager.c|  79 +++-
>  drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 336 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  12 +
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   4 +
>  16 files changed, 489 insertions(+), 91 deletions(-)
>

Re: [PATCH v2] drm/amdkfd: Correct svm prange overlapping handling at svm_range_set_attr ioctl

2024-07-18 Thread Felix Kuehling




On 2024-07-18 1:25, Chen, Xiaogang wrote:
> 
> On 7/17/2024 6:02 PM, Felix Kuehling wrote:
>>
>> On 2024-06-26 11:06, Xiaogang.Chen wrote:
>>> From: Xiaogang Chen 
>>>
>>> When user adds new vm range that has overlapping with existing svm pranges
>>> current kfd creats a cloned pragne and split it, then replaces original 
>>> prange
>>> by it. That destroy original prange locks and the cloned prange locks do not
>>> inherit original prange lock contexts. This may cause issue if code still 
>>> need
>>> use these locks. In general we should keep using original prange, update its
>>> internal data that got changed during split, then free the cloned prange.
>>
>> While splitting/updating ranges, the svms->lock needs to be held. You cannot 
>> have concurrent threads accessing ranges while we're updating the range 
>> list. If that is a possibility, you have a race condition anyway. You also 
>> can't split, shrink or otherwise modify a range while someone else is 
>> accessing that range. So keeping the same locking context is a non-issue.
>>
> We do hold svms->lock when call svm_range_add. The patch does not mean to fix 
> race condition. It keeps original svm range context when we need split/update 
> it. The current implementation "duplicate" a new one, then destroy original 
> svm range.
My point is, nobody can be using the lock context while we update the range 
list. Therefore it doesn't matter if we replace it with a new one.

> 
>>>
>>> This patch change vm range overlaping handling that does not remove existing
>>> pranges, instead updates it for split and keeps its locks alive.
>>
>> It sounds like you're trying to fix a problem here. Is this an actual or a 
>> hypothetical problem?
>>
> It is not a problem in reality so far. It uses a way that not destroy 
> original svm range when split/update it. So keep its locks(prange->lock, 
> prange->migrate_mutex) context. The so called "clone svm range" create new 
> locks that are not related to original locks. I think that is not reasonable.

That's your opinion. In my opinion the current code is perferctly reasonable. 
There is no technical reason that the prange and its lock context must remain 
the same. I see no technical justification for a change that makes the code 
longer, more complex, and introduces the risk of regressions.

Regards,
  Felix

> 
>>
>>>
>>> Signed-off-by: Xiaogang Chen
>>> ---
>>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 112 ---
>>>   1 file changed, 82 insertions(+), 30 deletions(-)
>>
>> Just looking at the number of added and removed lines, this doesn't look 
>> like a simplification. I really question the justification for this change. 
>> If it makes the code more complicated or less robust, without a good reason, 
>> then it's not a good change.
>>
> As above it does not make code simpler or more complicated. It split/update 
> svm range directly on prange data, not destroy original prange, then generate 
> a new one.
> 
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> index 407636a68814..a66b8c96ee14 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>>> @@ -1967,7 +1967,8 @@ svm_range_evict(struct svm_range *prange, struct 
>>> mm_struct *mm,
>>>   return r;
>>>   }
>>>   -static struct svm_range *svm_range_clone(struct svm_range *old)
>>> +/* create a prange that has same range/size/addr etc info as old */
>>> +static struct svm_range *svm_range_duplicate(struct svm_range *old)
>>
>> This seems like an unnecessary name change. "clone" and "duplicate" mean the 
>> same thing. But "clone" is shorter.
>>
> 'clone" should mean identical to existing one. Here we use some items from 
> existing svm_range to build new one, the new one is not totally same as 
> existing one, such as the prange->lock is not same between old and new svm 
> range.
> 
>>>   {
>>>   struct svm_range *new;
>>>   @@ -1999,6 +2000,25 @@ static struct svm_range *svm_range_clone(struct 
>>> svm_range *old)
>>>   return new;
>>>   }
>>>   +/* copy range/size/addr info from src to dst prange */
>>> +static void svm_range_copy(struct svm_range *dst, struct svm_range *src)
>>> +{
>>> +    dst->npages = src->npages;
>>> +    d

Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-18 Thread Felix Kuehling




On 2024-07-18 15:57, Philip Yang wrote:
> 
> On 2024-07-17 16:16, Felix Kuehling wrote:
>> Sorry, I see that this patch still doesn't propagate errors returned from 
>> kfd_queue_releasre_buffers correctly. And the later patches in the series 
>> don't seem to fix it either. See inline.
> kfd_queue_release_buffers return value is handled in queue destroy path, to 
> return -ERESTARTSYS if fail to hold vm lock to release buffers because signal 
> is received. See inline.

Sorry, I thought I had searched all the places that call 
kfd_queue_release_buffers, but I missed the one where the error handling 
matters most, that does it correctly. More inline.

>>
>> On 2024-07-15 08:34, Philip Yang wrote:
>>> Add helper function kfd_queue_acquire_buffers to get queue wptr_bo
>>> reference from queue write_ptr if it is mapped to the KFD node with
>>> expected size.
>>>
>>> Move wptr_bo to structure queue_properties from struct queue as queue is
>>> allocated after queue buffers are validated, then we can remove wptr_bo
>>> parameter from pqm_create_queue.
>>>
>>> Because amdgpu_bo_unref clear the pointer, queue_properties wptr_bo is
>>> used to acquire and release wptr_bo for validation, add wptr_bo_gart to
>>> queue_propertes, to hold wptr_bo reference for GART mapping and
>>> umapping.
>>>
>>> Move MES wptr_bo GART mapping to init_user_queue, the same location with
>>> queue ctx_bo GART mapping.
>>>
>>> Signed-off-by: Philip Yang 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
>>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 +-
>>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 56 +++---
>>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +-
>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++--
>>>   .../amd/amdkfd/kfd_process_queue_manager.c    | 45 +++
>>>   drivers/gpu/drm/amd/amdkfd/kfd_queue.c    | 57 +++
>>>   7 files changed, 116 insertions(+), 69 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> index 6e591280774b..4ed49265c764 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>>> @@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct 
>>> kgd_mem *mem,
>>>    void **kptr, uint64_t *size);
>>>   void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
>>>   -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);
>>> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct 
>>> amdgpu_bo **bo_gart);
>>>     int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>>>   struct dma_fence __rcu **ef);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 199e387d35f4..0ab37e7aec26 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>>>   /**
>>>    * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment 
>>> reference count
>>>    * @bo: Buffer object to be mapped
>>> + * @bo_gart: Return bo reference
>>>    *
>>>    * Before return, bo reference count is incremented. To release the 
>>> reference and unpin/
>>>    * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
>>>    */
>>> -int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
>>> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct 
>>> amdgpu_bo **bo_gart)
>>>   {
>>>   int ret;
>>>   @@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct 
>>> amdgpu_bo *bo)
>>>     amdgpu_bo_unreserve(bo);
>>>   -    bo = amdgpu_bo_ref(bo);
>>> +    *bo_gart = amdgpu_bo_ref(bo);
>>>     return 0;
>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> index 823f245dc7d0..202f24ee4bd7 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>>> @@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct 
>>> queue_properties *q_pr

Re: [PATCH v2] drm/amdkfd: Correct svm prange overlapping handling at svm_range_set_attr ioctl

2024-07-17 Thread Felix Kuehling




On 2024-06-26 11:06, Xiaogang.Chen wrote:

From: Xiaogang Chen 

When user adds new vm range that has overlapping with existing svm pranges
current kfd creats a cloned pragne and split it, then replaces original prange
by it. That destroy original prange locks and the cloned prange locks do not
inherit original prange lock contexts. This may cause issue if code still need
use these locks. In general we should keep using original prange, update its
internal data that got changed during split, then free the cloned prange.


While splitting/updating ranges, the svms->lock needs to be held. You 
cannot have concurrent threads accessing ranges while we're updating the 
range list. If that is a possibility, you have a race condition anyway. 
You also can't split, shrink or otherwise modify a range while someone 
else is accessing that range. So keeping the same locking context is a 
non-issue.





This patch change vm range overlaping handling that does not remove existing
pranges, instead updates it for split and keeps its locks alive.


It sounds like you're trying to fix a problem here. Is this an actual or 
a hypothetical problem?





Signed-off-by: Xiaogang Chen
---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 112 ---
  1 file changed, 82 insertions(+), 30 deletions(-)


Just looking at the number of added and removed lines, this doesn't look 
like a simplification. I really question the justification for this 
change. If it makes the code more complicated or less robust, without a 
good reason, then it's not a good change.





diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 407636a68814..a66b8c96ee14 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1967,7 +1967,8 @@ svm_range_evict(struct svm_range *prange, struct 
mm_struct *mm,
return r;
  }
  
-static struct svm_range *svm_range_clone(struct svm_range *old)

+/* create a prange that has same range/size/addr etc info as old */
+static struct svm_range *svm_range_duplicate(struct svm_range *old)


This seems like an unnecessary name change. "clone" and "duplicate" mean 
the same thing. But "clone" is shorter.




  {
struct svm_range *new;
  
@@ -1999,6 +2000,25 @@ static struct svm_range *svm_range_clone(struct svm_range *old)

return new;
  }
  
+/* copy range/size/addr info from src to dst prange */

+static void svm_range_copy(struct svm_range *dst, struct svm_range *src)
+{
+   dst->npages = src->npages;
+   dst->start = src->start;
+   dst->last = src->last;
+
+   dst->vram_pages = src->vram_pages;
+   dst->offset = src->offset;
+
+   for (int i = 0; i < MAX_GPU_INSTANCE; i++) {
+   if (!src->dma_addr[i])
+   continue;
+
+memcpy(dst->dma_addr[i], src->dma_addr[i],
+   src->npages * sizeof(*src->dma_addr[i]));


This does not reallocate/resize the dma_addr arrays. Reallocating these 
arrays can't be done here, because this function is not allowed to fail. 
That's one reason to use the clone instead of modifying the existing range.


Regards,
  Felix



+   }
+}
+
  void svm_range_set_max_pages(struct amdgpu_device *adev)
  {
uint64_t max_pages;
@@ -2057,20 +2077,19 @@ svm_range_split_new(struct svm_range_list *svms, 
uint64_t start, uint64_t last,
   * @attrs: array of attributes
   * @update_list: output, the ranges need validate and update GPU mapping
   * @insert_list: output, the ranges need insert to svms
- * @remove_list: output, the ranges are replaced and need remove from svms
   * @remap_list: output, remap unaligned svm ranges
   *
   * Check if the virtual address range has overlap with any existing ranges,
   * split partly overlapping ranges and add new ranges in the gaps. All changes
   * should be applied to the range_list and interval tree transactionally. If
   * any range split or allocation fails, the entire update fails. Therefore any
- * existing overlapping svm_ranges are cloned and the original svm_ranges left
+ * existing overlapping svm_ranges are duplicated and the original svm_ranges 
left
   * unchanged.
   *
- * If the transaction succeeds, the caller can update and insert clones and
- * new ranges, then free the originals.
+ * If the transaction succeeds, the caller can update and insert split ranges 
and
+ * new ranges.
   *
- * Otherwise the caller can free the clones and new ranges, while the old
+ * Otherwise the caller can free the duplicated and new ranges, while the old
   * svm_ranges remain unchanged.
   *
   * Context: Process context, caller must hold svms->lock
@@ -2082,7 +2101,7 @@ static int
  svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
  uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
  struct list_head *update_list, struct list_head *insert_list,
- struct list_head *remove_list, struct

Re: [PATCH 1/6] drm/amdgpu/gfx: add bad opcode interrupt

2024-07-17 Thread Felix Kuehling




On 2024-07-17 16:40, Alex Deucher wrote:

Add the irq source for bad opcodes.

Signed-off-by: Alex Deucher 


Looks like all the error IRQ handlers return 0, which means the 
interrupts will still get forwarded to KFD (which is good). The series is


Acked-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index ddda94e49db4..86d3fa7eef90 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -391,6 +391,7 @@ struct amdgpu_gfx {
struct amdgpu_irq_src   eop_irq;
struct amdgpu_irq_src   priv_reg_irq;
struct amdgpu_irq_src   priv_inst_irq;
+   struct amdgpu_irq_src   bad_op_irq;
struct amdgpu_irq_src   cp_ecc_error_irq;
struct amdgpu_irq_src   sq_irq;
struct amdgpu_irq_src   rlc_gc_fed_irq;

Re: [PATCH 5/9] drm/amdkfd: Ensure user queue buffers residency

2024-07-17 Thread Felix Kuehling


On 2024-07-15 08:34, Philip Yang wrote:

Add atomic queue_refcount to struct bo_va, return -EBUSY to fail unmap
BO from the GPU if the bo_va queue_refcount is not zero.

Create queue to increase the bo_va queue_refcount, destroy queue to
decrease the bo_va queue_refcount, to ensure the queue buffers mapped on
the GPU when queue is active.

Signed-off-by: Philip Yang 
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 14 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|  6 
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  3 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 34 ---
  5 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 0ab37e7aec26..6d5fd371d5ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1252,7 +1252,7 @@ static int unreserve_bo_and_vms(struct 
bo_vm_reservation_context *ctx,
return ret;
  }
  
-static void unmap_bo_from_gpuvm(struct kgd_mem *mem,

+static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct kfd_mem_attachment *entry,
struct amdgpu_sync *sync)
  {
@@ -1260,11 +1260,18 @@ static void unmap_bo_from_gpuvm(struct kgd_mem *mem,
struct amdgpu_device *adev = entry->adev;
struct amdgpu_vm *vm = bo_va->base.vm;
  
+	if (bo_va->queue_refcount) {

+   pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
+   return -EBUSY;
+   }
+
amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
  
  	amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
  
  	amdgpu_sync_fence(sync, bo_va->last_pt_update);

+
+   return 0;
  }
  
  static int update_gpuvm_pte(struct kgd_mem *mem,

@@ -2191,7 +2198,10 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
 entry->va, entry->va + bo_size, entry);
  
-		unmap_bo_from_gpuvm(mem, entry, ctx.sync);

+   ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+   if (ret)
+   goto unreserve_out;
+
entry->is_mapped = false;
  
  		mem->mapped_to_gpu_memory--;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index bc42ccbde659..d7e27957013f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -90,6 +90,12 @@ struct amdgpu_bo_va {
boolcleared;
  
  	boolis_xgmi;

+
+   /*
+* protected by vm reservation lock
+* if non-zero, cannot unmap from GPU because user queues may still 
access it
+*/
+   unsigned intqueue_refcount;
  };
  
  struct amdgpu_bo {

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 202f24ee4bd7..65a37ac5a0f0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1384,8 +1384,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file 
*filep,
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
peer_pdd->dev->adev, (struct kgd_mem *)mem, 
peer_pdd->drm_priv);
if (err) {
-   pr_err("Failed to unmap from gpu %d/%d\n",
-  i, args->n_devices);
+   pr_debug("Failed to unmap from gpu %d/%d\n", i, 
args->n_devices);
goto unmap_memory_from_gpu_failed;
}
args->n_success = i+1;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d0dca20849d9..95fbdb12beb1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1291,6 +1291,7 @@ void print_queue_properties(struct queue_properties *q);
  void print_queue(struct queue *q);
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size);
+void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo);
  int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
  int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct 
queue_properties *properties);
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c

index 0e661160c295..3fd386dcb011 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -106,6 +106,7 @@ int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user 
*addr, struct amdgpu_
}
  
  	*pbo = amdgpu_bo_ref(mapping->bo_va->base.bo);

+   mapping->bo_va->qu

Re: [PATCH 6/9] drm/amdkfd: Validate user queue svm memory residency

2024-07-17 Thread Felix Kuehling




On 2024-07-15 08:34, Philip Yang wrote:

Queue CWSR area maybe registered to GPU as svm memory, create queue to
ensure svm mapped to GPU with KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED flag.

Add queue_refcount to struct svm_range, to track queue CWSR area usage.

Because unmap mmu notifier callback return value is ignored, if
application unmap the CWSR area while queue is active, pr_warn message
in dmesg log. To be safe, evict user queue.

Signed-off-by: Philip Yang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 110 -
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  12 +++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   1 +
  3 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index 3fd386dcb011..67242ce051b5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -24,6 +24,7 @@
  
  #include 

  #include "kfd_priv.h"
+#include "kfd_svm.h"
  
  void print_queue_properties(struct queue_properties *q)

  {
@@ -83,6 +84,100 @@ void uninit_queue(struct queue *q)
kfree(q);
  }
  
+static int kfd_queue_buffer_svm_get(struct kfd_process_device *pdd, u64 addr, u64 size)

+{
+   struct kfd_process *p = pdd->process;
+   struct list_head update_list;
+   struct svm_range *prange;
+   int ret = -EINVAL;
+
+   INIT_LIST_HEAD(&update_list);
+   addr >>= PAGE_SHIFT;
+   size >>= PAGE_SHIFT;
+
+   mutex_lock(&p->svms.lock);
+
+   /*
+* range may split to multiple svm pranges aligned to granularity 
boundaery.
+*/
+   while (size) {
+   uint32_t gpuid, gpuidx;
+   int r;
+
+   prange = svm_range_from_addr(&p->svms, addr, NULL);
+   if (!prange)
+   break;
+
+   if (!prange->mapped_to_gpu)
+   break;
+
+   r = kfd_process_gpuid_from_node(p, pdd->dev, &gpuid, &gpuidx);
+   if (r < 0)
+   break;
+   if (!test_bit(gpuidx, prange->bitmap_access) &&
+   !test_bit(gpuidx, prange->bitmap_aip))
+   break;
+
+   if (!(prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED))
+   break;
+
+   list_add(&prange->update_list, &update_list);
+
+   if (prange->last - prange->start + 1 >= size) {
+   size = 0;
+   break;
+   }
+
+   size -= prange->last - prange->start + 1;
+   addr += prange->last - prange->start + 1;
+   }
+   if (size) {
+   pr_debug("[0x%llx 0x%llx] not registered\n", addr, addr + size 
- 1);
+   goto out_unlock;
+   }
+
+   list_for_each_entry(prange, &update_list, update_list)
+   atomic_inc(&prange->queue_refcount);
+   ret = 0;
+
+out_unlock:
+   mutex_unlock(&p->svms.lock);
+   return ret;
+}
+
+static void kfd_queue_buffer_svm_put(struct kfd_process_device *pdd, u64 addr, 
u64 size)
+{
+   struct kfd_process *p = pdd->process;
+   struct svm_range *prange, *pchild;
+   struct interval_tree_node *node;
+   unsigned long last;
+
+   addr >>= PAGE_SHIFT;
+   last = addr + (size >> PAGE_SHIFT) - 1;
+
+   mutex_lock(&p->svms.lock);
+
+   node = interval_tree_iter_first(&p->svms.objects, addr, last);
+   while (node) {
+   struct interval_tree_node *next_node;
+   unsigned long next_start;
+
+   prange = container_of(node, struct svm_range, it_node);
+   next_node = interval_tree_iter_next(node, addr, last);
+   next_start = min(node->last, last) + 1;
+
+   if (atomic_add_unless(&prange->queue_refcount, -1, 0)) {
+   list_for_each_entry(pchild, &prange->child_list, 
child_list)
+   atomic_add_unless(&pchild->queue_refcount, -1, 
0);
+   }
+
+   node = next_node;
+   addr = next_start;
+   }
+
+   mutex_unlock(&p->svms.lock);
+}
+
  int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct 
amdgpu_bo **pbo,
 u64 expected_size)
  {
@@ -165,8 +260,17 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
  
  	err = kfd_queue_buffer_get(vm, (void *)properties->ctx_save_restore_area_address,

   &properties->cwsr_bo, 0);
+   if (!err)
+   goto out_unreserve;
+
+   amdgpu_bo_unreserve(vm->root.bo);
+
+   err = kfd_queue_buffer_svm_get(pdd, 
properties-&

Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-17 Thread Felix Kuehling

Sorry, I see that this patch still doesn't propagate errors returned 
from kfd_queue_releasre_buffers correctly. And the later patches in the 
series don't seem to fix it either. See inline.


On 2024-07-15 08:34, Philip Yang wrote:

Add helper function kfd_queue_acquire_buffers to get queue wptr_bo
reference from queue write_ptr if it is mapped to the KFD node with
expected size.

Move wptr_bo to structure queue_properties from struct queue as queue is
allocated after queue buffers are validated, then we can remove wptr_bo
parameter from pqm_create_queue.

Because amdgpu_bo_unref clear the pointer, queue_properties wptr_bo is
used to acquire and release wptr_bo for validation, add wptr_bo_gart to
queue_propertes, to hold wptr_bo reference for GART mapping and
umapping.

Move MES wptr_bo GART mapping to init_user_queue, the same location with
queue ctx_bo GART mapping.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 +-
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 56 +++---
  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++--
  .../amd/amdkfd/kfd_process_queue_manager.c| 45 +++
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 57 +++
  7 files changed, 116 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 6e591280774b..4ed49265c764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem 
*mem,
 void **kptr, uint64_t *size);
  void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
  
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);

+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart);
  
  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,

struct dma_fence __rcu **ef);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 199e387d35f4..0ab37e7aec26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
  /**
   * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference 
count
   * @bo: Buffer object to be mapped
+ * @bo_gart: Return bo reference
   *
   * Before return, bo reference count is incremented. To release the reference 
and unpin/
   * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
   */
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart)
  {
int ret;
  
@@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
  
  	amdgpu_bo_unreserve(bo);
  
-	bo = amdgpu_bo_ref(bo);

+   *bo_gart = amdgpu_bo_ref(bo);
  
  	return 0;
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index 823f245dc7d0..202f24ee4bd7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->priority = args->queue_priority;
q_properties->queue_address = args->ring_base_address;
q_properties->queue_size = args->ring_size;
-   q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
-   q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
+   q_properties->read_ptr = (void __user *)args->read_pointer_address;
+   q_properties->write_ptr = (void __user *)args->write_pointer_address;
q_properties->eop_ring_buffer_address = args->eop_buffer_address;
q_properties->eop_ring_buffer_size = args->eop_buffer_size;
q_properties->ctx_save_restore_area_address =
@@ -306,7 +306,6 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
struct kfd_process_device *pdd;
struct queue_properties q_properties;
uint32_t doorbell_offset_in_process = 0;
-   struct amdgpu_bo *wptr_bo = NULL;
  
  	memset(&q_properties, 0, sizeof(struct queue_properties));
  
@@ -342,53 +341,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,

}
}
  
-	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work

-* on unmapped queues for usermode queue oversubscription (no 
aggregated doorbell)
-*/
-   if (dev->kfd->shared_resources.enable_mes &&
-   ((dev->adev->mes.sched_version & 
AMDGPU_MES_API_VERSI

Re: [PATCH 3/9] drm/amdkfd: Refactor queue wptr_bo GART mapping

2024-07-17 Thread Felix Kuehling




On 2024-07-15 08:34, Philip Yang wrote:

Add helper function kfd_queue_acquire_buffers to get queue wptr_bo
reference from queue write_ptr if it is mapped to the KFD node with
expected size.

Move wptr_bo to structure queue_properties from struct queue as queue is
allocated after queue buffers are validated, then we can remove wptr_bo
parameter from pqm_create_queue.

Because amdgpu_bo_unref clear the pointer, queue_properties wptr_bo is
used to acquire and release wptr_bo for validation, add wptr_bo_gart to
queue_propertes, to hold wptr_bo reference for GART mapping and
umapping.

Move MES wptr_bo GART mapping to init_user_queue, the same location with
queue ctx_bo GART mapping.

Signed-off-by: Philip Yang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  5 +-
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 56 +++---
  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 14 +++--
  .../amd/amdkfd/kfd_process_queue_manager.c| 45 +++
  drivers/gpu/drm/amd/amdkfd/kfd_queue.c| 57 +++
  7 files changed, 116 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 6e591280774b..4ed49265c764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -322,7 +322,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem 
*mem,
 void **kptr, uint64_t *size);
  void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
  
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo);

+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart);
  
  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,

struct dma_fence __rcu **ef);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 199e387d35f4..0ab37e7aec26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2226,11 +2226,12 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
  /**
   * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference 
count
   * @bo: Buffer object to be mapped
+ * @bo_gart: Return bo reference
   *
   * Before return, bo reference count is incremented. To release the reference 
and unpin/
   * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
   */
-int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo 
**bo_gart)
  {
int ret;
  
@@ -2257,7 +2258,7 @@ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo)
  
  	amdgpu_bo_unreserve(bo);
  
-	bo = amdgpu_bo_ref(bo);

+   *bo_gart = amdgpu_bo_ref(bo);
  
  	return 0;
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c

index 823f245dc7d0..202f24ee4bd7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -247,8 +247,8 @@ static int set_queue_properties_from_user(struct 
queue_properties *q_properties,
q_properties->priority = args->queue_priority;
q_properties->queue_address = args->ring_base_address;
q_properties->queue_size = args->ring_size;
-   q_properties->read_ptr = (uint32_t *) args->read_pointer_address;
-   q_properties->write_ptr = (uint32_t *) args->write_pointer_address;
+   q_properties->read_ptr = (void __user *)args->read_pointer_address;
+   q_properties->write_ptr = (void __user *)args->write_pointer_address;
q_properties->eop_ring_buffer_address = args->eop_buffer_address;
q_properties->eop_ring_buffer_size = args->eop_buffer_size;
q_properties->ctx_save_restore_area_address =
@@ -306,7 +306,6 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
struct kfd_process_device *pdd;
struct queue_properties q_properties;
uint32_t doorbell_offset_in_process = 0;
-   struct amdgpu_bo *wptr_bo = NULL;
  
  	memset(&q_properties, 0, sizeof(struct queue_properties));
  
@@ -342,53 +341,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,

}
}
  
-	/* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work

-* on unmapped queues for usermode queue oversubscription (no 
aggregated doorbell)
-*/
-   if (dev->kfd->shared_resources.enable_mes &&
-   ((dev->adev->mes.sched_version & 
AMDGPU_MES_API_VERSION_MASK)
-   >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) {
-   struct amdgpu_bo_va_mapping *wptr_mapping;
-   struct amdgpu_vm *wptr_vm;
-
-

Re: [PATCH 2/9] drm/amdkfd: amdkfd_free_gtt_mem clear the correct pointer

2024-07-17 Thread Felix Kuehling




On 2024-07-15 08:34, Philip Yang wrote:

Pass pointer reference to amdgpu_bo_unref to clear the correct pointer,
otherwise amdgpu_bo_unref clear the local variable, the original pointer
not set to NULL, this could cause use-after-free bug.

Signed-off-by: Philip Yang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 14 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_device.c|  4 ++--
  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c   |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_process.c   |  2 +-
  .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  4 ++--
  8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 03205e3c3746..c272461d70a9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -364,15 +364,15 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device 
*adev, size_t size,
return r;
  }
  
-void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj)

+void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj)
  {
-   struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj;
+   struct amdgpu_bo **bo = (struct amdgpu_bo **) mem_obj;
  
-	amdgpu_bo_reserve(bo, true);

-   amdgpu_bo_kunmap(bo);
-   amdgpu_bo_unpin(bo);
-   amdgpu_bo_unreserve(bo);
-   amdgpu_bo_unref(&(bo));
+   amdgpu_bo_reserve(*bo, true);
+   amdgpu_bo_kunmap(*bo);
+   amdgpu_bo_unpin(*bo);
+   amdgpu_bo_unreserve(*bo);
+   amdgpu_bo_unref(bo);
  }
  
  int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 66b1c72c81e5..6e591280774b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -235,7 +235,7 @@ int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo 
*bo,
  int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr, bool mqd_gfx9);
-void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void *mem_obj);
+void amdgpu_amdkfd_free_gtt_mem(struct amdgpu_device *adev, void **mem_obj);
  int amdgpu_amdkfd_alloc_gws(struct amdgpu_device *adev, size_t size,
void **mem_obj);
  void amdgpu_amdkfd_free_gws(struct amdgpu_device *adev, void *mem_obj);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1d9b21628be7..823f245dc7d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -423,7 +423,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
  
  err_create_queue:

if (wptr_bo)
-   amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
+   amdgpu_amdkfd_free_gtt_mem(dev->adev, (void **)&wptr_bo);
  err_wptr_map_gart:
  err_bind_process:
  err_pdd:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index f4d20adaa068..6619028dd58b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -907,7 +907,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
  kfd_doorbell_error:
kfd_gtt_sa_fini(kfd);
  kfd_gtt_sa_init_error:
-   amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
  alloc_gtt_mem_failure:
dev_err(kfd_device,
"device %x:%x NOT added due to errors\n",
@@ -925,7 +925,7 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfd_doorbell_fini(kfd);
ida_destroy(&kfd->doorbell_ida);
kfd_gtt_sa_fini(kfd);
-   amdgpu_amdkfd_free_gtt_mem(kfd->adev, kfd->gtt_mem);
+   amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
}
  
  	kfree(kfd);

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 4f48507418d2..420444eb8e98 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -2621,7 +2621,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev,
  {
WARN(!mqd, "No hiq sdma mqd trunk to free");
  
-	amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem);

+   amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem);
  }
  
  void device_queue_manager_uninit(str

Re: [PATCH] drm/amdgpu: Mark amdgpu_bo as invalid after moved

2024-07-17 Thread Felix Kuehling




On 2024-07-15 08:39, Christian König wrote:

Hi Felix,

yes that is a perfectly expected consequence.

The last time we talked about it the problem to solve this was that 
amdgpu_vm_sdma_prepare() couldn't read the fences from a resv object 
which wasn't locked.


Why only amdgpu_vm_sdma_prepare? Doesn't CPU page table update have the 
same problem?





That happens both during amdgpu_vm_handle_moved() as well as unlocked 
in validations of the page tables.


By "unlocked validations of page table entries" do you mean the 
"unlocked" flag in amdgpu_vm_update_range? That should only be used for 
invalidating page table entries in MMU notifiers for SVM ranges. It 
should not affect normal BOs.


amdgpu_vm_handle_moved tries to lock the reservations. But if it fails, 
it clears page table entries. So this is another case of "unlocked 
invalidations". This one does affect normal BOs. I think 
amdgpu_vm_handle_moved makes an assumption that the other user of the BO 
is in a different VM but not the same VM. Clearing the PTEs in this VM 
even though the BO move is still waiting for some other VM to finish 
accessing it, is safe.


I think here we have a case where the BO is used by something else in 
the same VM. In this case we cannot safely clear the PTEs before the BO 
fence signals.


We want to clear the PTEs before the move happens. Otherwise we risk 
memory corruption. Maybe the same job that does the move blit should 
also invalidate the PTEs?


Regards,
  Felix




IIRC we postponed looking into the issue until it really becomes a 
problem which is probably now :)


Regards,
Christian.

Am 12.07.24 um 16:56 schrieb Felix Kuehling:
KFD eviction fences are triggered by the enable_signaling callback on 
the eviction fence. Any move operations scheduled by amdgpu_bo_move 
are held up by the GPU scheduler until the eviction fence is signaled 
by the KFD eviction handler, which only happens after the user mode 
queues are stopped.


As I understand it, VM BO invalidation does not unmap anything from 
the page table itself. So the KFD queues are OK continue running 
until the eviction handler stops them and signals the fence.


However, if amdgpu_vm_handle_moved gets called before the eviction 
fence is signaled, then there could be a problem. In applications 
that do compute-graphics interop, the VM is shared between compute 
and graphics. So graphics and compute submissions at the same time 
are possible. @Christian, this is a concequence of using libdrm and 
insisting that each process uses only a single VM per GPU.


Regards,
   Felix

On 2024-07-12 3:39, Christian König wrote:

Hi River,

well that isn't an error at all, this is perfectly expected behavior.

The VMs used by the KFD process are currently not meant to be used 
by classic CS at the same time.


This is one of the reasons for that.

Regards,
Christian.

Am 12.07.24 um 09:35 schrieb YuanShang Mao (River):

[AMD Official Use Only - AMD Internal Distribution Only]

Add more info and CC @Kuehling, Felix @cao, lin

In amdgpu_amdkfd_fence.c, there is a design description:

/* Eviction Fence
   * Fence helper functions to deal with KFD memory eviction.
   * Big Idea - Since KFD submissions are done by user queues, a BO 
cannot be

   *  evicted unless all the user queues for that process are evicted.
   *
   * All the BOs in a process share an eviction fence. When process 
X wants
   * to map VRAM memory but TTM can't find enough space, TTM will 
attempt to
   * evict BOs from its LRU list. TTM checks if the BO is valuable 
to evict

   * by calling ttm_device_funcs->eviction_valuable().
   *
   * ttm_device_funcs->eviction_valuable() - will return false if 
the BO belongs
   *  to process X. Otherwise, it will return true to indicate BO 
can be

   *  evicted by TTM.
   *
   * If ttm_device_funcs->eviction_valuable returns true, then TTM 
will continue
   * the evcition process for that BO by calling ttm_bo_evict --> 
amdgpu_bo_move

   * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler.
   *
   * GPU Scheduler (amd_sched_main) - sets up a cb 
(fence_add_callback) to
   *  nofity when the BO is free to move. fence_add_callback --> 
enable_signaling

   *  --> amdgpu_amdkfd_fence.enable_signaling
   *
   * amdgpu_amdkfd_fence.enable_signaling - Start a work item that 
will quiesce
   * user queues and signal fence. The work item will also start 
another delayed

   * work item to restore BOs
   */

If mark BOs as invalidated before submitting job to move the 
buffer, user queue is still active.
During the time before user queue is evicted, if a drm job achieve, 
amdgpu_cs_vm_handling will call amdgpu_vm_handle_moved to clear the 
ptes of
Invalidated BOs. Then page fault happens because compute shader is 
still accessing the "invalidated" BO.


I am not familiar with amdgpu_vm_bo state machine, so I don’t know 
if it is an code error or an desi

Re: [PATCH] drm/amdgpu: Mark amdgpu_bo as invalid after moved

2024-07-12 Thread Felix Kuehling

KFD eviction fences are triggered by the enable_signaling callback on the 
eviction fence. Any move operations scheduled by amdgpu_bo_move are held up by 
the GPU scheduler until the eviction fence is signaled by the KFD eviction 
handler, which only happens after the user mode queues are stopped.

As I understand it, VM BO invalidation does not unmap anything from the page 
table itself. So the KFD queues are OK continue running until the eviction 
handler stops them and signals the fence.

However, if amdgpu_vm_handle_moved gets called before the eviction fence is 
signaled, then there could be a problem. In applications that do 
compute-graphics interop, the VM is shared between compute and graphics. So 
graphics and compute submissions at the same time are possible. @Christian, 
this is a concequence of using libdrm and insisting that each process uses only 
a single VM per GPU.

Regards,
  Felix

On 2024-07-12 3:39, Christian König wrote:
> Hi River,
> 
> well that isn't an error at all, this is perfectly expected behavior.
> 
> The VMs used by the KFD process are currently not meant to be used by classic 
> CS at the same time.
> 
> This is one of the reasons for that.
> 
> Regards,
> Christian.
> 
> Am 12.07.24 um 09:35 schrieb YuanShang Mao (River):
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> Add more info and CC @Kuehling, Felix @cao, lin
>>
>> In amdgpu_amdkfd_fence.c, there is a design description:
>>
>> /* Eviction Fence
>>   * Fence helper functions to deal with KFD memory eviction.
>>   * Big Idea - Since KFD submissions are done by user queues, a BO cannot be
>>   *  evicted unless all the user queues for that process are evicted.
>>   *
>>   * All the BOs in a process share an eviction fence. When process X wants
>>   * to map VRAM memory but TTM can't find enough space, TTM will attempt to
>>   * evict BOs from its LRU list. TTM checks if the BO is valuable to evict
>>   * by calling ttm_device_funcs->eviction_valuable().
>>   *
>>   * ttm_device_funcs->eviction_valuable() - will return false if the BO 
>> belongs
>>   *  to process X. Otherwise, it will return true to indicate BO can be
>>   *  evicted by TTM.
>>   *
>>   * If ttm_device_funcs->eviction_valuable returns true, then TTM will 
>> continue
>>   * the evcition process for that BO by calling ttm_bo_evict --> 
>> amdgpu_bo_move
>>   * --> amdgpu_copy_buffer(). This sets up job in GPU scheduler.
>>   *
>>   * GPU Scheduler (amd_sched_main) - sets up a cb (fence_add_callback) to
>>   *  nofity when the BO is free to move. fence_add_callback --> 
>> enable_signaling
>>   *  --> amdgpu_amdkfd_fence.enable_signaling
>>   *
>>   * amdgpu_amdkfd_fence.enable_signaling - Start a work item that will 
>> quiesce
>>   * user queues and signal fence. The work item will also start another 
>> delayed
>>   * work item to restore BOs
>>   */
>>
>> If mark BOs as invalidated before submitting job to move the buffer, user 
>> queue is still active.
>> During the time before user queue is evicted, if a drm job achieve, 
>> amdgpu_cs_vm_handling will call amdgpu_vm_handle_moved to clear the ptes of
>> Invalidated BOs. Then page fault happens because compute shader is still 
>> accessing the "invalidated" BO.
>>
>> I am not familiar with amdgpu_vm_bo state machine, so I don’t know if it is 
>> an code error or an design error.
>>
>> Thanks
>> River
>>
>>
>> -Original Message-
>> From: YuanShang Mao (River)
>> Sent: Friday, July 12, 2024 10:55 AM
>> To: Christian König 
>> Cc: Huang, Trigger ; amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/amdgpu: Mark amdgpu_bo as invalid after moved
>>
>> We need to make sure that all BOs of an active kfd process validated. Moving 
>> buffer will trigger process eviction.
>> If mark it as invalided before process eviction, related kfd process is 
>> still active and may attempt to access this invalidated BO.
>>
>> Agree with Trigger. Seems kfd eviction should been synced to move notify, 
>> not the move action.
>>
>> Thanks
>> River
>>
>> -Original Message-
>> From: Christian König 
>> Sent: Thursday, July 11, 2024 8:39 PM
>> To: Huang, Trigger ; YuanShang Mao (River) 
>> ; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Mark amdgpu_bo as invalid after moved
>>
>> Yeah, completely agree. This patch doesn't really make sense.
>>
>> Please explain why you would want to do this?
>>
>> Regards,
>> Christian.
>>
>> Am 11.07.24 um 13:56 schrieb Huang, Trigger:
>>> [AMD Official Use Only - AMD Internal Distribution Only]
>>>
>>> This patch seems to be wrong.
>>> Quite a lot of preparations have been done in amdgpu_bo_move_notify
>>> For example, amdgpu_bo_kunmap() will be called to prevent the BO from being 
>>> accessed by CPU. If not called, the CPU may attempt to access the BO while 
>>> it is being moved.
>>>
>>> Thanks,
>>> Trigger
>>>
 -Original Message-
 From: amd-gfx  On Behalf Of
 YuanShang
 Sent: Thursday, July 11, 2024 5

Re: va range based memory management discussion (was: 回复：回复：Re：Proposal to add CRIU support to DRM render nodes)

2024-07-10 Thread Felix Kuehling


On 2024-07-09 22:38, 周春明(日月) wrote:







--
发件人：Felix Kuehling 
发送时间：2024年7月10日(星期三) 01:07
收件人：周春明(日月) ; Tvrtko Ursulin 
; dri-de...@lists.freedesktop.org 
; amd-gfx@lists.freedesktop.org 
; Dave Airlie ; 
Daniel Vetter ; criu 
抄　送："Errabolu, Ramesh" ; "Christian König" 


主　题：Re: 回复：Re：Proposal to add CRIU support to DRM render nodes



On 2024-07-09 5:30, 周春明(日月) wrote:
>
>
>
>
>
>
> --
> 发件人：Felix Kuehling 
> 发送时间：2024年7月9日(星期二) 06:40
> 收件人：周春明(日月) ; Tvrtko Ursulin 
; dri-de...@lists.freedesktop.org 
; amd-gfx@lists.freedesktop.org 
; Dave Airlie ; 
Daniel Vetter ; criu 
> 抄 送："Errabolu, Ramesh" ; "Christian König" 


> 主 题：Re: Re：Proposal to add CRIU support to DRM render nodes
>
>
> On 2024-07-08 2:51, 周春明(日月) wrote:
>>
>>> Hi Felix,
>>>
>>> When I learn CRIU you introduced in 
https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu 
<https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu> 
<https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu>> 
<https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu> 
<https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu>> 
<https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu>> 
<https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu>>> 
, there is a sentence
>>> "ROCm manages memory in the form of buffer objects (BOs). We are 
also working on a new memory management API that will be based on 
virtual address ranges...",
>>> Out of curious, how about "new memory management based on virtual 
address ranges"? Any introduction for that?

>>
>>>Hi David,
>>
>>>This refers to the SVM API that has been in the upstream driver for 
a while now: 
https://elixir.bootlin.com/linux/v6.9.8/source/include/uapi/linux/kfd_ioctl.h#L732 
<https://elixir.bootlin.com/linux/v6.9.8/source/include/uapi/linux/kfd_ioctl.h#L732> 
<https://elixir.bootlin.com/linux/v6.9.8/source/include/uapi/linux/kfd_ioctl.h#L732>>

>>
>> [David] Can all ROCm runtime memory management switch to use svm 
apis? No need BOs any more?


>I had thought about that when I started working on SVM years ago. But 
I came to the conclusion that we need to use BOs for VRAM to support 
DMABuf exports and imports to support P2P and IPC features.


[David] OK, I guessed you would say DMABuf and IPC factors, if we 
don't use dmabuf (as you know, dmabuf isn't popular in compute area) 
and implement a new ipc based on va ranges, is that possible to using 
svm api to cover all ROCm memory management?

When I tried memory pool used by cuda graph, seems that's OK.


DMABuf and IPC are important for collective communications libraries 
used by distributed applications. You could get away without it when 
you're running a single-process application on a single machine. But 
changing all memory allocations to SVM would probably cause some 
performance regressions, because our BO allocators and memory mapping 
functions are simpler and easier to optimize than for unified memory.


That leaves the question, what's the expected benefit or a compelling 
reason for making such an invasive change?


Regards,
  Felix




Thanks,
-David

>Regards,
>  Felix


>
> Thanks,
> -David
>
> Regards,
>   Felix
>
>
>>
>> Thanks,
>> -David
>>
>> --
>>     发件人：Felix Kuehling 
>>     发送时间：2024年5月3日(星期五) 22:44
>>     收件人：Tvrtko Ursulin ; 
dri-de...@lists.freedesktop.org ; 
amd-gfx@lists.freedesktop.org ; Dave 
Airlie ; Daniel Vetter ; criu 

>>     抄 送："Errabolu, Ramesh" ; "Christian 
König" 

>>     主 题：Re: Proposal to add CRIU support to DRM render nodes
>>
>>
>>
>>     On 2024-04-16 10:04, Tvrtko Ursulin wrote:
>>     >
>>     > On 01/04/2024 18:58, Felix Kuehling wrote:
>>     >>
>>     >> On 2024-04-01 12:56, Tvrtko Ursulin wrote:
>>     >>>
>>     >>> On 01/04/2024 17:37, Felix Kuehling wrote:
>>     >>>> On 2024-04-01 11:09, Tvrtko Ursulin wrote:
>>     >>>>>
>>     >>>>> On 28/03/2024 20:42, Felix Kuehling wrote:
>>     >>>>>>
>>     >>>>>> On 2024-03-28 12:03, Tvrtko Ursulin wrote:
>>     >>>>>>>
>>     >>>>>>> Hi Felix,
>>     >>>>>>>
>>     >>&

Re: [PATCH] drm/amdgpu: Restore uncache behaviour on GFX12

2024-07-09 Thread Felix Kuehling




On 2024-07-08 17:41, David Belanger wrote:
> Always use MTYPE_UC if UNCACHED flag is specified.
> 
> This makes kernarg region uncached and it restores
> usermode cache disable debug flag functionality.
> 
> Do not set MTYPE_UC for COHERENT flag, on GFX12 coherence is handled by
> shader code.
> 
> Signed-off-by: David Belanger 

Reviewed-by: Felix Kuehling 


> ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 21 ++---
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c   |  8 +---
>  2 files changed, 3 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> index fd3ac483760e..542225eb13b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> @@ -498,9 +498,6 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>uint64_t *flags)
>  {
>   struct amdgpu_bo *bo = mapping->bo_va->base.bo;
> - struct amdgpu_device *bo_adev;
> - bool coherent, is_system;
> -
>  
>   *flags &= ~AMDGPU_PTE_EXECUTABLE;
>   *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
> @@ -516,25 +513,11 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>   *flags &= ~AMDGPU_PTE_VALID;
>   }
>  
> - if (!bo)
> - return;
> -
> - if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
> -AMDGPU_GEM_CREATE_UNCACHED))
> - *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC);
> -
> - bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
> - coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
> - is_system = (bo->tbo.resource->mem_type == TTM_PL_TT) ||
> - (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT);
> -
>   if (bo && bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC)
>   *flags |= AMDGPU_PTE_DCC;
>  
> - /* WA for HW bug */
> - if (is_system || ((bo_adev != adev) && coherent))
> - *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_NC);
> -
> + if (bo && bo->flags & AMDGPU_GEM_CREATE_UNCACHED)
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC);
>  }
>  
>  static unsigned gmc_v12_0_get_vbios_fb_size(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index bd9c2921e0dc..7b671aefab01 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -1248,13 +1248,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
>   break;
>   case IP_VERSION(12, 0, 0):
>   case IP_VERSION(12, 0, 1):
> - if (domain == SVM_RANGE_VRAM_DOMAIN) {
> - if (bo_node != node)
> - mapping_flags |= AMDGPU_VM_MTYPE_NC;
> - } else {
> - mapping_flags |= coherent ?
> - AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
> - }
> + mapping_flags |= AMDGPU_VM_MTYPE_NC;
>   break;
>   default:
>   mapping_flags |= coherent ?

Re: 回复：Re：Proposal to add CRIU support to DRM render nodes

2024-07-09 Thread Felix Kuehling




On 2024-07-09 5:30, 周春明(日月) wrote:
> 
> 
> 
> 
> 
> 
> --
> 发件人：Felix Kuehling 
> 发送时间：2024年7月9日(星期二) 06:40
> 收件人：周春明(日月) ; Tvrtko Ursulin 
> ; dri-de...@lists.freedesktop.org 
> ; amd-gfx@lists.freedesktop.org 
> ; Dave Airlie ; Daniel 
> Vetter ; criu 
> 抄　送："Errabolu, Ramesh" ; "Christian König" 
> 
> 主　题：Re: Re：Proposal to add CRIU support to DRM render nodes
> 
> 
> On 2024-07-08 2:51, 周春明(日月) wrote:
>>
>> Hi Felix,
>>
>> When I learn CRIU you introduced in 
>> https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu 
>> <https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu> 
>> <https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu> 
>> <https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu>> , 
>> there is a sentence
>> "ROCm manages memory in the form of buffer objects (BOs). We are also 
>> working on a new memory management API that will be based on virtual address 
>> ranges...",
>> Out of curious, how about "new memory management based on virtual address 
>> ranges"? Any introduction for that?
> 
>>Hi David,
> 
>>This refers to the SVM API that has been in the upstream driver for a while 
>>now: 
>>https://elixir.bootlin.com/linux/v6.9.8/source/include/uapi/linux/kfd_ioctl.h#L732
>> 
>><https://elixir.bootlin.com/linux/v6.9.8/source/include/uapi/linux/kfd_ioctl.h#L732>
> 
> [David] Can all ROCm runtime memory management switch to use svm apis? No 
> need BOs any more?

I had thought about that when I started working on SVM years ago. But I came to 
the conclusion that we need to use BOs for VRAM to support DMABuf exports and 
imports to support P2P and IPC features.

Regards,
  Felix


> 
> Thanks,
> -David
> 
> Regards,
>   Felix
> 
> 
>>
>> Thanks,
>> -David
>>
>>     --
>>     发件人：Felix Kuehling 
>>     发送时间：2024年5月3日(星期五) 22:44
>>     收件人：Tvrtko Ursulin ; 
>> dri-de...@lists.freedesktop.org ; 
>> amd-gfx@lists.freedesktop.org ; Dave Airlie 
>> ; Daniel Vetter ; criu 
>>     抄 送："Errabolu, Ramesh" ; "Christian König" 
>> 
>>     主 题：Re: Proposal to add CRIU support to DRM render nodes
>>
>>
>>
>>     On 2024-04-16 10:04, Tvrtko Ursulin wrote:
>>     >
>>     > On 01/04/2024 18:58, Felix Kuehling wrote:
>>     >>
>>     >> On 2024-04-01 12:56, Tvrtko Ursulin wrote:
>>     >>>
>>     >>> On 01/04/2024 17:37, Felix Kuehling wrote:
>>     >>>> On 2024-04-01 11:09, Tvrtko Ursulin wrote:
>>     >>>>>
>>     >>>>> On 28/03/2024 20:42, Felix Kuehling wrote:
>>     >>>>>>
>>     >>>>>> On 2024-03-28 12:03, Tvrtko Ursulin wrote:
>>     >>>>>>>
>>     >>>>>>> Hi Felix,
>>     >>>>>>>
>>     >>>>>>> I had one more thought while browsing around the amdgpu CRIU 
>> plugin. It appears it relies on the KFD support being compiled in and 
>> /dev/kfd present, correct? AFAICT at least, it relies on that to figure out 
>> the amdgpu DRM node.
>>     >>>>>>>
>>     >>>>>>> In would be probably good to consider designing things without 
>> that dependency. So that checkpointing an application which does not use 
>> /dev/kfd is possible. Or if the kernel does not even have the KFD support 
>> compiled in.
>>     >>>>>>
>>     >>>>>> Yeah, if we want to support graphics apps that don't use KFD, we 
>> should definitely do that. Currently we get a lot of topology information 
>> from KFD, not even from the /dev/kfd device but from the sysfs nodes exposed 
>> by KFD. We'd need to get GPU device info from the render nodes instead. And 
>> if KFD is available, we may need to integrate both sources of information.
>>     >>>>>>
>>     >>>>>>
>>     >>>>>>>
>>     >>>>>>> It could perhaps mean no more than adding some GPU discovery 
>> code into CRIU. Which shuold be flexible enough to account for things like 
>> re-assigned minor numbers due driver reload.
>>     >>>>>>
>>     >>>>>> Do you mean adding

Re: Re：Proposal to add CRIU support to DRM render nodes

2024-07-08 Thread Felix Kuehling



On 2024-07-08 2:51, 周春明(日月) wrote:
> 
> Hi Felix,
> 
> When I learn CRIU you introduced in 
> https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu 
> <https://github.com/checkpoint-restore/criu/tree/criu-dev/plugins/amdgpu> , 
> there is a sentence
> "ROCm manages memory in the form of buffer objects (BOs). We are also working 
> on a new memory management API that will be based on virtual address 
> ranges...", 
> Out of curious, how about "new memory management based on virtual address 
> ranges"? Any introduction for that?

Hi David,

This refers to the SVM API that has been in the upstream driver for a while 
now: 
https://elixir.bootlin.com/linux/v6.9.8/source/include/uapi/linux/kfd_ioctl.h#L732

Regards,
  Felix


> 
> Thanks,
> -David
> 
> --
> 发件人：Felix Kuehling 
> 发送时间：2024年5月3日(星期五) 22:44
> 收件人：Tvrtko Ursulin ; 
> dri-de...@lists.freedesktop.org ; 
> amd-gfx@lists.freedesktop.org ; Dave Airlie 
> ; Daniel Vetter ; criu 
> 抄　送："Errabolu, Ramesh" ; "Christian König" 
> 
> 主　题：Re: Proposal to add CRIU support to DRM render nodes
> 
> 
> 
> On 2024-04-16 10:04, Tvrtko Ursulin wrote:
> >
> > On 01/04/2024 18:58, Felix Kuehling wrote:
> >>
> >> On 2024-04-01 12:56, Tvrtko Ursulin wrote:
> >>>
> >>> On 01/04/2024 17:37, Felix Kuehling wrote:
> >>>> On 2024-04-01 11:09, Tvrtko Ursulin wrote:
> >>>>>
> >>>>> On 28/03/2024 20:42, Felix Kuehling wrote:
> >>>>>>
> >>>>>> On 2024-03-28 12:03, Tvrtko Ursulin wrote:
> >>>>>>>
> >>>>>>> Hi Felix,
> >>>>>>>
> >>>>>>> I had one more thought while browsing around the amdgpu CRIU 
> plugin. It appears it relies on the KFD support being compiled in and 
> /dev/kfd present, correct? AFAICT at least, it relies on that to figure out 
> the amdgpu DRM node.
> >>>>>>>
> >>>>>>> In would be probably good to consider designing things without 
> that dependency. So that checkpointing an application which does not use 
> /dev/kfd is possible. Or if the kernel does not even have the KFD support 
> compiled in.
> >>>>>>
> >>>>>> Yeah, if we want to support graphics apps that don't use KFD, we 
> should definitely do that. Currently we get a lot of topology information 
> from KFD, not even from the /dev/kfd device but from the sysfs nodes exposed 
> by KFD. We'd need to get GPU device info from the render nodes instead. And 
> if KFD is available, we may need to integrate both sources of information.
> >>>>>>
> >>>>>>
> >>>>>>>
> >>>>>>> It could perhaps mean no more than adding some GPU discovery code 
> into CRIU. Which shuold be flexible enough to account for things like 
> re-assigned minor numbers due driver reload.
> >>>>>>
> >>>>>> Do you mean adding GPU discovery to the core CRIU, or to the 
> plugin. I was thinking this is still part of the plugin.
> >>>>>
> >>>>> Yes I agree. I was only thinking about adding some DRM device 
> discovery code in a more decoupled fashion from the current plugin, for both 
> the reason discussed above (decoupling a bit from reliance on kfd sysfs), and 
> then also if/when a new DRM driver might want to implement this the code 
> could be move to some common plugin area.
> >>>>>
> >>>>> I am not sure how feasible that would be though. The "gpu id" 
> concept and it's matching in the current kernel code and CRIU plugin - is 
> that value tied to the physical GPU instance or how it works?
> >>>>
> >>>> The concept of the GPU ID is that it's stable while the system is 
> up, even when devices get added and removed dynamically. It was baked into 
> the API early on, but I don't think we ever fully validated device hot plug. 
> I think the closest we're getting is with our latest MI GPUs and dynamic 
> partition mode change.
> >>>
> >>> Doesn't it read the saved gpu id from the image file while doing 
> restore and tries to open the render node to match it? Maybe I am misreading 
> the code.. But if it does, does it imply that in practice it could be stable 
> acro

Re: [PATCH] drm/amdkfd: Update mm interval notifier tree without acquiring mm's mmap lock

2024-06-18 Thread Felix Kuehling




On 2024-06-12 16:11, Xiaogang.Chen wrote:

From: Xiaogang Chen 

Current kfd/svm driver acquires mm's mmap write lock before update
mm->notifier_subscriptions->itree. This tree is already protected
by mm->notifier_subscriptions->lock at mmu notifier. Each process mm interval
tree update from different components in kernel go to mmu interval notifier
where they got serialized. This patch removes mmap write lock acquiring at
kfd/svm driver when need updates process mm interval tree. It reduces chance
of dead lock or warning from lockdev and simplifies the driver code.

In addition, the patch adjusts some locks granularity to reduce the lock number
that driver holds at same time which also reduces the chance of dead lock or
warning from lockdev.

Signed-off-by: Xiaogang Chen
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   3 +-
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |   6 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 181 +++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |   2 +-
  4 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index fdf171ad4a3c..b52588ded567 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1078,9 +1078,8 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
/* Flush pending deferred work to avoid racing with deferred actions
 * from previous memory map changes (e.g. munmap).
 */
-   svm_range_list_lock_and_flush_work(&p->svms, current->mm);
+   svm_range_list_flush_work(&p->svms);
mutex_lock(&p->svms.lock);
-   mmap_write_unlock(current->mm);
if (interval_tree_iter_first(&p->svms.objects,
 args->va_addr >> PAGE_SHIFT,
 (args->va_addr + args->size - 1) >> 
PAGE_SHIFT)) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 8ee3d07ffbdf..eb46643d96b2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -969,10 +969,12 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
mutex_lock(&p->svms.lock);
  
  	prange = svm_range_from_addr(&p->svms, addr, NULL);

+
+   mutex_unlock(&p->svms.lock);


You must continue to hold the svms.lock here. As soon as you drop the 
lock, the prange can be freed or changed, so cannot keep using this 
pointer without holding the lock.




if (!prange) {
pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, 
addr);
r = -EFAULT;
-   goto out_unlock_svms;
+   goto out_unref_process;
}
  
  	mutex_lock(&prange->migrate_mutex);

@@ -993,8 +995,6 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
  
  out_unlock_prange:

mutex_unlock(&prange->migrate_mutex);
-out_unlock_svms:
-   mutex_unlock(&p->svms.lock);
  out_unref_process:
pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr);
kfd_unref_process(p);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 407636a68814..46f81c1215d9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -106,12 +106,31 @@ static void svm_range_unlink(struct svm_range *prange)
  }
  
  static void

-svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
+svm_range_add_notifier(struct mm_struct *mm, struct svm_range *prange, bool 
locked)
  {
pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
 prange, prange->start, prange->last);
  
-	mmu_interval_notifier_insert_locked(&prange->notifier, mm,

+   /* mm->notifier_subscriptions should have been setup for this process
+* ex: during kfd process creation
+*/
+   WARN_ON_ONCE(!mm->notifier_subscriptions);
+
+   /* not necessary hold mmap lock to update mm interval notifier tree as
+* opeations on mm->notifier_subscriptions->itree are serialized by
+* mm->notifier_subscriptions->lock
+*/
+   if (locked) {
+   /* if mmap write lock has been hold use lock version to udpate
+* mm interval notifier tree
+*/
+   mmu_interval_notifier_insert_locked(&prange->notifier, mm,
+  prange->start << PAGE_SHIFT,
+  prange->npages << PAGE_SHIFT,
+  &svm_range_mn_ops);
+   } else
+   /* use no-mmap-lock version to update mm interval notifier tree 
*/
+   mmu_interval_notifier_insert(&prange->notifier, mm,
 prange->start << PAGE_SHIFT,
 prange->npages << PAGE_SHIFT,
 &svm_range_mn_ops);
@@ -

Re: [PATCH 1/2][RFC] amdgpu: fix a race in kfd_mem_export_dmabuf()

2024-06-06 Thread Felix Kuehling




On 2024-06-05 05:14, Christian König wrote:

Am 04.06.24 um 20:08 schrieb Felix Kuehling:


On 2024-06-03 22:13, Al Viro wrote:

Using drm_gem_prime_handle_to_fd() to set dmabuf up and insert it into
descriptor table, only to have it looked up by file descriptor and
remove it from descriptor table is not just too convoluted - it's
racy; another thread might have modified the descriptor table while
we'd been going through that song and dance.

It's not hard to fix - turn drm_gem_prime_handle_to_fd()
into a wrapper for a new helper that would simply return the
dmabuf, without messing with descriptor table.

Then kfd_mem_export_dmabuf() would simply use that new helper
and leave the descriptor table alone.

Signed-off-by: Al Viro 


This patch looks good to me on the amdgpu side. For the DRM side I'm 
adding dri-devel.


Yeah that patch should probably be split up and the DRM changes 
discussed separately.


On the other hand skimming over it it seems reasonable to me.

Felix are you going to look into this or should I take a look and try 
to push it through drm-misc-next?


It doesn't matter much to me, as long as we submit both changes together.

Thanks,
  Felix




Thanks,
Christian.



Acked-by: Felix Kuehling 



---
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 8975cf41a91a..793780bb819c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -25,7 +25,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
    #include 
@@ -812,18 +811,13 @@ static int kfd_mem_export_dmabuf(struct 
kgd_mem *mem)

  if (!mem->dmabuf) {
  struct amdgpu_device *bo_adev;
  struct dma_buf *dmabuf;
-    int r, fd;
    bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
-    r = drm_gem_prime_handle_to_fd(&bo_adev->ddev, 
bo_adev->kfd.client.file,
+    dmabuf = drm_gem_prime_handle_to_dmabuf(&bo_adev->ddev, 
bo_adev->kfd.client.file,

 mem->gem_handle,
  mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
-   DRM_RDWR : 0, &fd);
-    if (r)
-    return r;
-    dmabuf = dma_buf_get(fd);
-    close_fd(fd);
-    if (WARN_ON_ONCE(IS_ERR(dmabuf)))
+   DRM_RDWR : 0);
+    if (IS_ERR(dmabuf))
  return PTR_ERR(dmabuf);
  mem->dmabuf = dmabuf;
  }
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 03bd3c7bd0dc..622c51d3fe18 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -409,23 +409,9 @@ static struct dma_buf 
*export_and_register_object(struct drm_device *dev,

  return dmabuf;
  }
  -/**
- * drm_gem_prime_handle_to_fd - PRIME export function for GEM drivers
- * @dev: dev to export the buffer from
- * @file_priv: drm file-private structure
- * @handle: buffer handle to export
- * @flags: flags like DRM_CLOEXEC
- * @prime_fd: pointer to storage for the fd id of the create dma-buf
- *
- * This is the PRIME export function which must be used mandatorily 
by GEM
- * drivers to ensure correct lifetime management of the underlying 
GEM object.
- * The actual exporting from GEM object to a dma-buf is done 
through the

- * &drm_gem_object_funcs.export callback.
- */
-int drm_gem_prime_handle_to_fd(struct drm_device *dev,
+struct dma_buf *drm_gem_prime_handle_to_dmabuf(struct drm_device *dev,
 struct drm_file *file_priv, uint32_t handle,
-   uint32_t flags,
-   int *prime_fd)
+   uint32_t flags)
  {
  struct drm_gem_object *obj;
  int ret = 0;
@@ -434,14 +420,14 @@ int drm_gem_prime_handle_to_fd(struct 
drm_device *dev,

  mutex_lock(&file_priv->prime.lock);
  obj = drm_gem_object_lookup(file_priv, handle);
  if (!obj)  {
-    ret = -ENOENT;
+    dmabuf = ERR_PTR(-ENOENT);
  goto out_unlock;
  }
    dmabuf = drm_prime_lookup_buf_by_handle(&file_priv->prime, 
handle);

  if (dmabuf) {
  get_dma_buf(dmabuf);
-    goto out_have_handle;
+    goto out;
  }
    mutex_lock(&dev->object_name_lock);
@@ -463,7 +449,6 @@ int drm_gem_prime_handle_to_fd(struct drm_device 
*dev,

  /* normally the created dma-buf takes ownership of the ref,
   * but if that fails then drop the ref
   */
-    ret = PTR_ERR(dmabuf);
  mutex_unlock(&dev->object_name_lock);
  goto out;
  }
@@ -478,34 +463,49 @@ int drm_gem_prime_handle_to_fd(struct 
drm_device *dev,

  ret = drm_prime_add_buf_handle(&file_priv->prime,
 dmabuf, handle);
  mutex_unlock(&dev->object_name_lock);
-    if (ret)
-    goto fail_put_dmabuf;
-
-out_have_handle:
-    ret = dma_buf_fd(d

Re: [PATCH 3/3] drm/amdgpu: nuke the VM PD/PT shadow handling

2024-06-06 Thread Felix Kuehling


On 2024-06-06 04:47, Christian König wrote:

This was only used as workaround for recovering the page tables after
VRAM was lost and is no longer necessary after the function
amdgpu_vm_bo_reset_state_machine() started to do the same.

Compute never used shadows either, so the only proplematic case left is
SVM and that is most likely not recoverable in any way when VRAM is
lost.


I agree. The series is

Acked-by: Felix Kuehling 




Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  4 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 87 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 67 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  | 21 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 17 
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   | 56 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 19 +
  7 files changed, 6 insertions(+), 265 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 1f71c7b98d77..c50e591aae5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1093,10 +1093,6 @@ struct amdgpu_device {
  
  	struct amdgpu_virt	virt;
  
-	/* link all shadow bo */

-   struct list_headshadow_list;
-   struct mutexshadow_list_lock;
-
/* record hw reset is performed */
bool has_hw_reset;
u8  reset_magic[AMDGPU_RESET_MAGIC_NUM];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4096cb3e937e..be30ed4d456a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4073,9 +4073,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(&adev->mm_stats.lock);
spin_lock_init(&adev->wb.lock);
  
-	INIT_LIST_HEAD(&adev->shadow_list);

-   mutex_init(&adev->shadow_list_lock);
-
INIT_LIST_HEAD(&adev->reset_list);
  
  	INIT_LIST_HEAD(&adev->ras_list);

@@ -4980,80 +4977,6 @@ static int amdgpu_device_ip_post_soft_reset(struct 
amdgpu_device *adev)
return 0;
  }
  
-/**

- * amdgpu_device_recover_vram - Recover some VRAM contents
- *
- * @adev: amdgpu_device pointer
- *
- * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
- * restore things like GPUVM page tables after a GPU reset where
- * the contents of VRAM might be lost.
- *
- * Returns:
- * 0 on success, negative error code on failure.
- */
-static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
-{
-   struct dma_fence *fence = NULL, *next = NULL;
-   struct amdgpu_bo *shadow;
-   struct amdgpu_bo_vm *vmbo;
-   long r = 1, tmo;
-
-   if (amdgpu_sriov_runtime(adev))
-   tmo = msecs_to_jiffies(8000);
-   else
-   tmo = msecs_to_jiffies(100);
-
-   dev_info(adev->dev, "recover vram bo from shadow start\n");
-   mutex_lock(&adev->shadow_list_lock);
-   list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
-   /* If vm is compute context or adev is APU, shadow will be NULL 
*/
-   if (!vmbo->shadow)
-   continue;
-   shadow = vmbo->shadow;
-
-   /* No need to recover an evicted BO */
-   if (!shadow->tbo.resource ||
-   shadow->tbo.resource->mem_type != TTM_PL_TT ||
-   shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
-   shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
-   continue;
-
-   r = amdgpu_bo_restore_shadow(shadow, &next);
-   if (r)
-   break;
-
-   if (fence) {
-   tmo = dma_fence_wait_timeout(fence, false, tmo);
-   dma_fence_put(fence);
-   fence = next;
-   if (tmo == 0) {
-   r = -ETIMEDOUT;
-   break;
-   } else if (tmo < 0) {
-   r = tmo;
-   break;
-   }
-   } else {
-   fence = next;
-   }
-   }
-   mutex_unlock(&adev->shadow_list_lock);
-
-   if (fence)
-   tmo = dma_fence_wait_timeout(fence, false, tmo);
-   dma_fence_put(fence);
-
-   if (r < 0 || tmo <= 0) {
-   dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, 
tmo is %ld\n", r, tmo);
-   return -EIO;
-   }
-
-   dev_info(adev->dev, "recover vram bo from shadow done\n");
-   return 0;
-}
-
-
  /**
   * amdgpu_device_reset_sriov - reset ASIC fo

Re: [PATCH v4 9/9] drm/amdgpu: add lock in kfd_process_dequeue_from_device

2024-06-06 Thread Felix Kuehling




On 2024-06-04 21:33, Yunxiang Li wrote:

We need to take the reset domain lock before talking to MES. While in
this case we can take the lock inside the mes helper. We can't do so for
most other mes helpers since they are used during reset. So for
consistency sake we add the lock here.

Signed-off-by: Yunxiang Li 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 86ea610b16f3..21f5a1fb3bf8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -28,6 +28,7 @@
  #include "kfd_priv.h"
  #include "kfd_kernel_queue.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_reset.h"
  
  static inline struct process_queue_node *get_queue_by_qid(

struct process_queue_manager *pqm, unsigned int qid)
@@ -87,8 +88,12 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
return;
  
  	dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);

-   if (dev->kfd->shared_resources.enable_mes)
-   amdgpu_mes_flush_shader_debugger(dev->adev, 
pdd->proc_ctx_gpu_addr);
+   if (dev->kfd->shared_resources.enable_mes &&
+   down_read_trylock(&dev->adev->reset_domain->sem)) {
+   amdgpu_mes_flush_shader_debugger(dev->adev,
+pdd->proc_ctx_gpu_addr);
+   up_read(&dev->adev->reset_domain->sem);
+   }
pdd->already_dequeued = true;
  }

Re: [PATCH 10/12] drm/amdkfd: remove dead code in kq_initialize

2024-06-04 Thread Felix Kuehling




On 2024-06-03 04:49, Jesse Zhang wrote:

The queue type can only be KFD_QUEUE_TYPE_DIQ or KFD_QUEUE_TYPE_HIQ,
and the default cannot be reached.


I wonder, if you remove the default case, I guess you are relying on the 
compiler or a static checker to ensure that we can only pass valid enum 
values to this function. I don't think C compilers are that strict. You 
could pass a random integer to the function. That said, this function 
only has two callers, and both of them use a proper enum value.





Signed-off-by: Jesse Zhang 


Acked-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 32c926986dbb..3142b2593e2b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -67,9 +67,6 @@ static bool kq_initialize(struct kernel_queue *kq, struct 
kfd_node *dev,
case KFD_QUEUE_TYPE_HIQ:
kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
break;
-   default:
-   pr_err("Invalid queue type %d\n", type);
-   return false;
}
  
  	if (!kq->mqd_mgr)

Re: [PATCH 11/12] drm/amdkfd: remove logically dead code

2024-06-04 Thread Felix Kuehling




On 2024-06-03 04:49, Jesse Zhang wrote:

idr_for_each_entry can ensure that mem is not empty during the loop.
So don't need check mem again.

Signed-off-by: Jesse Zhang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 -
  1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index fdf171ad4a3c..32e5db509560 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1913,11 +1913,6 @@ static int criu_checkpoint_bos(struct kfd_process *p,
struct kfd_criu_bo_priv_data *bo_priv;
int i, dev_idx = 0;
  
-			if (!mem) {

-   ret = -ENOMEM;
-   goto exit;
-   }
-
kgd_mem = (struct kgd_mem *)mem;
dumper_bo = kgd_mem->bo;

Re: [PATCH] Revert "drm/amdgpu: init iommu after amdkfd device init"

2024-06-04 Thread Felix Kuehling




On 2024-06-03 18:19, Armin Wolf wrote:

Am 23.05.24 um 19:30 schrieb Armin Wolf:


This reverts commit 56b522f4668167096a50c39446d6263c96219f5f.

A user reported that this commit breaks the integrated gpu of his
notebook, causing a black screen. He was able to bisect the problematic
commit and verified that by reverting it the notebook works again.
He also confirmed that kernel 6.8.1 also works on his device, so the
upstream commit itself seems to be ok.

An amdgpu developer (Alex Deucher) confirmed that this patch should
have never been ported to 5.15 in the first place, so revert this
commit from the 5.15 stable series.


Hi,

what is the status of this?


Which branch is this for? This patch won't apply to anything after Linux 
6.5. Support for IOMMUv2 was removed from amdgpu in Linux 6.6 by:


commit c99a2e7ae291e5b19b60443eb6397320ef9e8571
Author: Alex Deucher 
Date:   Fri Jul 28 12:20:12 2023 -0400

    drm/amdkfd: drop IOMMUv2 support

    Now that we use the dGPU path for all APUs, drop the
    IOMMUv2 support.

    v2: drop the now unused queue manager functions for gfx7/8 APUs

    Reviewed-by: Felix Kuehling 
    Acked-by: Christian König 
    Tested-by: Mike Lothian 
    Signed-off-by: Alex Deucher 

Regards,
  Felix




Armin Wolf



Reported-by: Barry Kauler 
Signed-off-by: Armin Wolf 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 222a1d9ecf16..5f6c32ec674d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2487,6 +2487,10 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  if (r)
  goto init_failed;

+    r = amdgpu_amdkfd_resume_iommu(adev);
+    if (r)
+    goto init_failed;
+
  r = amdgpu_device_ip_hw_init_phase1(adev);
  if (r)
  goto init_failed;
@@ -2525,10 +2529,6 @@ static int amdgpu_device_ip_init(struct 
amdgpu_device *adev)

  if (!adev->gmc.xgmi.pending_reset)
  amdgpu_amdkfd_device_init(adev);

-    r = amdgpu_amdkfd_resume_iommu(adev);
-    if (r)
-    goto init_failed;
-
  amdgpu_fru_get_product_info(adev);

  init_failed:
--
2.39.2

Re: [PATCH 2/2][RFC] amdkfd CRIU fixes

2024-06-04 Thread Felix Kuehling




On 2024-06-03 22:14, Al Viro wrote:

Instead of trying to use close_fd() on failure exits, just have
criu_get_prime_handle() store the file reference without inserting
it into descriptor table.

Then, once the callers are past the last failure exit, they can go
and either insert all those file references into the corresponding
slots of descriptor table, or drop all those file references and
free the unused descriptors.

Signed-off-by: Al Viro 


Thank you for the patches and the explanation. One minor nit-pick 
inline. With that fixed, this patch is


Reviewed-by: Felix Kuehling 

I can apply this patch to amd-staging-drm-next, if you want. See one 
comment inline ...




---
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index fdf171ad4a3c..3f129e1c0daa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -36,7 +36,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  #include "kfd_priv.h"
  #include "kfd_device_queue_manager.h"
@@ -1857,7 +1856,8 @@ static uint32_t get_process_num_bos(struct kfd_process *p)
  }
  
  static int criu_get_prime_handle(struct kgd_mem *mem,

-int flags, u32 *shared_fd)
+int flags, u32 *shared_fd,
+struct file **file)
  {
struct dma_buf *dmabuf;
int ret;
@@ -1868,13 +1868,14 @@ static int criu_get_prime_handle(struct kgd_mem *mem,
return ret;
}
  
-	ret = dma_buf_fd(dmabuf, flags);

+   ret = get_unused_fd_flags(flags);
if (ret < 0) {
pr_err("dmabuf create fd failed, ret:%d\n", ret);
goto out_free_dmabuf;
}
  
  	*shared_fd = ret;

+   *file = dmabuf->file;
return 0;
  
  out_free_dmabuf:

@@ -1882,6 +1883,24 @@ static int criu_get_prime_handle(struct kgd_mem *mem,
return ret;
  }
  
+static void commit_files(struct file **files,

+struct kfd_criu_bo_bucket *bo_buckets,
+unsigned int count,
+int err)
+{
+   while (count--) {
+   struct file *file = files[count];
+   if (!file)


checkpatch.pl would complain here without an empty line after the 
variable definition.


Regards,
  Felix



+   continue;
+   if (err) {
+   fput(file);
+   put_unused_fd(bo_buckets[count].dmabuf_fd);
+   } else {
+   fd_install(bo_buckets[count].dmabuf_fd, file);
+   }
+   }
+}
+
  static int criu_checkpoint_bos(struct kfd_process *p,
   uint32_t num_bos,
   uint8_t __user *user_bos,
@@ -1890,6 +1909,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
  {
struct kfd_criu_bo_bucket *bo_buckets;
struct kfd_criu_bo_priv_data *bo_privs;
+   struct file **files = NULL;
int ret = 0, pdd_index, bo_index = 0, id;
void *mem;
  
@@ -1903,6 +1923,12 @@ static int criu_checkpoint_bos(struct kfd_process *p,

goto exit;
}
  
+	files = kvzalloc(num_bos * sizeof(struct file *), GFP_KERNEL);

+   if (!files) {
+   ret = -ENOMEM;
+   goto exit;
+   }
+
for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
struct kfd_process_device *pdd = p->pdds[pdd_index];
struct amdgpu_bo *dumper_bo;
@@ -1950,7 +1976,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
ret = criu_get_prime_handle(kgd_mem,
bo_bucket->alloc_flags &

KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
-   &bo_bucket->dmabuf_fd);
+   &bo_bucket->dmabuf_fd, 
&files[bo_index]);
if (ret)
goto exit;
} else {
@@ -2001,12 +2027,8 @@ static int criu_checkpoint_bos(struct kfd_process *p,
*priv_offset += num_bos * sizeof(*bo_privs);
  
  exit:

-   while (ret && bo_index--) {
-   if (bo_buckets[bo_index].alloc_flags
-   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT))
-   close_fd(bo_buckets[bo_index].dmabuf_fd);
-   }
-
+   commit_files(files, bo_buckets, bo_index, ret);
+   kvfree(files);
kvfree(bo_buckets);
kvfree(bo_privs);
return ret;
@@ -2358,7 +2380,8 @@ static int criu_restore_memory_of_gpu(struct 
kfd_process_device *pdd,
  
  static int criu_restore_bo(struct kfd_process *p,

   struct kfd_

Re: [PATCH 1/2][RFC] amdgpu: fix a race in kfd_mem_export_dmabuf()

2024-06-04 Thread Felix Kuehling




On 2024-06-03 22:13, Al Viro wrote:

Using drm_gem_prime_handle_to_fd() to set dmabuf up and insert it into
descriptor table, only to have it looked up by file descriptor and
remove it from descriptor table is not just too convoluted - it's
racy; another thread might have modified the descriptor table while
we'd been going through that song and dance.

It's not hard to fix - turn drm_gem_prime_handle_to_fd()
into a wrapper for a new helper that would simply return the
dmabuf, without messing with descriptor table.

Then kfd_mem_export_dmabuf() would simply use that new helper
and leave the descriptor table alone.

Signed-off-by: Al Viro 


This patch looks good to me on the amdgpu side. For the DRM side I'm 
adding dri-devel.


Acked-by: Felix Kuehling 



---
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 8975cf41a91a..793780bb819c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -25,7 +25,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  
  #include 

@@ -812,18 +811,13 @@ static int kfd_mem_export_dmabuf(struct kgd_mem *mem)
if (!mem->dmabuf) {
struct amdgpu_device *bo_adev;
struct dma_buf *dmabuf;
-   int r, fd;
  
  		bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);

-   r = drm_gem_prime_handle_to_fd(&bo_adev->ddev, 
bo_adev->kfd.client.file,
+   dmabuf = drm_gem_prime_handle_to_dmabuf(&bo_adev->ddev, 
bo_adev->kfd.client.file,
   mem->gem_handle,
mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
-  DRM_RDWR : 0, &fd);
-   if (r)
-   return r;
-   dmabuf = dma_buf_get(fd);
-   close_fd(fd);
-   if (WARN_ON_ONCE(IS_ERR(dmabuf)))
+  DRM_RDWR : 0);
+   if (IS_ERR(dmabuf))
return PTR_ERR(dmabuf);
mem->dmabuf = dmabuf;
}
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 03bd3c7bd0dc..622c51d3fe18 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -409,23 +409,9 @@ static struct dma_buf *export_and_register_object(struct 
drm_device *dev,
return dmabuf;
  }
  
-/**

- * drm_gem_prime_handle_to_fd - PRIME export function for GEM drivers
- * @dev: dev to export the buffer from
- * @file_priv: drm file-private structure
- * @handle: buffer handle to export
- * @flags: flags like DRM_CLOEXEC
- * @prime_fd: pointer to storage for the fd id of the create dma-buf
- *
- * This is the PRIME export function which must be used mandatorily by GEM
- * drivers to ensure correct lifetime management of the underlying GEM object.
- * The actual exporting from GEM object to a dma-buf is done through the
- * &drm_gem_object_funcs.export callback.
- */
-int drm_gem_prime_handle_to_fd(struct drm_device *dev,
+struct dma_buf *drm_gem_prime_handle_to_dmabuf(struct drm_device *dev,
   struct drm_file *file_priv, uint32_t handle,
-  uint32_t flags,
-  int *prime_fd)
+  uint32_t flags)
  {
struct drm_gem_object *obj;
int ret = 0;
@@ -434,14 +420,14 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev,
mutex_lock(&file_priv->prime.lock);
obj = drm_gem_object_lookup(file_priv, handle);
if (!obj)  {
-   ret = -ENOENT;
+   dmabuf = ERR_PTR(-ENOENT);
goto out_unlock;
}
  
  	dmabuf = drm_prime_lookup_buf_by_handle(&file_priv->prime, handle);

if (dmabuf) {
get_dma_buf(dmabuf);
-   goto out_have_handle;
+   goto out;
}
  
  	mutex_lock(&dev->object_name_lock);

@@ -463,7 +449,6 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev,
/* normally the created dma-buf takes ownership of the ref,
 * but if that fails then drop the ref
 */
-   ret = PTR_ERR(dmabuf);
mutex_unlock(&dev->object_name_lock);
goto out;
}
@@ -478,34 +463,49 @@ int drm_gem_prime_handle_to_fd(struct drm_device *dev,
ret = drm_prime_add_buf_handle(&file_priv->prime,
   dmabuf, handle);
mutex_unlock(&dev->object_name_lock);
-   if (ret)
-   goto fail_put_dmabuf;
-
-out_have_handle:
-   ret = dma_buf_fd(dmabuf, flags);
-   /*
-* We must _not_ remove the buffer from the handle cache since the newly
-* created dma buf is already link

Re: [PATCH 8/8] drm/amdkfd: remove dead code in kfd_create_vcrat_image_gpu

2024-05-31 Thread Felix Kuehling




On 2024-05-30 21:44, Zhang, Jesse(Jie) wrote:

[AMD Official Use Only - AMD Internal Distribution Only]

Hi Felix,

-Original Message-
From: Kuehling, Felix 
Sent: Friday, May 31, 2024 4:37 AM
To: Zhang, Jesse(Jie) ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Koenig, Christian 
; Kim, Jonathan ; Huang, Tim 

Subject: Re: [PATCH 8/8] drm/amdkfd: remove dead code in 
kfd_create_vcrat_image_gpu


On 2024-05-29 23:50, Jesse Zhang wrote:

Since the value of avail_size is at least VCRAT_SIZE_FOR_GPU(16384),
minus struct crat_header(40UL) and struct crat_subtype_compute(40UL) it cannot 
be less than 0.

Signed-off-by: Jesse Zhang 
---
   drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 6 --
   1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 71150d503dc7..ead43386a7ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -2213,9 +2213,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* Modify length and total_entries as subunits are added.
*/
   avail_size -= sizeof(struct crat_header);
- if (avail_size < 0)
- return -ENOMEM;
-

Avail_size is passed in from the caller through the *size parameter.
You're making an assumption about the caller here.

[Zhang, Jesse(Jie)]  avil_size is checked at the beginning of 
kfd_create_vcrat_image_gpu
and it cannot be smaller than VCRAT_SIZE_FOR_GPU (16384).

 if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
 return -EINVAL;


Ok, I missed that. Makes sense. Maybe mention it in the commit 
description that kfd_create_vcrat_image_gpu itself checks the avail_size 
at the start. The patch is


Reviewed-by: Felix Kuehling 





Regards
Jesse

Regards,
Felix



   memset(crat_table, 0, sizeof(struct crat_header));

   memcpy(&crat_table->signature, CRAT_SIGNATURE, @@ -2229,9 +2226,6
@@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
* First fill in the sub type header and then sub type data
*/
   avail_size -= sizeof(struct crat_subtype_computeunit);
- if (avail_size < 0)
- return -ENOMEM;
-
   sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
   memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));

Re: [PATCH 7/8] drm/amdkfd: Comment out the unused variable use_static in pm_map_queues_v9

2024-05-31 Thread Felix Kuehling




On 2024-05-30 22:51, Jesse Zhang wrote:

To fix the warning about unused value,
remove the use_static and use the parameter is_static directly.

Signed-off-by: Jesse Zhang 
Suggested-by: Felix Kuehling 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 4 +---
  1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
index 8ee2bedd301a..00776f08351c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -213,7 +213,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, 
uint32_t *buffer,
struct queue *q, bool is_static)
  {
struct pm4_mes_map_queues *packet;
-   bool use_static = is_static;
  
  	packet = (struct pm4_mes_map_queues *)buffer;

memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
@@ -234,7 +233,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, 
uint32_t *buffer,
  
  	switch (q->properties.type) {

case KFD_QUEUE_TYPE_COMPUTE:
-   if (use_static)
+   if (is_static)
packet->bitfields2.queue_type =
queue_type__mes_map_queues__normal_latency_static_queue_vi;
break;
@@ -244,7 +243,6 @@ static int pm_map_queues_v9(struct packet_manager *pm, 
uint32_t *buffer,
break;
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_SDMA_XGMI:
-   use_static = false; /* no static queues under SDMA */
if (q->properties.sdma_engine_id < 2 &&
!pm_use_ext_eng(q->device->kfd))
packet->bitfields2.engine_sel = 
q->properties.sdma_engine_id +

Re: [PATCH 2/8] drm/amdkfd: fix the kdf debugger issue

2024-05-31 Thread Felix Kuehling


On 2024-05-30 22:51, Jesse Zhang wrote:

The expression caps | HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED
and  caps | HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED
are always 1/true regardless of the values of its operand.

Fixes: 75de8428c3d632 ("drm/amdkfd: enable single alu ops for gfx12")
Signed-off-by: Jesse Zhang 
Suggested-by: Felix Kuehling 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 3f27bab7a502..34a282540c7e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -503,13 +503,13 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, 
uint32_t *flags)

kfd_topology_device_by_id(target->pdds[i]->dev->id);
uint32_t caps = topo_dev->node_props.capability;
  
-		if (!(caps | HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&

+   if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) 
&&
(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
*flags = prev_flags;
return -EACCES;
}
  
-		if (!(caps | HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) &&

+   if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED) 
&&
(*flags & KFD_DBG_TRAP_FLAG_SINGLE_ALU_OP)) {
*flags = prev_flags;
return -EACCES;

Re: [PATCH v2 09/10] drm/amdgpu: fix missing reset domain locks

2024-05-31 Thread Felix Kuehling



On 2024-05-31 2:52, Christian König wrote:
> Am 31.05.24 um 00:02 schrieb Felix Kuehling:
>> On 2024-05-28 13:23, Yunxiang Li wrote:
>>> These functions are missing the lock for reset domain.
>>>
>>> Signed-off-by: Yunxiang Li 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 4 +++-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 8 ++--
>>>   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 9 +++--
>>>   3 files changed, 16 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> index eb172388d99e..ddc5e9972da8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> @@ -34,6 +34,7 @@
>>>   #include 
>>>   #endif
>>>   #include "amdgpu.h"
>>> +#include "amdgpu_reset.h"
>>>   #include 
>>>   #include 
>>>   @@ -401,13 +402,14 @@ void amdgpu_gart_invalidate_tlb(struct 
>>> amdgpu_device *adev)
>>>   {
>>>   int i;
>>>   -    if (!adev->gart.ptr)
>>> +    if (!adev->gart.ptr || !down_read_trylock(&adev->reset_domain->sem))
>>>   return;
>>>     mb();
>>>   amdgpu_device_flush_hdp(adev, NULL);
>>>   for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS)
>>>   amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);
>>> +    up_read(&adev->reset_domain->sem);
>>>   }
>>>     /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index e4742b65032d..52a3170d15b7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -307,8 +307,12 @@ static struct dma_fence *amdgpu_job_run(struct 
>>> drm_sched_job *sched_job)
>>>   dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)",
>>>   ring->name);
>>>   } else {
>>> -    r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>>> -   &fence);
>>> +    r = -ETIME;
>>> +    if (down_read_trylock(&adev->reset_domain->sem)) {
>>> +    r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs,
>>> +   job, &fence);
>>> +    up_read(&adev->reset_domain->sem);
>>> +    }
>>>   if (r)
>>>   dev_err(adev->dev,
>>>   "Error scheduling IBs (%d) in ring(%s)", r,
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> index 86ea610b16f3..21f5a1fb3bf8 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>>> @@ -28,6 +28,7 @@
>>>   #include "kfd_priv.h"
>>>   #include "kfd_kernel_queue.h"
>>>   #include "amdgpu_amdkfd.h"
>>> +#include "amdgpu_reset.h"
>>>     static inline struct process_queue_node *get_queue_by_qid(
>>>   struct process_queue_manager *pqm, unsigned int qid)
>>> @@ -87,8 +88,12 @@ void kfd_process_dequeue_from_device(struct 
>>> kfd_process_device *pdd)
>>>   return;
>>>     dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);
>>> -    if (dev->kfd->shared_resources.enable_mes)
>>> -    amdgpu_mes_flush_shader_debugger(dev->adev, 
>>> pdd->proc_ctx_gpu_addr);
>>> +    if (dev->kfd->shared_resources.enable_mes &&
>>> + down_read_trylock(&dev->adev->reset_domain->sem)) {
>>> +    amdgpu_mes_flush_shader_debugger(dev->adev,
>>> + pdd->proc_ctx_gpu_addr);
>>> +
>>
>> It's not clear to me what's the requirement for reset domain locking around 
>> MES calls. We have a lot more of them in kfd_device_queue_manager.c (mostly 
>> calling adev->mes.funcs->... directly). Do they all need to be wrapped 
>> individually?
> 
> Whenever you call a MES function (or any other function directly interacting 
> with the rings or the HW registers) you need to make sure that at least the 
> read side of the reset lock is held.

Having to do that for each caller of amdgpu_mes functions seems error prone.

Would it make sense to wrap that inside amdgpu_mes_lock/unlock? Maybe turn it 
into amdgpu_mes_trylock/unlock and make sure that all the amdgpu_mes functions 
that take that lock can fail and return an error code. Add an attribute so the 
compiler can flag callers that ignore the return values. This would make it 
easier to let the compiler spot places that don't handle errors due to reset 
lock failures.

Regards,
  Felix

> 
> Regards,
> Christian.
> 
>>
>> Regards,
>>   Felix
>>
>>
>>> up_read(&dev->adev->reset_domain->sem);
>>> +    }
>>>   pdd->already_dequeued = true;
>>>   }
>

Re: [PATCH v2 09/10] drm/amdgpu: fix missing reset domain locks

2024-05-30 Thread Felix Kuehling


On 2024-05-28 13:23, Yunxiang Li wrote:

These functions are missing the lock for reset domain.

Signed-off-by: Yunxiang Li 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 4 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 8 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 9 +++--
  3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index eb172388d99e..ddc5e9972da8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -34,6 +34,7 @@
  #include 
  #endif
  #include "amdgpu.h"
+#include "amdgpu_reset.h"
  #include 
  #include 
  
@@ -401,13 +402,14 @@ void amdgpu_gart_invalidate_tlb(struct amdgpu_device *adev)

  {
int i;
  
-	if (!adev->gart.ptr)

+   if (!adev->gart.ptr || !down_read_trylock(&adev->reset_domain->sem))
return;
  
  	mb();

amdgpu_device_flush_hdp(adev, NULL);
for_each_set_bit(i, adev->vmhubs_mask, AMDGPU_MAX_VMHUBS)
amdgpu_gmc_flush_gpu_tlb(adev, 0, i, 0);
+   up_read(&adev->reset_domain->sem);
  }
  
  /**

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index e4742b65032d..52a3170d15b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -307,8 +307,12 @@ static struct dma_fence *amdgpu_job_run(struct 
drm_sched_job *sched_job)
dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)",
ring->name);
} else {
-   r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
-  &fence);
+   r = -ETIME;
+   if (down_read_trylock(&adev->reset_domain->sem)) {
+   r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs,
+  job, &fence);
+   up_read(&adev->reset_domain->sem);
+   }
if (r)
dev_err(adev->dev,
"Error scheduling IBs (%d) in ring(%s)", r,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 86ea610b16f3..21f5a1fb3bf8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -28,6 +28,7 @@
  #include "kfd_priv.h"
  #include "kfd_kernel_queue.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_reset.h"
  
  static inline struct process_queue_node *get_queue_by_qid(

struct process_queue_manager *pqm, unsigned int qid)
@@ -87,8 +88,12 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
return;
  
  	dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);

-   if (dev->kfd->shared_resources.enable_mes)
-   amdgpu_mes_flush_shader_debugger(dev->adev, 
pdd->proc_ctx_gpu_addr);
+   if (dev->kfd->shared_resources.enable_mes &&
+   down_read_trylock(&dev->adev->reset_domain->sem)) {
+   amdgpu_mes_flush_shader_debugger(dev->adev,
+pdd->proc_ctx_gpu_addr);
+   


It's not clear to me what's the requirement for reset domain locking 
around MES calls. We have a lot more of them in 
kfd_device_queue_manager.c (mostly calling adev->mes.funcs->... 
directly). Do they all need to be wrapped individually?


Regards,
  Felix



up_read(&dev->adev->reset_domain->sem);
+   }
pdd->already_dequeued = true;
  }

Re: [PATCH 6/8] drm/amdkfd: remove dead code in the function svm_range_get_pte_flags

2024-05-30 Thread Felix Kuehling


On 2024-05-29 23:49, Jesse Zhang wrote:

The varible uncached  set false, the condition uncached cannot be true.
So remove the dead code, mapping flags will set the flag AMDGPU_VM_MTYPE_UC in 
else.

Signed-off-by: Jesse Zhang 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 +
  1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 407636a68814..bd9c2921e0dc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1171,7 +1171,6 @@ svm_range_get_pte_flags(struct kfd_node *node,
bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | 
KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
-   bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/
unsigned int mtype_local;
  
  	if (domain == SVM_RANGE_VRAM_DOMAIN)

@@ -1220,9 +1219,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
mtype_local = amdgpu_mtype_local == 1 ? 
AMDGPU_VM_MTYPE_NC :
amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : 
AMDGPU_VM_MTYPE_RW;
snoop = true;
-   if (uncached) {
-   mapping_flags |= AMDGPU_VM_MTYPE_UC;
-   } else if (domain == SVM_RANGE_VRAM_DOMAIN) {
+   if (domain == SVM_RANGE_VRAM_DOMAIN) {
/* local HBM region close to partition */
if (bo_node->adev == node->adev &&
(!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == 
node->xcp->mem_id))

Re: [PATCH 7/8] drm/amdkfd: Comment out the unused variable use_static in pm_map_queues_v9

2024-05-30 Thread Felix Kuehling




On 2024-05-30 10:12, Christian König wrote:

Am 30.05.24 um 05:50 schrieb Jesse Zhang:
To fix the warning about unused value, comment out the variable 
use_static.


Commenting out variables with // will just get you another warning 
from checkpatch.


Christian.



Signed-off-by: Jesse Zhang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c

index 8ee2bedd301a..c09476273f73 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
@@ -244,7 +244,7 @@ static int pm_map_queues_v9(struct packet_manager 
*pm, uint32_t *buffer,

  break;
  case KFD_QUEUE_TYPE_SDMA:
  case KFD_QUEUE_TYPE_SDMA_XGMI:
-    use_static = false; /* no static queues under SDMA */
+    //use_static = false; /* no static queues under SDMA */


I'd just remove this line, remove the use_static variable and use the 
parameter is_static directly under case KFD_QUEUE_TYPE_COMPUTE.


Regards,
  Felix



  if (q->properties.sdma_engine_id < 2 &&
  !pm_use_ext_eng(q->device->kfd))
  packet->bitfields2.engine_sel = 
q->properties.sdma_engine_id +

Re: [PATCH 2/8] drm/amdkfd: fix the kdf debugger issue

2024-05-30 Thread Felix Kuehling




On 2024-05-29 23:47, Jesse Zhang wrote:

the expression caps | HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED
is always 1/true regardless of the values of its operand.

Signed-off-by: Jesse Zhang 


Please add a Fixes tag. I think this is the commit that introduced the 
problem:


commit 75de8428c3d632eacc7890b7cb39dbec04d286c3
Author: Jonathan Kim 
Date:   Mon Aug 21 11:47:47 2023 -0400

    drm/amdkfd: enable single alu ops for gfx12

    GFX12 debugging requires setting up precise ALU operation for catching
    ALU exceptions.

    Signed-off-by: Jonathan Kim 
    Tested-by: Lancelot Six 
    Reviewed-by: Eric Huang 
    Signed-off-by: Alex Deucher 


---
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 3f27bab7a502..4abd275056d6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -503,7 +503,7 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, 
uint32_t *flags)

kfd_topology_device_by_id(target->pdds[i]->dev->id);
uint32_t caps = topo_dev->node_props.capability;
  
-		if (!(caps | HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) &&

+   if (!(caps & HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) 
&&


Looks like the same mistake was copied and pasted for 
HSA_CAP_TRAP_DEBUG_PRECISE_ALU_OPERATIONS_SUPPORTED a few lines below. 
Please fix that as well while you're at it.


Thanks,
  Felix



(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
*flags = prev_flags;
return -EACCES;

Re: [PATCH 3/8] drm/amdkfd: fix overflow for the function criu_restore_bos

2024-05-30 Thread Felix Kuehling




On 2024-05-29 23:47, Jesse Zhang wrote:

When copying the information from the user fails, it will goto exit.
But the variable i remains at 0, and do i-- will overflow.


i-- may underflow, but the loop will still exit. Why is the underflow a 
problem?





Signed-off-by: Jesse Zhang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index fdf171ad4a3c..dac8fdc49e3b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2480,10 +2480,11 @@ static int criu_restore_bos(struct kfd_process *p,
ret = -EFAULT;
  
  exit:

-   while (ret && i--) {
+   while (ret && i) {
if (bo_buckets[i].alloc_flags
   & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | 
KFD_IOC_ALLOC_MEM_FLAGS_GTT))
close_fd(bo_buckets[i].dmabuf_fd);
+   i--;


This changes the value of i in the loop body. To get the same behaviour 
you'd need to decrement i at the start of the loop body.


Regards,
  Felix



}
kvfree(bo_buckets);
kvfree(bo_privs);

Re: [PATCH 5/8] drm/amdkfd: fix the return for the function kfd_dbg_trap_set_flags

2024-05-30 Thread Felix Kuehling




On 2024-05-29 23:48, Jesse Zhang wrote:

If the rewind flag is set, it should return the final result of
setting mes debug mode or refresh the run list.


No. We're rewinding because an error occurred. We want to return that 
error, not the success probably returned by refreshing the runlist.


Regards,
  Felix




Signed-off-by: Jesse Zhang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
index 4abd275056d6..d12e5f29919a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
@@ -548,9 +548,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, 
uint32_t *flags)
continue;
  
  			if (!pdd->dev->kfd->shared_resources.enable_mes)

-   debug_refresh_runlist(pdd->dev->dqm);
+   r = debug_refresh_runlist(pdd->dev->dqm);
else
-   kfd_dbg_set_mes_debug_mode(pdd, true);
+   r = kfd_dbg_set_mes_debug_mode(pdd, true);
}
}

Re: [PATCH 8/8] drm/amdkfd: remove dead code in kfd_create_vcrat_image_gpu

2024-05-30 Thread Felix Kuehling




On 2024-05-29 23:50, Jesse Zhang wrote:

Since the value of avail_size is at least VCRAT_SIZE_FOR_GPU(16384),
minus struct crat_header(40UL) and struct crat_subtype_compute(40UL) it cannot 
be less than 0.

Signed-off-by: Jesse Zhang 
---
  drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 6 --
  1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 71150d503dc7..ead43386a7ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -2213,9 +2213,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 * Modify length and total_entries as subunits are added.
 */
avail_size -= sizeof(struct crat_header);
-   if (avail_size < 0)
-   return -ENOMEM;
-


Avail_size is passed in from the caller through the *size parameter. 
You're making an assumption about the caller here.


Regards,
  Felix



memset(crat_table, 0, sizeof(struct crat_header));
  
  	memcpy(&crat_table->signature, CRAT_SIGNATURE,

@@ -2229,9 +2226,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 * First fill in the sub type header and then sub type data
 */
avail_size -= sizeof(struct crat_subtype_computeunit);
-   if (avail_size < 0)
-   return -ENOMEM;
-
sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));

Re: [PATCH v2 04/10] drm/amdgpu/kfd: remove is_hws_hang and is_resetting

2024-05-29 Thread Felix Kuehling




On 2024-05-28 13:23, Yunxiang Li wrote:
> is_hws_hang and is_resetting serves pretty much the same purpose and
> they all duplicates the work of the reset_domain lock, just check that
> directly instead. This also eliminate a few bugs listed below and get
> rid of dqm->ops.pre_reset.
> 
> kfd_hws_hang did not need to avoid scheduling another reset. If the
> on-going reset decided to skip GPU reset we have a bad time, otherwise
> the extra reset will get cancelled anyway.
> 
> remove_queue_mes forgot to check is_resetting flag compared to the
> pre-MES path unmap_queue_cpsch, so it did not block hw access during
> reset correctly.
> 
> Signed-off-by: Yunxiang Li 

The patch looks good to me. It's been years since I worked on HWS hang and GPU 
reset handling in KFD, and at the time the reset domain stuff didn't exist. The 
result of this patch looks a lot cleaner, which is good. If there are 
regressions, they are hopefully not too hard to fix.

One thing I could see going wrong is, that 
down_read_trylock(&dqm->dev->adev->reset_domain->sem) will not fail immediately 
when the reset is scheduled. So there may be multipe attempts at HW access that 
detect an error or time out, which may get the HW into a worse state or delay 
the actual reset.

At a minimum, I'd recommend testing this with /sys/kernel/debug/hang_hws on a 
pre-MES GPU, while some ROCm workload is running.

Reviewed-by: Felix Kuehling 


> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  1 -
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 79 ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 -
>  drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 11 ++-
>  .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  4 +-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  4 +-
>  .../amd/amdkfd/kfd_process_queue_manager.c|  4 +-
>  7 files changed, 45 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index fba9b9a258a5..3e0f46d60de5 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -935,7 +935,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>   for (i = 0; i < kfd->num_nodes; i++) {
>   node = kfd->nodes[i];
>   kfd_smi_event_update_gpu_reset(node, false);
> - node->dqm->ops.pre_reset(node->dqm);
>   }
>  
>   kgd2kfd_suspend(kfd, false);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4721b2fccd06..3a2dc31279a4 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -35,6 +35,7 @@
>  #include "cik_regs.h"
>  #include "kfd_kernel_queue.h"
>  #include "amdgpu_amdkfd.h"
> +#include "amdgpu_reset.h"
>  #include "mes_api_def.h"
>  #include "kfd_debug.h"
>  
> @@ -155,14 +156,7 @@ static void kfd_hws_hang(struct device_queue_manager 
> *dqm)
>   /*
>* Issue a GPU reset if HWS is unresponsive
>*/
> - dqm->is_hws_hang = true;
> -
> - /* It's possible we're detecting a HWS hang in the
> -  * middle of a GPU reset. No need to schedule another
> -  * reset in this case.
> -  */
> - if (!dqm->is_resetting)
> - schedule_work(&dqm->hw_exception_work);
> + schedule_work(&dqm->hw_exception_work);
>  }
>  
>  static int convert_to_mes_queue_type(int queue_type)
> @@ -194,7 +188,7 @@ static int add_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>   int r, queue_type;
>   uint64_t wptr_addr_off;
>  
> - if (dqm->is_hws_hang)
> + if (!down_read_trylock(&adev->reset_domain->sem))
>   return -EIO;
>  
>   memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
> @@ -245,6 +239,7 @@ static int add_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>   amdgpu_mes_lock(&adev->mes);
>   r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
>   amdgpu_mes_unlock(&adev->mes);
> + up_read(&adev->reset_domain->sem);
>   if (r) {
>   dev_err(adev->dev, "failed to add hardware queue to MES, 
> doorbell=0x%x\n",
>   q->properties.doorbell_off);
> @@ -262,7 +257,7 @@ static int remove_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>   int r;
>   struct mes_remove_queue_input queue_input;
>  
> - if (

Re: [PATCH] drm/amdgpu: Make CPX mode auto default in NPS4

2024-05-27 Thread Felix Kuehling


On 2024-05-22 15:15, Rajneesh Bhardwaj wrote:

On GFXIP9.4.3, make CPX mode as the default compute mode if the node is
setup in NPS4 memory partition mode. This change is only applicable for
dGPU, for APU, continue to use TPX mode.

Cc: Lijo Lazar 
Signed-off-by: Rajneesh Bhardwaj 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index d62cfa4e2d2b..2c9a0aa41e2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -422,7 +422,7 @@ __aqua_vanjaram_get_auto_mode(struct amdgpu_xcp_mgr 
*xcp_mgr)
  
  	if (adev->gmc.num_mem_partitions == num_xcc / 2)

return (adev->flags & AMD_IS_APU) ? AMDGPU_TPX_PARTITION_MODE :
-   AMDGPU_QPX_PARTITION_MODE;
+   AMDGPU_CPX_PARTITION_MODE;
  
  	if (adev->gmc.num_mem_partitions == 2 && !(adev->flags & AMD_IS_APU))

return AMDGPU_DPX_PARTITION_MODE;

Re: [PATCH] drm/amdkfd: simplify APU VRAM handling

2024-05-27 Thread Felix Kuehling


On 2024-05-24 10:08, Alex Deucher wrote:

With commit 89773b85599a
("drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs")
big and small APU "VRAM" handling in KFD was unified.  Since AMD_IS_APU
is set for both big and small APUs, we can simplify the checks in
the code.

v2: clean up a few more places (Lang)

Signed-off-by: Alex Deucher 


This is a lot cleaner, thanks. I was looking for something like this 
when I reviewed the original patch but missed it. I found it now in 
amdgpu_discovery_set_ip_blocks (I think).


Acked-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  6 ++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  1 -
  4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 336eb51c4839..3af00b57cd8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
  
  		vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);

-   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
+   if (adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -233,7 +233,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
adev->kfd.vram_used_aligned[xcp_id] +=
-   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
+   (adev->flags & AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -261,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
  
  		if (adev) {

adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
+   if (adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -894,7 +894,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) 
&&
+   if ((adev != bo_adev && !(adev->flags & AMD_IS_APU)) &&
((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
 (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
 (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
@@ -1682,7 +1682,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *adev,
- atomic64_read(&adev->vram_pin_size)
- reserved_for_pt;
  
-	if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {

+   if (adev->flags & AMD_IS_APU) {
system_mem_available = no_system_mem_limit ?
kfd_mem_limit.max_system_mem_limit :
kfd_mem_limit.max_system_mem_limit -
@@ -1730,7 +1730,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
  
-		if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {

+   if (adev->flags & AMD_IS_APU) {
domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_flags = 0;
@@ -1981,7 +1981,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
if (size) {
if (!is_imported &&
   (mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||
-  ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
+  ((adev->flags & AMD_IS_APU) &&
mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))
*size = bo_size;
else
@@ -2404,7 +2404,7 @@ static int import_obj_create(struct amdgpu_device *adev,

Re: [PATCH] drm/amdgpu: Update the impelmentation of AMDGPU_PTE_MTYPE_GFX12

2024-05-21 Thread Felix Kuehling



On 2024-05-20 5:14, Shane Xiao wrote:
> This patch changes the implementation of AMDGPU_PTE_MTYPE_GFX12,
> clear the bits before setting the new one.
> This fixed the potential issue that GFX12 setting memory to NC.
> 
> v2: Clear mtype field before setting the new one (Alex)
> 
> Signed-off-by: longlyao 
> Signed-off-by: Shane Xiao 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  7 +--
>  drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 23 +++
>  2 files changed, 16 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index bc71b44387b2..99b246e82ed6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -116,8 +116,11 @@ struct amdgpu_mem_stats;
>  #define AMDGPU_PTE_PRT_FLAG(adev)\
>   ((amdgpu_ip_version((adev), GC_HWIP, 0) >= IP_VERSION(12, 0, 0)) ? 
> AMDGPU_PTE_PRT_GFX12 : AMDGPU_PTE_PRT)
>  
> -#define AMDGPU_PTE_MTYPE_GFX12(a)((uint64_t)(a) << 54)
> -#define AMDGPU_PTE_MTYPE_GFX12_MASK  AMDGPU_PTE_MTYPE_GFX12(3ULL)
> +#define AMDGPU_PTE_MTYPE_GFX12_SHIFT(mtype)  ((uint64_t)(mytype) << 54)

You have a typo here: mytype -> mtype .

Regards,
  Felix


> +#define AMDGPU_PTE_MTYPE_GFX12_MASK  AMDGPU_PTE_MTYPE_GFX12_SHIFT(3ULL)
> +#define AMDGPU_PTE_MTYPE_GFX12(flags, mtype) \
> + ((flags) & ((~AMDGPU_PTE_MTYPE_GFX12_MASK)) |   \
> +   AMDGPU_PTE_MTYPE_GFX12_SHIFT(mtype))
>  
>  #define AMDGPU_PTE_IS_PTE(1ULL << 63)
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> index e2c6ec3cc4f3..f2d331d0181f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
> @@ -461,17 +461,17 @@ static uint64_t gmc_v12_0_map_mtype(struct 
> amdgpu_device *adev, uint32_t flags)
>  {
>   switch (flags) {
>   case AMDGPU_VM_MTYPE_DEFAULT:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC);
>   case AMDGPU_VM_MTYPE_NC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC);
>   case AMDGPU_VM_MTYPE_WC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_WC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_WC);
>   case AMDGPU_VM_MTYPE_CC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_CC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_CC);
>   case AMDGPU_VM_MTYPE_UC:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_UC);
>   default:
> - return AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + return AMDGPU_PTE_MTYPE_GFX12(0ULL,MTYPE_NC);
>   }
>  }
>  
> @@ -509,8 +509,8 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>   *flags &= ~AMDGPU_PTE_EXECUTABLE;
>   *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE;
>  
> - *flags &= ~AMDGPU_PTE_MTYPE_GFX12_MASK;
> - *flags |= (mapping->flags & AMDGPU_PTE_MTYPE_GFX12_MASK);
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, (mapping->flags &   \
> +  AMDGPU_PTE_MTYPE_GFX12_MASK) >> 
> AMDGPU_PTE_MTYPE_GFX12_SHIFT);
>  
>   if (mapping->flags & AMDGPU_PTE_PRT_GFX12) {
>   *flags |= AMDGPU_PTE_PRT_GFX12;
> @@ -524,8 +524,7 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>  
>   if (bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
>  AMDGPU_GEM_CREATE_UNCACHED))
> - *flags = (*flags & ~AMDGPU_PTE_MTYPE_GFX12_MASK) |
> -  AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC);
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_UC);
>  
>   bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
>   coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
> @@ -534,7 +533,7 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device 
> *adev,
>  
>   /* WA for HW bug */
>   if (is_system || ((bo_adev != adev) && coherent))
> - *flags |= AMDGPU_PTE_MTYPE_GFX12(MTYPE_NC);
> + *flags = AMDGPU_PTE_MTYPE_GFX12(*flags, MTYPE_NC);
>  
>  }
>  
> @@ -707,7 +706,7 @@ static int gmc_v12_0_gart_init(struct amdgpu_device *adev)
>   return r;
>  
>   adev->gart.table_size = adev->gart.num_gpu_pages * 8;
> - adev->gart.gart_pte_flags = AMDGPU_PTE_MTYPE_GFX12(MTYPE_UC) |
> + adev->gart.gart_pte_flags = AMDGPU_PTE_MTYPE_GFX12(0ULL, MTYPE_UC) |
>   AMDGPU_PTE_EXECUTABLE |
>   AMDGPU_PTE_IS_PTE;
>

Re: [PATCH] drm/kfd: Correct pined buffer handling at kfd restore and validate process

2024-05-13 Thread Felix Kuehling




On 2024-05-13 11:18, Xiaogang.Chen wrote:
> From: Xiaogang Chen 
> 
> This reverts 8a774fe912ff09e39c2d3a3589c729330113f388 "drm/amdgpu: avoid 
> restore
> process run into dead loop" since buffer got pined is not related whether it

Spelling: pined -> pinned

Same in the commit headline.


> needs mapping. And skip buffer validation at kfd driver if the buffer has been
> pinned.
> 
> Signed-off-by: Xiaogang Chen 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 3314821e4cf3..80018738bd1c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -415,6 +415,10 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo 
> *bo, uint32_t domain,
>"Called with userptr BO"))
>   return -EINVAL;
>  
> + /* bo has been pined, not need validate it */

pined -> pinned

With those typos fixed, the patch is

Reviewed-by: Felix Kuehling 


> + if (bo->tbo.pin_count)
> + return 0;
> +
>   amdgpu_bo_placement_from_domain(bo, domain);
>  
>   ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
> @@ -2736,7 +2740,7 @@ static int confirm_valid_user_pages_locked(struct 
> amdkfd_process_info *process_i
>  
>   /* keep mem without hmm range at userptr_inval_list */
>   if (!mem->range)
> -  continue;
> + continue;
>  
>   /* Only check mem with hmm range associated */
>   valid = amdgpu_ttm_tt_get_user_pages_done(
> @@ -2981,9 +2985,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, 
> struct dma_fence __rcu *
>   if (!attachment->is_mapped)
>   continue;
>  
> - if (attachment->bo_va->base.bo->tbo.pin_count)
> - continue;
> -
>   kfd_mem_dmaunmap_attachment(mem, attachment);
>   ret = update_gpuvm_pte(mem, attachment, &sync_obj);
>   if (ret) {

Re: [PATCH v2] drm/amdkfd: Check correct memory types for is_system variable

2024-05-10 Thread Felix Kuehling




On 2024-05-10 10:06, Sreekant Somasekharan wrote:

To catch GPU mapping of system memory, TTM_PL_TT and AMDGPU_PL_PREEMPT
must be checked.

'Fixes: 3b01ca1b860d ("drm/amdkfd: mark GFX12 system and peer
GPU memory mappings as MTYPE_NC")'


I don't think that's a valid format for the Fixes tag. It should be a 
single line and no single quotes. Other than that, the patch is


Reviewed-by: Felix Kuehling 



Signed-off-by: Sreekant Somasekharan 
---
  drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index df0363ad1a51..6eb370609d01 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -495,7 +495,8 @@ static void gmc_v12_0_get_vm_pte(struct amdgpu_device *adev,
struct amdgpu_bo *bo = mapping->bo_va->base.bo;
struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT;
-   bool is_system = bo->tbo.resource->mem_type == TTM_PL_SYSTEM;
+   bool is_system = (bo->tbo.resource->mem_type == TTM_PL_TT) ||
+   (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT);
  
  
  	*flags &= ~AMDGPU_PTE_EXECUTABLE;

Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique

2024-05-10 Thread Felix Kuehling




On 2024-05-09 16:06, Harish Kasiviswanathan wrote:

gpu_id needs to be unique for user space to identify GPUs via KFD
interface. In the current implementation there is a very small
probability of having non unique gpu_ids.

v2: Add check to confirm if gpu_id is unique. If not unique, find one
 Changed commit header to reflect the above
v3: Use crc16 as suggested-by: Lijo Lazar 
 Ensure that gpu_id != 0

Signed-off-by: Harish Kasiviswanathan 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 40 +++
  1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 219dcf504f24..4954a3021f70 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -31,6 +31,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include "kfd_priv.h"

  #include "kfd_crat.h"
@@ -1091,14 +1092,17 @@ void kfd_topology_shutdown(void)
  
  static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)

  {
-   uint32_t hashout;
+   uint32_t gpu_id;
uint32_t buf[8];
uint64_t local_mem_size;
-   int i;
+   struct kfd_topology_device *dev;
+   bool is_unique;
+   uint8_t *crc_buf;
  
  	if (!gpu)

return 0;
  
+	crc_buf = (uint8_t*)&buf;

local_mem_size = gpu->local_mem_info.local_mem_size_private +
gpu->local_mem_info.local_mem_size_public;
buf[0] = gpu->adev->pdev->devfn;
@@ -,10 +1115,34 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node 
*gpu)
buf[6] = upper_32_bits(local_mem_size);
buf[7] = (ffs(gpu->xcc_mask) - 1) | (NUM_XCC(gpu->xcc_mask) << 16);
  
-	for (i = 0, hashout = 0; i < 8; i++)

-   hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
+   gpu_id = crc16(0, crc_buf, sizeof(buf)) &
+((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
  
-	return hashout;

+   /* There is a very small possibility when generating a
+* 16 (KFD_GPU_ID_HASH_WIDTH) bit value from 8 word buffer
+* that the value could be 0 or non-unique. So, check if
+* it is unique and non-zero. If not unique increment till
+* unique one is found. In case of overflow, restart from 1
+*/
+
+   down_read(&topology_lock);
+   do {
+   is_unique = true;
+   if (!gpu_id)
+   gpu_id = 1;
+   list_for_each_entry(dev, &topology_device_list, list) {
+   if (dev->gpu && dev->gpu_id == gpu_id) {
+   is_unique = false;
+   break;
+   }
+   }
+   if (unlikely(!is_unique))
+   gpu_id = (gpu_id + 1) &
+ ((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
+   } while (!is_unique);
+   up_read(&topology_lock);
+
+   return gpu_id;
  }
  /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
   *the GPU device is not already present in the topology device
@@ -1945,7 +1973,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config;
struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info;
  
-	gpu_id = kfd_generate_gpu_id(gpu);

if (gpu->xcp && !gpu->xcp->ddev) {
dev_warn(gpu->adev->dev,
 "Won't add GPU to topology since it has no drm node 
assigned.");
@@ -1968,6 +1995,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
if (res)
return res;
  
+	gpu_id = kfd_generate_gpu_id(gpu);

dev->gpu_id = gpu_id;
gpu->id = gpu_id;

Re: [PATCH 11/11] drm/tegra: Use fbdev client helpers

2024-05-07 Thread Felix Kuehling




On 2024-05-07 07:58, Thomas Zimmermann wrote:

Implement struct drm_client_funcs with the respective helpers and
remove the custom code from the emulation. The generic helpers are
equivalent in functionality.

Signed-off-by: Thomas Zimmermann 
---
  drivers/gpu/drm/radeon/radeon_fbdev.c | 66 ++-


Was radeon meant to be a separate patch?

Regards,
  Felix



  drivers/gpu/drm/tegra/fbdev.c | 58 ++-
  2 files changed, 6 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c 
b/drivers/gpu/drm/radeon/radeon_fbdev.c
index 02bf25759059a..cf790922174ea 100644
--- a/drivers/gpu/drm/radeon/radeon_fbdev.c
+++ b/drivers/gpu/drm/radeon/radeon_fbdev.c
@@ -29,7 +29,6 @@
  #include 
  #include 
  
-#include 

  #include 
  #include 
  #include 
@@ -293,71 +292,12 @@ static const struct drm_fb_helper_funcs 
radeon_fbdev_fb_helper_funcs = {
  };
  
  /*

- * Fbdev client and struct drm_client_funcs
+ * struct drm_client_funcs
   */
  
-static void radeon_fbdev_client_unregister(struct drm_client_dev *client)

-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-   struct drm_device *dev = fb_helper->dev;
-   struct radeon_device *rdev = dev->dev_private;
-
-   if (fb_helper->info) {
-   vga_switcheroo_client_fb_set(rdev->pdev, NULL);
-   drm_helper_force_disable_all(dev);
-   drm_fb_helper_unregister_info(fb_helper);
-   } else {
-   drm_client_release(&fb_helper->client);
-   drm_fb_helper_unprepare(fb_helper);
-   kfree(fb_helper);
-   }
-}
-
-static int radeon_fbdev_client_restore(struct drm_client_dev *client)
-{
-   drm_fb_helper_lastclose(client->dev);
-   vga_switcheroo_process_delayed_switch();
-
-   return 0;
-}
-
-static int radeon_fbdev_client_hotplug(struct drm_client_dev *client)
-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-   struct drm_device *dev = client->dev;
-   struct radeon_device *rdev = dev->dev_private;
-   int ret;
-
-   if (dev->fb_helper)
-   return drm_fb_helper_hotplug_event(dev->fb_helper);
-
-   ret = drm_fb_helper_init(dev, fb_helper);
-   if (ret)
-   goto err_drm_err;
-
-   if (!drm_drv_uses_atomic_modeset(dev))
-   drm_helper_disable_unused_functions(dev);
-
-   ret = drm_fb_helper_initial_config(fb_helper);
-   if (ret)
-   goto err_drm_fb_helper_fini;
-
-   vga_switcheroo_client_fb_set(rdev->pdev, fb_helper->info);
-
-   return 0;
-
-err_drm_fb_helper_fini:
-   drm_fb_helper_fini(fb_helper);
-err_drm_err:
-   drm_err(dev, "Failed to setup radeon fbdev emulation (ret=%d)\n", ret);
-   return ret;
-}
-
  static const struct drm_client_funcs radeon_fbdev_client_funcs = {
-   .owner  = THIS_MODULE,
-   .unregister = radeon_fbdev_client_unregister,
-   .restore= radeon_fbdev_client_restore,
-   .hotplug= radeon_fbdev_client_hotplug,
+   .owner = THIS_MODULE,
+   DRM_FBDEV_HELPER_CLIENT_FUNCS,
  };
  
  void radeon_fbdev_setup(struct radeon_device *rdev)

diff --git a/drivers/gpu/drm/tegra/fbdev.c b/drivers/gpu/drm/tegra/fbdev.c
index db6eaac3d30e6..f9cc365cfed94 100644
--- a/drivers/gpu/drm/tegra/fbdev.c
+++ b/drivers/gpu/drm/tegra/fbdev.c
@@ -12,7 +12,6 @@
  #include 
  
  #include 

-#include 
  #include 
  #include 
  #include 
@@ -150,63 +149,12 @@ static const struct drm_fb_helper_funcs 
tegra_fb_helper_funcs = {
  };
  
  /*

- * struct drm_client
+ * struct drm_client_funcs
   */
  
-static void tegra_fbdev_client_unregister(struct drm_client_dev *client)

-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-
-   if (fb_helper->info) {
-   drm_fb_helper_unregister_info(fb_helper);
-   } else {
-   drm_client_release(&fb_helper->client);
-   drm_fb_helper_unprepare(fb_helper);
-   kfree(fb_helper);
-   }
-}
-
-static int tegra_fbdev_client_restore(struct drm_client_dev *client)
-{
-   drm_fb_helper_lastclose(client->dev);
-
-   return 0;
-}
-
-static int tegra_fbdev_client_hotplug(struct drm_client_dev *client)
-{
-   struct drm_fb_helper *fb_helper = drm_fb_helper_from_client(client);
-   struct drm_device *dev = client->dev;
-   int ret;
-
-   if (dev->fb_helper)
-   return drm_fb_helper_hotplug_event(dev->fb_helper);
-
-   ret = drm_fb_helper_init(dev, fb_helper);
-   if (ret)
-   goto err_drm_err;
-
-   if (!drm_drv_uses_atomic_modeset(dev))
-   drm_helper_disable_unused_functions(dev);
-
-   ret = drm_fb_helper_initial_config(fb_helper);
-   if (ret)
-   goto err_drm_fb_helper_fini;
-
-   return 0;
-
-err_drm_fb_helper_fini:
-   drm_fb_helper_fini(fb_helper);
-err_drm_err:
-   drm_err(dev,

Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique

2024-05-06 Thread Felix Kuehling


On 2024-05-06 17:10, Harish Kasiviswanathan wrote:

On 2024-05-06 16:30, Felix Kuehling wrote:

On 2024-05-03 18:06, Harish Kasiviswanathan wrote:

gpu_id needs to be unique for user space to identify GPUs via KFD
interface. In the current implementation there is a very small
probability of having non unique gpu_ids.

v2: Add check to confirm if gpu_id is unique. If not unique, find one
  Changed commit header to reflect the above

Signed-off-by: Harish Kasiviswanathan 
---
   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++-
   1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index b93913934b03..01d4c2e10c6d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
   uint32_t hashout;
   uint32_t buf[8];
   uint64_t local_mem_size;
+    struct kfd_topology_device *dev;
+    bool is_unique;
   int i;
     if (!gpu)
@@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
   for (i = 0, hashout = 0; i < 8; i++)
   hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
   +    /* hash generated could be non-unique. Check if it is unique.
+ * If not unique increment till unique one is found. In case
+ * of overflow, restart from 1
+    */
+    down_read(&topology_lock);
+    do {
+    is_unique = true;
+    list_for_each_entry(dev, &topology_device_list, list) {
+    if (dev->gpu && dev->gpu_id == hashout) {
+    is_unique = false;
+    break;
+    }
+    }
+    if (unlikely(!is_unique)) {
+    hashout = (hashout + 1) &
+  ((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
+    if (!hashout)
+    hashout = 1;

This doesn't catch the case that hashout was 0 before incrementing it, and was 
found to be unique.

I didn't actively think about this case when I sent the patch out. However, we 
don't have gpu_id to be 0. There are places where gpu_id=0 means it is CPU node


I think we make that assumption in a few places, both in kernel mode and 
user mode, e.g.:


struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, 
uint32_t gpu_id)
{
int i;

if (gpu_id) {
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];

if (pdd->user_gpu_id == gpu_id)
return pdd;
}
}
return NULL;
}

Or in the Thunk in hsaKmtGetNodeProperties:

/* For CPU only node don't add any additional GPU memory banks. */
if (gpu_id) {
uint64_t base, limit;
if (is_dgpu)
NodeProperties->NumMemoryBanks += NUM_OF_DGPU_HEAPS;
else
NodeProperties->NumMemoryBanks += NUM_OF_IGPU_HEAPS;
if (fmm_get_aperture_base_and_limit(FMM_MMIO, gpu_id, &base,
&limit) == HSAKMT_STATUS_SUCCESS)
NodeProperties->NumMemoryBanks += 1;
}

Regards,
  Felix





Regards,
   Felix



+    }
+    } while (!is_unique);
+    up_read(&topology_lock);
+
   return hashout;
   }
   /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
@@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
   struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config;
   struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info;
   -    gpu_id = kfd_generate_gpu_id(gpu);
   if (gpu->xcp && !gpu->xcp->ddev) {
   dev_warn(gpu->adev->dev,
    "Won't add GPU to topology since it has no drm node assigned.");
@@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
   if (res)
   return res;
   +    gpu_id = kfd_generate_gpu_id(gpu);
   dev->gpu_id = gpu_id;
   gpu->id = gpu_id;

Re: [PATCH] drm/amdkfd: Ensure gpu_id is unique

2024-05-06 Thread Felix Kuehling




On 2024-05-03 18:06, Harish Kasiviswanathan wrote:

gpu_id needs to be unique for user space to identify GPUs via KFD
interface. In the current implementation there is a very small
probability of having non unique gpu_ids.

v2: Add check to confirm if gpu_id is unique. If not unique, find one
 Changed commit header to reflect the above

Signed-off-by: Harish Kasiviswanathan 
---
  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 26 ++-
  1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index b93913934b03..01d4c2e10c6d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
uint32_t hashout;
uint32_t buf[8];
uint64_t local_mem_size;
+   struct kfd_topology_device *dev;
+   bool is_unique;
int i;
  
  	if (!gpu)

@@ -1115,6 +1117,28 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node *gpu)
for (i = 0, hashout = 0; i < 8; i++)
hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
  
+	/* hash generated could be non-unique. Check if it is unique.

+* If not unique increment till unique one is found. In case
+* of overflow, restart from 1
+   */
+   down_read(&topology_lock);
+   do {
+   is_unique = true;
+   list_for_each_entry(dev, &topology_device_list, list) {
+   if (dev->gpu && dev->gpu_id == hashout) {
+   is_unique = false;
+   break;
+   }
+   }
+   if (unlikely(!is_unique)) {
+   hashout = (hashout + 1) &
+ ((1 << KFD_GPU_ID_HASH_WIDTH) - 1);
+   if (!hashout)
+   hashout = 1;


This doesn't catch the case that hashout was 0 before incrementing it, 
and was found to be unique.


Regards,
  Felix



+   }
+   } while (!is_unique);
+   up_read(&topology_lock);
+
return hashout;
  }
  /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
@@ -1946,7 +1970,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config;
struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info;
  
-	gpu_id = kfd_generate_gpu_id(gpu);

if (gpu->xcp && !gpu->xcp->ddev) {
dev_warn(gpu->adev->dev,
 "Won't add GPU to topology since it has no drm node 
assigned.");
@@ -1969,6 +1992,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
if (res)
return res;
  
+	gpu_id = kfd_generate_gpu_id(gpu);

dev->gpu_id = gpu_id;
gpu->id = gpu_id;

Re: [PATCH] drm/amdkfd: Refactor kfd CRIU into its own file

2024-05-06 Thread Felix Kuehling




On 2024-05-06 15:20, David Francis wrote:

The kfd CRIU code takes up about a thousand lines
in the kfd_chardev file; move it to its own file.

No functional change intended.

Signed-off-by: David Francis 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 972 +-
  drivers/gpu/drm/amd/amdkfd/kfd_criu.c| 989 +++
  drivers/gpu/drm/amd/amdkfd/kfd_criu.h|  50 ++
  4 files changed, 1046 insertions(+), 966 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_criu.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 0d3d8972240d..e06af4073ac5 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -32,6 +32,7 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_flat_memory.o \
$(AMDKFD_PATH)/kfd_process.o \
$(AMDKFD_PATH)/kfd_queue.o \
+   $(AMDKFD_PATH)/kfd_criu.o \


Any particular reason for adding this in the middle and not the end?



$(AMDKFD_PATH)/kfd_mqd_manager.o \
$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6b713fb0b818..e6e44a199a93 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -45,6 +45,7 @@


Can you remove #include  and "amdgpu_dma_buf.h" here? 
Or is it still needed by something else left in kfd_chardev.c?


Other than that, this patch is

Reviewed-by: Felix Kuehling 



  #include "kfd_smi_events.h"
  #include "amdgpu_dma_buf.h"
  #include "kfd_debug.h"
+#include "kfd_criu.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1751,967 +1752,6 @@ static int kfd_ioctl_svm(struct file *filep, struct 
kfd_process *p, void *data)
  }
  #endif
  
-static int criu_checkpoint_process(struct kfd_process *p,

-uint8_t __user *user_priv_data,
-uint64_t *priv_offset)
-{
-   struct kfd_criu_process_priv_data process_priv;
-   int ret;
-
-   memset(&process_priv, 0, sizeof(process_priv));
-
-   process_priv.version = KFD_CRIU_PRIV_VERSION;
-   /* For CR, we don't consider negative xnack mode which is used for
-* querying without changing it, here 0 simply means disabled and 1
-* means enabled so retry for finding a valid PTE.
-*/
-   process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
-
-   ret = copy_to_user(user_priv_data + *priv_offset,
-   &process_priv, sizeof(process_priv));
-
-   if (ret) {
-   pr_err("Failed to copy process information to user\n");
-   ret = -EFAULT;
-   }
-
-   *priv_offset += sizeof(process_priv);
-   return ret;
-}
-
-static int criu_checkpoint_devices(struct kfd_process *p,
-uint32_t num_devices,
-uint8_t __user *user_addr,
-uint8_t __user *user_priv_data,
-uint64_t *priv_offset)
-{
-   struct kfd_criu_device_priv_data *device_priv = NULL;
-   struct kfd_criu_device_bucket *device_buckets = NULL;
-   int ret = 0, i;
-
-   device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), 
GFP_KERNEL);
-   if (!device_buckets) {
-   ret = -ENOMEM;
-   goto exit;
-   }
-
-   device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
-   if (!device_priv) {
-   ret = -ENOMEM;
-   goto exit;
-   }
-
-   for (i = 0; i < num_devices; i++) {
-   struct kfd_process_device *pdd = p->pdds[i];
-
-   device_buckets[i].user_gpu_id = pdd->user_gpu_id;
-   device_buckets[i].actual_gpu_id = pdd->dev->id;
-
-   /*
-* priv_data does not contain useful information for now and is 
reserved for
-* future use, so we do not set its contents.
-*/
-   }
-
-   ret = copy_to_user(user_addr, device_buckets, num_devices * 
sizeof(*device_buckets));
-   if (ret) {
-   pr_err("Failed to copy device information to user\n");
-   ret = -EFAULT;
-   goto exit;
-   }
-
-   ret = copy_to_user(user_priv_data + *priv_offset,
-  device_priv,
-  num_devices * sizeof(*device_priv));
-   if (ret) {
-   pr_err("Failed to copy device information to user\n");
-   ret = -EFAUL

Re: [PATCH] drm/amdkfd: Remove arbitrary timeout for hmm_range_fault

2024-05-06 Thread Felix Kuehling




On 2024-05-01 18:56, Philip Yang wrote:

On system with khugepaged enabled and user cases with THP buffer, the
hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary
timeout value is not accurate, cause memory allocation failure.

Remove the arbitrary timeout value, return EAGAIN to application if
hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call
ioctl again.

Change EAGAIN to debug message as this is not error.

Signed-off-by: Philip Yang 


Assuming this passes your stress testing without CPU stall warnings, 
this patch is


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c |  5 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c  | 12 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c |  5 +
  3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54198c3928c7..02696c2102f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1087,7 +1087,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t 
user_addr,
  
  	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);

if (ret) {
-   pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
+   if (ret == -EAGAIN)
+   pr_debug("Failed to get user pages, try again\n");
+   else
+   pr_err("%s: Failed to get user pages: %d\n", __func__, 
ret);
goto unregister_out;
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 431ec72655ec..e36fede7f74c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct 
mmu_interval_notifier *notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
  
-		/* Assuming 64MB takes maximum 1 second to fault page address */

-   timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
-   timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
-   timeout = jiffies + msecs_to_jiffies(timeout);
+   timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
  
  retry:

hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range);
if (unlikely(r)) {
-   schedule();
-   /*
-* FIXME: This timeout should encompass the retry from
-* mmu_interval_read_retry() as well.
-*/
if (r == -EBUSY && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
@@ -247,6 +239,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
  out_free_range:
kfree(hmm_range);
  
+	if (r == -EBUSY)

+   r = -EAGAIN;
return r;
  }
  
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 94f83be2232d..e7040f809f33 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1670,11 +1670,8 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
   readonly, owner, NULL,
   &hmm_range);
WRITE_ONCE(p->svms.faulting_task, NULL);
-   if (r) {
+   if (r)
pr_debug("failed %d to get svm range pages\n", 
r);
-   if (r == -EBUSY)
-   r = -EAGAIN;
-   }
} else {
r = -EFAULT;
}

Re: Proposal to add CRIU support to DRM render nodes

2024-05-03 Thread Felix Kuehling




On 2024-04-16 10:04, Tvrtko Ursulin wrote:
> 
> On 01/04/2024 18:58, Felix Kuehling wrote:
>>
>> On 2024-04-01 12:56, Tvrtko Ursulin wrote:
>>>
>>> On 01/04/2024 17:37, Felix Kuehling wrote:
>>>> On 2024-04-01 11:09, Tvrtko Ursulin wrote:
>>>>>
>>>>> On 28/03/2024 20:42, Felix Kuehling wrote:
>>>>>>
>>>>>> On 2024-03-28 12:03, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> Hi Felix,
>>>>>>>
>>>>>>> I had one more thought while browsing around the amdgpu CRIU plugin. It 
>>>>>>> appears it relies on the KFD support being compiled in and /dev/kfd 
>>>>>>> present, correct? AFAICT at least, it relies on that to figure out the 
>>>>>>> amdgpu DRM node.
>>>>>>>
>>>>>>> In would be probably good to consider designing things without that 
>>>>>>> dependency. So that checkpointing an application which does not use 
>>>>>>> /dev/kfd is possible. Or if the kernel does not even have the KFD 
>>>>>>> support compiled in.
>>>>>>
>>>>>> Yeah, if we want to support graphics apps that don't use KFD, we should 
>>>>>> definitely do that. Currently we get a lot of topology information from 
>>>>>> KFD, not even from the /dev/kfd device but from the sysfs nodes exposed 
>>>>>> by KFD. We'd need to get GPU device info from the render nodes instead. 
>>>>>> And if KFD is available, we may need to integrate both sources of 
>>>>>> information.
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> It could perhaps mean no more than adding some GPU discovery code into 
>>>>>>> CRIU. Which shuold be flexible enough to account for things like 
>>>>>>> re-assigned minor numbers due driver reload.
>>>>>>
>>>>>> Do you mean adding GPU discovery to the core CRIU, or to the plugin. I 
>>>>>> was thinking this is still part of the plugin.
>>>>>
>>>>> Yes I agree. I was only thinking about adding some DRM device discovery 
>>>>> code in a more decoupled fashion from the current plugin, for both the 
>>>>> reason discussed above (decoupling a bit from reliance on kfd sysfs), and 
>>>>> then also if/when a new DRM driver might want to implement this the code 
>>>>> could be move to some common plugin area.
>>>>>
>>>>> I am not sure how feasible that would be though. The "gpu id" concept and 
>>>>> it's matching in the current kernel code and CRIU plugin - is that value 
>>>>> tied to the physical GPU instance or how it works?
>>>>
>>>> The concept of the GPU ID is that it's stable while the system is up, even 
>>>> when devices get added and removed dynamically. It was baked into the API 
>>>> early on, but I don't think we ever fully validated device hot plug. I 
>>>> think the closest we're getting is with our latest MI GPUs and dynamic 
>>>> partition mode change.
>>>
>>> Doesn't it read the saved gpu id from the image file while doing restore 
>>> and tries to open the render node to match it? Maybe I am misreading the 
>>> code.. But if it does, does it imply that in practice it could be stable 
>>> across reboots? Or that it is not possible to restore to a different 
>>> instance of maybe the same GPU model installed in a system?
>>
>> Ah, the idea is, that when you restore on a different system, you may get 
>> different GPU IDs. Or you may checkpoint an app running on GPU 1 but restore 
>> it on GPU 2 on the same system. That's why we need to translate GPU IDs in 
>> restored applications. User mode still uses the old GPU IDs, but the kernel 
>> mode driver translates them to the actual GPU IDs of the GPUs that the 
>> process was restored on.
> 
> I see.. I think. Normal flow is ppd->user_gpu_id set during client init, but 
> for restored clients it gets overriden during restore so that any further 
> ioctls can actually not instantly fail.
> 
> And then in amdgpu_plugin_restore_file, when it is opening the render node, 
> it relies on the kfd topology to have filled in (more or less) the 
> target_gpu_id corresponding to the render node gpu id of the target GPU - the 
> one associated with the new kfd gpu_id?

Ye

Re: [PATCH v3 2/3] drm/amdgpu: Reduce mem_type to domain double indirection

2024-05-02 Thread Felix Kuehling




On 2024-04-30 13:16, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

All apart from AMDGPU_GEM_DOMAIN_GTT memory domains map 1:1 to TTM
placements. And the former be either AMDGPU_PL_PREEMPT or TTM_PL_TT,
depending on AMDGPU_GEM_CREATE_PREEMPTIBLE.

Simplify a few places in the code which convert the TTM placement into
a domain by checking against the current placement directly.

In the conversion AMDGPU_PL_PREEMPT either does not have to be handled
because amdgpu_mem_type_to_domain() cannot return that value anyway.

v2:
  * Remove AMDGPU_PL_PREEMPT handling.

v3:
  * Rebase.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Christian König  # v1
Reviewed-by: Felix Kuehling  # v2


I'm waiting for Christian to review patches 1 and 3. Then I can apply 
the whole series.


Regards,
  Felix



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  3 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 29 +
  2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..0b3b10d21952 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -165,8 +165,7 @@ static struct sg_table *amdgpu_dma_buf_map(struct 
dma_buf_attachment *attach,
if (r)
return ERR_PTR(r);
  
-	} else if (!(amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type) &

-AMDGPU_GEM_DOMAIN_GTT)) {
+   } else if (bo->tbo.resource->mem_type != TTM_PL_TT) {
return ERR_PTR(-EBUSY);
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index b2a83c802bbd..c581e4952cbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -983,12 +983,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
  
  	ttm_bo_pin(&bo->tbo);
  
-	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
+   if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
atomic64_add(amdgpu_bo_size(bo), &adev->vram_pin_size);
atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo),
 &adev->visible_pin_size);
-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);
}
  
@@ -1289,7 +1288,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,

struct ttm_resource *res = bo->tbo.resource;
uint64_t size = amdgpu_bo_size(bo);
struct drm_gem_object *obj;
-   unsigned int domain;
bool shared;
  
  	/* Abort if the BO doesn't currently have a backing store */

@@ -1299,21 +1297,20 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
obj = &bo->tbo.base;
shared = drm_gem_object_is_shared_for_memory_stats(obj);
  
-	domain = amdgpu_mem_type_to_domain(res->mem_type);

-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (res->mem_type) {
+   case TTM_PL_VRAM:
stats->vram += size;
-   if (amdgpu_res_cpu_visible(adev, bo->tbo.resource))
+   if (amdgpu_res_cpu_visible(adev, res))
stats->visible_vram += size;
if (shared)
stats->vram_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_GTT:
+   case TTM_PL_TT:
stats->gtt += size;
if (shared)
stats->gtt_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_CPU:
+   case TTM_PL_SYSTEM:
default:
stats->cpu += size;
if (shared)
@@ -1326,7 +1323,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->requested_visible_vram += size;
  
-		if (domain != AMDGPU_GEM_DOMAIN_VRAM) {

+   if (res->mem_type != TTM_PL_VRAM) {
stats->evicted_vram += size;
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->evicted_visible_vram += size;
@@ -1600,20 +1597,18 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, 
struct seq_file *m)
u64 size;
  
  	if (dma_resv_trylock(bo->tbo.base.resv)) {

-   unsigned int domain;
  
-		domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (bo->tbo.resource->mem_type) {
+   case TTM_PL_VRAM:

Re: [PATCH 1/2] drm/amdkfd: Use dev_error intead of pr_error

2024-05-01 Thread Felix Kuehling



On 2024-05-01 21:08, Harish Kasiviswanathan wrote:
> No functional change. This will help in moving gpu_id creation to next
> step while still being able to identify the correct GPU
> 
> Signed-off-by: Harish Kasiviswanathan 

Reviewed-by: Felix Kuehling 

> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index ba326b43bec5..b93913934b03 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1773,7 +1773,7 @@ static void kfd_fill_cache_non_crat_info(struct 
> kfd_topology_device *dev, struct
>   pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
>  }
>  
> -static int kfd_topology_add_device_locked(struct kfd_node *gpu, uint32_t 
> gpu_id,
> +static int kfd_topology_add_device_locked(struct kfd_node *gpu,
> struct kfd_topology_device **dev)
>  {
>   int proximity_domain = ++topology_crat_proximity_domain;
> @@ -1786,8 +1786,7 @@ static int kfd_topology_add_device_locked(struct 
> kfd_node *gpu, uint32_t gpu_id,
>   COMPUTE_UNIT_GPU, gpu,
>   proximity_domain);
>   if (res) {
> - pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
> -gpu_id);
> + dev_err(gpu->adev->dev, "Error creating VCRAT\n");
>   topology_crat_proximity_domain--;
>   goto err;
>   }
> @@ -1798,8 +1797,7 @@ static int kfd_topology_add_device_locked(struct 
> kfd_node *gpu, uint32_t gpu_id,
>  &temp_topology_device_list,
>  proximity_domain);
>   if (res) {
> - pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
> -gpu_id);
> + dev_err(gpu->adev->dev, "Error parsing VCRAT\n");
>   topology_crat_proximity_domain--;
>   goto err;
>   }
> @@ -1825,8 +1823,8 @@ static int kfd_topology_add_device_locked(struct 
> kfd_node *gpu, uint32_t gpu_id,
>   if (!res)
>   sys_props.generation_count++;
>   else
> - pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. 
> res=%d\n",
> -gpu_id, res);
> + dev_err(gpu->adev->dev, "Failed to update GPU to sysfs 
> topology. res=%d\n",
> + res);
>  
>  err:
>   kfd_destroy_crat_image(crat_image);
> @@ -1951,11 +1949,10 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   gpu_id = kfd_generate_gpu_id(gpu);
>   if (gpu->xcp && !gpu->xcp->ddev) {
>   dev_warn(gpu->adev->dev,
> - "Won't add GPU (ID: 0x%x) to topology since it has no drm node 
> assigned.",
> - gpu_id);
> +  "Won't add GPU to topology since it has no drm node 
> assigned.");
>   return 0;
>   } else {
> - pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
> + dev_dbg(gpu->adev->dev, "Adding new GPU to topology\n");
>   }
>  
>   /* Check to see if this gpu device exists in the topology_device_list.
> @@ -1967,7 +1964,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   down_write(&topology_lock);
>   dev = kfd_assign_gpu(gpu);
>   if (!dev)
> - res = kfd_topology_add_device_locked(gpu, gpu_id, &dev);
> + res = kfd_topology_add_device_locked(gpu, &dev);
>   up_write(&topology_lock);
>   if (res)
>   return res;

Re: [PATCH 2/2] drm/amdkfd: Improve chances of unique gpu_id

2024-05-01 Thread Felix Kuehling




On 2024-05-01 21:08, Harish Kasiviswanathan wrote:
> gpu_id needs to be unique for user space to identify GPUs via KFD
> interface. Do a single pass search to detect collision. If
> detected, increment gpu_id by one.
> 
> Probability of collisons are very rare. Hence, no more complexity is
> added to ensure uniqueness.> 
> Signed-off-by: Harish Kasiviswanathan 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index b93913934b03..f2d1e82e7bed 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1095,6 +1095,8 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node 
> *gpu)
>   uint32_t hashout;
>   uint32_t buf[8];
>   uint64_t local_mem_size;
> + struct kfd_topology_device *dev;
> + bool is_unique = true;
>   int i;
>  
>   if (!gpu)
> @@ -1115,7 +1117,13 @@ static uint32_t kfd_generate_gpu_id(struct kfd_node 
> *gpu)
>   for (i = 0, hashout = 0; i < 8; i++)
>   hashout ^= hash_32(buf[i], KFD_GPU_ID_HASH_WIDTH);
>  
> - return hashout;
> + down_read(&topology_lock);
> + list_for_each_entry(dev, &topology_device_list, list) {
> + if (dev->gpu && dev->gpu_id == hashout)
> + is_unique = false;

You can break early here.

> + }
> + up_read(&topology_lock);
> + return is_unique ? hashout : ++hashout;

We should make sure that hashout stays within the KFD_GPU_ID_HASH_WIDTH. And if 
we're already adding a collision check, we may as well make it air-tight. It 
should be easy enough by wrapping it in a do-while loop. While we're at it, can 
we also check that the hash is not 0, because that value is used for non-GPU 
nodes? I think this would satisfy all my requests:

do {
if (!hashout)
hashout++;
is_unique = true;
list_for_each_entry(dev, &topology_device_list, list) {
if (dev->gpu && dev->gpu_id == hashout) {
is_unique = false;
hashout = (hashout + 1) &
  ((1U << KFD_GPU_ID_HASH_WIDTH) - 1);
break;
}
}
} while (!is_unique);

Regards,
  Felix


>  }
>  /* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
>   *   the GPU device is not already present in the topology device
> @@ -1946,7 +1954,6 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config;
>   struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info;
>  
> - gpu_id = kfd_generate_gpu_id(gpu);
>   if (gpu->xcp && !gpu->xcp->ddev) {
>   dev_warn(gpu->adev->dev,
>"Won't add GPU to topology since it has no drm node 
> assigned.");
> @@ -1969,6 +1976,7 @@ int kfd_topology_add_device(struct kfd_node *gpu)
>   if (res)
>   return res;
>  
> + gpu_id = kfd_generate_gpu_id(gpu);
>   dev->gpu_id = gpu_id;
>   gpu->id = gpu_id;
>

Re: [PATCH v2] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Felix Kuehling




On 2024-05-01 16:38, Ramesh Errabolu wrote:

Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 


Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 386875e6eb96..481cb958e165 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
start = map_start << PAGE_SHIFT;
end = (map_last + 1) << PAGE_SHIFT;
for (addr = start; !r && addr < end; ) {
-   struct hmm_range *hmm_range;
+   struct hmm_range *hmm_range = NULL;
unsigned long map_start_vma;
unsigned long map_last_vma;
struct vm_area_struct *vma;
@@ -1696,7 +1696,12 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
}
  
  		svm_range_lock(prange);

-   if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+   /* Free backing memory of hmm_range if it was initialized
+* Overrride return value to TRY AGAIN only if prior returns
+* were successful
+*/
+   if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && 
!r) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;
}

Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Felix Kuehling




On 2024-05-01 14:34, Felix Kuehling wrote:



On 2024-04-30 19:29, Ramesh Errabolu wrote:

Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 
---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 386875e6eb96..dcb1d5d3f860 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct 
mm_struct *mm,

  start = map_start << PAGE_SHIFT;
  end = (map_last + 1) << PAGE_SHIFT;
  for (addr = start; !r && addr < end; ) {
-    struct hmm_range *hmm_range;
+    struct hmm_range *hmm_range = NULL;
  unsigned long map_start_vma;
  unsigned long map_last_vma;
  struct vm_area_struct *vma;
@@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct 
mm_struct *mm,

  }
  svm_range_lock(prange);
-    if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+    // Free backing memory of hmm_range if it was initialized
+    if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) {
  pr_debug("hmm update the range, need validate again\n");
  r = -EAGAIN;


Nack! This can now override other error codes that aren't meant to be 
overridden with -EAGAIN.


I think a better solution would be to just revserse this condition to 
ensure that amdgpu_hmm_range_get_pages_done is always called:


     if (amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {


Correction: You still need the NULL check:

if (hmm_range &&
amdgpu_hmm_range_get_pages_done(hmm_range) &&
!r) {
...
}

Regards,
  Felix


     ...
     r = -EAGAIN;
     }

Regards,
   Felix


  }

Re: [PATCH] drm/amd/amdkfd: Fix a resource leak in svm_range_validate_and_map()

2024-05-01 Thread Felix Kuehling





On 2024-04-30 19:29, Ramesh Errabolu wrote:

Analysis of code by Coverity, a static code analyser, has identified
a resource leak in the symbol hmm_range. This leak occurs when one of
the prior steps before it is released encounters an error.

Signed-off-by: Ramesh Errabolu 
---
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 386875e6eb96..dcb1d5d3f860 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1658,7 +1658,7 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
start = map_start << PAGE_SHIFT;
end = (map_last + 1) << PAGE_SHIFT;
for (addr = start; !r && addr < end; ) {
-   struct hmm_range *hmm_range;
+   struct hmm_range *hmm_range = NULL;
unsigned long map_start_vma;
unsigned long map_last_vma;
struct vm_area_struct *vma;
@@ -1696,7 +1696,9 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
}
  
  		svm_range_lock(prange);

-   if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
+
+   // Free backing memory of hmm_range if it was initialized
+   if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range)) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;


Nack! This can now override other error codes that aren't meant to be 
overridden with -EAGAIN.


I think a better solution would be to just revserse this condition to 
ensure that amdgpu_hmm_range_get_pages_done is always called:


if (amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {
...
r = -EAGAIN;
}

Regards,
  Felix


}

Re: [PATCH v2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-04-30 Thread Felix Kuehling




On 2024-04-30 6:08, Lang Yu wrote:

Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist.
Error Log when running mnist:
"torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate
84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes
is free. Of the allocated memory 103.83 MiB is allocated by PyTorch,
and 22.17 MiB is reserved by PyTorch but unallocated"

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.
Then device and host can effectively share system memory.

v2: Report local_mem_size_private as 0. (Felix)

Signed-off-by: Lang Yu 


Reviewed-by: Felix Kuehling 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  5 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++-
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
  5 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7ba05f030dd1..e3738d417245 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device 
*adev,
else
mem_info->local_mem_size_private =
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
+   } else if (adev->flags & AMD_IS_APU) {
+   mem_info->local_mem_size_public = (ttm_tt_pages_limit() << 
PAGE_SHIFT);
+   mem_info->local_mem_size_private = 0;
} else {
mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
@@ -824,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
}
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
return ALIGN_DOWN(tmp, PAGE_SIZE);
+   } else if (adev->flags & AMD_IS_APU) {
+   return (ttm_tt_pages_limit() << PAGE_SHIFT);
} else {
return adev->gmc.real_vram_size;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 4bdf59213384..5843c3d35cb9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
  
  		vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);

-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
  "adev reference can't be null when vram is used");
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
-   adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+   adev->kfd.vram_used_aligned[xcp_id] +=
+   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
  
  		if (adev) {

adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !adev->gmc.is_app_apu) &am

Re: [PATCH 2/3] drm/amdgpu: Reduce mem_type to domain double indirection

2024-04-29 Thread Felix Kuehling




On 2024-04-29 12:47, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

All apart from AMDGPU_GEM_DOMAIN_GTT memory domains map 1:1 to TTM
placements. And the former be either AMDGPU_PL_PREEMPT or TTM_PL_TT,
depending on AMDGPU_GEM_CREATE_PREEMPTIBLE.

Simplify a few places in the code which convert the TTM placement into
a domain by checking against the current placement directly.

In the conversion AMDGPU_PL_PREEMPT either does not have to be handled
because amdgpu_mem_type_to_domain() cannot return that value anyway.

v2:
  * Remove AMDGPU_PL_PREEMPT handling.

Signed-off-by: Tvrtko Ursulin 
Reviewed-by: Christian König  # v1

Reviewed-by: Felix Kuehling 

I also ran kfdtest on a multi-GPU system just to make sure this didn't 
break our multi-GPU support. BTW, I had to fix up some things when I 
tried to apply your patch to the current amd-staging-drm-next branch. 
That branch was just rebased on Linux 6.8, so maybe that's part of the 
reason.




---
  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c |  3 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 27 +
  2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index 055ba2ea4c12..0b3b10d21952 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -165,8 +165,7 @@ static struct sg_table *amdgpu_dma_buf_map(struct 
dma_buf_attachment *attach,
if (r)
return ERR_PTR(r);
  
-	} else if (!(amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type) &

-AMDGPU_GEM_DOMAIN_GTT)) {
+   } else if (bo->tbo.resource->mem_type != TTM_PL_TT) {
return ERR_PTR(-EBUSY);
}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 8bc79924d171..eb5bd6962560 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -976,12 +976,11 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
  
  	ttm_bo_pin(&bo->tbo);
  
-	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
+   if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
atomic64_add(amdgpu_bo_size(bo), &adev->vram_pin_size);
atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo),
 &adev->visible_pin_size);
-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);
}
  
@@ -1280,7 +1279,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,

  {
uint64_t size = amdgpu_bo_size(bo);
struct drm_gem_object *obj;
-   unsigned int domain;
bool shared;
  
  	/* Abort if the BO doesn't currently have a backing store */

@@ -1290,21 +1288,20 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
obj = &bo->tbo.base;
shared = drm_gem_object_is_shared_for_memory_stats(obj);
  
-	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);

-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VRAM:
+   switch (bo->tbo.resource->mem_type) {
+   case TTM_PL_VRAM:
stats->vram += size;
if (amdgpu_bo_in_cpu_visible_vram(bo))
stats->visible_vram += size;
if (shared)
stats->vram_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_GTT:
+   case TTM_PL_TT:
stats->gtt += size;
if (shared)
stats->gtt_shared += size;
break;
-   case AMDGPU_GEM_DOMAIN_CPU:
+   case TTM_PL_SYSTEM:
default:
stats->cpu += size;
if (shared)
@@ -1317,7 +1314,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->requested_visible_vram += size;
  
-		if (domain != AMDGPU_GEM_DOMAIN_VRAM) {

+   if (bo->tbo.resource->mem_type != TTM_PL_VRAM) {
stats->evicted_vram += size;
if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)
stats->evicted_visible_vram += size;
@@ -1592,19 +1589,17 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, 
struct seq_file *m)
u64 size;
  
  	if (dma_resv_trylock(bo->tbo.base.resv)) {

-   unsigned int domain;
-   domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
-   switch (domain) {
-   case AMDGPU_GEM_DOMAIN_VR

Re: [PATCH] drm/amdkfd: update buffer_{store,load}_* modifiers for gfx940

2024-04-29 Thread Felix Kuehling


On 2024-04-29 17:50, Jay Cornwall wrote:

On 4/29/2024 06:06, Lancelot SIX wrote:

Instruction modifiers of the untyped vector memory buffer instructions
(MUBUF encoded) changed in gfx940.  The slc, scc and glc modifiers have
been replaced with sc0, sc1 and nt.

The current CWSR trap handler is written using pre-gfx940 modifier
names, making the source incompatible with a strict gfx940 assembler.

This patch updates the cwsr_trap_handler_gfx9.s source file to be
compatible with all gfx9 variants of the ISA.  The binary assembled code
is unchanged (so the behaviour is unchanged as well), only the source
representation is updated.

Signed-off-by: Lancelot SIX 
---
  .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 24 ---
  1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm

index bb26338204f4..a2d597d7fb57 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -48,6 +48,12 @@ var ACK_SQC_STORE    = 1    
//workaround for suspected SQC store bug causing
  var SAVE_AFTER_XNACK_ERROR    =    1 //workaround for TCP store 
failure after XNACK error when ALLOW_REPLAY=0, for debugger
  var SINGLE_STEP_MISSED_WORKAROUND   =    (ASIC_FAMILY <= 
CHIP_ALDEBARAN)    //workaround for lost MODE.DEBUG_EN exception when 
SAVECTX raised

  +#if ASIC_FAMILY < CHIP_GC_9_4_3
+#define VMEM_MODIFIERS slc:1 glc:1
+#else
+#define VMEM_MODIFIERS sc0:1 nt:1
+#endif
+
/**/
  /*    variables  */
/**/
@@ -581,7 +587,7 @@ end
  L_SAVE_LDS_LOOP_VECTOR:
    ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
    s_waitcnt lgkmcnt(0)
-  buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, 
s_save_mem_offset offen:1  glc:1  slc:1
+  buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, 
s_save_mem_offset VMEM_MODIFIERS offen:1

  //    s_waitcnt vmcnt(0)
  //    v_add_u32 v2, vcc[0:1], v2, v3
    v_add_u32 v2, v2, v3
@@ -979,17 +985,17 @@ L_TCP_STORE_CHECK_DONE:
  end
    function write_4vgprs_to_mem(s_rsrc, s_mem_offset)
-    buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
-    buffer_store_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256
-    buffer_store_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256*2
-    buffer_store_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1  
offset:256*3

+    buffer_store_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS
+    buffer_store_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256
+    buffer_store_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*2
+    buffer_store_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*3

  end
    function read_4vgprs_from_mem(s_rsrc, s_mem_offset)
-    buffer_load_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
-    buffer_load_dword v1, v0, s_rsrc, s_mem_offset slc:1 glc:1 
offset:256
-    buffer_load_dword v2, v0, s_rsrc, s_mem_offset slc:1 glc:1 
offset:256*2
-    buffer_load_dword v3, v0, s_rsrc, s_mem_offset slc:1 glc:1 
offset:256*3

+    buffer_load_dword v0, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS
+    buffer_load_dword v1, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256
+    buffer_load_dword v2, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*2
+    buffer_load_dword v3, v0, s_rsrc, s_mem_offset VMEM_MODIFIERS 
offset:256*3

  s_waitcnt vmcnt(0)
  end

base-commit: cf743996352e327f483dc7d66606c90276f57380


Reviewed-by: Jay Cornwall 


Acked-by: Felix Kuehling 

Do you need me to submit the patch to amd-staging-drm-next?

Thanks,
  Felix

Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs

2024-04-29 Thread Felix Kuehling

On 2024-04-29 06:38, Yu, Lang wrote:

[Public]

-Original Message-
From: Kuehling, Felix 
Sent: Saturday, April 27, 2024 6:45 AM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Yang, Philip ; Koenig, Christian
; Zhang, Yifan ; Liu,
Aaron 
Subject: Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on
small APUs

On 2024-04-26 04:37, Lang Yu wrote:

The default ttm_tt_pages_limit is 1/2 of system memory.
It is prone to out of memory with such a configuration.

Indiscriminately allowing the violation of all memory limits is not a good
solution. It will lead to poor performance once you actually reach
ttm_pages_limit and TTM starts swapping out BOs.

Hi Felix,

I just feel it's like a bug that 1/2 of system memory is fee, the driver tells 
users out of memory.
On the other hand, if memory is available, why not use it.

TTM does not allow us to use more than 1/2 system memory. I believe 
that's because TTM needs additional memory to swap out BOs. Any GTT 
allocation through the render node APIs is subject to the same limitations.

Render node APIs can handle memory overcommitment more gracefully 
because the kernel mode driver is in the loop for command submissions 
and fences. That doesn't work for KFD with user mode queues. The memory 
limits in KFD are there to prevent overcommitting memory because we need 
all of our memory (per process) to be resident at the same time. If we 
let KFD exceed the TTM limits, we get into situations where we're 
thrashing (processes evicting each other constantly) or even worse, 
where we're just not able to make all memory resident. So we end up with 
suspended user mode queues and extremely poor performance or soft hangs.

By the way, can we use USERPTR for VRAM allocations?
Then we don't have ttm_tt_pages_limit limitations. Thanks.

No. There is an expectation that VRAM BOs can be shared between 
processes through DMABufs (for HIP IPC APIs). You can't export userptrs 
as DMABufs.

You can try to raise the TTM pages limit using a TTM module parameter. 
But this is taking a risk for system stability when TTM gets into a 
situation where it needs to swap out a large BO.

Regards,
  Felix

I actually did some tests on Strix (12 CU@2100 MHz, 29412M 128bits 
LPDDR5@937MHz) with
https://github.com/ROCm/pytorch-micro-benchmarking.

Command: python micro_benchmarking_pytorch.py --network resnet50 
--batch-size=64 --iterations=20

1, Run 1 resnet50 (FP32, batch size 64)
Memory usage:
 System mem used 6748M out of 29412M
 TTM mem used 6658M out of 15719M
Memory oversubscription percentage:  0
Throughput [img/sec] : 49.04

2,  Run 2 resnet50 simultaneously (FP32, batch size 64)
Memory usage:
 System mem used 13496M out of 29412M
 TTM mem used 13316M out of 15719M
Memory oversubscription percentage:  0
Throughput [img/sec] (respectively) : 25.27 / 26.70

3, Run 3 resnet50 simultaneously (FP32, batch size 64)
Memory usage:
 System mem used 20245M out of 29412M
 TTM mem used 19974M out of 15719M
Memory oversubscription percentage:  ~27%

Throughput [img/sec](respectively) : 10.62 / 7.47 / 6.90 (In theory: 16 / 16 / 
16)

 From my observations,

1, GPU is underutilized a lot, sometimes its loading is less than 50% and even 
0, when running 3 resnet50 simultaneously with ~27% memory oversubscription.
The driver is busying evicting and restoring process. It takes ~2-5 seconds to 
restore all the BOs for one process (swap in and out BOs, actually allocate and 
copy pages),
even though the process doesn't need all the allocated BOs to be resident.

2, Sometimes, the fairness can't be guaranteed between process when memory is 
oversubscribed.
They can't share the GPU equally when created with default priority.

3, The less GPU underutilization time during evicting and restoring, the less 
performance degradation under memory oversubscription.

Regards,
Lang

Regards,
   Felix

Signed-off-by: Lang Yu 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   |  4 ++--
   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12

+---

   3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3295838e9a1d..c01c6f3ab562 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct

amdgpu_device *adev)

  int i;
  int last_valid_bit;

-amdgpu_amdkfd_gpuvm_init_mem_limits();
+amdgpu_amdkfd_gpuvm_init_mem_limits(adev);

  if (adev->kfd.dev) {
  struct kgd2kfd_shared_resources gpu_resources = { diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..13284dbd8c58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.

Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting

2024-04-29 Thread Felix Kuehling


On 2024-04-29 5:43, Tvrtko Ursulin wrote:


On 26/04/2024 23:24, Felix Kuehling wrote:


On 2024-04-26 12:43, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

When commit b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible
SG BOs") added a new TTM region it missed to notice the conceptual
imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin.

That imbalance leads to such objects getting accounted against the
resource, but are not un-accounted when unpinned.


AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be 
pinned. In any case you should make sure that the accounting is 
consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. This 
patch breaks that consistency.


You mean amdgpu_bo_pin(_restricted) and amdgpu_bo_unpin do not run for 
such objects, or something else?


Right. amdgpu_bo_pin_restricted will return an error for userptr BOs:

if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
return -EPERM;




If they run, then at the end of pin there is:

 domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
...
 } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
     atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);


You changed that in your patch 2:

-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT ||
+  bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);
}

I was suggesting you just change this in patch 2 like this, so it 
matches what's done on unpin:


-   } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
+   } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);
}




And unpin has no handling for AMDGPU_PL_PREEMPT.

Ah I see.. does it rely on amdgpu_mem_type_to_domain returning 0 for 
AMDGPU_PL_PREEMPT? My confusion was I misread the pinning check as 
checking the domain as stored in the bo at creation time.


Although I am still confused by the statement userptr BOs are not 
pinned. It is not needed to map them via GART on AMD hardware for GPU to 
be able to access them?

Fix by extending the accounting criteria in amdgpu_bo_unpin.

What also aappears needs fixing is not reporting their size from the
amdgpu_bo_get_memory, which is used to implement fdinfo stats, so 
they are

not mixed with the regular userspace created and driver owned objects.


I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT 
does use system memory and it is GPU accessible, just like GTT. The 
only difference is, that it's not subject to the GTT limits because 
their eviction is handled by callbacks other than TTM evictions and 
doesn't need to wait for fences.


As in you think those two hunks of the patch are correct?


Yes. It seems, Christian agrees but wants to show preemptible memory 
separately in debugfs instead of not showing it at all.


Regards,
  Felix




Regards,

Tvrtko



Regards,
   Felix




And also amdgpu_bo_print_info for debugfs reporting.

Note that the patch depends on the previous one which broke down the
relevant checks from the domain based to placement based.

Signed-off-by: Tvrtko Ursulin 
Fixes: b453e42a6e8b ("drm/amdgpu: Add new placement for preemptible 
SG BOs")

Cc: Felix Kuehling 
Cc: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index fb984669fc3a..5a2bbc793953 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1032,7 +1032,8 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
  atomic64_sub(amdgpu_bo_size(bo), &adev->vram_pin_size);
  atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo),
   &adev->visible_pin_size);
-    } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
+    } else if (bo->tbo.resource->mem_type == TTM_PL_TT ||
+   bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
  atomic64_sub(amdgpu_bo_size(bo), &adev->gart_pin_size);
  }
@@ -1298,7 +1299,6 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
  stats->vram_shared += size;
  break;
  case TTM_PL_TT:
-    case AMDGPU_PL_PREEMPT:
  stats->gtt += size;
  if (shared)
  stats->gtt_shared += size;
@@ -1599,7 +1599,6 @@ u64 amdgpu_bo_print_info(int id, struct 
amdgpu_bo *bo, struct seq_file *m)

  placement = "VRAM";
  break;
  case TTM_PL_TT:
-    case AMDGPU_PL_PREEMPT:
  placement = "GTT";
  break;
  case TTM_PL_SYSTEM:

Re: [PATCH 3/3] drm/amdgpu: Fix pinned GART area accounting and fdinfo reporting

2024-04-29 Thread Felix Kuehling





On 2024-04-29 9:45, Tvrtko Ursulin wrote:


On 29/04/2024 12:11, Christian König wrote:

Am 29.04.24 um 11:43 schrieb Tvrtko Ursulin:


On 26/04/2024 23:24, Felix Kuehling wrote:


On 2024-04-26 12:43, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

When commit b453e42a6e8b ("drm/amdgpu: Add new placement for 
preemptible

SG BOs") added a new TTM region it missed to notice the conceptual
imbalance in GART pin size accounting as done in amdgpu_bo_pin/unpin.

That imbalance leads to such objects getting accounted against the
resource, but are not un-accounted when unpinned.


AMDGPU_PL_PREEMPT is mostly used for userptr BOs, which cannot be 
pinned. In any case you should make sure that the accounting is 
consistent between amdgpu_bo_pin_restricted and amdgpu_bo_unpin. 
This patch breaks that consistency.


You mean amdgpu_bo_pin(_restricted) and amdgpu_bo_unpin do not run 
for such objects, or something else?


If they run, then at the end of pin there is:

domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
...
} else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
    atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);

And unpin has no handling for AMDGPU_PL_PREEMPT.

Ah I see.. does it rely on amdgpu_mem_type_to_domain returning 0 for 
AMDGPU_PL_PREEMPT? My confusion was I misread the pinning check as 
checking the domain as stored in the bo at creation time.


Although I am still confused by the statement userptr BOs are not 
pinned. It is not needed to map them via GART on AMD hardware for GPU 
to be able to access them?


No, a GART mapping is only needed if you want to scanout from them or 
otherwise use them from the kernel on the GPU.


Background is that the kernel doesn't has VM with page tables..


Got it, thanks!

Presumably somewhere else in the code then it is prevented to call 
pin/unpin on those?


I was referring to this condition in amdgpu_bo_pin_restricted:

if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
return -EPERM;

However, when I look into it more, I see that AMDGPU_PL_PREEMPT is used 
for other SG BOs that actually are pinned, specifically BOs created by 
KFD with KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL or 
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP. These are very small BOs (one or two 
pages), and only one per process, per GPU, so I'm not sure it's worth 
adding special handling for them in the BO pin accounting.


Regards,
  Felix




What to do, if anything, with the attempt to address the asymmetry in 
the accounting criteria between the pin and unpin?


I mean domain based on pin:

 domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);
 if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
     atomic64_add(amdgpu_bo_size(bo), &adev->vram_pin_size);
     atomic64_add(amdgpu_vram_mgr_bo_visible_size(bo),
  &adev->visible_pin_size);
 } else if (domain == AMDGPU_GEM_DOMAIN_GTT) {
     atomic64_add(amdgpu_bo_size(bo), &adev->gart_pin_size);
 }

Versus placement based on unpin:

 if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
     atomic64_sub(amdgpu_bo_size(bo), &adev->vram_pin_size);
     atomic64_sub(amdgpu_vram_mgr_bo_visible_size(bo),
  &adev->visible_pin_size);
 } else if (bo->tbo.resource->mem_type == TTM_PL_TT) {
     atomic64_sub(amdgpu_bo_size(bo), &adev->gart_pin_size);
 }

The fact amdgpu_mem_type_to_domain never translates back to 
AMDGPU_PL_PREEMPT means there is indeed currently no bug.


Is 2/3 still desirable to convert the check in pin to me mem_type based?


Fix by extending the accounting criteria in amdgpu_bo_unpin.

What also aappears needs fixing is not reporting their size from the
amdgpu_bo_get_memory, which is used to implement fdinfo stats, so 
they are

not mixed with the regular userspace created and driver owned objects.


I think that's true. It's a very fine distinction. AMDGPU_PL_PREEMPT 
does use system memory and it is GPU accessible, just like GTT. The 
only difference is, that it's not subject to the GTT limits because 
their eviction is handled by callbacks other than TTM evictions and 
doesn't need to wait for fences.


As in you think those two hunks of the patch are correct?


I think so as well, yes. But we still need a name for preemptible BOs 
while printing them in debugfs.


Currently it looks the name is 'CPU':

amdgpu_bo_print_info()
...
     case AMDGPU_GEM_DOMAIN_CPU:
     default:
     placement = "CPU";
     break;


Also, where to account them in struct amdgpu_mem_stats?

Regards,

Tvrtko



Regards,
Christian.



Regards,

Tvrtko



Regards,
   Felix




And also amdgpu_bo_print_info for debugfs reporting.

Note that the patch depends on the previous one which broke down the
relevant checks from the domain based to placement based.

Signed-off-by: Tvrtko U

Re: [PATCH 1/2] drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs

2024-04-26 Thread Felix Kuehling




On 2024-04-26 04:37, Lang Yu wrote:

Small APUs(i.e., consumer, embedded products) usually have a small
carveout device memory which can't satisfy most compute workloads
memory allocation requirements.

We can't even run a Basic MNIST Example with a default 512MB carveout.
https://github.com/pytorch/examples/tree/main/mnist.

Though we can change BIOS settings to enlarge carveout size,
which is inflexible and may bring complaint. On the other hand,
the memory resource can't be effectively used between host and device.

The solution is MI300A approach, i.e., let VRAM allocations go to GTT.

Signed-off-by: Lang Yu 


Two nit-picks inline. Other than that, this patch looks reasonable to me.



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  6 +-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 21 +++
  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c  |  2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  6 --
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h  |  3 ++-
  5 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7ba05f030dd1..3295838e9a1d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -456,7 +456,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device 
*adev,
mem_info->local_mem_size_private =
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
} else {
-   mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
+   mem_info->local_mem_size_public = adev->flags & AMD_IS_APU ?
+ (ttm_tt_pages_limit() << 
PAGE_SHIFT) :
+ adev->gmc.visible_vram_size;
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
adev->gmc.visible_vram_size;


On an APU the private size should be reported as 0.



}
@@ -824,6 +826,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id)
}
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
return ALIGN_DOWN(tmp, PAGE_SIZE);
+   } else if (adev->flags & AMD_IS_APU) {
+   return (ttm_tt_pages_limit() << PAGE_SHIFT);
} else {
return adev->gmc.real_vram_size;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index c4f9960dafbb..7eb5afcc4895 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
return -EINVAL;
  
  		vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);

-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
system_mem_needed = size;
ttm_mem_needed = size;
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
  "adev reference can't be null when vram is used");
if (adev && xcp_id >= 0) {
adev->kfd.vram_used[xcp_id] += vram_needed;
-   adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
+   adev->kfd.vram_used_aligned[xcp_id] +=
+   (adev->gmc.is_app_apu || adev->flags & 
AMD_IS_APU) ?
vram_needed :
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device 
*adev,
  
  		if (adev) {

adev->kfd.vram_used[xcp_id] -= size;
-   if (adev->gmc.is_app_apu) {
+   if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
adev->kfd.vram_used_aligned[xcp_id] -= size;
kfd_mem_limit.system_mem_used -= size;
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, 
struct kgd_mem *mem,
 * if peer device has large BAR. In contrast, access over xGMI is
 * allowed for both small and large BAR configurations of peer device
 */
-   if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
+   if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) 
&&
((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
 (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
 (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
@@ -1657,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct 
amdgpu_device *ad

Re: [PATCH 2/2] drm/amdkfd: Allow memory oversubscription on small APUs

2024-04-26 Thread Felix Kuehling


On 2024-04-26 04:37, Lang Yu wrote:

The default ttm_tt_pages_limit is 1/2 of system memory.
It is prone to out of memory with such a configuration.
Indiscriminately allowing the violation of all memory limits is not a 
good solution. It will lead to poor performance once you actually reach 
ttm_pages_limit and TTM starts swapping out BOs.


Regards,
  Felix




Signed-off-by: Lang Yu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   |  4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +---
  3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 3295838e9a1d..c01c6f3ab562 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -167,7 +167,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
int i;
int last_valid_bit;
  
-	amdgpu_amdkfd_gpuvm_init_mem_limits();

+   amdgpu_amdkfd_gpuvm_init_mem_limits(adev);
  
  	if (adev->kfd.dev) {

struct kgd2kfd_shared_resources gpu_resources = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1de021ebdd46..13284dbd8c58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -363,7 +363,7 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device 
*adev, int xcp_id);
  
  
  #if IS_ENABLED(CONFIG_HSA_AMD)

-void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev);
  void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
struct amdgpu_vm *vm);
  
@@ -376,7 +376,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo);

  void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
  #else
  static inline
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
  {
  }
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 7eb5afcc4895..a3e623a320b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -60,6 +60,7 @@ static struct {
int64_t system_mem_used;
int64_t ttm_mem_used;
spinlock_t mem_limit_lock;
+   bool alow_oversubscribe;
  } kfd_mem_limit;
  
  static const char * const domain_bit_to_string[] = {

@@ -110,7 +111,7 @@ static bool reuse_dmamap(struct amdgpu_device *adev, struct 
amdgpu_device *bo_ad
   *  System (TTM + userptr) memory - 15/16th System RAM
   *  TTM memory - 3/8th System RAM
   */
-void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
+void amdgpu_amdkfd_gpuvm_init_mem_limits(struct amdgpu_device *adev)
  {
struct sysinfo si;
uint64_t mem;
@@ -130,6 +131,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT;
  
  	kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;

+   kfd_mem_limit.alow_oversubscribe = !!(adev->flags & AMD_IS_APU);
pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
(kfd_mem_limit.max_system_mem_limit >> 20),
(kfd_mem_limit.max_ttm_mem_limit >> 20));
@@ -221,8 +223,12 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
 kfd_mem_limit.max_ttm_mem_limit) ||
(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed >
 vram_size - reserved_for_pt - 
atomic64_read(&adev->vram_pin_size))) {
-   ret = -ENOMEM;
-   goto release;
+   if (kfd_mem_limit.alow_oversubscribe) {
+   pr_warn_ratelimited("Memory is getting 
oversubscried.\n");
+   } else {
+   ret = -ENOMEM;
+   goto release;
+   }
}
  
  	/* Update memory accounting by decreasing available system

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1001 matches

Mail list logo